Commit b437d2d4 authored by Loren Merritt's avatar Loren Merritt

faster lossless zigzag

parent 489555ed
......@@ -542,8 +542,12 @@ static void zigzag_scan_4x4ac_field( int16_t level[15], int16_t dct[4][4] )
int oe = x+y*FENC_STRIDE;\
int od = x+y*FDEC_STRIDE;\
level[i] = p_src[oe] - p_dst[od];\
p_dst[od] = p_src[oe];\
}
#define COPY4x4\
*(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\
*(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\
*(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\
*(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);\
static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
{
......@@ -551,6 +555,7 @@ static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8
ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)
ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
COPY4x4
}
static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
......@@ -559,6 +564,7 @@ static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8
ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)
ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)
ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
COPY4x4
}
static void zigzag_sub_4x4ac_frame( int16_t level[15], const uint8_t *p_src, uint8_t *p_dst )
......@@ -567,6 +573,7 @@ static void zigzag_sub_4x4ac_frame( int16_t level[15], const uint8_t *p_src, uin
ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2)
ZIG( 7,2,1) ZIG( 8,3,0) ZIG( 9,3,1) ZIG(10,2,2)
ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3)
COPY4x4
}
static void zigzag_sub_4x4ac_field( int16_t level[15], const uint8_t *p_src, uint8_t *p_dst )
......@@ -575,9 +582,11 @@ static void zigzag_sub_4x4ac_field( int16_t level[15], const uint8_t *p_src, uin
ZIG( 3,3,0) ZIG( 4,1,1) ZIG( 5,2,1) ZIG( 6,3,1)
ZIG( 7,0,2) ZIG( 8,1,2) ZIG( 9,2,2) ZIG(10,3,2)
ZIG(11,0,3) ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,3)
COPY4x4
}
#undef ZIG
#undef COPY4x4
void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
{
......@@ -609,6 +618,11 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
pf->sub_4x4 = zigzag_sub_4x4_frame;
pf->sub_4x4ac = zigzag_sub_4x4ac_frame;
#ifdef HAVE_SSE3
if( cpu&X264_CPU_SSSE3 )
pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
#endif
#ifdef ARCH_PPC
if( cpu&X264_CPU_ALTIVEC )
{
......
......@@ -27,6 +27,7 @@
SECTION_RODATA
pw_1: times 8 dw 1
pw_32: times 8 dw 32
pb_zigzag4: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
SECTION .text
......@@ -290,3 +291,43 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
mov [r0+12], r2d
RET
%ifdef HAVE_SSE3
;-----------------------------------------------------------------------------
; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
;-----------------------------------------------------------------------------
cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
movd xmm0, [r1+0*FENC_STRIDE]
movd xmm1, [r1+1*FENC_STRIDE]
movd xmm2, [r1+2*FENC_STRIDE]
movd xmm3, [r1+3*FENC_STRIDE]
movd xmm4, [r2+0*FDEC_STRIDE]
movd xmm5, [r2+1*FDEC_STRIDE]
movd xmm6, [r2+2*FDEC_STRIDE]
movd xmm7, [r2+3*FDEC_STRIDE]
movd [r2+0*FDEC_STRIDE], xmm0
movd [r2+1*FDEC_STRIDE], xmm1
movd [r2+2*FDEC_STRIDE], xmm2
movd [r2+3*FDEC_STRIDE], xmm3
picgetgot r1
punpckldq xmm0, xmm1
punpckldq xmm2, xmm3
punpckldq xmm4, xmm5
punpckldq xmm6, xmm7
movlhps xmm0, xmm2
movlhps xmm4, xmm6
movdqa xmm7, [pb_zigzag4 GLOBAL]
pshufb xmm0, xmm7
pshufb xmm4, xmm7
pxor xmm6, xmm6
movdqa xmm1, xmm0
movdqa xmm5, xmm4
punpcklbw xmm0, xmm6
punpckhbw xmm1, xmm6
punpcklbw xmm4, xmm6
punpckhbw xmm5, xmm6
psubw xmm0, xmm4
psubw xmm1, xmm5
movdqa [r0], xmm0
movdqa [r0+16], xmm1
RET
%endif
......@@ -47,5 +47,6 @@ void x264_add8x8_idct8_sse2( uint8_t *dst, int16_t dct[8][8] );
void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][8][8] );
void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst );
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment