Commit ded3e28c authored by Fiona Glaser's avatar Fiona Glaser
Browse files

Faster 8x8dct+CAVLC interleave

Integrate array_non_zero with the CAVLC 8x8dct interleave function.
Roughly 1.5-2x faster than the original separate array_non_zero method.
parent 741e1f99
......@@ -608,12 +608,19 @@ static void zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8
#undef ZIG
#undef COPY4x4
static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src )
static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
{
int i,j;
for( i=0; i<4; i++ )
{
int nz = 0;
for( j=0; j<16; j++ )
{
nz |= src[i+j*4];
dst[i*16+j] = src[i+j*4];
}
nnz[(i&1) + (i>>1)*8] = !!nz;
}
}
void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
......
......@@ -119,7 +119,7 @@ typedef struct
void (*scan_4x4)( int16_t level[16], int16_t dct[4][4] );
void (*sub_8x8)( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst );
void (*sub_4x4)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst );
void (*interleave_8x8_cavlc)( int16_t *dst, int16_t *src );
void (*interleave_8x8_cavlc)( int16_t *dst, int16_t *src, uint8_t *nnz );
} x264_zigzag_function_t;
......
......@@ -34,6 +34,7 @@ pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
pb_1: times 8 db 1
SECTION .text
......@@ -737,19 +738,47 @@ cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
movdqa [r0+16], xmm1
RET
INIT_MMX
cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 2,3
mov r2d, 24
.loop:
movq m0, [r1+r2*4+ 0]
movq m1, [r1+r2*4+ 8]
movq m2, [r1+r2*4+16]
movq m3, [r1+r2*4+24]
;-----------------------------------------------------------------------------
; void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz )
;-----------------------------------------------------------------------------
%macro INTERLEAVE 1
movq m0, [r1+%1*4+ 0]
movq m1, [r1+%1*4+ 8]
movq m2, [r1+%1*4+16]
movq m3, [r1+%1*4+24]
TRANSPOSE4x4W 0,1,2,3,4
movq [r0+r2+ 0], m0
movq [r0+r2+32], m1
movq [r0+r2+64], m2
movq [r0+r2+96], m3
sub r2d, 8
jge .loop
REP_RET
movq [r0+%1+ 0], m0
movq [r0+%1+32], m1
movq [r0+%1+64], m2
movq [r0+%1+96], m3
%if %1
packsswb m0, m1
por m6, m2
por m7, m3
por m5, m0
%else
packsswb m0, m1
SWAP m5, m0
SWAP m6, m2
SWAP m7, m3
%endif
%endmacro
INIT_MMX
cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3
INTERLEAVE 0
INTERLEAVE 8
INTERLEAVE 16
INTERLEAVE 24
packsswb m6, m7
packsswb m5, m6
packsswb m5, m5
pxor m0, m0
pcmpeqb m5, m0
paddb m5, [pb_1 GLOBAL]
movd r0d, m5
mov [r2+0], r0w
shr r0d, 16
mov [r2+8], r0w
RET
......@@ -61,6 +61,6 @@ void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src );
void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz );
#endif
......@@ -273,11 +273,7 @@ static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8s
/* shuffle 8x8 dct coeffs into 4x4 lists */
for( i8 = i8start; i8 <= i8end; i8++ )
if( h->mb.i_cbp_luma & (1 << i8) )
{
h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8] );
for( i4 = 0; i4 < 4; i4++ )
h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero( h->dct.luma4x4[i4+i8*4] );
}
h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8], &h->mb.cache.non_zero_count[x264_scan8[i8*4]] );
}
for( i8 = i8start; i8 <= i8end; i8++ )
......
......@@ -635,6 +635,26 @@ static int check_dct( int cpu_ref, int cpu_new )
call_a2( zigzag_asm.name, t2, buf2, buf4 ); \
}
#define TEST_INTERLEAVE( name, t1, t2, dct, size ) \
if( zigzag_asm.name != zigzag_ref.name ) \
{ \
for( j=0; j<100; j++ ) \
{ \
set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
used_asm = 1; \
memcpy(dct, buf1, size*sizeof(int16_t));\
for( i=0; i<size; i++ ) \
dct[i] = rand()&0x1F ? 0 : dct[i]; \
memcpy(buf3, buf4, 10*sizeof(uint8_t)); \
call_c( zigzag_c.name, t1, dct, buf3 ); \
call_a( zigzag_asm.name, t2, dct, buf4 ); \
if( memcmp( t1, t2, size*sizeof(int16_t) ) || memcmp( buf3, buf4, 10*sizeof(uint8_t) ) ) \
{ \
ok = 0; \
} \
} \
}
interlace = 0;
x264_zigzag_init( 0, &zigzag_c, 0 );
x264_zigzag_init( cpu_ref, &zigzag_ref, 0 );
......@@ -643,7 +663,6 @@ static int check_dct( int cpu_ref, int cpu_new )
ok = 1; used_asm = 0;
TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 );
TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 );
TEST_ZIGZAG_SCAN( interleave_8x8_cavlc, level1, level2, (void*)dct1, 64 );
TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
report( "zigzag_frame :" );
......@@ -657,6 +676,10 @@ static int check_dct( int cpu_ref, int cpu_new )
TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 );
TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
report( "zigzag_field :" );
ok = 1; used_asm = 0;
TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct1[0][0], 64 );
report( "zigzag_interleave :" );
#undef TEST_ZIGZAG_SCAN
#undef TEST_ZIGZAG_SUB
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment