Commit e9a6bd75 authored by Loren Merritt's avatar Loren Merritt Committed by Fiona Glaser

Add assembly version of CAVLC 8x8dct interleave

Faster CAVLC encoding and RDO with 8x8dct
parent f151cc4b
......@@ -582,6 +582,14 @@ static void zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8
#undef ZIG
#undef COPY4x4
static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src )
{
int i,j;
for( i=0; i<4; i++ )
for( j=0; j<16; j++ )
dst[i*16+j] = src[i+j*4];
}
void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
{
if( b_interlaced )
......@@ -627,4 +635,10 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
pf->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
#endif
}
pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMX )
pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
#endif
}
......@@ -120,6 +120,7 @@ typedef struct
void (*scan_4x4)( int16_t level[16], int16_t dct[4][4] );
void (*sub_8x8)( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst );
void (*sub_4x4)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst );
void (*interleave_8x8_cavlc)( int16_t *dst, int16_t *src );
} x264_zigzag_function_t;
......
......@@ -548,3 +548,20 @@ cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
movdqa [r0], xmm0
movdqa [r0+16], xmm1
RET
INIT_MMX
cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 2,3
mov r2d, 24
.loop:
movq m0, [r1+r2*4+ 0]
movq m1, [r1+r2*4+ 8]
movq m2, [r1+r2*4+16]
movq m3, [r1+r2*4+24]
TRANSPOSE4x4W 0,1,2,3,4
movq [r0+r2+ 0], m0
movq [r0+r2+32], m1
movq [r0+r2+64], m2
movq [r0+r2+96], m3
sub r2d, 8
jge .loop
REP_RET
......@@ -56,5 +56,6 @@ void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src );
#endif
......@@ -273,15 +273,13 @@ static void cavlc_mb8x8_mvd( x264_t *h, bs_t *s, int i_list, int i )
static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8start, int i8end )
{
int i8, i4, i;
int i8, i4;
if( h->mb.b_transform_8x8 )
{
/* shuffle 8x8 dct coeffs into 4x4 lists */
for( i8 = i8start; i8 <= i8end; i8++ )
if( h->mb.i_cbp_luma & (1 << i8) )
for( i4 = 0; i4 < 4; i4++ )
for( i = 0; i < 16; i++ )
h->dct.luma4x4[i4+i8*4][i] = h->dct.luma8x8[i8][i4+i*4];
h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8] );
}
for( i8 = i8start; i8 <= i8end; i8++ )
......
......@@ -645,6 +645,7 @@ static int check_dct( int cpu_ref, int cpu_new )
ok = 1; used_asm = 0;
TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 );
TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 );
TEST_ZIGZAG_SCAN( interleave_8x8_cavlc, level1, level2, (void*)dct1, 64 );
TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
report( "zigzag_frame :" );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment