Commit ffd73767 authored by Fiona Glaser's avatar Fiona Glaser Committed by Loren Merritt

Much faster CAVLC RDO and bitstream writing

Pure asm version of level/run coding.  Over 2x faster than C.
Up to 40% faster CAVLC RDO.  Overall benefit up to ~7.5% with RDO or ~5% with fast encoding settings.
parent f33ba9e2
......@@ -50,6 +50,13 @@ typedef struct bs_s
int i_bits_encoded; /* RD only */
} bs_t;
typedef struct
{
int last;
int16_t level[16];
uint8_t run[16];
} x264_run_level_t;
extern const vlc_t x264_coeff0_token[5];
extern const vlc_t x264_coeff_token[5][16*4];
extern const vlc_t x264_total_zeros[15][16];
......
......@@ -273,6 +273,27 @@ static int x264_coeff_last64( int16_t *l )
return x264_coeff_last_internal( l, 64 );
}
#define level_run(num)\
static int x264_coeff_level_run##num( int16_t *dct, x264_run_level_t *runlevel )\
{\
int i_last = runlevel->last = x264_coeff_last##num(dct);\
int i_total = 0;\
do\
{\
int r = 0;\
runlevel->level[i_total] = dct[i_last];\
while( --i_last >= 0 && dct[i_last] == 0 )\
r++;\
runlevel->run[i_total++] = r;\
} while( i_last >= 0 );\
return i_total;\
}
level_run(4)
level_run(15)
level_run(16)
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
pf->quant_8x8 = quant_8x8;
......@@ -293,6 +314,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64;
pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15;
pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16;
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMX )
......@@ -323,8 +347,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmxext;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmxext;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmxext;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmxext;
pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmxext;
#endif
pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext;
pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext;
}
if( cpu&X264_CPU_SSE2 )
......@@ -347,6 +374,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
}
if( cpu&X264_CPU_SSSE3 )
......@@ -375,4 +404,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
#endif
pf->coeff_last[ DCT_LUMA_DC] = pf->coeff_last[DCT_LUMA_4x4];
pf->coeff_last[DCT_CHROMA_AC] = pf->coeff_last[ DCT_LUMA_AC];
pf->coeff_level_run[ DCT_LUMA_DC] = pf->coeff_level_run[DCT_LUMA_4x4];
pf->coeff_level_run[DCT_CHROMA_AC] = pf->coeff_level_run[ DCT_LUMA_AC];
}
......@@ -40,6 +40,7 @@ typedef struct
int (*decimate_score16)( int16_t *dct );
int (*decimate_score64)( int16_t *dct );
int (*coeff_last[6])( int16_t *dct );
int (*coeff_level_run[5])( int16_t *dct, x264_run_level_t *runlevel );
} x264_quant_function_t;
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
......
......@@ -662,6 +662,10 @@ INIT_XMM
DECIMATE8x8 sse2
DECIMATE8x8 ssse3
;-----------------------------------------------------------------------------
; int x264_coeff_last( int16_t *dct )
;-----------------------------------------------------------------------------
%macro LAST_MASK_SSE2 2-3
movdqa xmm0, [%2+ 0]
pxor xmm2, xmm2
......@@ -766,3 +770,63 @@ COEFF_LAST mmxext
%endif
%define LAST_MASK LAST_MASK_SSE2
COEFF_LAST sse2
;-----------------------------------------------------------------------------
; int x264_coeff_level_run( int16_t *dct, x264_run_level_t *runlevel )
;-----------------------------------------------------------------------------
%macro LAST_MASK4_MMX 2-3
movq mm0, [%2]
pxor mm2, mm2
packsswb mm0, mm0
pcmpeqb mm0, mm2
pmovmskb %1, mm0
%endmacro
; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
%ifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,6
%else
DECLARE_REG_TMP 6,3,2,1,4,5,0
%endif
%macro COEFF_LEVELRUN 2
cglobal x264_coeff_level_run%2_%1,0,7
movifnidn t0d, r0m
movifnidn t1d, r1m
LAST_MASK t2d, t0-(%2&1)*2, t4d
not t2d
shl t2d, 32-((%2+1)&~1)
mov t4d, %2-1
mov t5d, t2d
bsr t3d, t2d
xor t6d, t6d
shl t5d, 1
xor t3d, 0x1f
sub t4d, t3d
shl t5d, t3b
mov [t1], t4d
.loop:
bsr t3d, t5d
xor t3d, 0x1f
mov t2w, [t0+t4*2]
mov [t1+t6 +36], t3b
mov [t1+t6*2+ 4], t2w
inc t3d
shl t5d, t3b
inc t6d
sub t4d, t3d
jge .loop
RET
%endmacro
%ifndef ARCH_X86_64
%define LAST_MASK LAST_MASK_MMX
COEFF_LEVELRUN mmxext, 15
COEFF_LEVELRUN mmxext, 16
%endif
%define LAST_MASK LAST_MASK4_MMX
COEFF_LEVELRUN mmxext, 4
%define LAST_MASK LAST_MASK_SSE2
COEFF_LEVELRUN sse2, 15
COEFF_LEVELRUN sse2, 16
......@@ -64,5 +64,10 @@ int x264_coeff_last64_mmxext( int16_t *dct );
int x264_coeff_last15_sse2( int16_t *dct );
int x264_coeff_last16_sse2( int16_t *dct );
int x264_coeff_last64_sse2( int16_t *dct );
int x264_coeff_level_run16_mmxext( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_mmxext( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_sse2( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_mmxext( int16_t *dct, x264_run_level_t *runlevel );
#endif
......@@ -96,7 +96,7 @@ static inline int block_residual_write_cavlc_escape( x264_t *h, bs_t *s, int i_s
/* Weight highly against overflows. */
s->i_bits_encoded += 1000000;
#else
x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile", i_level_code );
x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile\n", i_level_code );
/* clip level, preserving sign */
i_level_code = (1<<12) - 2 + (i_level_code & 1);
#endif
......@@ -116,8 +116,8 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, i
{
static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
int level[16], run[16];
int i_trailing, i_total_zero, i_last, i_suffix_length, i;
x264_run_level_t runlevel;
int i_trailing, i_total_zero, i_suffix_length, i;
int i_total = 0;
unsigned int i_sign;
/* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
......@@ -129,32 +129,22 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, i
return;
}
i_last = h->quantf.coeff_last[i_ctxBlockCat](l);
i_total_zero = i_last + 1;
/* level and run and total */
/* set these to 2 to allow branchless i_trailing calculation */
level[1] = 2;
level[2] = 2;
do
{
int r = 0;
level[i_total] = l[i_last];
while( --i_last >= 0 && l[i_last] == 0 )
r++;
run[i_total++] = r;
} while( i_last >= 0 );
runlevel.level[1] = 2;
runlevel.level[2] = 2;
i_total = h->quantf.coeff_level_run[i_ctxBlockCat]( l, &runlevel );
i_total_zero = runlevel.last + 1 - i_total;
h->mb.cache.non_zero_count[x264_scan8[i_idx]] = i_total;
i_total_zero -= i_total;
i_trailing = ((((level[0]+1) | (1-level[0])) >> 31) & 1) // abs(level[0])>1
| ((((level[1]+1) | (1-level[1])) >> 31) & 2)
| ((((level[2]+1) | (1-level[2])) >> 31) & 4);
i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
| ((((runlevel.level[1]+1) | (1-runlevel.level[1])) >> 31) & 2)
| ((((runlevel.level[2]+1) | (1-runlevel.level[2])) >> 31) & 4);
i_trailing = ctz_index[i_trailing];
i_sign = ((level[2] >> 31) & 1)
| ((level[1] >> 31) & 2)
| ((level[0] >> 31) & 4);
i_sign = ((runlevel.level[2] >> 31) & 1)
| ((runlevel.level[1] >> 31) & 2)
| ((runlevel.level[0] >> 31) & 4);
i_sign >>= 3-i_trailing;
/* total/trailing */
......@@ -166,10 +156,10 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, i
if( i_trailing < i_total )
{
int16_t val = level[i_trailing];
int16_t val_original = level[i_trailing]+LEVEL_TABLE_SIZE/2;
int16_t val = runlevel.level[i_trailing];
int16_t val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2;
if( i_trailing < 3 )
val -= (val>>15)|1; /* as level[i] can't be 1 for the first one if i_trailing < 3 */
val -= (val>>15)|1; /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */
val += LEVEL_TABLE_SIZE/2;
if( (unsigned)val_original < LEVEL_TABLE_SIZE )
......@@ -181,7 +171,7 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, i
i_suffix_length = block_residual_write_cavlc_escape( h, s, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
for( i = i_trailing+1; i < i_total; i++ )
{
val = level[i] + LEVEL_TABLE_SIZE/2;
val = runlevel.level[i] + LEVEL_TABLE_SIZE/2;
if( (unsigned)val < LEVEL_TABLE_SIZE )
{
bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
......@@ -203,8 +193,8 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, i
for( i = 0; i < i_total-1 && i_total_zero > 0; i++ )
{
int i_zl = X264_MIN( i_total_zero - 1, 6 );
bs_write_vlc( s, x264_run_before[i_zl][run[i]] );
i_total_zero -= run[i];
bs_write_vlc( s, x264_run_before[i_zl][runlevel.run[i]] );
i_total_zero -= runlevel.run[i];
}
}
......
......@@ -1127,7 +1127,7 @@ static int check_quant( int cpu_ref, int cpu_new )
ok = oks[1]; used_asm = used_asms[1];
report( "dequant :" );
ok = 1;
ok = 1; used_asm = 0;
if( qf_a.denoise_dct != qf_ref.denoise_dct )
{
int size;
......@@ -1160,21 +1160,18 @@ static int check_quant( int cpu_ref, int cpu_new )
dct1[idx] = !(rand()&3) + (!(rand()&15))*(rand()&3); \
if( ac ) \
dct1[0] = 0; \
memcpy( dct2, dct1, w*w*2 ); \
result_c = call_c1( qf_c.decname, (void*)dct2 ); \
result_a = call_a1( qf_a.decname, (void*)dct2 ); \
result_c = call_c( qf_c.decname, (void*)dct1 ); \
result_a = call_a( qf_a.decname, (void*)dct1 ); \
if( X264_MIN(result_c,thresh) != X264_MIN(result_a,thresh) ) \
{ \
ok = 0; \
fprintf( stderr, #decname ": [FAILED]\n" ); \
break; \
} \
call_c2( qf_c.decname, (void*)dct2 ); \
call_a2( qf_a.decname, (void*)dct2 ); \
} \
}
ok = 1;
ok = 1; used_asm = 0;
TEST_DECIMATE( decimate_score64, 8, 0, 6 );
TEST_DECIMATE( decimate_score16, 4, 0, 6 );
TEST_DECIMATE( decimate_score15, 4, 1, 7 );
......@@ -1194,27 +1191,60 @@ static int check_quant( int cpu_ref, int cpu_new )
nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
if( !nnz ) \
dct1[ac] = 1; \
memcpy( dct2, dct1, w*w*2 ); \
result_c = call_c1( qf_c.last, (void*)(dct2+ac) ); \
result_a = call_a1( qf_a.last, (void*)(dct2+ac) ); \
result_c = call_c( qf_c.last, (void*)(dct1+ac) ); \
result_a = call_a( qf_a.last, (void*)(dct1+ac) ); \
if( result_c != result_a ) \
{ \
ok = 0; \
fprintf( stderr, #lastname ": [FAILED]\n" ); \
break; \
} \
call_c2( qf_c.last, (void*)(dct2+ac) ); \
call_a2( qf_a.last, (void*)(dct2+ac) ); \
} \
}
ok = 1;
ok = 1; used_asm = 0;
TEST_LAST( coeff_last[DCT_CHROMA_DC], coeff_last4, 2, 0 );
TEST_LAST( coeff_last[ DCT_LUMA_AC], coeff_last15, 4, 1 );
TEST_LAST( coeff_last[ DCT_LUMA_4x4], coeff_last16, 4, 0 );
TEST_LAST( coeff_last[ DCT_LUMA_8x8], coeff_last64, 8, 0 );
report( "coeff_last :" );
#define TEST_LEVELRUN( lastname, name, w, ac ) \
if( qf_a.lastname != qf_ref.lastname ) \
{ \
set_func_name( #name ); \
used_asm = 1; \
for( i = 0; i < 100; i++ ) \
{ \
x264_run_level_t runlevel_c, runlevel_a; \
int result_c, result_a, idx, nnz=0; \
int max = rand() & (w*w-1); \
memset( dct1, 0, w*w*2 ); \
memcpy( &runlevel_a, buf1+i, sizeof(x264_run_level_t) ); \
memcpy( &runlevel_c, buf1+i, sizeof(x264_run_level_t) ); \
for( idx = ac; idx < max; idx++ ) \
nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
if( !nnz ) \
dct1[ac] = 1; \
result_c = call_c( qf_c.lastname, (void*)(dct1+ac), &runlevel_c ); \
result_a = call_a( qf_a.lastname, (void*)(dct1+ac), &runlevel_a ); \
if( result_c != result_a || runlevel_c.last != runlevel_a.last || \
memcmp(runlevel_c.level, runlevel_a.level, sizeof(int16_t)*result_c) || \
memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \
{ \
ok = 0; \
fprintf( stderr, #name ": [FAILED]\n" ); \
break; \
} \
} \
}
ok = 1; used_asm = 0;
TEST_LEVELRUN( coeff_level_run[DCT_CHROMA_DC], coeff_level_run4, 2, 0 );
TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_AC], coeff_level_run15, 4, 1 );
TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_4x4], coeff_level_run16, 4, 0 );
report( "coeff_level_run :" );
return ret;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment