Commit 9c0fa2d6 authored by Fiona Glaser's avatar Fiona Glaser

Use a large LUT for CAVLC zero-run bit codes

Helps the most with trellis and RD, but also helps with bitstream writing.
Seems at worst neutral even in the extreme case of a CPU with small L2 cache (e.g. ARM Cortex A8).
parent de7aed78
......@@ -56,6 +56,7 @@ typedef struct bs_s
typedef struct
{
int last;
int mask;
dctcoef level[16];
uint8_t run[16];
} x264_run_level_t;
......@@ -65,7 +66,6 @@ extern const vlc_t x264_coeff_token[6][16][4];
extern const vlc_t x264_total_zeros[15][16];
extern const vlc_t x264_total_zeros_2x2_dc[3][4];
extern const vlc_t x264_total_zeros_2x4_dc[7][8];
extern const vlc_t x264_run_before[7][16];
typedef struct
{
......@@ -82,6 +82,11 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );
#define LEVEL_TABLE_SIZE 128
extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
/* The longest possible set of zero run codes sums to 25 bits. This leaves
* plenty of room for both the code (25 bits) and size (5 bits) in a uint32_t. */
extern uint32_t x264_run_before[1<<16];
static inline void bs_init( bs_t *s, void *p_data, int i_data )
{
int offset = ((intptr_t)p_data & 3);
......
......@@ -236,7 +236,7 @@ void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
void x264_reduce_fraction( uint32_t *n, uint32_t *d );
void x264_reduce_fraction64( uint64_t *n, uint64_t *d );
void x264_cavlc_init( void );
void x264_cavlc_init( x264_t *h );
void x264_cabac_init( x264_t *h );
static ALWAYS_INLINE pixel x264_clip_pixel( int x )
......
......@@ -373,14 +373,17 @@ static int x264_coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel )
{\
int i_last = runlevel->last = x264_coeff_last##num(dct);\
int i_total = 0;\
int mask = 0;\
do\
{\
int r = 0;\
runlevel->level[i_total] = dct[i_last];\
mask |= 1 << (i_last);\
while( --i_last >= 0 && dct[i_last] == 0 )\
r++;\
runlevel->run[i_total++] = r;\
} while( i_last >= 0 );\
runlevel->mask = mask;\
return i_total;\
}
......
......@@ -738,7 +738,7 @@ const vlc_t x264_total_zeros_2x4_dc[7][8] =
};
/* [MIN( i_zero_left-1, 6 )][run_before] */
const vlc_t x264_run_before[7][16] =
static const vlc_t run_before[7][16] =
{
{ /* i_zero_left 1 */
{ 0x1, 1 }, /* str=1 */
......@@ -799,8 +799,9 @@ const vlc_t x264_run_before[7][16] =
};
vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
uint32_t x264_run_before[1<<16];
void x264_cavlc_init( void )
void x264_cavlc_init( x264_t *h )
{
for( int i_suffix = 0; i_suffix < 7; i_suffix++ )
for( int16_t level = -LEVEL_TABLE_SIZE/2; level < LEVEL_TABLE_SIZE/2; level++ )
......@@ -840,4 +841,27 @@ void x264_cavlc_init( void )
i_next++;
vlc->i_next = i_next;
}
for( int i = 1; i < (1<<16); i++ )
{
x264_run_level_t runlevel;
ALIGNED_ARRAY_16( dctcoef, dct, [16] );
int size = 0;
int bits = 0;
for( int j = 0; j < 16; j++ )
dct[j] = i&(1<<j);
int total = h->quantf.coeff_level_run[DCT_LUMA_4x4]( dct, &runlevel );
int zeros = runlevel.last + 1 - total;
for( int j = 0; j < total-1 && zeros > 0; j++ )
{
int idx = X264_MIN(zeros, 7) - 1;
int run = runlevel.run[j];
int len = run_before[idx][run].i_size;
size += len;
bits <<= len;
bits |= run_before[idx][run].i_bits;
zeros -= run;
}
x264_run_before[i] = (bits << 5) + size;
}
}
......@@ -1352,8 +1352,16 @@ cglobal coeff_level_run%1,0,7
movifnidn t1, r1mp
pxor m2, m2
LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
not t5d
shl t5d, 32-((%1+1)&~1)
%if %1==15
shr t5d, 1
%elif %1==8
and t5d, 0xff
%elif %1==4
and t5d, 0xf
%endif
xor t5d, (1<<%1)-1
mov [t1+4], t5d
shl t5d, 32-%1
mov t4d, %1-1
LZCOUNT t3d, t5d, 0x1f
xor t6d, t6d
......@@ -1365,12 +1373,12 @@ cglobal coeff_level_run%1,0,7
LZCOUNT t3d, t5d, 0x1f
%ifdef HIGH_BIT_DEPTH
mov t2d, [t0+t4*4]
mov [t1+t6 +4+16*4], t3b
mov [t1+t6*4+ 4], t2d
mov [t1+t6+8+16*4], t3b
mov [t1+t6*4+ 8], t2d
%else
mov t2w, [t0+t4*2]
mov [t1+t6 +4+16*2], t3b
mov [t1+t6*2+ 4], t2w
mov [t1+t6+8+16*2], t3b
mov [t1+t6*2+ 8], t2w
%endif
inc t3d
shl t5d, t3b
......
......@@ -132,6 +132,7 @@ static int x264_cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dct
runlevel.level[1] = 2;
runlevel.level[2] = 2;
i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel );
x264_prefetch( &x264_run_before[runlevel.mask] );
i_total_zero = runlevel.last + 1 - i_total;
i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
......@@ -188,12 +189,8 @@ static int x264_cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dct
else if( (uint8_t)i_total < count_cat[ctx_block_cat] )
bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
for( int i = 0; i < i_total-1 && i_total_zero > 0; i++ )
{
int i_zl = X264_MIN( i_total_zero, 7 );
bs_write_vlc( s, x264_run_before[i_zl-1][runlevel.run[i]] );
i_total_zero -= runlevel.run[i];
}
int zero_run_code = x264_run_before[runlevel.mask];
bs_write( s, zero_run_code&0x1f, zero_run_code>>5 );
return i_total;
}
......
......@@ -1173,10 +1173,6 @@ x264_t *x264_encoder_open( x264_param_t *param )
x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c );
x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter );
x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
if( h->param.b_cabac )
x264_cabac_init( h );
else
x264_cavlc_init();
x264_pixel_init( h->param.cpu, &h->pixf );
x264_dct_init( h->param.cpu, &h->dctf );
x264_zigzag_init( h->param.cpu, &h->zigzagf_progressive, &h->zigzagf_interlaced );
......@@ -1186,6 +1182,10 @@ x264_t *x264_encoder_open( x264_param_t *param )
x264_deblock_init( h->param.cpu, &h->loopf, PARAM_INTERLACED );
x264_bitstream_init( h->param.cpu, &h->bsf );
x264_dct_init_weights();
if( h->param.b_cabac )
x264_cabac_init( h );
else
x264_cavlc_init( h );
mbcmp_init( h );
chroma_dsp_init( h );
......
......@@ -2013,6 +2013,7 @@ static int check_quant( int cpu_ref, int cpu_new )
int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \
int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \
if( result_c != result_a || runlevel_c.last != runlevel_a.last || \
runlevel_c.mask != runlevel_a.mask || \
memcmp(runlevel_c.level, runlevel_a.level, sizeof(dctcoef)*result_c) || \
memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \
{ \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment