Commit 406a40dc authored by Fiona Glaser's avatar Fiona Glaser

Much faster CABAC RDO

Since RDO doesn't care about what order bit costs are calculated, merge sigmap and level coding into the same loop in RDO.
This is bit-exact for 4x4dct but slightly incorrect for 8x8dct due to the sigmap containing duplicated contexts.
However, the PSNR penalty of this is extremely small (~0.001db).
Speed benefit is about 15% in 4x4dct and 30% in 8x8dct residual bit cost calculation at QP20.
Overall encoding speed benefit is up to 5%, depending on encoding settings.
Also remove an old unnecessary CABAC table that hasn't been used for years.
parent 131d066e
......@@ -742,41 +742,6 @@ const uint8_t x264_cabac_renorm_shift[64]= {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
};
static const uint8_t x264_cabac_probability[128] =
{
FIX8(0.9812), FIX8(0.9802), FIX8(0.9792), FIX8(0.9781),
FIX8(0.9769), FIX8(0.9757), FIX8(0.9744), FIX8(0.9730),
FIX8(0.9716), FIX8(0.9700), FIX8(0.9684), FIX8(0.9667),
FIX8(0.9650), FIX8(0.9631), FIX8(0.9611), FIX8(0.9590),
FIX8(0.9568), FIX8(0.9545), FIX8(0.9521), FIX8(0.9495),
FIX8(0.9468), FIX8(0.9440), FIX8(0.9410), FIX8(0.9378),
FIX8(0.9345), FIX8(0.9310), FIX8(0.9273), FIX8(0.9234),
FIX8(0.9193), FIX8(0.9150), FIX8(0.9105), FIX8(0.9057),
FIX8(0.9006), FIX8(0.8953), FIX8(0.8897), FIX8(0.8838),
FIX8(0.8776), FIX8(0.8710), FIX8(0.8641), FIX8(0.8569),
FIX8(0.8492), FIX8(0.8411), FIX8(0.8326), FIX8(0.8237),
FIX8(0.8143), FIX8(0.8043), FIX8(0.7938), FIX8(0.7828),
FIX8(0.7712), FIX8(0.7590), FIX8(0.7461), FIX8(0.7325),
FIX8(0.7182), FIX8(0.7031), FIX8(0.6872), FIX8(0.6705),
FIX8(0.6528), FIX8(0.6343), FIX8(0.6147), FIX8(0.5941),
FIX8(0.5724), FIX8(0.5495), FIX8(0.5254), FIX8(0.5000),
FIX8(0.5000), FIX8(0.4746), FIX8(0.4505), FIX8(0.4276),
FIX8(0.4059), FIX8(0.3853), FIX8(0.3657), FIX8(0.3472),
FIX8(0.3295), FIX8(0.3128), FIX8(0.2969), FIX8(0.2818),
FIX8(0.2675), FIX8(0.2539), FIX8(0.2410), FIX8(0.2288),
FIX8(0.2172), FIX8(0.2062), FIX8(0.1957), FIX8(0.1857),
FIX8(0.1763), FIX8(0.1674), FIX8(0.1589), FIX8(0.1508),
FIX8(0.1431), FIX8(0.1359), FIX8(0.1290), FIX8(0.1224),
FIX8(0.1162), FIX8(0.1103), FIX8(0.1047), FIX8(0.0994),
FIX8(0.0943), FIX8(0.0895), FIX8(0.0850), FIX8(0.0807),
FIX8(0.0766), FIX8(0.0727), FIX8(0.0690), FIX8(0.0655),
FIX8(0.0622), FIX8(0.0590), FIX8(0.0560), FIX8(0.0532),
FIX8(0.0505), FIX8(0.0479), FIX8(0.0455), FIX8(0.0432),
FIX8(0.0410), FIX8(0.0389), FIX8(0.0369), FIX8(0.0350),
FIX8(0.0333), FIX8(0.0316), FIX8(0.0300), FIX8(0.0284),
FIX8(0.0270), FIX8(0.0256), FIX8(0.0243), FIX8(0.0231),
FIX8(0.0219), FIX8(0.0208), FIX8(0.0198), FIX8(0.0187)
};
/* -ln2(probability) */
#define F(a,b) {FIX8(a),FIX8(b)}
const uint16_t x264_cabac_entropy[128][2] =
......
......@@ -636,6 +636,7 @@ static const uint8_t coeff_abs_level_transition[2][8] = {
{ 4, 4, 4, 4, 5, 6, 7, 7 }
};
#if !RDO_SKIP_BS
static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
{
const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
......@@ -692,9 +693,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
if( i == i_last )
{
i_coeff_abs_m1[i_coeff] = abs(l[i]) - 1;
#if !RDO_SKIP_BS
i_coeff_sign[i_coeff] = l[i] < 0;
#endif
i_coeff++;
}
......@@ -711,15 +710,10 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
{
x264_cabac_encode_decision( cb, ctx, 1 );
ctx = coeff_abs_levelgt1_ctx[node_ctx] + i_ctx_level;
#if RDO_SKIP_BS
cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]];
cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]];
#else
for( i = 0; i < i_prefix - 1; i++ )
x264_cabac_encode_decision( cb, ctx, 1 );
if( i_prefix < 14 )
x264_cabac_encode_decision( cb, ctx, 0 );
#endif
if( i_prefix >= 14 )
x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1[i_coeff] - 14 );
......@@ -729,18 +723,110 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
{
x264_cabac_encode_decision( cb, ctx, 0 );
node_ctx = coeff_abs_level_transition[0][node_ctx];
#if RDO_SKIP_BS
x264_cabac_encode_bypass( cb, 0 ); // sign
#endif
}
#if !RDO_SKIP_BS
x264_cabac_encode_bypass( cb, i_coeff_sign[i_coeff] );
#endif
} while( i_coeff > 0 );
}
#define block_residual_write_cabac_8x8( h, cb, idx, l ) block_residual_write_cabac( h, cb, DCT_LUMA_8x8, idx, l, 64 )
#else
/* Faster RDO by merging sigmap and level coding. Note that for 8x8dct
* this is slightly incorrect because the sigmap is not reversible
* (contexts are repeated). However, there is nearly no quality penalty
* for this (~0.001db) and the speed boost (~30%) is worth it. */
static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int b_8x8 )
{
const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
const int i_ctx_level = coeff_abs_level_m1_offset[i_ctxBlockCat];
const uint8_t *significant_coeff_flag_offset = significant_coeff_flag_offset_8x8[h->mb.b_interlaced];
int i_last, i_coeff_abs_m1, ctx, i_prefix, i, node_ctx;
if( !b_8x8 )
{
/* coded block flag */
ctx = 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx );
if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )
x264_cabac_encode_decision( cb, ctx, 1 );
else
{
x264_cabac_encode_decision( cb, ctx, 0 );
return;
}
}
i_last = h->quantf.coeff_last[i_ctxBlockCat](l);
i_coeff_abs_m1 = abs(l[i_last]) - 1;
i_prefix = X264_MIN( i_coeff_abs_m1, 14 );
ctx = coeff_abs_level1_ctx[0] + i_ctx_level;
if( i_last != i_count - 1 )
{
x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i_last]:i_last), 1 );
x264_cabac_encode_decision( cb, i_ctx_last + (b_8x8?last_coeff_flag_offset_8x8[i_last]:i_last), 1 );
}
if( i_prefix )
{
x264_cabac_encode_decision( cb, ctx, 1 );
ctx = coeff_abs_levelgt1_ctx[0] + i_ctx_level;
cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]];
cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]];
if( i_prefix >= 14 )
x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1 - 14 );
node_ctx = coeff_abs_level_transition[1][0];
}
else
{
x264_cabac_encode_decision( cb, ctx, 0 );
node_ctx = coeff_abs_level_transition[0][0];
x264_cabac_encode_bypass( cb, 0 ); // sign
}
for( i = i_last-1 ; i >= 0; i-- )
{
if( l[i] )
{
x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i]:i), 1 );
x264_cabac_encode_decision( cb, i_ctx_last + (b_8x8?last_coeff_flag_offset_8x8[i]:i), 0 );
ctx = coeff_abs_level1_ctx[node_ctx] + i_ctx_level;
if( (unsigned)(l[i]+1) > 2 )
{
i_coeff_abs_m1 = abs(l[i]) - 1;
i_prefix = X264_MIN( i_coeff_abs_m1, 14 );
x264_cabac_encode_decision( cb, ctx, 1 );
ctx = coeff_abs_levelgt1_ctx[node_ctx] + i_ctx_level;
cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]];
cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]];
if( i_prefix >= 14 )
x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1 - 14 );
node_ctx = coeff_abs_level_transition[1][node_ctx];
}
else
{
x264_cabac_encode_decision( cb, ctx, 0 );
node_ctx = coeff_abs_level_transition[0][node_ctx];
x264_cabac_encode_bypass( cb, 0 );
}
}
else
x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i]:i), 0 );
}
}
static void block_residual_write_cabac_8x8( x264_t *h, x264_cabac_t *cb, int i_idx, int16_t *l )
{
block_residual_write_cabac_internal( h, cb, DCT_LUMA_8x8, i_idx, l, 64, 1 );
}
static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
{
block_residual_write_cabac_internal( h, cb, i_ctxBlockCat, i_idx, l, i_count, 0 );
}
#endif
void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
{
......@@ -959,7 +1045,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
{
for( i = 0; i < 4; i++ )
if( h->mb.i_cbp_luma & ( 1 << i ) )
block_residual_write_cabac( h, cb, DCT_LUMA_8x8, i, h->dct.luma8x8[i], 64 );
block_residual_write_cabac_8x8( h, cb, i, h->dct.luma8x8[i] );
}
else
{
......@@ -1024,7 +1110,7 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
if( h->mb.i_cbp_luma & (1 << i8) )
{
if( h->mb.b_transform_8x8 )
block_residual_write_cabac( h, cb, DCT_LUMA_8x8, i8, h->dct.luma8x8[i8], 64 );
block_residual_write_cabac_8x8( h, cb, i8, h->dct.luma8x8[i8] );
else
{
int i4;
......@@ -1063,7 +1149,7 @@ static void x264_partition_i8x8_size_cabac( x264_t *h, x264_cabac_t *cb, int i8,
{
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4]] = 0x0101;
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
block_residual_write_cabac( h, cb, DCT_LUMA_8x8, 4*i8, h->dct.luma8x8[i8], 64 );
block_residual_write_cabac_8x8( h, cb, 4*i8, h->dct.luma8x8[i8] );
}
else
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment