Commit c656d68f authored by Fiona Glaser's avatar Fiona Glaser

Much faster CABAC residual context selection

Up to ~17% faster CABAC RDO, ~36% faster intra-only CABAC RDO.
Up to 7% faster overall in extreme cases.
parent 5a7a1d14
......@@ -505,92 +505,43 @@ static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i_list
}
}
static int x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx )
/* i_ctxBlockCat: 0-> DC 16x16 i_idx = 0
* 1-> AC 16x16 i_idx = luma4x4idx
* 2-> Luma4x4 i_idx = luma4x4idx
* 3-> DC Chroma i_idx = iCbCr
* 4-> AC Chroma i_idx = 4 * iCbCr + chroma4x4idx
* 5-> Luma8x8 i_idx = luma8x8idx
*/
static int ALWAYS_INLINE x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx )
{
/* i_ctxBlockCat: 0-> DC 16x16 i_idx = 0
* 1-> AC 16x16 i_idx = luma4x4idx
* 2-> Luma4x4 i_idx = luma4x4idx
* 3-> DC Chroma i_idx = iCbCr
* 4-> AC Chroma i_idx = 4 * iCbCr + chroma4x4idx
* 5-> Luma8x8 i_idx = luma8x8idx
*/
int i_mba_xy = -1;
int i_mbb_xy = -1;
int i_nza = 0;
int i_nzb = 0;
int i_nza;
int i_nzb;
int b_intra = IS_INTRA( h->mb.i_type );
switch( i_cat )
{
case DCT_LUMA_DC:
if( h->mb.i_neighbour & MB_LEFT )
{
i_mba_xy = h->mb.i_mb_xy - 1;
i_nza = h->mb.cbp[i_mba_xy] & 0x100;
}
if( h->mb.i_neighbour & MB_TOP )
{
i_mbb_xy = h->mb.i_mb_top_xy;
i_nzb = h->mb.cbp[i_mbb_xy] & 0x100;
}
break;
case DCT_LUMA_AC:
case DCT_LUMA_4x4:
if( i_idx & ~10 ) // block_idx_x > 0
i_mba_xy = h->mb.i_mb_xy;
else if( h->mb.i_neighbour & MB_LEFT )
i_mba_xy = h->mb.i_mb_xy - 1;
if( i_idx & ~5 ) // block_idx_y > 0
i_mbb_xy = h->mb.i_mb_xy;
else if( h->mb.i_neighbour & MB_TOP )
i_mbb_xy = h->mb.i_mb_top_xy;
case DCT_CHROMA_AC:
/* no need to test for skip/pcm */
if( i_mba_xy >= 0 )
i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1];
if( i_mbb_xy >= 0 )
i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8];
break;
i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1];
i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8];
i_nza &= 0x7f + (b_intra << 7);
i_nzb &= 0x7f + (b_intra << 7);
return 4*i_cat + 2*!!i_nzb + !!i_nza;
case DCT_LUMA_DC:
/* Note: this depends on the exact values of MB_LEFT and MB_TOP enums */
i_nza = ((h->mb.cbp[h->mb.i_mb_xy - 1] >> 8) | ~h->mb.i_neighbour) & 1;
i_nzb = ((h->mb.cbp[h->mb.i_mb_top_xy] >> 7) | ~h->mb.i_neighbour) & 2;
return 4*i_cat + i_nzb + i_nza;
case DCT_CHROMA_DC:
/* no need to test skip/pcm */
i_idx -= 25;
if( h->mb.i_neighbour & MB_LEFT )
{
i_mba_xy = h->mb.i_mb_xy - 1;
i_nza = h->mb.cbp[i_mba_xy] & (0x200 << i_idx);
}
if( h->mb.i_neighbour & MB_TOP )
{
i_mbb_xy = h->mb.i_mb_top_xy;
i_nzb = h->mb.cbp[i_mbb_xy] & (0x200 << i_idx);
}
break;
case DCT_CHROMA_AC:
if( i_idx & 1 )
i_mba_xy = h->mb.i_mb_xy;
else if( h->mb.i_neighbour & MB_LEFT )
i_mba_xy = h->mb.i_mb_xy - 1;
if( i_idx & 2 )
i_mbb_xy = h->mb.i_mb_xy;
else if( h->mb.i_neighbour & MB_TOP )
i_mbb_xy = h->mb.i_mb_top_xy;
/* no need to test skip/pcm */
if( i_mba_xy >= 0 )
i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1];
if( i_mbb_xy >= 0 )
i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8];
i_nza = h->mb.i_neighbour & MB_LEFT ? (h->mb.cbp[h->mb.i_mb_xy - 1] >> (9 + i_idx)) & 1 : b_intra;
i_nzb = h->mb.i_neighbour & MB_TOP ? (h->mb.cbp[h->mb.i_mb_top_xy] >> (9 + i_idx)) & 1 : b_intra;
return 4*i_cat + 2*i_nzb + i_nza;
}
if( IS_INTRA( h->mb.i_type ) )
{
i_nza |= i_mba_xy < 0;
i_nzb |= i_mbb_xy < 0;
}
return 4*i_cat + 2*!!i_nzb + !!i_nza;
}
......@@ -637,7 +588,7 @@ static const uint8_t coeff_abs_level_transition[2][8] = {
};
#if !RDO_SKIP_BS
static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int ctxidxinc )
{
const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
......@@ -655,7 +606,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
if( i_count != 64 )
{
/* coded block flag */
int ctx = 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx );
int ctx = 85 + ctxidxinc;
if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )
x264_cabac_encode_decision( cb, ctx, 1 );
else
......@@ -736,7 +687,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
* this is slightly incorrect because the sigmap is not reversible
* (contexts are repeated). However, there is nearly no quality penalty
* for this (~0.001db) and the speed boost (~30%) is worth it. */
static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int b_8x8 )
static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int ctxidxinc, int b_8x8 )
{
const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
......@@ -747,7 +698,7 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c
if( !b_8x8 )
{
/* coded block flag */
ctx = 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx );
ctx = 85 + ctxidxinc;
if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )
x264_cabac_encode_decision( cb, ctx, 1 );
else
......@@ -820,14 +771,20 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c
static void block_residual_write_cabac_8x8( x264_t *h, x264_cabac_t *cb, int i_idx, int16_t *l )
{
block_residual_write_cabac_internal( h, cb, DCT_LUMA_8x8, i_idx, l, 64, 1 );
block_residual_write_cabac_internal( h, cb, DCT_LUMA_8x8, i_idx, l, 64, 0, 1 );
}
static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int ctxidxinc )
{
block_residual_write_cabac_internal( h, cb, i_ctxBlockCat, i_idx, l, i_count, 0 );
block_residual_write_cabac_internal( h, cb, i_ctxBlockCat, i_idx, l, i_count, ctxidxinc, 0 );
}
#endif
#define block_residual_write_cabac( h, cb, i_ctxBlockCat, i_idx, l, i_count ) \
{ \
int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx); \
block_residual_write_cabac( h, cb, i_ctxBlockCat, i_idx, l, i_count, ctxidxinc ); \
}
void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
{
const int i_mb_type = h->mb.i_type;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment