Commit 2bcc39fd authored by Fiona Glaser's avatar Fiona Glaser

Various CABAC optimizations

Move calculation of b_intra out of the core residual loop and hardcode it where applicable.
Inlining cabac_mb_mvd was unnecessary and wasted tremendous amounts of code size.  Inlining only cache_mvd is faster and significantly smaller.
parent bf749f76
......@@ -444,7 +444,7 @@ static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_lis
}
}
static inline void x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width, int height )
static NOINLINE uint32_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width, int height )
{
DECLARE_ALIGNED_4( int16_t mvp[2] );
int mdx, mdy;
......@@ -458,8 +458,13 @@ static inline void x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, i
x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 0, mdx );
x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy );
/* save value */
x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, pack16to32_mask(mdx,mdy) );
return pack16to32_mask(mdx,mdy);
}
#define x264_cabac_mb_mvd(h,cb,i_list,idx,width,height)\
{\
uint32_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width,height);\
x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mvd );\
}
static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int i )
......@@ -505,11 +510,10 @@ static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i_list
* 5-> Luma8x8 i_idx = luma8x8idx
*/
static int ALWAYS_INLINE x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx )
static int ALWAYS_INLINE x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx, int b_intra )
{
int i_nza;
int i_nzb;
int b_intra = IS_INTRA( h->mb.i_type );
switch( i_cat )
{
......@@ -672,7 +676,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
x264_cabac_encode_bypass( cb, i_coeff_sign[i_coeff] );
} while( i_coeff > 0 );
}
#define block_residual_write_cabac_8x8( h, cb, idx, l ) block_residual_write_cabac( h, cb, DCT_LUMA_8x8, idx, l, 64 )
#define block_residual_write_cabac_8x8( h, cb, idx, l ) block_residual_write_cabac( h, cb, DCT_LUMA_8x8, idx, l, 64, 0 )
#else
......@@ -784,9 +788,9 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
}
#endif
#define block_residual_write_cabac( h, cb, i_ctxBlockCat, i_idx, l, i_count ) \
#define block_residual_write_cabac( h, cb, i_ctxBlockCat, i_idx, l, i_count, b_intra ) \
{ \
int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx); \
int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx, b_intra ); \
block_residual_write_cabac( h, cb, i_ctxBlockCat, i_idx, l, i_count, ctxidxinc ); \
}
......@@ -990,18 +994,19 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
if( h->mb.i_cbp_luma > 0 || h->mb.i_cbp_chroma > 0 || i_mb_type == I_16x16 )
{
const int b_intra = IS_INTRA( i_mb_type );
x264_cabac_mb_qp_delta( h, cb );
/* write residual */
if( i_mb_type == I_16x16 )
{
/* DC Luma */
block_residual_write_cabac( h, cb, DCT_LUMA_DC, 24, h->dct.luma16x16_dc, 16 );
block_residual_write_cabac( h, cb, DCT_LUMA_DC, 24, h->dct.luma16x16_dc, 16, 1 );
/* AC Luma */
if( h->mb.i_cbp_luma != 0 )
for( i = 0; i < 16; i++ )
block_residual_write_cabac( h, cb, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1, 15 );
block_residual_write_cabac( h, cb, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1, 15, 1 );
}
else if( h->mb.b_transform_8x8 )
{
......@@ -1013,18 +1018,18 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
{
for( i = 0; i < 16; i++ )
if( h->mb.i_cbp_luma & ( 1 << ( i / 4 ) ) )
block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i, h->dct.luma4x4[i], 16 );
block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i, h->dct.luma4x4[i], 16, b_intra );
}
if( h->mb.i_cbp_chroma &0x03 ) /* Chroma DC residual present */
{
block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4 );
block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4 );
block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4, b_intra );
block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4, b_intra );
}
if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
{
for( i = 16; i < 24; i++ )
block_residual_write_cabac( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15 );
block_residual_write_cabac( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15, b_intra );
}
}
......@@ -1050,7 +1055,9 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
if( i_mb_type == P_8x8 )
x264_cabac_mb8x8_mvd( h, cb, 0, i8 );
else if( i_mb_type == P_L0 )
{
x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
}
else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
{
if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
......@@ -1077,12 +1084,12 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
{
int i4;
for( i4 = 0; i4 < 4; i4++ )
block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16, 0 );
}
}
block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 15 );
block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1, 15 );
block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 15, 0 );
block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1, 15, 0 );
i8 += x264_pixel_size[i_pixel].h >> 3;
}
......@@ -1091,13 +1098,15 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
static void x264_subpartition_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_pixel )
{
int b_8x4 = i_pixel == PIXEL_8x4;
block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16, 0 );
if( i_pixel == PIXEL_4x4 )
{
x264_cabac_mb_mvd( h, cb, 0, i4, 1, 1 );
}
else
{
x264_cabac_mb_mvd( h, cb, 0, i4, 1+b_8x4, 2-b_8x4 );
block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+2-b_8x4, h->dct.luma4x4[i4+2-b_8x4], 16 );
block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+2-b_8x4, h->dct.luma4x4[i4+2-b_8x4], 16, 0 );
}
}
......@@ -1116,7 +1125,7 @@ static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4,
const int i_pred = x264_mb_predict_intra4x4_mode( h, i4 );
i_mode = x264_mb_pred_mode4x4_fix( i_mode );
x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16, 1 );
}
static void x264_i8x8_chroma_size_cabac( x264_t *h, x264_cabac_t *cb )
......@@ -1125,14 +1134,14 @@ static void x264_i8x8_chroma_size_cabac( x264_t *h, x264_cabac_t *cb )
x264_cabac_mb_cbp_chroma( h, cb );
if( h->mb.i_cbp_chroma > 0 )
{
block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4 );
block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4 );
block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4, 1 );
block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4, 1 );
if( h->mb.i_cbp_chroma == 2 )
{
int i;
for( i = 16; i < 24; i++ )
block_residual_write_cabac( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15 );
block_residual_write_cabac( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15, 1 );
}
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment