Commit 9f027f4f authored by Fiona Glaser's avatar Fiona Glaser

Inline i4x4/i8x8 encode into intra analysis

Larger code size, but faster.
parent a5a6d0ee
......@@ -112,28 +112,6 @@ static ALWAYS_INLINE int array_non_zero( dctcoef *v, int i_count )
return 0;
}
static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
{
int i_quant_cat = b_intra ? (p?CQM_4IC:CQM_4IY) : (p?CQM_4PC:CQM_4PY);
if( h->mb.b_noise_reduction )
h->quantf.denoise_dct( dct, h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
if( h->mb.b_trellis )
return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*16 );
else
return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
}
static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, dctcoef dct[64], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
{
int i_quant_cat = b_intra ? (p?CQM_8IC:CQM_8IY) : (p?CQM_8PC:CQM_8PY);
if( h->mb.b_noise_reduction )
h->quantf.denoise_dct( dct, h->nr_residual_sum[1+!!p*2], h->nr_offset[1+!!p*2], 64 );
if( h->mb.b_trellis )
return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*4 );
else
return h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
}
/* All encoding functions must output the correct CBP and NNZ values.
* The entropy coding functions will check CBP first, then NNZ, before
* actually reading the DCT coefficients. NNZ still must be correct even
......@@ -145,99 +123,6 @@ static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, dctcoef dct[64], int i_qp, i
/* This means that decimation can be done merely by adjusting the CBP and NNZ
* rather than memsetting the coefficients. */
void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode )
{
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
if( h->mb.b_lossless )
x264_predict_lossless_4x4( h, p_dst, p, idx, i_mode );
else
h->predict_4x4[i_mode]( p_dst );
if( h->mb.b_lossless )
{
nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+idx], p_src, p_dst );
h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
h->mb.i_cbp_luma |= nz<<(idx>>2);
return;
}
h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 1, p, idx );
h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
if( nz )
{
h->mb.i_cbp_luma |= 1<<(idx>>2);
h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4 );
h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[p?CQM_4IC:CQM_4IY], i_qp );
h->dctf.add4x4_idct( p_dst, dct4x4 );
}
}
#define STORE_8x8_NNZ( p, idx, nz )\
do\
{\
M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+0] ) = (nz) * 0x0101;\
M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+8] ) = (nz) * 0x0101;\
} while(0)
#define CLEAR_16x16_NNZ( p ) \
do\
{\
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 0]] ) = 0;\
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 2]] ) = 0;\
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 8]] ) = 0;\
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+10]] ) = 0;\
} while(0)
void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge )
{
int x = idx&1;
int y = idx>>1;
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
if( !edge )
{
h->predict_8x8_filter( p_dst, edge_buf, h->mb.i_neighbour8[idx], x264_pred_i4x4_neighbors[i_mode] );
edge = edge_buf;
}
if( h->mb.b_lossless )
x264_predict_lossless_8x8( h, p_dst, p, idx, i_mode, edge );
else
h->predict_8x8[i_mode]( p_dst, edge );
if( h->mb.b_lossless )
{
nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+idx], p_src, p_dst );
STORE_8x8_NNZ( p, idx, nz );
h->mb.i_cbp_luma |= nz<<idx;
return;
}
h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
nz = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 1, p, idx );
if( nz )
{
h->mb.i_cbp_luma |= 1<<idx;
h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8 );
h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[p?CQM_8IC:CQM_8IY], i_qp );
h->dctf.add8x8_idct8( p_dst, dct8x8 );
STORE_8x8_NNZ( p, idx, 1 );
}
else
STORE_8x8_NNZ( p, idx, 0 );
}
static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
{
pixel *p_src = h->mb.pic.p_fenc[p];
......
......@@ -52,8 +52,6 @@ void x264_macroblock_write_cavlc ( x264_t *h );
void x264_macroblock_encode_p8x8( x264_t *h, int i8 );
void x264_macroblock_encode_p4x4( x264_t *h, int i4 );
void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode );
void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge );
void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp );
void x264_cabac_mb_skip( x264_t *h, int b_skip );
......@@ -68,5 +66,120 @@ int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
void x264_noise_reduction_update( x264_t *h );
static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
{
int i_quant_cat = b_intra ? (p?CQM_4IC:CQM_4IY) : (p?CQM_4PC:CQM_4PY);
if( h->mb.b_noise_reduction )
h->quantf.denoise_dct( dct, h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
if( h->mb.b_trellis )
return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*16 );
else
return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
}
static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, dctcoef dct[64], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
{
int i_quant_cat = b_intra ? (p?CQM_8IC:CQM_8IY) : (p?CQM_8PC:CQM_8PY);
if( h->mb.b_noise_reduction )
h->quantf.denoise_dct( dct, h->nr_residual_sum[1+!!p*2], h->nr_offset[1+!!p*2], 64 );
if( h->mb.b_trellis )
return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*4 );
else
return h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
}
#define STORE_8x8_NNZ( p, idx, nz )\
do\
{\
M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+0] ) = (nz) * 0x0101;\
M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+8] ) = (nz) * 0x0101;\
} while(0)
#define CLEAR_16x16_NNZ( p ) \
do\
{\
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 0]] ) = 0;\
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 2]] ) = 0;\
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 8]] ) = 0;\
M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+10]] ) = 0;\
} while(0)
static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode )
{
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
if( h->mb.b_lossless )
x264_predict_lossless_4x4( h, p_dst, p, idx, i_mode );
else
h->predict_4x4[i_mode]( p_dst );
if( h->mb.b_lossless )
{
nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+idx], p_src, p_dst );
h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
h->mb.i_cbp_luma |= nz<<(idx>>2);
return;
}
h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 1, p, idx );
h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
if( nz )
{
h->mb.i_cbp_luma |= 1<<(idx>>2);
h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4 );
h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[p?CQM_4IC:CQM_4IY], i_qp );
h->dctf.add4x4_idct( p_dst, dct4x4 );
}
}
static ALWAYS_INLINE void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge )
{
int x = idx&1;
int y = idx>>1;
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
if( !edge )
{
h->predict_8x8_filter( p_dst, edge_buf, h->mb.i_neighbour8[idx], x264_pred_i4x4_neighbors[i_mode] );
edge = edge_buf;
}
if( h->mb.b_lossless )
x264_predict_lossless_8x8( h, p_dst, p, idx, i_mode, edge );
else
h->predict_8x8[i_mode]( p_dst, edge );
if( h->mb.b_lossless )
{
nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+idx], p_src, p_dst );
STORE_8x8_NNZ( p, idx, nz );
h->mb.i_cbp_luma |= nz<<idx;
return;
}
h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
nz = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 1, p, idx );
if( nz )
{
h->mb.i_cbp_luma |= 1<<idx;
h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8 );
h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[p?CQM_8IC:CQM_8IY], i_qp );
h->dctf.add8x8_idct8( p_dst, dct8x8 );
STORE_8x8_NNZ( p, idx, 1 );
}
else
STORE_8x8_NNZ( p, idx, 0 );
}
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment