Commit a59f4a7b authored by Fiona Glaser's avatar Fiona Glaser

Optimizations and cosmetics in macroblock.c

If an i4x4 dct block has no coefficients, don't bother with dequant/zigzag/idct.  Not useful for larger sizes because the odds of an empty block are much lower.
Cosmetics in i16x16 to be more consistent with other similar functions.
Add an SSD threshold for chroma in probe_skip to improve speed and minimize time spent on chroma skip analysis.
Rename lambda arrays to lambda_tab for consistency.
parent ed9a9f31
......@@ -128,7 +128,7 @@ typedef struct
} x264_mb_analysis_t;
/* lambda = pow(2,qp/6-2) */
static const int i_qp0_cost_table[52] = {
const int x264_lambda_tab[52] = {
1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
1, 1, 1, 1, /* 8-11 */
1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
......@@ -139,7 +139,7 @@ static const int i_qp0_cost_table[52] = {
};
/* lambda2 = pow(lambda,2) * .9 * 256 */
static const int i_qp0_cost2_table[52] = {
const int x264_lambda2_tab[52] = {
14, 18, 22, 28, 36, 45, 57, 72, /* 0 - 7 */
91, 115, 145, 182, 230, 290, 365, 460, /* 8 - 15 */
580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16 - 23 */
......@@ -205,8 +205,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
/* conduct the analysis using this lamda and QP */
a->i_qp = h->mb.i_qp = i_qp;
h->mb.i_chroma_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
a->i_lambda = i_qp0_cost_table[i_qp];
a->i_lambda2 = i_qp0_cost2_table[i_qp];
a->i_lambda = x264_lambda_tab[i_qp];
a->i_lambda2 = x264_lambda2_tab[i_qp];
a->b_mbrd = h->param.analyse.i_subpel_refine >= 6 &&
( h->sh.i_type != SLICE_TYPE_B || h->param.analyse.b_bframe_rdo );
......@@ -924,7 +924,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
if( i_max > 0 )
{
int i_chroma_lambda = i_qp0_cost2_table[h->mb.i_chroma_qp];
int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
/* the previous thing encoded was x264_intra_rd(), so the pixels and
* coefs for the current chroma mode are still around, so we only
* have to recount the bits. */
......
......@@ -100,11 +100,16 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
else
h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );
h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 );
h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
if( array_non_zero( dct4x4 ) )
{
h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 );
h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
/* output samples to fdec */
h->dctf.add4x4_idct( p_dst, dct4x4 );
/* output samples to fdec */
h->dctf.add4x4_idct( p_dst, dct4x4 );
}
else
memset( h->dct.luma4x4[idx], 0, sizeof(h->dct.luma4x4[idx]));
}
void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale )
......@@ -132,7 +137,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
uint8_t *p_src = h->mb.pic.p_fenc[0];
uint8_t *p_dst = h->mb.pic.p_fdec[0];
DECLARE_ALIGNED_16( int16_t dct4x4[16+1][4][4] );
DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
DECLARE_ALIGNED_16( int16_t dct_dc4x4[4][4] );
int i;
......@@ -143,46 +149,46 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
h->zigzagf.sub_4x4( h->dct.luma4x4[i], p_src+oe, p_dst+od );
dct4x4[0][block_idx_x[i]][block_idx_y[i]] = h->dct.luma4x4[i][0];
dct_dc4x4[block_idx_x[i]][block_idx_y[i]] = h->dct.luma4x4[i][0];
h->dct.luma4x4[i][0] = 0;
}
h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct4x4[0] );
h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
return;
}
h->dctf.sub16x16_dct( &dct4x4[1], p_src, p_dst );
h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
for( i = 0; i < 16; i++ )
{
/* copy dc coeff */
dct4x4[0][block_idx_y[i]][block_idx_x[i]] = dct4x4[1+i][0][0];
dct4x4[1+i][0][0] = 0;
dct_dc4x4[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
dct4x4[i][0][0] = 0;
/* quant/scan/dequant */
if( h->mb.b_trellis )
x264_quant_4x4_trellis( h, dct4x4[1+i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1 );
x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1 );
else
h->quantf.quant_4x4( dct4x4[1+i], h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );
h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );
h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[1+i] );
h->quantf.dequant_4x4( dct4x4[1+i], h->dequant4_mf[CQM_4IY], i_qscale );
h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] );
h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qscale );
}
h->dctf.dct4x4dc( dct4x4[0] );
h->quantf.quant_4x4_dc( dct4x4[0], h->quant4_mf[CQM_4IY][i_qscale][0]>>1, h->quant4_bias[CQM_4IY][i_qscale][0]<<1 );
h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct4x4[0] );
h->dctf.dct4x4dc( dct_dc4x4 );
h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qscale][0]>>1, h->quant4_bias[CQM_4IY][i_qscale][0]<<1 );
h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
/* output samples to fdec */
h->dctf.idct4x4dc( dct4x4[0] );
x264_mb_dequant_4x4_dc( dct4x4[0], h->dequant4_mf[CQM_4IY], i_qscale ); /* XXX not inversed */
h->dctf.idct4x4dc( dct_dc4x4 );
x264_mb_dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qscale ); /* XXX not inversed */
/* calculate dct coeffs */
for( i = 0; i < 16; i++ )
{
/* copy dc coeff */
dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
dct4x4[i][0][0] = dct_dc4x4[block_idx_y[i]][block_idx_x[i]];
}
/* put pixels to fdec */
h->dctf.add16x16_idct( p_dst, &dct4x4[1] );
h->dctf.add16x16_idct( p_dst, dct4x4 );
}
void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
......@@ -617,7 +623,7 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
int i_qp = h->mb.i_qp;
int mvp[2];
int ch;
int ch, thresh;
int i8x8, i4x4;
int i_decimate_mb;
......@@ -656,6 +662,7 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
/* encode chroma */
i_qp = h->mb.i_chroma_qp;
thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
for( ch = 0; ch < 2; ch++ )
{
......@@ -669,6 +676,11 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
mvp[0], mvp[1], 8, 8 );
}
/* there is almost never a termination during chroma, but we can't avoid the check entirely */
/* so instead we check SSD and skip the actual check if the score is low enough. */
if( h->pixf.ssd[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) < thresh )
continue;
h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
/* calculate dct DC */
......
......@@ -26,6 +26,9 @@
#include "common/macroblock.h"
extern const int x264_lambda2_tab[52];
extern const int x264_lambda_tab[52];
void x264_rdo_init( );
int x264_macroblock_probe_skip( x264_t *h, int b_bidir );
......
......@@ -32,7 +32,7 @@
static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
{
a->i_qp = 12; // arbitrary, but low because SATD scores are 1/4 normal
a->i_lambda = i_qp0_cost_table[ a->i_qp ];
a->i_lambda = x264_lambda_tab[ a->i_qp ];
x264_mb_analyse_load_costs( h, a );
h->mb.i_me_method = X264_MIN( X264_ME_HEX, h->param.analyse.i_me_method ); // maybe dia?
h->mb.i_subpel_refine = 4; // 3 should be enough, but not tweaking for speed now
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment