Commit 32bd2d64 authored by Fiona Glaser's avatar Fiona Glaser Committed by Loren Merritt

force unroll macroblock_load_pic_pointers

and a few other minor optimizations
parent 2d816a51
......@@ -1011,6 +1011,42 @@ static NOINLINE void copy_column8( uint8_t *dst, uint8_t *src )
dst[i*FDEC_STRIDE] = src[i*FDEC_STRIDE];
}
static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb_x, int i_mb_y, int i)
{
const int w = (i == 0 ? 16 : 8);
const int i_stride = h->fdec->i_stride[i];
const int i_stride2 = i_stride << h->mb.b_interlaced;
const int i_pix_offset = h->mb.b_interlaced
? w * (i_mb_x + (i_mb_y&~1) * i_stride) + (i_mb_y&1) * i_stride
: w * (i_mb_x + i_mb_y * i_stride);
int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
const uint8_t *intra_fdec = &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
x264_frame_t **fref[2] = { h->fref0, h->fref1 };
int j, k, l;
if( h->mb.b_interlaced )
ref_pix_offset[1] += (1-2*(i_mb_y&1)) * i_stride;
h->mb.pic.i_stride[i] = i_stride2;
h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
&h->fenc->plane[i][i_pix_offset], i_stride2, w );
memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
if( h->mb.b_interlaced )
{
const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
for( j = 0; j < w; j++ )
h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
}
for( l=0; l<2; l++ )
{
for( j=0; j<h->mb.pic.i_fref[l]; j++ )
{
h->mb.pic.p_fref[l][j][i==0 ? 0:i+3] = &fref[l][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]];
if( i == 0 )
for( k = 1; k < 4; k++ )
h->mb.pic.p_fref[l][j][k] = &fref[l][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]];
}
}
}
void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
{
int i_mb_xy = i_mb_y * h->mb.i_mb_stride + i_mb_x;
......@@ -1189,45 +1225,9 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
}
/* load picture pointers */
for( i = 0; i < 3; i++ )
{
const int w = (i == 0 ? 16 : 8);
const int i_stride = h->fdec->i_stride[i];
const int i_stride2 = i_stride << h->mb.b_interlaced;
const int i_pix_offset = h->mb.b_interlaced
? w * (i_mb_x + (i_mb_y&~1) * i_stride) + (i_mb_y&1) * i_stride
: w * (i_mb_x + i_mb_y * i_stride);
int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
const uint8_t *intra_fdec = &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
x264_frame_t **fref[2] = { h->fref0, h->fref1 };
int j, k, l;
if( h->mb.b_interlaced )
ref_pix_offset[1] += (1-2*(i_mb_y&1)) * i_stride;
h->mb.pic.i_stride[i] = i_stride2;
h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
&h->fenc->plane[i][i_pix_offset], i_stride2, w );
memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
if( h->mb.b_interlaced )
{
const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
for( j = 0; j < w; j++ )
h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
}
for( l=0; l<2; l++ )
{
for( j=0; j<h->mb.pic.i_fref[l]; j++ )
{
h->mb.pic.p_fref[l][j][i==0 ? 0:i+3] = &fref[l][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]];
if( i == 0 )
for( k = 1; k < 4; k++ )
h->mb.pic.p_fref[l][j][k] = &fref[l][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]];
}
}
}
x264_macroblock_load_pic_pointers( h, i_mb_x, i_mb_y, 0 );
x264_macroblock_load_pic_pointers( h, i_mb_x, i_mb_y, 1 );
x264_macroblock_load_pic_pointers( h, i_mb_x, i_mb_y, 2 );
if( h->fdec->integral )
{
......
......@@ -64,7 +64,7 @@ static int x264_mb_decimate_score( int16_t *dct, int i_max )
{
int i_run;
if( abs( dct[idx--] ) > 1 )
if( (unsigned)(dct[idx--] + 1) > 2 )
return 9;
i_run = 0;
......@@ -273,15 +273,9 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
static void x264_macroblock_encode_skip( x264_t *h )
{
int i;
h->mb.i_cbp_luma = 0x00;
h->mb.i_cbp_chroma = 0x00;
for( i = 0; i < 16+8; i++ )
{
h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
}
memset( h->mb.cache.non_zero_count, 0, X264_SCAN8_SIZE );
/* store cbp */
h->mb.cbp[h->mb.i_mb_xy] = 0;
}
......@@ -500,8 +494,8 @@ void x264_macroblock_encode( x264_t *h )
h->quantf.quant_4x4( dct4x4[idx], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
if( b_decimate )
if( b_decimate && i_decimate_8x8 <= 6 )
i_decimate_8x8 += x264_mb_decimate_score( h->dct.luma4x4[idx], 16 );
}
......@@ -799,10 +793,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
int i4;
DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
h->quantf.quant_4x4( dct4x4[0], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
h->quantf.quant_4x4( dct4x4[1], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
h->quantf.quant_4x4( dct4x4[2], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
h->quantf.quant_4x4( dct4x4[3], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
for( i4 = 0; i4 < 4; i4++ )
h->quantf.quant_4x4( dct4x4[i4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
for( i4 = 0; i4 < 4; i4++ )
h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment