Commit 5e8645b3 authored by Anton Mitrofanov's avatar Anton Mitrofanov Committed by Fiona Glaser

Faster, more accurate psy-RD caching

Keep more variants of cached Hadamard scores and only calculate them when necessary.
Results in more calculation, but simpler lookups.
Slightly more accurate due to internal rounding in SATD and SA8D functions.
parent 5c767904
......@@ -583,11 +583,9 @@ struct x264_t
ALIGNED_16( int16_t fenc_dct8[4][64] );
ALIGNED_16( int16_t fenc_dct4[16][16] );
/* Psy RD SATD scores */
int fenc_satd[4][4];
int fenc_satd_sum;
int fenc_sa8d[2][2];
int fenc_sa8d_sum;
/* Psy RD SATD/SA8D scores cache */
ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
ALIGNED_16( uint32_t fenc_satd_cache[32] );
/* pointer over mb of the frame to be compressed */
uint8_t *p_fenc[3];
......
......@@ -731,15 +731,17 @@ cglobal x264_memcpy_aligned_sse2, 3,3
;-----------------------------------------------------------------------------
%macro MEMZERO 1
cglobal x264_memzero_aligned_%1, 2,2
add r0, r1
neg r1
pxor m0, m0
.loop:
sub r1d, mmsize*8
%assign i 0
%rep 8
mova [r0 + r1 + i], m0
%assign i i+mmsize
%endrep
jg .loop
add r1d, mmsize*8
jl .loop
REP_RET
%endmacro
......
......@@ -578,34 +578,13 @@ static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
}
/* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
static inline void x264_mb_cache_fenc_satd( x264_t *h )
/* Reset fenc satd scores cache for psy RD */
static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
{
ALIGNED_16( static uint8_t zero[16] ) = {0};
uint8_t *fenc;
int x, y, satd_sum = 0, sa8d_sum = 0;
if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
if( !h->mb.i_psy_rd )
return;
for( y = 0; y < 4; y++ )
for( x = 0; x < 4; x++ )
{
fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
- (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
satd_sum += h->mb.pic.fenc_satd[y][x];
}
for( y = 0; y < 2; y++ )
for( x = 0; x < 2; x++ )
{
fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
- (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
}
h->mb.pic.fenc_satd_sum = satd_sum;
h->mb.pic.fenc_sa8d_sum = sa8d_sum;
/* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
if( b_satd )
h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
}
static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
......@@ -1193,7 +1172,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
h->mb.i_type = P_L0;
if( a->i_mbrd )
{
x264_mb_cache_fenc_satd( h );
x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
{
h->mb.i_partition = D_16x16;
......@@ -2432,7 +2411,7 @@ void x264_macroblock_analyse( x264_t *h )
{
intra_analysis:
if( analysis.i_mbrd )
x264_mb_cache_fenc_satd( h );
x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
x264_mb_analyse_intra( h, &analysis, COST_MAX );
if( analysis.i_mbrd )
x264_intra_rd( h, &analysis, COST_MAX );
......@@ -2749,7 +2728,7 @@ intra_analysis:
int b_skip = 0;
if( analysis.i_mbrd )
x264_mb_cache_fenc_satd( h );
x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
h->mb.i_type = B_SKIP;
if( h->mb.b_direct_auto_write )
......
......@@ -61,36 +61,44 @@ static uint16_t cabac_size_5ones[128];
#define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
/* Sum the cached SATDs to avoid repeating them. */
static inline int sum_satd( x264_t *h, int pixel, int x, int y )
static inline uint64_t cached_hadamard( x264_t *h, int pixel, int x, int y )
{
int satd = 0;
int min_x = x>>2;
int min_y = y>>2;
int max_x = (x>>2) + (x264_pixel_size[pixel].w>>2);
int max_y = (y>>2) + (x264_pixel_size[pixel].h>>2);
if( pixel == PIXEL_16x16 )
return h->mb.pic.fenc_satd_sum;
for( y = min_y; y < max_y; y++ )
for( x = min_x; x < max_x; x++ )
satd += h->mb.pic.fenc_satd[y][x];
return satd;
static const uint8_t hadamard_shift_x[4] = {4, 4, 3, 3};
static const uint8_t hadamard_shift_y[4] = {4-0, 3-0, 4-1, 3-1};
static const uint8_t hadamard_offset[4] = {0, 1, 3, 5};
int cache_index = (x >> hadamard_shift_x[pixel]) + (y >> hadamard_shift_y[pixel])
+ hadamard_offset[pixel];
uint64_t res = h->mb.pic.fenc_hadamard_cache[cache_index];
if( res )
return res - 1;
else
{
uint8_t *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
res = h->pixf.hadamard_ac[pixel]( fenc, FENC_STRIDE );
h->mb.pic.fenc_hadamard_cache[cache_index] = res + 1;
return res;
}
}
static inline int sum_sa8d( x264_t *h, int pixel, int x, int y )
static inline int cached_satd( x264_t *h, int pixel, int x, int y )
{
int sa8d = 0;
int min_x = x>>3;
int min_y = y>>3;
int max_x = (x>>3) + (x264_pixel_size[pixel].w>>3);
int max_y = (y>>3) + (x264_pixel_size[pixel].h>>3);
if( pixel == PIXEL_16x16 )
return h->mb.pic.fenc_sa8d_sum;
for( y = min_y; y < max_y; y++ )
for( x = min_x; x < max_x; x++ )
sa8d += h->mb.pic.fenc_sa8d[y][x];
return sa8d;
static const uint8_t satd_shift_x[3] = {3, 2, 2};
static const uint8_t satd_shift_y[3] = {2-1, 3-2, 2-2};
static const uint8_t satd_offset[3] = {0, 8, 16};
ALIGNED_16( static uint8_t zero[16] );
int cache_index = (x >> satd_shift_x[pixel - PIXEL_8x4]) + (y >> satd_shift_y[pixel - PIXEL_8x4])
+ satd_offset[pixel - PIXEL_8x4];
int res = h->mb.pic.fenc_satd_cache[cache_index];
if( res )
return res - 1;
else
{
uint8_t *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
int dc = h->pixf.sad[pixel]( fenc, FENC_STRIDE, zero, 0 ) >> 1;
res = h->pixf.satd[pixel]( fenc, FENC_STRIDE, zero, 0 ) - dc;
h->mb.pic.fenc_satd_cache[cache_index] = res + 1;
return res;
}
}
/* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */
......@@ -113,15 +121,16 @@ static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
/* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */
if( size <= PIXEL_8x8 )
{
uint64_t acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE );
satd = abs((int32_t)acs - sum_satd( h, size, x, y ))
+ abs((int32_t)(acs>>32) - sum_sa8d( h, size, x, y ));
uint64_t fdec_acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE );
uint64_t fenc_acs = cached_hadamard( h, size, x, y );
satd = abs((int32_t)fdec_acs - (int32_t)fenc_acs)
+ abs((int32_t)(fdec_acs>>32) - (int32_t)(fenc_acs>>32));
satd >>= 1;
}
else
{
int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1;
satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - sum_satd( h, size, x, y ));
satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - cached_satd( h, size, x, y ));
}
satd = (satd * h->mb.i_psy_rd * h->mb.i_psy_rd_lambda + 128) >> 8;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment