Commit 5e8645b3 authored by Anton Mitrofanov's avatar Anton Mitrofanov Committed by Fiona Glaser

Faster, more accurate psy-RD caching

Keep more variants of cached Hadamard scores and only calculate them when necessary.
Results in more calculation, but simpler lookups.
Slightly more accurate due to internal rounding in SATD and SA8D functions.
parent 5c767904
...@@ -583,11 +583,9 @@ struct x264_t ...@@ -583,11 +583,9 @@ struct x264_t
ALIGNED_16( int16_t fenc_dct8[4][64] ); ALIGNED_16( int16_t fenc_dct8[4][64] );
ALIGNED_16( int16_t fenc_dct4[16][16] ); ALIGNED_16( int16_t fenc_dct4[16][16] );
/* Psy RD SATD scores */ /* Psy RD SATD/SA8D scores cache */
int fenc_satd[4][4]; ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
int fenc_satd_sum; ALIGNED_16( uint32_t fenc_satd_cache[32] );
int fenc_sa8d[2][2];
int fenc_sa8d_sum;
/* pointer over mb of the frame to be compressed */ /* pointer over mb of the frame to be compressed */
uint8_t *p_fenc[3]; uint8_t *p_fenc[3];
......
...@@ -731,15 +731,17 @@ cglobal x264_memcpy_aligned_sse2, 3,3 ...@@ -731,15 +731,17 @@ cglobal x264_memcpy_aligned_sse2, 3,3
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro MEMZERO 1 %macro MEMZERO 1
cglobal x264_memzero_aligned_%1, 2,2 cglobal x264_memzero_aligned_%1, 2,2
add r0, r1
neg r1
pxor m0, m0 pxor m0, m0
.loop: .loop:
sub r1d, mmsize*8
%assign i 0 %assign i 0
%rep 8 %rep 8
mova [r0 + r1 + i], m0 mova [r0 + r1 + i], m0
%assign i i+mmsize %assign i i+mmsize
%endrep %endrep
jg .loop add r1d, mmsize*8
jl .loop
REP_RET REP_RET
%endmacro %endmacro
......
...@@ -578,34 +578,13 @@ static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct ) ...@@ -578,34 +578,13 @@ static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero ); h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
} }
/* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */ /* Reset fenc satd scores cache for psy RD */
static inline void x264_mb_cache_fenc_satd( x264_t *h ) static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
{ {
ALIGNED_16( static uint8_t zero[16] ) = {0}; /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
uint8_t *fenc; h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
int x, y, satd_sum = 0, sa8d_sum = 0; if( b_satd )
if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis ) h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
if( !h->mb.i_psy_rd )
return;
for( y = 0; y < 4; y++ )
for( x = 0; x < 4; x++ )
{
fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
- (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
satd_sum += h->mb.pic.fenc_satd[y][x];
}
for( y = 0; y < 2; y++ )
for( x = 0; x < 2; x++ )
{
fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
- (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
}
h->mb.pic.fenc_satd_sum = satd_sum;
h->mb.pic.fenc_sa8d_sum = sa8d_sum;
} }
static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a ) static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
...@@ -1193,7 +1172,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) ...@@ -1193,7 +1172,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
h->mb.i_type = P_L0; h->mb.i_type = P_L0;
if( a->i_mbrd ) if( a->i_mbrd )
{ {
x264_mb_cache_fenc_satd( h ); x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra ) if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
{ {
h->mb.i_partition = D_16x16; h->mb.i_partition = D_16x16;
...@@ -2432,7 +2411,7 @@ void x264_macroblock_analyse( x264_t *h ) ...@@ -2432,7 +2411,7 @@ void x264_macroblock_analyse( x264_t *h )
{ {
intra_analysis: intra_analysis:
if( analysis.i_mbrd ) if( analysis.i_mbrd )
x264_mb_cache_fenc_satd( h ); x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
x264_mb_analyse_intra( h, &analysis, COST_MAX ); x264_mb_analyse_intra( h, &analysis, COST_MAX );
if( analysis.i_mbrd ) if( analysis.i_mbrd )
x264_intra_rd( h, &analysis, COST_MAX ); x264_intra_rd( h, &analysis, COST_MAX );
...@@ -2749,7 +2728,7 @@ intra_analysis: ...@@ -2749,7 +2728,7 @@ intra_analysis:
int b_skip = 0; int b_skip = 0;
if( analysis.i_mbrd ) if( analysis.i_mbrd )
x264_mb_cache_fenc_satd( h ); x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
h->mb.i_type = B_SKIP; h->mb.i_type = B_SKIP;
if( h->mb.b_direct_auto_write ) if( h->mb.b_direct_auto_write )
......
...@@ -61,36 +61,44 @@ static uint16_t cabac_size_5ones[128]; ...@@ -61,36 +61,44 @@ static uint16_t cabac_size_5ones[128];
#define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \ #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) ) sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
static inline uint64_t cached_hadamard( x264_t *h, int pixel, int x, int y )
/* Sum the cached SATDs to avoid repeating them. */
static inline int sum_satd( x264_t *h, int pixel, int x, int y )
{ {
int satd = 0; static const uint8_t hadamard_shift_x[4] = {4, 4, 3, 3};
int min_x = x>>2; static const uint8_t hadamard_shift_y[4] = {4-0, 3-0, 4-1, 3-1};
int min_y = y>>2; static const uint8_t hadamard_offset[4] = {0, 1, 3, 5};
int max_x = (x>>2) + (x264_pixel_size[pixel].w>>2); int cache_index = (x >> hadamard_shift_x[pixel]) + (y >> hadamard_shift_y[pixel])
int max_y = (y>>2) + (x264_pixel_size[pixel].h>>2); + hadamard_offset[pixel];
if( pixel == PIXEL_16x16 ) uint64_t res = h->mb.pic.fenc_hadamard_cache[cache_index];
return h->mb.pic.fenc_satd_sum; if( res )
for( y = min_y; y < max_y; y++ ) return res - 1;
for( x = min_x; x < max_x; x++ ) else
satd += h->mb.pic.fenc_satd[y][x]; {
return satd; uint8_t *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
res = h->pixf.hadamard_ac[pixel]( fenc, FENC_STRIDE );
h->mb.pic.fenc_hadamard_cache[cache_index] = res + 1;
return res;
}
} }
static inline int sum_sa8d( x264_t *h, int pixel, int x, int y ) static inline int cached_satd( x264_t *h, int pixel, int x, int y )
{ {
int sa8d = 0; static const uint8_t satd_shift_x[3] = {3, 2, 2};
int min_x = x>>3; static const uint8_t satd_shift_y[3] = {2-1, 3-2, 2-2};
int min_y = y>>3; static const uint8_t satd_offset[3] = {0, 8, 16};
int max_x = (x>>3) + (x264_pixel_size[pixel].w>>3); ALIGNED_16( static uint8_t zero[16] );
int max_y = (y>>3) + (x264_pixel_size[pixel].h>>3); int cache_index = (x >> satd_shift_x[pixel - PIXEL_8x4]) + (y >> satd_shift_y[pixel - PIXEL_8x4])
if( pixel == PIXEL_16x16 ) + satd_offset[pixel - PIXEL_8x4];
return h->mb.pic.fenc_sa8d_sum; int res = h->mb.pic.fenc_satd_cache[cache_index];
for( y = min_y; y < max_y; y++ ) if( res )
for( x = min_x; x < max_x; x++ ) return res - 1;
sa8d += h->mb.pic.fenc_sa8d[y][x]; else
return sa8d; {
uint8_t *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
int dc = h->pixf.sad[pixel]( fenc, FENC_STRIDE, zero, 0 ) >> 1;
res = h->pixf.satd[pixel]( fenc, FENC_STRIDE, zero, 0 ) - dc;
h->mb.pic.fenc_satd_cache[cache_index] = res + 1;
return res;
}
} }
/* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */ /* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */
...@@ -113,15 +121,16 @@ static inline int ssd_plane( x264_t *h, int size, int p, int x, int y ) ...@@ -113,15 +121,16 @@ static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
/* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */ /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */
if( size <= PIXEL_8x8 ) if( size <= PIXEL_8x8 )
{ {
uint64_t acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE ); uint64_t fdec_acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE );
satd = abs((int32_t)acs - sum_satd( h, size, x, y )) uint64_t fenc_acs = cached_hadamard( h, size, x, y );
+ abs((int32_t)(acs>>32) - sum_sa8d( h, size, x, y )); satd = abs((int32_t)fdec_acs - (int32_t)fenc_acs)
+ abs((int32_t)(fdec_acs>>32) - (int32_t)(fenc_acs>>32));
satd >>= 1; satd >>= 1;
} }
else else
{ {
int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1; int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1;
satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - sum_satd( h, size, x, y )); satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - cached_satd( h, size, x, y ));
} }
satd = (satd * h->mb.i_psy_rd * h->mb.i_psy_rd_lambda + 128) >> 8; satd = (satd * h->mb.i_psy_rd * h->mb.i_psy_rd_lambda + 128) >> 8;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment