Commit 63f71477 authored by Fiona Glaser's avatar Fiona Glaser

Faster weightp analysis

Modify pixel_var slightly to return the necessary information and use it for weight analysis instead of sad/ssd.
Various minor cosmetics.
parent 118dc81e
......@@ -512,8 +512,6 @@ function x264_pixel_var_8x8_neon
VAR_SQR_SUM q1, q9, q14, d24
vld1.64 {d26}, [r0,:64], r1
VAR_SQR_SUM q2, q10, q15, d26
mov r2, #6
b x264_var_end
.endfunc
......@@ -529,7 +527,6 @@ function x264_pixel_var_16x16_neon
VAR_SQR_SUM q2, q13, q15, d19, vpaddl.u16
mov ip, #7
mov r2, #8
var16_loop:
subs ip, ip, #1
vld1.64 {d16-d17}, [r0,:128], r1
......@@ -554,8 +551,6 @@ function x264_var_end
vpadd.u32 d0, d0, d2
vmov r0, r1, d0
mul r0, r0, r0
sub r0, r1, r0, lsr r2
bx lr
.endfunc
......
......@@ -52,8 +52,8 @@ DECL_X1( ssd, neon )
int x264_pixel_sa8d_8x8_neon( uint8_t *, int, uint8_t *, int );
int x264_pixel_sa8d_16x16_neon( uint8_t *, int, uint8_t *, int );
int x264_pixel_var_8x8_neon( uint8_t *, int );
int x264_pixel_var_16x16_neon( uint8_t *, int );
uint64_t x264_pixel_var_8x8_neon( uint8_t *, int );
uint64_t x264_pixel_var_16x16_neon( uint8_t *, int );
int x264_pixel_var2_8x8_neon( uint8_t *, int, uint8_t *, int, int * );
uint64_t x264_pixel_hadamard_ac_8x8_neon( uint8_t *, int );
......
......@@ -142,10 +142,10 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1
/****************************************************************************
* pixel_var_wxh
****************************************************************************/
#define PIXEL_VAR_C( name, w, shift ) \
static int name( uint8_t *pix, int i_stride ) \
#define PIXEL_VAR_C( name, w ) \
static uint64_t name( uint8_t *pix, int i_stride ) \
{ \
uint32_t var = 0, sum = 0, sqr = 0; \
uint32_t sum = 0, sqr = 0; \
int x, y; \
for( y = 0; y < w; y++ ) \
{ \
......@@ -156,12 +156,11 @@ static int name( uint8_t *pix, int i_stride ) \
} \
pix += i_stride; \
} \
var = sqr - (sum * sum >> shift); \
return var; \
return sum + ((uint64_t)sqr << 32); \
}
PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 )
PIXEL_VAR_C( x264_pixel_var_8x8, 8, 6 )
PIXEL_VAR_C( x264_pixel_var_16x16, 16 )
PIXEL_VAR_C( x264_pixel_var_8x8, 8 )
/****************************************************************************
* pixel_var2_wxh
......
......@@ -75,7 +75,7 @@ typedef struct
x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
int (*var2_8x8)( uint8_t *, int, uint8_t *, int, int * );
int (*var[4])( uint8_t *pix, int stride );
uint64_t (*var[4])( uint8_t *pix, int stride );
uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride );
void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
......
......@@ -1636,7 +1636,7 @@ static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1,
/****************************************************************************
* variance
****************************************************************************/
static int x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
static uint64_t x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
{
ALIGNED_16(uint32_t sum_tab[4]);
ALIGNED_16(uint32_t sqr_tab[4]);
......@@ -1661,11 +1661,10 @@ static int x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
uint32_t sum = sum_tab[3];
uint32_t sqr = sqr_tab[3];
uint32_t var = sqr - (sum * sum >> 8);
return var;
return sum + ((uint64_t)sqr<<32);
}
static int x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
static uint64_t x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
{
ALIGNED_16(uint32_t sum_tab[4]);
ALIGNED_16(uint32_t sqr_tab[4]);
......@@ -1700,8 +1699,7 @@ static int x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
uint32_t sum = sum_tab[3];
uint32_t sqr = sqr_tab[3];
uint32_t var = sqr - (sum * sum >> 6);
return var;
return sum + ((uint64_t)sqr<<32);
}
......
......@@ -316,14 +316,15 @@ SSD 4, 8, ssse3
%endif
%endmacro
%macro VAR_END 1
%macro VAR_END 0
HADDW m5, m7
movd r1d, m5
imul r1d, r1d
movd eax, m5
HADDD m6, m1
shr r1d, %1
movd eax, m6
sub eax, r1d ; sqr - (sum * sum >> shift)
movd edx, m6
%ifdef ARCH_X86_64
shl rdx, 32
add rax, rdx
%endif
RET
%endmacro
......@@ -370,12 +371,12 @@ INIT_MMX
cglobal x264_pixel_var_16x16_mmxext, 2,3
VAR_START 0
VAR_2ROW 8, 16
VAR_END 8
VAR_END
cglobal x264_pixel_var_8x8_mmxext, 2,3
VAR_START 0
VAR_2ROW r1, 4
VAR_END 6
VAR_END
INIT_XMM
cglobal x264_pixel_var_16x16_sse2, 2,3,8
......@@ -389,7 +390,7 @@ cglobal x264_pixel_var_16x16_sse2, 2,3,8
VAR_CORE
dec r2d
jg .loop
VAR_END 8
VAR_END
cglobal x264_pixel_var_8x8_sse2, 2,4,8
VAR_START 1
......@@ -405,7 +406,7 @@ cglobal x264_pixel_var_8x8_sse2, 2,4,8
VAR_CORE
dec r2d
jg .loop
VAR_END 6
VAR_END
%macro VAR2_END 0
HADDW m5, m7
......
......@@ -69,8 +69,8 @@ DECL_X4( sad, cache64_mmxext );
DECL_X4( sad, cache64_sse2 );
DECL_X4( sad, cache64_ssse3 );
DECL_PIXELS( int, var, mmxext, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( int, var, sse2, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, var, mmxext, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, var, sse2, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( uint8_t *pix, int i_stride ))
......
......@@ -179,6 +179,22 @@ static inline double qscale2bits(ratecontrol_entry_t *rce, double qscale)
+ rce->misc_bits;
}
static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i )
{
int w = i ? 8 : 16;
int shift = i ? 6 : 8;
int stride = frame->i_stride[i];
int offset = h->mb.b_interlaced
? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride
: w * (mb_x + mb_y * stride);
int pix = i ? PIXEL_8x8 : PIXEL_16x16;
stride <<= h->mb.b_interlaced;
uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride );
uint32_t sum = (uint32_t)res;
uint32_t sqr = res >> 32;
return sqr - (sum * sum >> shift);
}
// Find the total AC energy of the block in all planes.
static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
{
......@@ -186,18 +202,9 @@ static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame
* and putting it after floating point ops. As a result, we put the emms at the end of the
* function and make sure that its always called before the float math. Noinline makes
* sure no reordering goes on. */
uint32_t var = 0, i;
for( i = 0; i < 3; i++ )
{
int w = i ? 8 : 16;
int stride = frame->i_stride[i];
int offset = h->mb.b_interlaced
? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride
: w * (mb_x + mb_y * stride);
int pix = i ? PIXEL_8x8 : PIXEL_16x16;
stride <<= h->mb.b_interlaced;
var += h->pixf.var[pix]( frame->plane[i]+offset, stride );
}
uint32_t var = ac_energy_plane( h, mb_x, mb_y, frame, 0 );
var += ac_energy_plane( h, mb_x, mb_y, frame, 1 );
var += ac_energy_plane( h, mb_x, mb_y, frame, 2 );
x264_emms();
return var;
}
......
......@@ -56,24 +56,23 @@ static void get_h264_weight( unsigned int weight_nonh264, int offset, x264_weigh
}
w->i_scale = X264_MIN( w->i_scale, 127 );
}
/* due to a GCC bug on some platforms (win32), flat[16] may not actually be aligned. */
ALIGNED_16( static uint8_t flat[17] ) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1};
static NOINLINE void weights_plane_analyse( x264_t *h, uint8_t *plane, int width, int height, int stride, unsigned int *sum, uint64_t *var )
{
int x,y;
unsigned int sad = 0;
uint64_t sad = 0;
uint64_t ssd = 0;
uint8_t *p = plane;
for( y = 0; y < height>>4; y++, p += stride*16 )
for( x = 0; x < width; x+=16 )
{
sad += h->pixf.sad_aligned[PIXEL_16x16]( p + x, stride, flat, 0 );
ssd += h->pixf.ssd[PIXEL_16x16]( p + x, stride, flat, 0 );
uint64_t res = h->pixf.var[PIXEL_16x16]( p + x, stride );
sad += (uint32_t)res;
ssd += res >> 32;
}
*sum = sad;
*var = ssd - (uint64_t) sad * sad / (width * height);
*var = ssd - ((uint64_t)sad * sad + width * height / 2) / (width * height);
x264_emms();
}
......@@ -126,24 +125,19 @@ static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, ui
int i_lines = fenc->i_lines_lowres;
int i_width = fenc->i_width_lowres;
uint8_t *fenc_plane = fenc->lowres[0];
ALIGNED_ARRAY_16( uint8_t, buf, [8*8] );
ALIGNED_8( uint8_t buf[8*8] );
int pixoff = 0;
int i_mb = 0;
if( w )
{
for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8)
{
w->weightfn[8>>2]( buf, 8, &src[pixoff], i_stride, w, 8 );
cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( buf, 8, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
}
else
for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
if( w )
{
/* Add cost of weights in the slice header. */
int numslices;
if( h->param.i_slice_count )
numslices = h->param.i_slice_count;
......@@ -151,11 +145,15 @@ static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, ui
numslices = (h->sps->i_mb_width * h->sps->i_mb_height + h->param.i_slice_max_mbs-1) / h->param.i_slice_max_mbs;
else
numslices = 1;
// FIXME still need to calculate for --slice-max-size
// Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used.
// Since using lowres frames, assume lambda = 1.
/* FIXME: find a way to account for --slice-max-size?
* Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used.
* Since using lowres frames, assume lambda = 1. */
cost += numslices * ( 10 + 2 * ( bs_size_ue( w[0].i_denom ) + bs_size_se( w[0].i_scale ) + bs_size_se( w[0].i_offset ) ) );
}
else
for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
x264_emms();
return cost;
}
......@@ -171,17 +169,16 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
int i_delta_index = fenc->i_frame - ref->i_frame - 1;
/* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
const float epsilon = 1.0/128.0;
float guess_scale;
int found;
x264_weight_t *weights = fenc->weight[0];
weights_plane_analyse( h, fenc->plane[0], fenc->i_width[0], fenc->i_lines[0], fenc->i_stride[0], &fenc_sum, &fenc_var );
weights_plane_analyse( h, ref->plane[0], ref->i_width[0], ref->i_lines[0], ref->i_stride[0], &ref_sum, &ref_var );
weights_plane_analyse( h, ref->plane[0], ref->i_width[0], ref->i_lines[0], ref->i_stride[0], &ref_sum, &ref_var );
fenc_var = round( sqrt( fenc_var ) );
ref_var = round( sqrt( ref_var ) );
ref_var = round( sqrt( ref_var ) );
fenc_mean = (float)fenc_sum / (fenc->i_lines[0] * fenc->i_width[0]);
ref_mean = (float)ref_sum / (fenc->i_lines[0] * fenc->i_width[0]);
ref_mean = (float) ref_sum / (fenc->i_lines[0] * fenc->i_width[0]);
//early termination
if( fabs( ref_mean - fenc_mean ) < 0.5 && fabsf( 1 - (float)fenc_var / ref_var ) < epsilon )
......@@ -220,7 +217,7 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
x264_emms();
/* FIXME: More analysis can be done here on SAD vs. SATD termination. */
/* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
/* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
if( !found || (minscale == 1<<mindenom && minoff == 0) || (float)minscore / origscore > 0.998 )
{
SET_WEIGHT( weights[0], 0, 1, 0, 0 );
......
......@@ -344,16 +344,20 @@ static int check_pixel( int cpu_ref, int cpu_new )
#define TEST_PIXEL_VAR( i ) \
if( pixel_asm.var[i] != pixel_ref.var[i] ) \
{ \
int res_c, res_asm; \
set_func_name( "%s_%s", "var", pixel_names[i] ); \
used_asm = 1; \
res_c = call_c( pixel_c.var[i], buf1, 16 ); \
res_asm = call_a( pixel_asm.var[i], buf1, 16 ); \
/* abi-check wrapper can't return uint64_t, so separate it from return value check */\
call_c1( pixel_c.var[i], buf1, 16 ); \
call_a1( pixel_asm.var[i], buf1, 16 ); \
uint64_t res_c = pixel_c.var[i]( buf1, 16 ); \
uint64_t res_asm = pixel_asm.var[i]( buf1, 16 ); \
if( res_c != res_asm ) \
{ \
ok = 0; \
fprintf( stderr, "var[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
fprintf( stderr, "var[%d]: %d %d != %d %d [FAILED]\n", i, (int)res_c, (int)(res_c>>32), (int)res_asm, (int)(res_asm>>32) ); \
} \
call_c2( pixel_c.var[i], buf1, 16 ); \
call_a2( pixel_asm.var[i], buf1, 16 ); \
}
ok = 1; used_asm = 0;
......@@ -386,6 +390,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
for( j=0; j<32; j++ )
{
uint8_t *pix = (j&16 ? buf1 : buf3) + (j&15)*256;
call_c1( pixel_c.hadamard_ac[i], buf1, 16 );
call_a1( pixel_asm.hadamard_ac[i], buf1, 16 );
uint64_t rc = pixel_c.hadamard_ac[i]( pix, 16 );
uint64_t ra = pixel_asm.hadamard_ac[i]( pix, 16 );
if( rc != ra )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment