Commit 45e36790 authored by David Pethes's avatar David Pethes Committed by Fiona Glaser

Add dedicated variance function instead of using SAD+SSD

Faster variance calculation
parent 25976441
......@@ -152,6 +152,33 @@ static inline void pixel_sub_wxh( int16_t *diff, int i_size,
}
/****************************************************************************
* pixel_var_wxh
****************************************************************************/
#define PIXEL_VAR_C( name, w, shift ) \
static int name( uint8_t *pix, int i_stride, uint32_t *sad ) \
{ \
uint32_t var = 0, sum = 0, sqr = 0; \
int x, y; \
for( y = 0; y < w; y++ ) \
{ \
for( x = 0; x < w; x++ ) \
{ \
sum += pix[x]; \
sqr += pix[x] * pix[x]; \
} \
pix += i_stride; \
} \
var = sqr - (sum * sum >> shift); \
*sad = sum; \
return var; \
}
PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 )
PIXEL_VAR_C( x264_pixel_var_8x8, 8, 6 )
/****************************************************************************
* pixel_satd_WxH: sum of 4x4 Hadamard transformed differences
****************************************************************************/
......@@ -532,6 +559,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT4( sa8d, );
INIT_ADS( );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8;
pixf->ssim_4x4x2_core = ssim_4x4x2_core;
pixf->ssim_end4 = ssim_end4;
......@@ -550,7 +580,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( satd_x3, _mmxext );
INIT7( satd_x4, _mmxext );
INIT_ADS( _mmxext );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmxext;
#ifdef ARCH_X86
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext;
......@@ -592,6 +623,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x3, _sse2 );
INIT2( sad_x4, _sse2 );
INIT_ADS( _sse2 );
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
#ifdef ARCH_X86
if( cpu&X264_CPU_CACHELINE_64 )
......@@ -608,6 +640,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT5( satd, _sse2 );
INIT5( satd_x3, _sse2 );
INIT5( satd_x4, _sse2 );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
......
......@@ -26,6 +26,7 @@
typedef int (*x264_pixel_cmp_t) ( uint8_t *, int, uint8_t *, int );
typedef void (*x264_pixel_cmp_x3_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[3] );
typedef void (*x264_pixel_cmp_x4_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[4] );
typedef int (*x264_pixel_var_t) ( uint8_t *, int, uint32_t * );
enum
{
......@@ -71,6 +72,7 @@ typedef struct
x264_pixel_cmp_t fpelcmp[7]; /* either satd or sad for fullpel motion search */
x264_pixel_cmp_x3_t fpelcmp_x3[7];
x264_pixel_cmp_x4_t fpelcmp_x4[7];
x264_pixel_var_t var[4];
void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
const uint8_t *pix2, int stride2, int sums[2][4] );
......
......@@ -162,6 +162,112 @@ SSD 8, 8, sse2
SSD 8, 4, sse2
;=============================================================================
; variance
;=============================================================================
%macro VAR_START 0
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
pxor m7, m7 ; zero
%ifdef ARCH_X86_64
%define t3d r3d
%else
%define t3d r2d
%endif
%endmacro
%macro VAR_END 1
%if mmsize == 16
movhlps m0, m5
paddw m5, m0
%endif
movifnidn r2d, r2m
movd r1d, m5
movd [r2], m5 ; return sum
imul r1d, r1d
HADDD m6, m1
shr r1d, %1
movd eax, m6
sub eax, r1d ; sqr - (sum * sum >> shift)
RET
%endmacro
%macro VAR_2ROW 2
mov t3d, %2
.loop:
mova m0, [r0]
mova m1, m0
mova m3, [r0+%1]
mova m2, m0
punpcklbw m0, m7
mova m4, m3
punpckhbw m1, m7
%ifidn %1, r1
lea r0, [r0+%1*2]
%else
add r0, r1
%endif
punpckhbw m4, m7
psadbw m2, m7
paddw m5, m2
mova m2, m3
punpcklbw m3, m7
dec t3d
psadbw m2, m7
pmaddwd m0, m0
paddw m5, m2
pmaddwd m1, m1
paddd m6, m0
pmaddwd m3, m3
paddd m6, m1
pmaddwd m4, m4
paddd m6, m3
paddd m6, m4
jg .loop
%endmacro
;-----------------------------------------------------------------------------
; int x264_pixel_var_wxh_mmxext( uint8_t *, int, int * )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal x264_pixel_var_16x16_mmxext, 2,3
VAR_START
VAR_2ROW 8, 16
VAR_END 8
cglobal x264_pixel_var_8x8_mmxext, 2,3
VAR_START
VAR_2ROW r1, 4
VAR_END 6
INIT_XMM
cglobal x264_pixel_var_16x16_sse2, 2,3
VAR_START
VAR_2ROW r1, 8
VAR_END 8
cglobal x264_pixel_var_8x8_sse2, 2,3
VAR_START
mov t3d, 4
.loop:
movh m0, [r0]
movhps m0, [r0+r1]
lea r0, [r0+r1*2]
mova m1, m0
punpcklbw m0, m7
mova m2, m1
punpckhbw m1, m7
dec t3d
pmaddwd m0, m0
pmaddwd m1, m1
psadbw m2, m7
paddw m5, m2
paddd m6, m0
paddd m6, m1
jnz .loop
VAR_END 6
;=============================================================================
; SATD
......
......@@ -68,6 +68,11 @@ DECL_X4( sad, cache64_ssse3 );
#undef DECL_X1
#undef DECL_X4
int x264_pixel_var_16x16_mmxext( uint8_t *pix, int i_stride, uint32_t *sad );
int x264_pixel_var_16x16_sse2 ( uint8_t *pix, int i_stride, uint32_t *sad );
int x264_pixel_var_8x8_mmxext ( uint8_t *pix, int i_stride, uint32_t *sad );
int x264_pixel_var_8x8_sse2 ( uint8_t *pix, int i_stride, uint32_t *sad );
void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_4x4_ssse3( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * );
......
......@@ -187,7 +187,7 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, int *satd )
/* FIXME: This array is larger than necessary because a bug in GCC causes an all-zero
* array to be placed in .bss despite .bss not being correctly aligned on some platforms (win32?) */
DECLARE_ALIGNED_16( static uint8_t zero[17] ) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1};
unsigned int var=0, sad, ssd, i;
unsigned int var=0, sad, i;
if( satd || h->param.rc.i_aq_mode == X264_AQ_GLOBAL )
{
for( i=0; i<3; i++ )
......@@ -199,9 +199,7 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, int *satd )
: w * (mb_x + mb_y * stride);
int pix = i ? PIXEL_8x8 : PIXEL_16x16;
stride <<= h->mb.b_interlaced;
sad = h->pixf.sad[pix]( zero, 0, h->fenc->plane[i]+offset, stride );
ssd = h->pixf.ssd[pix]( zero, 0, h->fenc->plane[i]+offset, stride );
var += ssd - (sad * sad >> (i?6:8));
var += h->pixf.var[pix]( h->fenc->plane[i]+offset, stride, &sad );
// SATD to represent the block's overall complexity (bit cost) for intra encoding.
// exclude the DC coef, because nothing short of an actual intra prediction will estimate DC cost.
if( var && satd )
......
......@@ -302,6 +302,27 @@ static int check_pixel( int cpu_ref, int cpu_new )
TEST_PIXEL_X(3);
TEST_PIXEL_X(4);
#define TEST_PIXEL_VAR( i ) \
if( pixel_asm.var[i] != pixel_ref.var[i] ) \
{ \
uint32_t res_c, res_asm; \
uint32_t sad_c, sad_asm; \
set_func_name( "%s_%s", "var", pixel_names[i] ); \
used_asm = 1; \
res_c = call_c( pixel_c.var[i], buf1, 16, &sad_c ); \
res_asm = call_a( pixel_asm.var[i], buf1, 16, &sad_asm ); \
if( (res_c != res_asm) || (sad_c != sad_asm) ) \
{ \
ok = 0; \
fprintf( stderr, "var[%d]: %d,%d != %d,%d [FAILED]\n", i, res_c, sad_c, res_asm, sad_asm ); \
} \
}
ok = 1; used_asm = 0;
TEST_PIXEL_VAR( PIXEL_16x16 );
TEST_PIXEL_VAR( PIXEL_8x8 );
report( "pixel var :" );
#define TEST_INTRA_SATD( name, pred, satd, i8x8, ... ) \
if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
{ \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment