Commit fea48871 authored by Simon Horlick's avatar Simon Horlick Committed by Fiona Glaser

MBAFF: Create a VSAD DSP function

x86 assembly by Fiona Glaser. This gives roughly 30x speed
increase over the C version.
parent 3a7194f1
......@@ -641,6 +641,14 @@ float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
return ssim;
}
int pixel_vsad( pixel *src, int stride )
{
int score = 0;
for( int i = 1; i < 16; i++, src += stride )
for( int j = 0; j < 16; j++ )
score += abs(src[j] - src[j+stride]);
return score;
}
/****************************************************************************
* successive elimination
......@@ -746,6 +754,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->ssim_4x4x2_core = ssim_4x4x2_core;
pixf->ssim_end4 = ssim_end4;
pixf->var2_8x8 = pixel_var2_8x8;
pixf->vsad = pixel_vsad;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4;
......@@ -873,6 +882,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmxext;
pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext;
pixf->vsad = x264_pixel_vsad_mmxext;
if( cpu&X264_CPU_CACHELINE_32 )
{
......@@ -921,6 +931,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
pixf->var2_8x8 = x264_pixel_var2_8x8_sse2;
pixf->vsad = x264_pixel_vsad_sse2;
}
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
......
......@@ -82,6 +82,7 @@ typedef struct
x264_pixel_cmp_x3_t fpelcmp_x3[7];
x264_pixel_cmp_x4_t fpelcmp_x4[7];
x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
int (*vsad)( pixel *, int );
int (*var2_8x8)( pixel *, int, pixel *, int, int * );
uint64_t (*var[4])( pixel *pix, int stride );
......
......@@ -138,6 +138,8 @@ float x264_pixel_ssim_end4_avx( int sum0[5][4], int sum1[5][4], int width );
int x264_pixel_var2_8x8_mmxext( pixel *, int, pixel *, int, int * );
int x264_pixel_var2_8x8_sse2( pixel *, int, pixel *, int, int * );
int x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * );
int x264_pixel_vsad_mmxext( pixel *src, int stride );
int x264_pixel_vsad_sse2( pixel *src, int stride );
#define DECL_ADS( size, suffix ) \
int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
......
......@@ -272,6 +272,65 @@ cglobal pixel_sad_8x16_sse2, 4,4
SAD_END_SSE2
RET
;-----------------------------------------------------------------------------
; void pixel_vsad( pixel *src, int stride );
;-----------------------------------------------------------------------------
%ifndef ARCH_X86_64
INIT_MMX
cglobal pixel_vsad_mmxext, 2,3
mova m0, [r0+0]
mova m1, [r0+8]
mova m2, [r0+r1+0]
mova m3, [r0+r1+8]
lea r0, [r0+r1*2]
psadbw m0, m2
psadbw m1, m3
mov r2d, 7
.loop:
mova m4, [r0+0]
mova m5, [r0+8]
psadbw m2, m4
psadbw m3, m5
paddw m0, m2
paddw m1, m3
mova m2, [r0+r1+0]
mova m3, [r0+r1+8]
lea r0, [r0+r1*2]
psadbw m4, m2
psadbw m5, m3
paddw m0, m4
paddw m1, m5
dec r2d
jg .loop
paddw m0, m1
movd eax, m0
RET
%endif
INIT_XMM
cglobal pixel_vsad_sse2, 2,2
mova m1, [r0]
%assign i 1
%rep 15
mova m2, [r0+r1*(i&1)]
%if i&1
lea r0, [r0+r1*2]
%endif
psadbw m1, m2
%if i>1
paddw m0, m1
%else
SWAP 0, 1
%endif
SWAP 1, 2
%assign i i+1
%endrep
movhlps m1, m0
paddw m0, m1
movd eax, m0
RET
;-----------------------------------------------------------------------------
; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
......
......@@ -417,6 +417,22 @@ static int check_pixel( int cpu_ref, int cpu_new )
}
report( "pixel hadamard_ac :" );
ok = 1; used_asm = 0;
if( pixel_asm.vsad != pixel_ref.vsad )
{
int res_c, res_asm;
set_func_name( "vsad" );
used_asm = 1;
res_c = call_c( pixel_c.vsad, pbuf1, 16 );
res_asm = call_a( pixel_asm.vsad, pbuf1, 16 );
if( res_c != res_asm )
{
ok = 0;
fprintf( stderr, "vsad: %d != %d\n", res_c, res_asm );
}
}
report( "pixel vsad :" );
#define TEST_INTRA_MBCMP( name, pred, satd, i8x8, ... ) \
if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
{ \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment