Commit 14b45a81 authored by Loren Merritt's avatar Loren Merritt

new ssd_8x*_sse2

align ssd_16x*_sse2
unroll ssd_4x*_mmx
parent 72869f76
......@@ -99,14 +99,17 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1
{
int64_t i_ssd = 0;
int x, y;
int align = !(((long)pix1 | (long)pix2 | i_pix1 | i_pix2) & 15);
#define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \
pix2 + y*i_pix2 + x, i_pix2 );
for( y = 0; y < i_height-15; y += 16 )
{
for( x = 0; x < i_width-15; x += 16 )
SSD(PIXEL_16x16);
if( x < i_width-7 )
x = 0;
if( align )
for( ; x < i_width-15; x += 16 )
SSD(PIXEL_16x16);
for( ; x < i_width-7; x += 8 )
SSD(PIXEL_8x16);
}
if( y < i_height-7 )
......@@ -610,7 +613,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
// these are faster on both Intel and AMD
if( cpu&X264_CPU_SSE2 )
{
INIT2( ssd, _sse2 );
INIT5( ssd, _sse2 );
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
......
......@@ -24,6 +24,8 @@
#ifndef _PIXEL_H
#define _PIXEL_H 1
// SSD assumes all args aligned
// other cmp functions assume first arg aligned
typedef int (*x264_pixel_cmp_t) ( uint8_t *, int, uint8_t *, int );
typedef void (*x264_pixel_cmp_x3_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[3] );
typedef void (*x264_pixel_cmp_x4_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[4] );
......
......@@ -83,41 +83,64 @@ SECTION .text
paddd mm0, mm4
%endmacro
%macro SSD_INC_1x8P 0
%macro SSD_INC_2x16P 0
SSD_INC_1x16P
SSD_INC_1x16P
%endmacro
%macro SSD_INC_2x8P 0
movq mm1, [r0]
movq mm2, [r2]
movq mm3, [r0+r1]
movq mm4, [r2+r3]
movq mm5, mm2
movq mm6, mm4
psubusb mm2, mm1
psubusb mm4, mm3
psubusb mm1, mm5
por mm1, mm2 ; mm1 = 8bit abs diff
psubusb mm3, mm6
por mm1, mm2
por mm3, mm4
movq mm2, mm1
movq mm4, mm3
punpcklbw mm1, mm7
punpckhbw mm2, mm7 ; (mm1,mm2) = 16bit abs diff
punpcklbw mm3, mm7
punpckhbw mm2, mm7
punpckhbw mm4, mm7
pmaddwd mm1, mm1
pmaddwd mm2, mm2
pmaddwd mm3, mm3
pmaddwd mm4, mm4
add r0, r1
add r2, r3
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
paddd mm0, mm1
paddd mm0, mm2
paddd mm0, mm3
paddd mm0, mm4
%endmacro
%macro SSD_INC_1x4P 0
movd mm1, [r0]
movd mm2, [r2]
movq mm5, mm2
psubusb mm2, mm1
psubusb mm1, mm5
por mm1, mm2
punpcklbw mm1, mm7
pmaddwd mm1, mm1
add r0, r1
add r2, r3
paddd mm0, mm1
%macro SSD_INC_2x4P 0
movd mm1, [r0]
movd mm2, [r2]
movd mm3, [r0+r1]
movd mm4, [r2+r3]
punpcklbw mm1, mm7
punpcklbw mm2, mm7
punpcklbw mm3, mm7
punpcklbw mm4, mm7
psubw mm1, mm2
psubw mm3, mm4
pmaddwd mm1, mm1
pmaddwd mm3, mm3
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
paddd mm0, mm1
paddd mm0, mm3
%endmacro
;-----------------------------------------------------------------------------
......@@ -127,8 +150,8 @@ SECTION .text
cglobal x264_pixel_ssd_%1x%2_mmx, 4,4
pxor mm7, mm7 ; zero
pxor mm0, mm0 ; mm0 holds the sum
%rep %2
SSD_INC_1x%1P
%rep %2/2
SSD_INC_2x%1P
%endrep
movq mm1, mm0
psrlq mm1, 32
......@@ -146,10 +169,10 @@ SSD_MMX 4, 8
SSD_MMX 4, 4
%macro SSD_INC_2x16P_SSE2 0
movdqu xmm1, [r0]
movdqu xmm2, [r2]
movdqu xmm3, [r0+r1]
movdqu xmm4, [r2+r3]
movdqa xmm1, [r0]
movdqa xmm2, [r2]
movdqa xmm3, [r0+r1]
movdqa xmm4, [r2+r3]
movdqa xmm5, xmm1
movdqa xmm6, xmm3
......@@ -180,6 +203,27 @@ SSD_MMX 4, 4
paddd xmm0, xmm3
%endmacro
%macro SSD_INC_2x8P_SSE2 0
movq xmm1, [r0]
movq xmm2, [r2]
movq xmm3, [r0+r1]
movq xmm4, [r2+r3]
punpcklbw xmm1,xmm7
punpcklbw xmm2,xmm7
punpcklbw xmm3,xmm7
punpcklbw xmm4,xmm7
psubw xmm1,xmm2
psubw xmm3,xmm4
pmaddwd xmm1,xmm1
pmaddwd xmm3,xmm3
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
paddd xmm0, xmm1
paddd xmm0, xmm3
%endmacro
;-----------------------------------------------------------------------------
; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
......@@ -188,7 +232,7 @@ cglobal x264_pixel_ssd_%1x%2_sse2, 4,4
pxor xmm7, xmm7
pxor xmm0, xmm0
%rep %2/2
SSD_INC_2x16P_SSE2
SSD_INC_2x%1P_SSE2
%endrep
HADDD xmm0, xmm1
movd eax, xmm0
......@@ -197,6 +241,9 @@ cglobal x264_pixel_ssd_%1x%2_sse2, 4,4
SSD_SSE2 16, 16
SSD_SSE2 16, 8
SSD_SSE2 8, 16
SSD_SSE2 8, 8
SSD_SSE2 8, 4
......
......@@ -48,7 +48,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
x264_predict_4x4_init( 0, predict_4x4 );
x264_predict_8x8_filter( buf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
#define TEST_PIXEL( name ) \
#define TEST_PIXEL( name, align ) \
for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \
{ \
int res_c, res_asm; \
......@@ -57,8 +57,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
for( j=0; j<64; j++ ) \
{ \
used_asm = 1; \
res_c = call_c( pixel_c.name[i], buf1, 32, buf2+j, 16 ); \
res_asm = call_a( pixel_asm.name[i], buf1, 32, buf2+j, 16 ); \
res_c = call_c( pixel_c.name[i], buf1, 32, buf2+j*!align, 16 ); \
res_asm = call_a( pixel_asm.name[i], buf1, 32, buf2+j*!align, 16 ); \
if( res_c != res_asm ) \
{ \
ok = 0; \
......@@ -70,10 +70,10 @@ static int check_pixel( int cpu_ref, int cpu_new )
} \
report( "pixel " #name " :" );
TEST_PIXEL( sad );
TEST_PIXEL( ssd );
TEST_PIXEL( satd );
TEST_PIXEL( sa8d );
TEST_PIXEL( sad, 0 );
TEST_PIXEL( ssd, 1 );
TEST_PIXEL( satd, 0 );
TEST_PIXEL( sa8d, 0 );
#define TEST_PIXEL_X( N ) \
for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment