Commit 6b577361 authored by Loren Merritt's avatar Loren Merritt

mmx implementation of x264_pixel_sa8d



git-svn-id: svn://svn.videolan.org/x264/trunk@507 df754926-b1dd-0310-bc7b-ec298dee348c
parent af751ac3
......@@ -45,6 +45,8 @@ cglobal x264_pixel_satd_8x8_sse2
cglobal x264_pixel_satd_16x8_sse2
cglobal x264_pixel_satd_8x16_sse2
cglobal x264_pixel_satd_16x16_sse2
cglobal x264_pixel_sa8d_8x8_sse2
cglobal x264_pixel_sa8d_16x16_sse2
%macro SAD_INC_4x16P_SSE2 0
movdqu xmm1, [rdx]
......@@ -506,3 +508,110 @@ x264_pixel_satd_8x4_sse2:
SATD_END
%macro LOAD_DIFF_8P 4 ; MMP, MMT, [pix1], [pix2]
movq %1, %3
movq %2, %4
punpcklbw %1, %2
punpcklbw %2, %2
psubw %1, %2
%endmacro
%macro SBUTTERFLY 5
mov%1 %5, %3
punpckl%2 %3, %4
punpckh%2 %5, %4
%endmacro
;-----------------------------------------------------------------------------
; input ABCDEFGH output AFHDTECB
;-----------------------------------------------------------------------------
%macro TRANSPOSE8x8 9
SBUTTERFLY dqa, wd, %1, %2, %9
SBUTTERFLY dqa, wd, %3, %4, %2
SBUTTERFLY dqa, wd, %5, %6, %4
SBUTTERFLY dqa, wd, %7, %8, %6
SBUTTERFLY dqa, dq, %1, %3, %8
SBUTTERFLY dqa, dq, %9, %2, %3
SBUTTERFLY dqa, dq, %5, %7, %2
SBUTTERFLY dqa, dq, %4, %6, %7
SBUTTERFLY dqa, qdq, %1, %5, %6
SBUTTERFLY dqa, qdq, %9, %4, %5
SBUTTERFLY dqa, qdq, %8, %2, %4
SBUTTERFLY dqa, qdq, %3, %7, %2
%endmacro
%macro SUMSUB_BADC 4
paddw %1, %2
paddw %3, %4
paddw %2, %2
paddw %4, %4
psubw %2, %1
psubw %4, %3
%endmacro
%macro HADAMARD1x8 8
SUMSUB_BADC %1, %5, %2, %6
SUMSUB_BADC %3, %7, %4, %8
SUMSUB_BADC %1, %3, %2, %4
SUMSUB_BADC %5, %7, %6, %8
SUMSUB_BADC %1, %2, %3, %4
SUMSUB_BADC %5, %6, %7, %8
%endmacro
ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
x264_pixel_sa8d_8x8_sse2:
lea r10, [3*parm2q]
lea r11, [3*parm4q]
LOAD_DIFF_8P xmm0, xmm8, [parm1q], [parm3q]
LOAD_DIFF_8P xmm1, xmm9, [parm1q+parm2q], [parm3q+parm4q]
LOAD_DIFF_8P xmm2, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q]
LOAD_DIFF_8P xmm3, xmm9, [parm1q+r10], [parm3q+r11]
lea parm1q, [parm1q+4*parm2q]
lea parm3q, [parm3q+4*parm4q]
LOAD_DIFF_8P xmm4, xmm8, [parm1q], [parm3q]
LOAD_DIFF_8P xmm5, xmm9, [parm1q+parm2q], [parm3q+parm4q]
LOAD_DIFF_8P xmm6, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q]
LOAD_DIFF_8P xmm7, xmm9, [parm1q+r10], [parm3q+r11]
HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
HADAMARD1x8 xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1
pxor xmm10, xmm10
SUM4x4_TWO_SSE2 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10
SUM4x4_TWO_SSE2 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10
SUM_MM_SSE2 xmm10, xmm0
add r8d, eax ; preserve rounding for 16x16
add eax, 1
shr eax, 1
ret
ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
;; violates calling convention
x264_pixel_sa8d_16x16_sse2:
xor r8d, r8d
call x264_pixel_sa8d_8x8_sse2 ; pix[0]
lea parm1q, [parm1q+4*parm2q]
lea parm3q, [parm3q+4*parm4q]
call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride]
lea r10, [3*parm2q-2]
lea r11, [3*parm4q-2]
shl r10, 2
shl r11, 2
sub parm1q, r10
sub parm3q, r11
call x264_pixel_sa8d_8x8_sse2 ; pix[8]
lea parm1q, [parm1q+4*parm2q]
lea parm3q, [parm3q+4*parm4q]
call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride+8]
mov eax, r8d
add eax, 1
shr eax, 1
ret
......@@ -340,7 +340,7 @@ BITS 32
; satd
%macro HADAMARD4_SUB_BADC 4
%macro SUMSUB_BADC 4
paddw %1, %2
paddw %3, %4
paddw %2, %2
......@@ -350,8 +350,8 @@ BITS 32
%endmacro
%macro HADAMARD4x4 4
HADAMARD4_SUB_BADC %1, %2, %3, %4
HADAMARD4_SUB_BADC %1, %3, %2, %4
SUMSUB_BADC %1, %2, %3, %4
SUMSUB_BADC %1, %3, %2, %4
%endmacro
%macro SBUTTERFLYwd 3
......@@ -373,6 +373,12 @@ BITS 32
SBUTTERFLYdq %5, %2, %3
%endmacro
%macro MMX_ABS 2 ; mma, tmp
pxor %2, %2
psubw %2, %1
pmaxsw %1, %2
%endmacro
%macro MMX_ABS_TWO 4 ; mma, mmb, tmp0, tmp1
pxor %3, %3
pxor %4, %4
......@@ -393,12 +399,12 @@ BITS 32
pavgw %1, mm6
%endmacro
%macro LOAD_DIFF_4P 3 ; mmp, dx, dy
movd %1, [eax+ebx*%3+%2]
movd mm3, [ecx+edx*%3+%2]
punpcklbw %1, mm3
punpcklbw mm3, mm3
psubw %1, mm3
%macro LOAD_DIFF_4P 4 ; mmp, mmt, dx, dy
movd %1, [eax+ebx*%4+%3]
movd %2, [ecx+edx*%4+%3]
punpcklbw %1, %2
punpcklbw %2, %2
psubw %1, %2
%endmacro
; in: %2 = horizontal offset
......@@ -407,21 +413,21 @@ BITS 32
; out: %1 = satd
%macro LOAD_DIFF_HADAMARD_SUM 3
%if %3
LOAD_DIFF_4P mm4, %2, 0
LOAD_DIFF_4P mm5, %2, 1
LOAD_DIFF_4P mm4, mm3, %2, 0
LOAD_DIFF_4P mm5, mm3, %2, 1
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
LOAD_DIFF_4P mm6, %2, 0
LOAD_DIFF_4P mm7, %2, 1
LOAD_DIFF_4P mm6, mm3, %2, 0
LOAD_DIFF_4P mm7, mm3, %2, 1
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
%else
LOAD_DIFF_4P mm4, %2, 0
LOAD_DIFF_4P mm6, %2, 2
LOAD_DIFF_4P mm4, mm3, %2, 0
LOAD_DIFF_4P mm6, mm3, %2, 2
add eax, ebx
add ecx, edx
LOAD_DIFF_4P mm5, %2, 0
LOAD_DIFF_4P mm7, %2, 2
LOAD_DIFF_4P mm5, mm3, %2, 0
LOAD_DIFF_4P mm7, mm3, %2, 2
%endif
HADAMARD4x4_SUM %1
%endmacro
......@@ -476,6 +482,9 @@ cglobal x264_pixel_satd_16x8_mmxext
cglobal x264_pixel_satd_8x16_mmxext
cglobal x264_pixel_satd_16x16_mmxext
cglobal x264_pixel_sa8d_16x16_mmxext
cglobal x264_pixel_sa8d_8x8_mmxext
%macro SAD_START 0
push ebx
......@@ -808,3 +817,162 @@ x264_pixel_satd_16x16_mmxext:
pop ebx
ret
%macro LOAD_DIFF_4x8P 1 ; dx
LOAD_DIFF_4P mm0, mm7, %1, 0
LOAD_DIFF_4P mm1, mm7, %1, 1
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
LOAD_DIFF_4P mm2, mm7, %1, 0
LOAD_DIFF_4P mm3, mm7, %1, 1
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
LOAD_DIFF_4P mm4, mm7, %1, 0
LOAD_DIFF_4P mm5, mm7, %1, 1
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
LOAD_DIFF_4P mm6, mm7, %1, 0
movq [spill], mm6
LOAD_DIFF_4P mm7, mm6, %1, 1
movq mm6, [spill]
%endmacro
%macro HADAMARD1x8 8
SUMSUB_BADC %1, %5, %2, %6
SUMSUB_BADC %3, %7, %4, %8
SUMSUB_BADC %1, %3, %2, %4
SUMSUB_BADC %5, %7, %6, %8
SUMSUB_BADC %1, %2, %3, %4
SUMSUB_BADC %5, %6, %7, %8
%endmacro
%macro SUM4x8_MM 0
movq [spill], mm7
MMX_ABS mm0, mm7
MMX_ABS mm1, mm7
MMX_ABS mm2, mm7
MMX_ABS mm3, mm7
paddw mm0, mm2
paddw mm1, mm3
movq mm7, [spill]
MMX_ABS_TWO mm4, mm5, mm2, mm3
MMX_ABS_TWO mm6, mm7, mm2, mm3
paddw mm4, mm6
paddw mm5, mm7
paddw mm0, mm4
paddw mm1, mm5
paddw mm0, mm1
%endmacro
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
x264_pixel_sa8d_8x8_mmxext:
SATD_START
sub esp, 0x68
%define args esp+0x6c
%define spill esp+0x60
LOAD_DIFF_4x8P 0
HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
movq [spill], mm0
TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0 ; abcd-t -> adtc
movq [esp+0x00], mm4
movq [esp+0x08], mm7
movq [esp+0x10], mm0
movq [esp+0x18], mm6
movq mm0, [spill]
TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4
movq [esp+0x20], mm0
movq [esp+0x28], mm3
movq [esp+0x30], mm4
movq [esp+0x38], mm2
mov eax, [args+4]
mov ecx, [args+12]
LOAD_DIFF_4x8P 4
HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
movq [spill], mm4
TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4
movq [esp+0x40], mm0
movq [esp+0x48], mm3
movq [esp+0x50], mm4
movq [esp+0x58], mm2
movq mm4, [spill]
TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0
movq mm5, [esp+0x00]
movq mm1, [esp+0x08]
movq mm2, [esp+0x10]
movq mm3, [esp+0x18]
HADAMARD1x8 mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6
SUM4x8_MM
movq [esp], mm0
movq mm0, [esp+0x20]
movq mm1, [esp+0x28]
movq mm2, [esp+0x30]
movq mm3, [esp+0x38]
movq mm4, [esp+0x40]
movq mm5, [esp+0x48]
movq mm6, [esp+0x50]
movq mm7, [esp+0x58]
HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
SUM4x8_MM
pavgw mm0, [esp]
pshufw mm1, mm0, 01001110b
paddw mm0, mm1
pshufw mm1, mm0, 10110001b
paddw mm0, mm1
movd eax, mm0
and eax, 0xffff
mov ecx, eax ; preserve rounding for 16x16
add eax, 1
shr eax, 1
add esp, 0x68
pop ebx
ret
%undef args
%undef spill
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sa8d_16x16_mmxext( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
;; violates calling convention
x264_pixel_sa8d_16x16_mmxext:
push esi
push edi
push ebp
mov esi, [esp+28] ; stride2
mov edi, [esp+20] ; stride1
push esi
push dword [esp+28] ; pix2
push edi
push dword [esp+28] ; pix1
call x264_pixel_sa8d_8x8_mmxext
mov ebp, ecx
shl edi, 3
shl esi, 3
add [esp+0], edi ; pix1+8*stride1
add [esp+8], esi ; pix2+8*stride2
call x264_pixel_sa8d_8x8_mmxext
add ebp, ecx
add dword [esp+0], 8 ; pix1+8*stride1+8
add dword [esp+8], 8 ; pix2+8*stride2+8
call x264_pixel_sa8d_8x8_mmxext
add ebp, ecx
sub [esp+0], edi ; pix1+8
sub [esp+8], esi ; pix2+8
call x264_pixel_sa8d_8x8_mmxext
lea eax, [ebp+ecx+1]
shr eax, 1
add esp, 16
pop ebp
pop edi
pop esi
ret
......@@ -67,6 +67,9 @@ int x264_pixel_satd_8x4_mmxext( uint8_t *, int, uint8_t *, int );
int x264_pixel_satd_4x8_mmxext( uint8_t *, int, uint8_t *, int );
int x264_pixel_satd_4x4_mmxext( uint8_t *, int, uint8_t *, int );
int x264_pixel_sa8d_16x16_mmxext( uint8_t *, int, uint8_t *, int );
int x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int );
int x264_pixel_sad_16x16_sse2( uint8_t *, int, uint8_t *, int );
int x264_pixel_sad_16x8_sse2( uint8_t *, int, uint8_t *, int );
......@@ -84,4 +87,7 @@ int x264_pixel_satd_8x16_sse2( uint8_t *, int, uint8_t *, int );
int x264_pixel_satd_8x8_sse2( uint8_t *, int, uint8_t *, int );
int x264_pixel_satd_8x4_sse2( uint8_t *, int, uint8_t *, int );
int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int );
int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int );
#endif
......@@ -365,6 +365,11 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sad_pde[PIXEL_16x16] = x264_pixel_sad_pde_16x16_mmxext;
pixf->sad_pde[PIXEL_16x8 ] = x264_pixel_sad_pde_16x8_mmxext;
pixf->sad_pde[PIXEL_8x16 ] = x264_pixel_sad_pde_8x16_mmxext;
#ifdef ARCH_X86
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext;
#endif
}
#endif
......@@ -381,7 +386,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->satd[PIXEL_8x8] = x264_pixel_satd_8x8_sse2;
pixf->satd[PIXEL_8x4] = x264_pixel_satd_8x4_sse2;
#ifndef ARCH_X86_64
#ifdef ARCH_X86
pixf->sad_x3[PIXEL_16x16] = x264_pixel_sad_x3_16x16_sse2;
pixf->sad_x3[PIXEL_16x8 ] = x264_pixel_sad_x3_16x8_sse2;
......@@ -394,6 +399,11 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
{
pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_sse2;
pixf->ssd[PIXEL_16x8] = x264_pixel_ssd_16x8_sse2;
#ifdef ARCH_X86_64
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2;
#endif
}
#endif
......
......@@ -60,6 +60,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
TEST_PIXEL( sad );
TEST_PIXEL( ssd );
TEST_PIXEL( satd );
TEST_PIXEL( sa8d );
#define TEST_PIXEL_X( N ) \
for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment