Commit 8947b51f authored by Loren Merritt's avatar Loren Merritt

interleave multiple calls to SAD.

15% faster fullpel motion estimation.



git-svn-id: svn://svn.videolan.org/x264/trunk@490 df754926-b1dd-0310-bc7b-ec298dee348c
parent bddf5f03
......@@ -29,6 +29,8 @@ BITS 64
%include "amd64inc.asm"
; sad
%macro SAD_INC_2x16P 0
movq mm1, [parm1q]
movq mm2, [parm1q+8]
......@@ -72,6 +74,177 @@ BITS 64
lea parm3q, [parm3q+2*parm4q]
%endmacro
; sad x3 / x4
%macro SAD_X3_START_1x8P 1
mov%1 mm3, [parm1q]
mov%1 mm0, [parm2q]
mov%1 mm1, [parm3q]
mov%1 mm2, [parm4q]
psadbw mm0, mm3
psadbw mm1, mm3
psadbw mm2, mm3
%endmacro
%macro SAD_X3_1x8P 3
mov%1 mm3, [parm1q+%2]
mov%1 mm4, [parm2q+%3]
mov%1 mm5, [parm3q+%3]
mov%1 mm6, [parm4q+%3]
psadbw mm4, mm3
psadbw mm5, mm3
psadbw mm6, mm3
paddw mm0, mm4
paddw mm1, mm5
paddw mm2, mm6
%endmacro
%macro SAD_X3_2x16P 1
%if %1
SAD_X3_START_1x8P q
%else
SAD_X3_1x8P q, 0, 0
%endif
SAD_X3_1x8P q, 8, 8
SAD_X3_1x8P q, FENC_STRIDE, parm5q
SAD_X3_1x8P q, FENC_STRIDE+8, parm5q+8
add parm1q, 2*FENC_STRIDE
lea parm2q, [parm2q+2*parm5q]
lea parm3q, [parm3q+2*parm5q]
lea parm4q, [parm4q+2*parm5q]
%endmacro
%macro SAD_X3_2x8P 1
%if %1
SAD_X3_START_1x8P q
%else
SAD_X3_1x8P q, 0, 0
%endif
SAD_X3_1x8P q, FENC_STRIDE, parm5q
add parm1q, 2*FENC_STRIDE
lea parm2q, [parm2q+2*parm5q]
lea parm3q, [parm3q+2*parm5q]
lea parm4q, [parm4q+2*parm5q]
%endmacro
%macro SAD_X3_2x4P 1
%if %1
SAD_X3_START_1x8P d
%else
SAD_X3_1x8P d, 0, 0
%endif
SAD_X3_1x8P d, FENC_STRIDE, parm5q
add parm1q, 2*FENC_STRIDE
lea parm2q, [parm2q+2*parm5q]
lea parm3q, [parm3q+2*parm5q]
lea parm4q, [parm4q+2*parm5q]
%endmacro
%macro SAD_X4_START_1x8P 1
mov%1 mm7, [parm1q]
mov%1 mm0, [parm2q]
mov%1 mm1, [parm3q]
mov%1 mm2, [parm4q]
mov%1 mm3, [parm5q]
psadbw mm0, mm7
psadbw mm1, mm7
psadbw mm2, mm7
psadbw mm3, mm7
%endmacro
%macro SAD_X4_1x8P 2
movq mm7, [parm1q+%1]
movq mm4, [parm2q+%2]
movq mm5, [parm3q+%2]
movq mm6, [parm4q+%2]
psadbw mm4, mm7
psadbw mm5, mm7
psadbw mm6, mm7
psadbw mm7, [parm5q+%2]
paddw mm0, mm4
paddw mm1, mm5
paddw mm2, mm6
paddw mm3, mm7
%endmacro
%macro SAD_X4_1x4P 2
movd mm7, [parm1q+%1]
movd mm4, [parm2q+%2]
movd mm5, [parm3q+%2]
movd mm6, [parm4q+%2]
psadbw mm4, mm7
psadbw mm5, mm7
paddw mm0, mm4
psadbw mm6, mm7
movd mm4, [parm5q+%2]
paddw mm1, mm5
psadbw mm4, mm7
paddw mm2, mm6
paddw mm3, mm4
%endmacro
%macro SAD_X4_2x16P 1
%if %1
SAD_X4_START_1x8P q
%else
SAD_X4_1x8P 0, 0
%endif
SAD_X4_1x8P 8, 8
SAD_X4_1x8P FENC_STRIDE, parm6q
SAD_X4_1x8P FENC_STRIDE+8, parm6q+8
add parm1q, 2*FENC_STRIDE
lea parm2q, [parm2q+2*parm6q]
lea parm3q, [parm3q+2*parm6q]
lea parm4q, [parm4q+2*parm6q]
lea parm5q, [parm5q+2*parm6q]
%endmacro
%macro SAD_X4_2x8P 1
%if %1
SAD_X4_START_1x8P q
%else
SAD_X4_1x8P 0, 0
%endif
SAD_X4_1x8P FENC_STRIDE, parm6q
add parm1q, 2*FENC_STRIDE
lea parm2q, [parm2q+2*parm6q]
lea parm3q, [parm3q+2*parm6q]
lea parm4q, [parm4q+2*parm6q]
lea parm5q, [parm5q+2*parm6q]
%endmacro
%macro SAD_X4_2x4P 1
%if %1
SAD_X4_START_1x8P d
%else
SAD_X4_1x4P 0, 0
%endif
SAD_X4_1x4P FENC_STRIDE, parm6q
add parm1q, 2*FENC_STRIDE
lea parm2q, [parm2q+2*parm6q]
lea parm3q, [parm3q+2*parm6q]
lea parm4q, [parm4q+2*parm6q]
lea parm5q, [parm5q+2*parm6q]
%endmacro
%macro SAD_X3_END 0
movd [parm6q+0], mm0
movd [parm6q+4], mm1
movd [parm6q+8], mm2
ret
%endmacro
%macro SAD_X4_END 0
mov rax, parm7q
movd [rax+0], mm0
movd [rax+4], mm1
movd [rax+8], mm2
movd [rax+12], mm3
ret
%endmacro
; ssd
%macro SSD_INC_1x16P 0
movq mm1, [rax]
movq mm2, [rcx]
......@@ -168,6 +341,8 @@ BITS 64
SSD_INC_1x4P
%endmacro
; satd
%macro LOAD_DIFF_4P 4 ; MMP, MMT, [pix1], [pix2]
movd %1, %3
movd %2, %4
......@@ -262,6 +437,22 @@ cglobal x264_pixel_sad_8x4_mmxext
cglobal x264_pixel_sad_4x8_mmxext
cglobal x264_pixel_sad_4x4_mmxext
cglobal x264_pixel_sad_x3_16x16_mmxext
cglobal x264_pixel_sad_x3_16x8_mmxext
cglobal x264_pixel_sad_x3_8x16_mmxext
cglobal x264_pixel_sad_x3_8x8_mmxext
cglobal x264_pixel_sad_x3_8x4_mmxext
cglobal x264_pixel_sad_x3_4x8_mmxext
cglobal x264_pixel_sad_x3_4x4_mmxext
cglobal x264_pixel_sad_x4_16x16_mmxext
cglobal x264_pixel_sad_x4_16x8_mmxext
cglobal x264_pixel_sad_x4_8x16_mmxext
cglobal x264_pixel_sad_x4_8x8_mmxext
cglobal x264_pixel_sad_x4_8x4_mmxext
cglobal x264_pixel_sad_x4_4x8_mmxext
cglobal x264_pixel_sad_x4_4x4_mmxext
cglobal x264_pixel_sad_pde_16x16_mmxext
cglobal x264_pixel_sad_pde_16x8_mmxext
cglobal x264_pixel_sad_pde_8x16_mmxext
......@@ -380,6 +571,35 @@ x264_pixel_sad_4x4_mmxext:
SAD_END
;-----------------------------------------------------------------------------
; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
; uint8_t *pix2, int i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X 3
ALIGN 16
x264_pixel_sad_x%1_%2x%3_mmxext:
SAD_X%1_2x%2P 1
%rep %3/2-1
SAD_X%1_2x%2P 0
%endrep
SAD_X%1_END
%endmacro
SAD_X 3, 16, 16
SAD_X 3, 16, 8
SAD_X 3, 8, 16
SAD_X 3, 8, 8
SAD_X 3, 8, 4
SAD_X 3, 4, 8
SAD_X 3, 4, 4
SAD_X 4, 16, 16
SAD_X 4, 16, 8
SAD_X 4, 8, 16
SAD_X 4, 8, 8
SAD_X 4, 8, 4
SAD_X 4, 4, 8
SAD_X 4, 4, 4
%macro PDE_CHECK 0
movd eax, mm0
......
......@@ -29,6 +29,8 @@ BITS 32
%include "i386inc.asm"
; sad
%macro SAD_INC_2x16P 0
movq mm1, [eax]
movq mm2, [eax+8]
......@@ -72,6 +74,199 @@ BITS 32
lea ecx, [ecx+2*edx]
%endmacro
; sad x3 / x4
%macro SAD_X3_START_1x8P 1
push edi
push esi
mov edi, [esp+12]
mov eax, [esp+16]
mov ecx, [esp+20]
mov edx, [esp+24]
mov esi, [esp+28]
mov%1 mm3, [edi]
mov%1 mm0, [eax]
mov%1 mm1, [ecx]
mov%1 mm2, [edx]
psadbw mm0, mm3
psadbw mm1, mm3
psadbw mm2, mm3
%endmacro
%macro SAD_X3_1x8P 3
mov%1 mm3, [edi+%2]
mov%1 mm4, [eax+%3]
mov%1 mm5, [ecx+%3]
mov%1 mm6, [edx+%3]
psadbw mm4, mm3
psadbw mm5, mm3
psadbw mm6, mm3
paddw mm0, mm4
paddw mm1, mm5
paddw mm2, mm6
%endmacro
%macro SAD_X3_2x16P 1
%if %1
SAD_X3_START_1x8P q
%else
SAD_X3_1x8P q, 0, 0
%endif
SAD_X3_1x8P q, 8, 8
SAD_X3_1x8P q, FENC_STRIDE, esi
SAD_X3_1x8P q, FENC_STRIDE+8, esi+8
add edi, 2*FENC_STRIDE
lea eax, [eax+2*esi]
lea ecx, [ecx+2*esi]
lea edx, [edx+2*esi]
%endmacro
%macro SAD_X3_2x8P 1
%if %1
SAD_X3_START_1x8P q
%else
SAD_X3_1x8P q, 0, 0
%endif
SAD_X3_1x8P q, FENC_STRIDE, esi
add edi, 2*FENC_STRIDE
lea eax, [eax+2*esi]
lea ecx, [ecx+2*esi]
lea edx, [edx+2*esi]
%endmacro
%macro SAD_X3_2x4P 1
%if %1
SAD_X3_START_1x8P d
%else
SAD_X3_1x8P d, 0, 0
%endif
SAD_X3_1x8P d, FENC_STRIDE, esi
add edi, 2*FENC_STRIDE
lea eax, [eax+2*esi]
lea ecx, [ecx+2*esi]
lea edx, [edx+2*esi]
%endmacro
%macro SAD_X4_START_1x8P 1
push edi
push esi
push ebx
mov edi, [esp+16]
mov eax, [esp+20]
mov ebx, [esp+24]
mov ecx, [esp+28]
mov edx, [esp+32]
mov esi, [esp+36]
mov%1 mm7, [edi]
mov%1 mm0, [eax]
mov%1 mm1, [ebx]
mov%1 mm2, [ecx]
mov%1 mm3, [edx]
psadbw mm0, mm7
psadbw mm1, mm7
psadbw mm2, mm7
psadbw mm3, mm7
%endmacro
%macro SAD_X4_1x8P 2
movq mm7, [edi+%1]
movq mm4, [eax+%2]
movq mm5, [ebx+%2]
movq mm6, [ecx+%2]
psadbw mm4, mm7
psadbw mm5, mm7
psadbw mm6, mm7
psadbw mm7, [edx+%2]
paddw mm0, mm4
paddw mm1, mm5
paddw mm2, mm6
paddw mm3, mm7
%endmacro
%macro SAD_X4_1x4P 2
movd mm7, [edi+%1]
movd mm4, [eax+%2]
movd mm5, [ebx+%2]
movd mm6, [ecx+%2]
psadbw mm4, mm7
psadbw mm5, mm7
paddw mm0, mm4
psadbw mm6, mm7
movd mm4, [edx+%2]
paddw mm1, mm5
psadbw mm4, mm7
paddw mm2, mm6
paddw mm3, mm4
%endmacro
%macro SAD_X4_2x16P 1
%if %1
SAD_X4_START_1x8P q
%else
SAD_X4_1x8P 0, 0
%endif
SAD_X4_1x8P 8, 8
SAD_X4_1x8P FENC_STRIDE, esi
SAD_X4_1x8P FENC_STRIDE+8, esi+8
add edi, 2*FENC_STRIDE
lea eax, [eax+2*esi]
lea ebx, [ebx+2*esi]
lea ecx, [ecx+2*esi]
lea edx, [edx+2*esi]
%endmacro
%macro SAD_X4_2x8P 1
%if %1
SAD_X4_START_1x8P q
%else
SAD_X4_1x8P 0, 0
%endif
SAD_X4_1x8P FENC_STRIDE, esi
add edi, 2*FENC_STRIDE
lea eax, [eax+2*esi]
lea ebx, [ebx+2*esi]
lea ecx, [ecx+2*esi]
lea edx, [edx+2*esi]
%endmacro
%macro SAD_X4_2x4P 1
%if %1
SAD_X4_START_1x8P d
%else
SAD_X4_1x4P 0, 0
%endif
SAD_X4_1x4P FENC_STRIDE, esi
add edi, 2*FENC_STRIDE
lea eax, [eax+2*esi]
lea ebx, [ebx+2*esi]
lea ecx, [ecx+2*esi]
lea edx, [edx+2*esi]
%endmacro
%macro SAD_X3_END 0
mov eax, [esp+32]
movd [eax+0], mm0
movd [eax+4], mm1
movd [eax+8], mm2
pop esi
pop edi
ret
%endmacro
%macro SAD_X4_END 0
mov eax, [esp+40]
movd [eax+0], mm0
movd [eax+4], mm1
movd [eax+8], mm2
movd [eax+12], mm3
pop ebx
pop esi
pop edi
ret
%endmacro
; ssd
%macro SSD_INC_1x16P 0
movq mm1, [eax]
movq mm2, [ecx]
......@@ -168,6 +363,8 @@ BITS 32
SSD_INC_1x4P
%endmacro
; satd
%macro LOAD_DIFF_4P 4 ; MMP, MMT, [pix1], [pix2]
movd %1, %3
movd %2, %4
......@@ -262,6 +459,22 @@ cglobal x264_pixel_sad_8x4_mmxext
cglobal x264_pixel_sad_4x8_mmxext
cglobal x264_pixel_sad_4x4_mmxext
cglobal x264_pixel_sad_x3_16x16_mmxext
cglobal x264_pixel_sad_x3_16x8_mmxext
cglobal x264_pixel_sad_x3_8x16_mmxext
cglobal x264_pixel_sad_x3_8x8_mmxext
cglobal x264_pixel_sad_x3_8x4_mmxext
cglobal x264_pixel_sad_x3_4x8_mmxext
cglobal x264_pixel_sad_x3_4x4_mmxext
cglobal x264_pixel_sad_x4_16x16_mmxext
cglobal x264_pixel_sad_x4_16x8_mmxext
cglobal x264_pixel_sad_x4_8x16_mmxext
cglobal x264_pixel_sad_x4_8x8_mmxext
cglobal x264_pixel_sad_x4_8x4_mmxext
cglobal x264_pixel_sad_x4_4x8_mmxext
cglobal x264_pixel_sad_x4_4x4_mmxext
cglobal x264_pixel_sad_pde_16x16_mmxext
cglobal x264_pixel_sad_pde_16x8_mmxext
cglobal x264_pixel_sad_pde_8x16_mmxext
......@@ -388,6 +601,36 @@ x264_pixel_sad_4x4_mmxext:
SAD_END
;-----------------------------------------------------------------------------
; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
; uint8_t *pix2, int i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X 3
ALIGN 16
x264_pixel_sad_x%1_%2x%3_mmxext:
SAD_X%1_2x%2P 1
%rep %3/2-1
SAD_X%1_2x%2P 0
%endrep
SAD_X%1_END
%endmacro
SAD_X 3, 16, 16
SAD_X 3, 16, 8
SAD_X 3, 8, 16
SAD_X 3, 8, 8
SAD_X 3, 8, 4
SAD_X 3, 4, 8
SAD_X 3, 4, 4
SAD_X 4, 16, 16
SAD_X 4, 16, 8
SAD_X 4, 8, 16
SAD_X 4, 8, 8
SAD_X 4, 8, 4
SAD_X 4, 4, 8
SAD_X 4, 4, 4
%macro PDE_CHECK 0
movd ebx, mm0
cmp ebx, [esp+24] ; prev_score
......
......@@ -38,6 +38,10 @@ SECTION .text
cglobal x264_pixel_sad_16x16_sse2
cglobal x264_pixel_sad_16x8_sse2
cglobal x264_pixel_sad_x3_16x16_sse2
cglobal x264_pixel_sad_x3_16x8_sse2
cglobal x264_pixel_sad_x4_16x16_sse2
cglobal x264_pixel_sad_x4_16x8_sse2
cglobal x264_pixel_ssd_16x16_sse2
cglobal x264_pixel_ssd_16x8_sse2
cglobal x264_pixel_satd_8x4_sse2
......@@ -164,6 +168,158 @@ x264_pixel_sad_16x8_sse2:
SAD_INC_4x16P_SSE2
SAD_END_SSE2
%macro SAD_X3_START_1x16P 0
push edi
push esi
mov edi, [esp+12]
mov eax, [esp+16]
mov ecx, [esp+20]
mov edx, [esp+24]
mov esi, [esp+28]
movdqa xmm3, [edi]
movdqu xmm0, [eax]
movdqu xmm1, [ecx]
movdqu xmm2, [edx]
psadbw xmm0, xmm3
psadbw xmm1, xmm3
psadbw xmm2, xmm3
%endmacro
%macro SAD_X3_1x16P 2
movdqa xmm3, [edi+%1]
movdqu xmm4, [eax+%2]
movdqu xmm5, [ecx+%2]
movdqu xmm6, [edx+%2]
psadbw xmm4, xmm3
psadbw xmm5, xmm3
psadbw xmm6, xmm3
paddw xmm0, xmm4
paddw xmm1, xmm5
paddw xmm2, xmm6
%endmacro
%macro SAD_X3_2x16P 1
%if %1
SAD_X3_START_1x16P
%else
SAD_X3_1x16P 0, 0
%endif
SAD_X3_1x16P FENC_STRIDE, esi
add edi, 2*FENC_STRIDE
lea eax, [eax+2*esi]
lea ecx, [ecx+2*esi]
lea edx, [edx+2*esi]
%endmacro
%macro SAD_X4_START_1x16P 0
push edi
push esi
push ebx
mov edi, [esp+16]
mov eax, [esp+20]
mov ebx, [esp+24]
mov ecx, [esp+28]
mov edx, [esp+32]
mov esi, [esp+36]
movdqa xmm7, [edi]
movdqu xmm0, [eax]
movdqu xmm1, [ebx]
movdqu xmm2, [ecx]
movdqu xmm3, [edx]
psadbw xmm0, xmm7
psadbw xmm1, xmm7
psadbw xmm2, xmm7
psadbw xmm3, xmm7
%endmacro
%macro SAD_X4_1x16P 2
movdqa xmm7, [edi+%1]
movdqu xmm4, [eax+%2]
movdqu xmm5, [ebx+%2]
movdqu xmm6, [ecx+%2]
psadbw xmm4, xmm7
psadbw xmm5, xmm7
paddw xmm0, xmm4
psadbw xmm6, xmm7
movdqu xmm4, [edx+%2]
paddw xmm1, xmm5
psadbw xmm4, xmm7
paddw xmm2, xmm6
paddw xmm3, xmm4
%endmacro
%macro SAD_X4_2x16P 1
%if %1
SAD_X4_START_1x16P
%else
SAD_X4_1x16P 0, 0
%endif
SAD_X4_1x16P FENC_STRIDE, esi
add edi, 2*FENC_STRIDE
lea eax, [eax+2*esi]
lea ebx, [ebx+2*esi]
lea ecx, [ecx+2*esi]
lea edx, [edx+2*esi]
%endmacro
%macro SAD_X3_END 0
mov eax, [esp+32]
pshufd xmm4, xmm0, 2
pshufd xmm5, xmm1, 2
pshufd xmm6, xmm2, 2
paddw xmm0, xmm4
paddw xmm1, xmm5
paddw xmm2, xmm6
movd [eax+0], xmm0
movd [eax+4], xmm1
movd [eax+8], xmm2
pop esi
pop edi
ret
%endmacro
%macro SAD_X4_END 0
mov eax, [esp+40]
pshufd xmm4, xmm0, 2
pshufd xmm5, xmm1, 2
pshufd xmm6, xmm2, 2
pshufd xmm7, xmm3, 2
paddw xmm0, xmm4
paddw xmm1, xmm5
paddw xmm2, xmm6
paddw xmm3, xmm7
movd [eax+0], xmm0
movd [eax+4], xmm1
movd [eax+8], xmm2
movd [eax+12], xmm3
pop ebx
pop esi
pop edi
ret
%endmacro
ALIGN 16
;-----------------------------------------------------------------------------
; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
; uint8_t *pix2, int i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X 3
ALIGN 16
x264_pixel_sad_x%1_%2x%3_sse2:
SAD_X%1_2x%2P 1
%rep %3/2-1
SAD_X%1_2x%2P 0
%endrep
SAD_X%1_END
%endmacro
SAD_X 3, 16, 16
SAD_X 3, 16, 8
SAD_X 4, 16, 16
SAD_X 4, 16, 8
%macro SSD_INC_2x16P_SSE2 0
movdqu xmm1, [eax]
movdqu xmm2, [ecx]
......
......@@ -32,6 +32,21 @@ int x264_pixel_sad_8x4_mmxext( uint8_t *, int, uint8_t *, int );
int x264_pixel_sad_4x8_mmxext( uint8_t *, int, uint8_t *, int );
int x264_pixel_sad_4x4_mmxext( uint8_t *, int, uint8_t *, int );
void x264_pixel_sad_x3_16x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
void x264_pixel_sad_x3_16x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
void x264_pixel_sad_x3_8x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
void x264_pixel_sad_x3_8x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
void x264_pixel_sad_x3_8x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
void x264_pixel_sad_x3_4x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
void x264_pixel_sad_x3_4x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
void x264_pixel_sad_x4_16x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
void x264_pixel_sad_x4_16x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
void x264_pixel_sad_x4_8x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
void x264_pixel_sad_x4_8x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
void x264_pixel_sad_x4_8x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
void x264_pixel_sad_x4_4x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
void x264_pixel_sad_x4_4x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
int x264_pixel_sad_pde_16x16_mmxext( uint8_t *, int, uint8_t *, int, int );
int x264_pixel_sad_pde_16x8_mmxext( uint8_t *, int, uint8_t *, int, int );
int x264_pixel_sad_pde_8x16_mmxext( uint8_t *, int, uint8_t *, int, int );
......@@ -55,6 +70,11 @@ int x264_pixel_satd_4x4_mmxext( uint8_t *, int, uint8_t *, int );
int x264_pixel_sad_16x16_sse2( uint8_t *, int, uint8_t *, int );
int x264_pixel_sad_16x8_sse2( uint8_t *, int, uint8_t *, int );
void x264_pixel_sad_x3_16x16_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
void x264_pixel_sad_x3_16x8_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
void x264_pixel_sad_x4_16x16_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
void x264_pixel_sad_x4_16x8_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
int x264_pixel_ssd_16x16_sse2( uint8_t *, int, uint8_t *, int );
int x264_pixel_ssd_16x8_sse2( uint8_t *, int, uint8_t *, int );
......