Commit ff41804e authored by Henrik Gramner's avatar Henrik Gramner Committed by Fiona Glaser

x86: Remove X264_CPU_SSE_MISALIGN functions

Prevents a crash if the misaligned exception mask bit is cleared for some reason.

Misaligned SSE functions are only used on AMD Phenom CPUs and the benefit is miniscule.
They also require modifying the MXCSR control register and by removing those functions
we can get rid of that complexity altogether.

VEX-encoded instructions also supports unaligned memory operands. I tried adding AVX
implementations of all removed functions but there were no performance improvements on
Ivy Bridge. pixel_sad_x3 and pixel_sad_x4 had significant code size reductions though
so I kept them and added some minor cosmetics fixes and tweaks.
parent 01087fdb
......@@ -74,7 +74,6 @@ const x264_cpu_name_t x264_cpu_names[] =
#undef MMX2
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"SSEMisalign", X264_CPU_SSE_MISALIGN},
{"LZCNT", X264_CPU_LZCNT},
{"BMI1", X264_CPU_BMI1},
{"BMI2", X264_CPU_BMI1|X264_CPU_BMI2},
......@@ -210,12 +209,6 @@ uint32_t x264_cpu_detect( void )
}
}
if( ecx&0x00000080 ) /* Misalign SSE */
{
cpu |= X264_CPU_SSE_MISALIGN;
x264_cpu_mask_misalign_sse();
}
if( cpu & X264_CPU_AVX )
{
if( ecx&0x00000800 ) /* XOP */
......
......@@ -45,7 +45,6 @@ void x264_cpu_sfence( void );
#define x264_emms()
#endif
#define x264_sfence x264_cpu_sfence
void x264_cpu_mask_misalign_sse( void );
void x264_safe_intel_cpu_indicator_init( void );
/* kludge:
......
......@@ -1119,12 +1119,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_sse2;
}
}
if( cpu&X264_CPU_SSE_MISALIGN )
{
INIT2( sad_x3, _sse2_misalign );
INIT2( sad_x4, _sse2_misalign );
}
}
if( cpu&X264_CPU_SSE2_IS_FAST && !(cpu&X264_CPU_CACHELINE_64) )
......@@ -1237,6 +1231,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
if( cpu&X264_CPU_AVX )
{
INIT2_NAME( sad_aligned, sad, _sse2 ); /* AVX-capable CPUs doesn't benefit from an aligned version */
INIT2( sad_x3, _avx );
INIT2( sad_x4, _avx );
INIT8( satd, _avx );
INIT7( satd_x3, _avx );
INIT7( satd_x4, _avx );
......
......@@ -146,17 +146,6 @@ cglobal cpu_sfence
sfence
ret
;-----------------------------------------------------------------------------
; void cpu_mask_misalign_sse( void )
;-----------------------------------------------------------------------------
cglobal cpu_mask_misalign_sse
sub rsp, 4
stmxcsr [rsp]
or dword [rsp], 1<<17
ldmxcsr [rsp]
add rsp, 4
ret
cextern intel_cpu_indicator_init
;-----------------------------------------------------------------------------
......
......@@ -1029,59 +1029,48 @@ cglobal pixel_avg2_w20_mmx2, 6,7
jg .height_loop
RET
INIT_XMM
cglobal pixel_avg2_w16_sse2, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
movdqu xmm0, [r2]
movdqu xmm2, [r2+r3]
movdqu xmm1, [r2+r4]
movdqu xmm3, [r2+r6]
movu m0, [r2]
movu m2, [r2+r3]
movu m1, [r2+r4]
movu m3, [r2+r6]
lea r2, [r2+r3*2]
pavgb xmm0, xmm1
pavgb xmm2, xmm3
movdqa [r0], xmm0
movdqa [r0+r1], xmm2
pavgb m0, m1
pavgb m2, m3
mova [r0], m0
mova [r0+r1], m2
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
sub r5d, 2
jg .height_loop
RET
%macro AVG2_W20 1
cglobal pixel_avg2_w20_%1, 6,7
cglobal pixel_avg2_w20_sse2, 6,7
sub r2, r4
lea r6, [r2+r3]
.height_loop:
movdqu xmm0, [r4]
movdqu xmm2, [r4+r3]
%ifidn %1, sse2_misalign
movd mm4, [r4+16]
movd mm5, [r4+r3+16]
pavgb xmm0, [r4+r2]
pavgb xmm2, [r4+r6]
%else
movdqu xmm1, [r4+r2]
movdqu xmm3, [r4+r6]
movd mm4, [r4+16]
movd mm5, [r4+r3+16]
pavgb xmm0, xmm1
pavgb xmm2, xmm3
%endif
pavgb mm4, [r4+r2+16]
pavgb mm5, [r4+r6+16]
movu m0, [r4]
movu m2, [r4+r3]
movu m1, [r4+r2]
movu m3, [r4+r6]
movd mm4, [r4+16]
movd mm5, [r4+r3+16]
pavgb m0, m1
pavgb m2, m3
pavgb mm4, [r4+r2+16]
pavgb mm5, [r4+r6+16]
lea r4, [r4+r3*2]
movdqa [r0], xmm0
movd [r0+16], mm4
movdqa [r0+r1], xmm2
movd [r0+r1+16], mm5
mova [r0], m0
mova [r0+r1], m2
movd [r0+16], mm4
movd [r0+r1+16], mm5
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
sub r5d, 2
jg .height_loop
RET
%endmacro
AVG2_W20 sse2
AVG2_W20 sse2_misalign
INIT_YMM avx2
cglobal pixel_avg2_w20, 6,7
......@@ -1524,7 +1513,7 @@ cglobal prefetch_ref, 3,3
%endmacro
%else ; !HIGH_BIT_DEPTH
%macro UNPACK_UNALIGNED 3
%if mmsize == 8 || cpuflag(misalign)
%if mmsize == 8
punpcklwd %1, %3
%else
movh %2, %3
......@@ -2130,8 +2119,6 @@ MC_CHROMA
%else ; !HIGH_BIT_DEPTH
INIT_MMX mmx2
MC_CHROMA
INIT_XMM sse2, misalign
MC_CHROMA
INIT_XMM sse2
MC_CHROMA
INIT_XMM ssse3
......
......@@ -482,7 +482,7 @@ cglobal hpel_filter_c, 3,3,9
%define pw_rnd [pw_32]
%endif
; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
%if cpuflag(misalign) || mmsize==32
%if mmsize==32
.loop:
movu m4, [src-4]
movu m5, [src-2]
......@@ -630,8 +630,6 @@ INIT_MMX mmx2
HPEL_V 0
INIT_XMM sse2
HPEL_V 8
INIT_XMM sse2, misalign
HPEL_C
%if ARCH_X86_64 == 0
INIT_XMM sse2
HPEL_C
......
......@@ -158,7 +158,6 @@ void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src,
int dx, int dy, int i_width, int i_height );
MC_CHROMA(mmx2)
MC_CHROMA(sse2)
MC_CHROMA(sse2_misalign)
MC_CHROMA(ssse3)
MC_CHROMA(ssse3_cache64)
MC_CHROMA(avx)
......@@ -186,7 +185,6 @@ PIXEL_AVG_WALL(cache32_mmx2)
PIXEL_AVG_WALL(cache64_mmx2)
PIXEL_AVG_WALL(cache64_sse2)
PIXEL_AVG_WALL(sse2)
PIXEL_AVG_WALL(sse2_misalign)
PIXEL_AVG_WALL(cache64_ssse3)
PIXEL_AVG_WALL(avx2)
......@@ -227,7 +225,6 @@ PIXEL_AVG_WTAB(cache32_mmx2, mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2, cac
PIXEL_AVG_WTAB(cache64_mmx2, mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2)
#endif
PIXEL_AVG_WTAB(sse2, mmx2, mmx2, sse2, sse2, sse2)
PIXEL_AVG_WTAB(sse2_misalign, mmx2, mmx2, sse2, sse2, sse2_misalign)
PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2)
PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2)
PIXEL_AVG_WTAB(cache64_ssse3_atom, mmx2, mmx2, cache64_ssse3, cache64_ssse3, sse2)
......@@ -429,7 +426,6 @@ GET_REF(avx2)
GET_REF(cache32_mmx2)
GET_REF(cache64_mmx2)
#endif
GET_REF(sse2_misalign)
GET_REF(cache64_sse2)
GET_REF(cache64_ssse3)
GET_REF(cache64_ssse3_atom)
......@@ -477,7 +473,6 @@ HPEL(16, ssse3, ssse3, ssse3, ssse3)
HPEL(16, avx, avx, avx, avx)
HPEL(32, avx2, avx2, avx2, avx2)
#endif
HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
#endif // HIGH_BIT_DEPTH
static void x264_plane_copy_mmx2( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )
......@@ -696,8 +691,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2;
pf->hpel_filter = x264_hpel_filter_sse2;
if( cpu&X264_CPU_SSE_MISALIGN )
pf->hpel_filter = x264_hpel_filter_sse2_misalign;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_sse2;
......@@ -716,12 +709,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->mc_luma = mc_luma_cache64_sse2;
pf->get_ref = get_ref_cache64_sse2;
}
if( cpu&X264_CPU_SSE_MISALIGN )
{
pf->get_ref = get_ref_sse2_misalign;
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_sse2_misalign;
}
}
}
......
......@@ -47,7 +47,6 @@
DECL_X1( sad, mmx2 )
DECL_X1( sad, sse2 )
DECL_X4( sad, sse2_misalign )
DECL_X1( sad, sse3 )
DECL_X1( sad, sse2_aligned )
DECL_X1( sad, ssse3 )
......@@ -57,6 +56,7 @@ DECL_X4( sad, mmx2 )
DECL_X4( sad, sse2 )
DECL_X4( sad, sse3 )
DECL_X4( sad, ssse3 )
DECL_X4( sad, avx )
DECL_X4( sad, avx2 )
DECL_X1( ssd, mmx )
DECL_X1( ssd, mmx2 )
......
......@@ -1009,62 +1009,56 @@ SAD_X 4, 4, 4
;=============================================================================
%macro SAD_X3_START_1x16P_SSE2 0
%if cpuflag(misalign)
mova xmm2, [r0]
movu xmm0, [r1]
movu xmm1, [r2]
psadbw xmm0, xmm2
psadbw xmm1, xmm2
psadbw xmm2, [r3]
mova m2, [r0]
%if cpuflag(avx)
psadbw m0, m2, [r1]
psadbw m1, m2, [r2]
psadbw m2, [r3]
%else
mova xmm3, [r0]
movu xmm0, [r1]
movu xmm1, [r2]
movu xmm2, [r3]
psadbw xmm0, xmm3
psadbw xmm1, xmm3
psadbw xmm2, xmm3
movu m0, [r1]
movu m1, [r2]
movu m3, [r3]
psadbw m0, m2
psadbw m1, m2
psadbw m2, m3
%endif
%endmacro
%macro SAD_X3_1x16P_SSE2 2
%if cpuflag(misalign)
mova xmm3, [r0+%1]
movu xmm4, [r1+%2]
movu xmm5, [r2+%2]
psadbw xmm4, xmm3
psadbw xmm5, xmm3
psadbw xmm3, [r3+%2]
paddw xmm0, xmm4
paddw xmm1, xmm5
paddw xmm2, xmm3
mova m3, [r0+%1]
%if cpuflag(avx)
psadbw m4, m3, [r1+%2]
psadbw m5, m3, [r2+%2]
psadbw m3, [r3+%2]
%else
mova xmm3, [r0+%1]
movu xmm4, [r1+%2]
movu xmm5, [r2+%2]
movu xmm6, [r3+%2]
psadbw xmm4, xmm3
psadbw xmm5, xmm3
psadbw xmm6, xmm3
paddw xmm0, xmm4
paddw xmm1, xmm5
paddw xmm2, xmm6
movu m4, [r1+%2]
movu m5, [r2+%2]
movu m6, [r3+%2]
psadbw m4, m3
psadbw m5, m3
psadbw m3, m6
%endif
paddw m0, m4
paddw m1, m5
paddw m2, m3
%endmacro
%if ARCH_X86_64
DECLARE_REG_TMP 6
%else
DECLARE_REG_TMP 5
%endif
%macro SAD_X3_4x16P_SSE2 2
%if %1==0
%if UNIX64
mov r6, r5
%endif
lea r5, [r4*3]
lea t0, [r4*3]
SAD_X3_START_1x16P_SSE2
%else
SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0
%endif
SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1
SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2
SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r5
SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), t0
%if %1 != %2-1
%if (%1&1) != 0
add r0, 8*FENC_STRIDE
......@@ -1076,156 +1070,117 @@ SAD_X 4, 4, 4
%endmacro
%macro SAD_X3_START_2x8P_SSE2 0
movq xmm7, [r0]
movq xmm0, [r1]
movq xmm1, [r2]
movq xmm2, [r3]
movhps xmm7, [r0+FENC_STRIDE]
movhps xmm0, [r1+r4]
movhps xmm1, [r2+r4]
movhps xmm2, [r3+r4]
psadbw xmm0, xmm7
psadbw xmm1, xmm7
psadbw xmm2, xmm7
movq m3, [r0]
movq m0, [r1]
movq m1, [r2]
movq m2, [r3]
movhps m3, [r0+FENC_STRIDE]
movhps m0, [r1+r4]
movhps m1, [r2+r4]
movhps m2, [r3+r4]
psadbw m0, m3
psadbw m1, m3
psadbw m2, m3
%endmacro
%macro SAD_X3_2x8P_SSE2 4
movq xmm7, [r0+%1]
movq xmm3, [r1+%2]
movq xmm4, [r2+%2]
movq xmm5, [r3+%2]
movhps xmm7, [r0+%3]
movhps xmm3, [r1+%4]
movhps xmm4, [r2+%4]
movhps xmm5, [r3+%4]
psadbw xmm3, xmm7
psadbw xmm4, xmm7
psadbw xmm5, xmm7
paddw xmm0, xmm3
paddw xmm1, xmm4
paddw xmm2, xmm5
movq m6, [r0+%1]
movq m3, [r1+%2]
movq m4, [r2+%2]
movq m5, [r3+%2]
movhps m6, [r0+%3]
movhps m3, [r1+%4]
movhps m4, [r2+%4]
movhps m5, [r3+%4]
psadbw m3, m6
psadbw m4, m6
psadbw m5, m6
paddw m0, m3
paddw m1, m4
paddw m2, m5
%endmacro
%macro SAD_X4_START_2x8P_SSE2 0
movq xmm7, [r0]
movq xmm0, [r1]
movq xmm1, [r2]
movq xmm2, [r3]
movq xmm3, [r4]
movhps xmm7, [r0+FENC_STRIDE]
movhps xmm0, [r1+r5]
movhps xmm1, [r2+r5]
movhps xmm2, [r3+r5]
movhps xmm3, [r4+r5]
psadbw xmm0, xmm7
psadbw xmm1, xmm7
psadbw xmm2, xmm7
psadbw xmm3, xmm7
movq m4, [r0]
movq m0, [r1]
movq m1, [r2]
movq m2, [r3]
movq m3, [r4]
movhps m4, [r0+FENC_STRIDE]
movhps m0, [r1+r5]
movhps m1, [r2+r5]
movhps m2, [r3+r5]
movhps m3, [r4+r5]
psadbw m0, m4
psadbw m1, m4
psadbw m2, m4
psadbw m3, m4
%endmacro
%macro SAD_X4_2x8P_SSE2 4
movq xmm7, [r0+%1]
movq xmm4, [r1+%2]
movq xmm5, [r2+%2]
%if ARCH_X86_64
movq xmm6, [r3+%2]
movq xmm8, [r4+%2]
movhps xmm7, [r0+%3]
movhps xmm4, [r1+%4]
movhps xmm5, [r2+%4]
movhps xmm6, [r3+%4]
movhps xmm8, [r4+%4]
psadbw xmm4, xmm7
psadbw xmm5, xmm7
psadbw xmm6, xmm7
psadbw xmm8, xmm7
paddw xmm0, xmm4
paddw xmm1, xmm5
paddw xmm2, xmm6
paddw xmm3, xmm8
%else
movhps xmm7, [r0+%3]
movhps xmm4, [r1+%4]
movhps xmm5, [r2+%4]
psadbw xmm4, xmm7
psadbw xmm5, xmm7
paddw xmm0, xmm4
paddw xmm1, xmm5
movq xmm6, [r3+%2]
movq xmm4, [r4+%2]
movhps xmm6, [r3+%4]
movhps xmm4, [r4+%4]
psadbw xmm6, xmm7
psadbw xmm4, xmm7
paddw xmm2, xmm6
paddw xmm3, xmm4
%endif
movq m6, [r0+%1]
movq m4, [r1+%2]
movq m5, [r2+%2]
movhps m6, [r0+%3]
movhps m4, [r1+%4]
movhps m5, [r2+%4]
psadbw m4, m6
psadbw m5, m6
paddw m0, m4
paddw m1, m5
movq m4, [r3+%2]
movq m5, [r4+%2]
movhps m4, [r3+%4]
movhps m5, [r4+%4]
psadbw m4, m6
psadbw m5, m6
paddw m2, m4
paddw m3, m5
%endmacro
%macro SAD_X4_START_1x16P_SSE2 0
%if cpuflag(misalign)
mova xmm3, [r0]
movu xmm0, [r1]
movu xmm1, [r2]
movu xmm2, [r3]
psadbw xmm0, xmm3
psadbw xmm1, xmm3
psadbw xmm2, xmm3
psadbw xmm3, [r4]
mova m3, [r0]
%if cpuflag(avx)
psadbw m0, m3, [r1]
psadbw m1, m3, [r2]
psadbw m2, m3, [r3]
psadbw m3, [r4]
%else
mova xmm7, [r0]
movu xmm0, [r1]
movu xmm1, [r2]
movu xmm2, [r3]
movu xmm3, [r4]
psadbw xmm0, xmm7
psadbw xmm1, xmm7
psadbw xmm2, xmm7
psadbw xmm3, xmm7
movu m0, [r1]
movu m1, [r2]
movu m2, [r3]
movu m4, [r4]
psadbw m0, m3
psadbw m1, m3
psadbw m2, m3
psadbw m3, m4
%endif
%endmacro
%macro SAD_X4_1x16P_SSE2 2
%if cpuflag(misalign)
mova xmm7, [r0+%1]
movu xmm4, [r1+%2]
movu xmm5, [r2+%2]
movu xmm6, [r3+%2]
psadbw xmm4, xmm7
psadbw xmm5, xmm7
psadbw xmm6, xmm7
psadbw xmm7, [r4+%2]
paddw xmm0, xmm4
paddw xmm1, xmm5
paddw xmm2, xmm6
paddw xmm3, xmm7
mova m6, [r0+%1]
%if cpuflag(avx)
psadbw m4, m6, [r1+%2]
psadbw m5, m6, [r2+%2]
%else
mova xmm7, [r0+%1]
movu xmm4, [r1+%2]
movu xmm5, [r2+%2]
movu xmm6, [r3+%2]
%if ARCH_X86_64
movu xmm8, [r4+%2]
psadbw xmm4, xmm7
psadbw xmm5, xmm7
psadbw xmm6, xmm7
psadbw xmm8, xmm7
paddw xmm0, xmm4
paddw xmm1, xmm5
paddw xmm2, xmm6
paddw xmm3, xmm8
%else
psadbw xmm4, xmm7
psadbw xmm5, xmm7
paddw xmm0, xmm4
psadbw xmm6, xmm7
movu xmm4, [r4+%2]
paddw xmm1, xmm5
psadbw xmm4, xmm7
paddw xmm2, xmm6
paddw xmm3, xmm4
movu m4, [r1+%2]
movu m5, [r2+%2]
psadbw m4, m6
psadbw m5, m6
%endif
paddw m0, m4
paddw m1, m5
%if cpuflag(avx)
psadbw m4, m6, [r3+%2]
psadbw m5, m6, [r4+%2]
%else
movu m4, [r3+%2]
movu m5, [r4+%2]
psadbw m4, m6
psadbw m5, m6
%endif
paddw m2, m4
paddw m3, m5
%endmacro
%macro SAD_X4_4x16P_SSE2 2
......@@ -1251,15 +1206,12 @@ SAD_X 4, 4, 4
%macro SAD_X3_4x8P_SSE2 2
%if %1==0
%if UNIX64
mov r6, r5
%endif
lea r5, [r4*3]
lea t0, [r4*3]
SAD_X3_START_2x8P_SSE2
%else
SAD_X3_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0, FENC_STRIDE*(1+(%1&1)*4), r4*1
%endif
SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), r5
SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), t0
%if %1 != %2-1
%if (%1&1) != 0
add r0, 8*FENC_STRIDE
......@@ -1290,78 +1242,72 @@ SAD_X 4, 4, 4
%endmacro
%macro SAD_X3_END_SSE2 0
movhlps xmm4, xmm0
movhlps xmm5, xmm1
movhlps xmm6, xmm2
paddw xmm0, xmm4
paddw xmm1, xmm5
paddw xmm2, xmm6
%if UNIX64
movd [r6+0], xmm0
movd [r6+4], xmm1
movd [r6+8], xmm2
%else
mov r0, r5mp
movd [r0+0], xmm0
movd [r0+4], xmm1
movd [r0+8], xmm2
%endif
movhlps m3, m0
movhlps m4, m1
movhlps m5, m2
paddw m0, m3
paddw m1, m4
paddw m2, m5
movifnidn r5, r5mp
movd [r5+0], m0
movd [r5+4], m1
movd [r5+8], m2
RET
%endmacro
%macro SAD_X4_END_SSE2 0
mov r0, r6mp
psllq xmm1, 32
psllq xmm3, 32
paddw xmm0, xmm1
paddw xmm2, xmm3
movhlps xmm1, xmm0
movhlps xmm3, xmm2
paddw xmm0, xmm1
paddw xmm2, xmm3
movq [r0+0], xmm0
movq [r0+8], xmm2
mov r0, r6mp
psllq m1, 32
psllq m3, 32
paddw m0, m1
paddw m2, m3
movhlps m1, m0
movhlps m3, m2
paddw m0, m1
paddw m2, m3
movq [r0+0], m0
movq [r0+8], m2
RET
%endmacro
%macro SAD_X4_START_2x8P_SSSE3 0
movddup xmm4, [r0]
movq xmm0, [r1]
movq xmm1, [r3]
movhps xmm0, [r2]
movhps xmm1, [r4]
movddup xmm5, [r0+FENC_STRIDE]
movq xmm2, [r1+r5]
movq xmm3, [r3+r5]
movhps xmm2, [r2+r5]
movhps xmm3, [r4+r5]
psadbw xmm0, xmm4
psadbw xmm1, xmm4
psadbw xmm2, xmm5
psadbw xmm3, xmm5
paddw xmm0, xmm2
paddw xmm1, xmm3
movddup m4, [r0]
movq m0, [r1]
movq m1, [r3]
movhps m0, [r2]
movhps m1, [r4]
movddup m5, [r0+FENC_STRIDE]
movq m2, [r1+r5]
movq m3, [r3+r5]
movhps m2, [r2+r5]
movhps m3, [r4+r5]
psadbw m0, m4
psadbw m1, m4
psadbw m2, m5