Commit e33aac9a authored by Henrik Gramner's avatar Henrik Gramner Committed by Fiona Glaser

x86: SSSE3 implementation of pixel_sad_x3 and pixel_sad_x4

parent 4becc3e9
......@@ -1195,9 +1195,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
}
else
{
pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_ssse3;
pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_ssse3;
pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_ssse3;
INIT2( sad_x3, _ssse3 );
INIT5( sad_x4, _ssse3 );
}
if( (cpu&X264_CPU_SLOW_ATOM) || (cpu&X264_CPU_SLOW_SHUFFLE) )
{
......
......@@ -1241,21 +1241,34 @@ SAD_X 4, 4, 4
%endmacro
%macro SAD_X3_END_SSE2 0
movifnidn r5, r5mp
%if cpuflag(ssse3)
packssdw m0, m1
packssdw m2, m2
phaddd m0, m2
mova [r5], m0
%else
movhlps m3, m0
movhlps m4, m1
movhlps m5, m2
paddw m0, m3
paddw m1, m4
paddw m2, m5
movifnidn r5, r5mp
movd [r5+0], m0
movd [r5+4], m1
movd [r5+8], m2
%endif
RET
%endmacro
%macro SAD_X4_END_SSE2 0
mov r0, r6mp
%if cpuflag(ssse3)
packssdw m0, m1
packssdw m2, m3
phaddd m0, m2
mova [r0], m0
%else
psllq m1, 32
psllq m3, 32
paddw m0, m1
......@@ -1266,6 +1279,7 @@ SAD_X 4, 4, 4
paddw m2, m3
movq [r0+0], m0
movq [r0+8], m2
%endif
RET
%endmacro
......@@ -1504,9 +1518,13 @@ cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,8
%endmacro
INIT_XMM ssse3
SAD_X_SSSE3 4, 8, 16
SAD_X_SSSE3 4, 8, 8
SAD_X_SSSE3 4, 8, 4
SAD_X_SSE2 3, 16, 16, 7
SAD_X_SSE2 3, 16, 8, 7
SAD_X_SSE2 4, 16, 16, 7
SAD_X_SSE2 4, 16, 8, 7
SAD_X_SSSE3 4, 8, 16
SAD_X_SSSE3 4, 8, 8
SAD_X_SSSE3 4, 8, 4
INIT_XMM avx
SAD_X_SSE2 3, 16, 16, 6
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment