Commit a99183d3 authored by Fiona Glaser's avatar Fiona Glaser

Nehalem optimization part 2: SSE2 width-8 SAD

Helps a bit on Phenom as well
~25% faster width8 multiSAD on Nehalem
parent 4975e818
......@@ -707,6 +707,18 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
#endif
}
if( cpu&X264_CPU_SSE2_IS_FAST && !(cpu&X264_CPU_CACHELINE_64) )
{
pixf->sad_aligned[PIXEL_8x16] = x264_pixel_sad_8x16_sse2;
pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_sse2;
pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_sse2;
pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_sse2;
pixf->sad_x3[PIXEL_8x4] = x264_pixel_sad_x3_8x4_sse2;
pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_sse2;
pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_sse2;
pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_sse2;
}
if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_64) )
{
INIT2( sad, _sse3 );
......
......@@ -102,9 +102,9 @@ SAD 4, 4
;=============================================================================
%macro SAD_END_SSE2 0
movhlps xmm1, xmm0
paddw xmm0, xmm1
movd eax, xmm0
movhlps m1, m0
paddw m0, m1
movd eax, m0
RET
%endmacro
......@@ -113,115 +113,146 @@ SAD 4, 4
; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
cglobal x264_pixel_sad_16x16_%1, 4,4
movdqu xmm0, [r2]
movdqu xmm1, [r2+r3]
lea r2, [r2+2*r3]
movdqu xmm2, [r2]
movdqu xmm3, [r2+r3]
lea r2, [r2+2*r3]
psadbw xmm0, [r0]
psadbw xmm1, [r0+r1]
lea r0, [r0+2*r1]
movdqu xmm4, [r2]
paddw xmm0, xmm1
psadbw xmm2, [r0]
psadbw xmm3, [r0+r1]
lea r0, [r0+2*r1]
movdqu xmm5, [r2+r3]
lea r2, [r2+2*r3]
paddw xmm2, xmm3
movdqu xmm6, [r2]
movdqu xmm7, [r2+r3]
lea r2, [r2+2*r3]
paddw xmm0, xmm2
psadbw xmm4, [r0]
psadbw xmm5, [r0+r1]
lea r0, [r0+2*r1]
movdqu xmm1, [r2]
paddw xmm4, xmm5
psadbw xmm6, [r0]
psadbw xmm7, [r0+r1]
lea r0, [r0+2*r1]
movdqu xmm2, [r2+r3]
lea r2, [r2+2*r3]
paddw xmm6, xmm7
movdqu xmm3, [r2]
paddw xmm0, xmm4
movdqu xmm4, [r2+r3]
lea r2, [r2+2*r3]
paddw xmm0, xmm6
psadbw xmm1, [r0]
psadbw xmm2, [r0+r1]
lea r0, [r0+2*r1]
movdqu xmm5, [r2]
paddw xmm1, xmm2
psadbw xmm3, [r0]
psadbw xmm4, [r0+r1]
lea r0, [r0+2*r1]
movdqu xmm6, [r2+r3]
lea r2, [r2+2*r3]
paddw xmm3, xmm4
movdqu xmm7, [r2]
paddw xmm0, xmm1
movdqu xmm1, [r2+r3]
paddw xmm0, xmm3
psadbw xmm5, [r0]
psadbw xmm6, [r0+r1]
lea r0, [r0+2*r1]
paddw xmm5, xmm6
psadbw xmm7, [r0]
psadbw xmm1, [r0+r1]
paddw xmm7, xmm1
paddw xmm0, xmm5
paddw xmm0, xmm7
movdqu m0, [r2]
movdqu m1, [r2+r3]
lea r2, [r2+2*r3]
movdqu m2, [r2]
movdqu m3, [r2+r3]
lea r2, [r2+2*r3]
psadbw m0, [r0]
psadbw m1, [r0+r1]
lea r0, [r0+2*r1]
movdqu m4, [r2]
paddw m0, m1
psadbw m2, [r0]
psadbw m3, [r0+r1]
lea r0, [r0+2*r1]
movdqu m5, [r2+r3]
lea r2, [r2+2*r3]
paddw m2, m3
movdqu m6, [r2]
movdqu m7, [r2+r3]
lea r2, [r2+2*r3]
paddw m0, m2
psadbw m4, [r0]
psadbw m5, [r0+r1]
lea r0, [r0+2*r1]
movdqu m1, [r2]
paddw m4, m5
psadbw m6, [r0]
psadbw m7, [r0+r1]
lea r0, [r0+2*r1]
movdqu m2, [r2+r3]
lea r2, [r2+2*r3]
paddw m6, m7
movdqu m3, [r2]
paddw m0, m4
movdqu m4, [r2+r3]
lea r2, [r2+2*r3]
paddw m0, m6
psadbw m1, [r0]
psadbw m2, [r0+r1]
lea r0, [r0+2*r1]
movdqu m5, [r2]
paddw m1, m2
psadbw m3, [r0]
psadbw m4, [r0+r1]
lea r0, [r0+2*r1]
movdqu m6, [r2+r3]
lea r2, [r2+2*r3]
paddw m3, m4
movdqu m7, [r2]
paddw m0, m1
movdqu m1, [r2+r3]
paddw m0, m3
psadbw m5, [r0]
psadbw m6, [r0+r1]
lea r0, [r0+2*r1]
paddw m5, m6
psadbw m7, [r0]
psadbw m1, [r0+r1]
paddw m7, m1
paddw m0, m5
paddw m0, m7
SAD_END_SSE2
;-----------------------------------------------------------------------------
; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
cglobal x264_pixel_sad_16x8_%1, 4,4
movdqu xmm0, [r2]
movdqu xmm2, [r2+r3]
lea r2, [r2+2*r3]
movdqu xmm3, [r2]
movdqu xmm4, [r2+r3]
psadbw xmm0, [r0]
psadbw xmm2, [r0+r1]
lea r0, [r0+2*r1]
psadbw xmm3, [r0]
psadbw xmm4, [r0+r1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
paddw xmm0, xmm2
paddw xmm3, xmm4
paddw xmm0, xmm3
movdqu xmm1, [r2]
movdqu xmm2, [r2+r3]
lea r2, [r2+2*r3]
movdqu xmm3, [r2]
movdqu xmm4, [r2+r3]
psadbw xmm1, [r0]
psadbw xmm2, [r0+r1]
lea r0, [r0+2*r1]
psadbw xmm3, [r0]
psadbw xmm4, [r0+r1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
paddw xmm1, xmm2
paddw xmm3, xmm4
paddw xmm0, xmm1
paddw xmm0, xmm3
movdqu m0, [r2]
movdqu m2, [r2+r3]
lea r2, [r2+2*r3]
movdqu m3, [r2]
movdqu m4, [r2+r3]
psadbw m0, [r0]
psadbw m2, [r0+r1]
lea r0, [r0+2*r1]
psadbw m3, [r0]
psadbw m4, [r0+r1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
paddw m0, m2
paddw m3, m4
paddw m0, m3
movdqu m1, [r2]
movdqu m2, [r2+r3]
lea r2, [r2+2*r3]
movdqu m3, [r2]
movdqu m4, [r2+r3]
psadbw m1, [r0]
psadbw m2, [r0+r1]
lea r0, [r0+2*r1]
psadbw m3, [r0]
psadbw m4, [r0+r1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
paddw m1, m2
paddw m3, m4
paddw m0, m1
paddw m0, m3
SAD_END_SSE2
%endmacro
INIT_XMM
SAD_W16 sse2
%define movdqu lddqu
SAD_W16 sse3
%define movdqu movdqa
SAD_W16 sse2_aligned
%define movdqu movups
%undef movdqu
%macro SAD_INC_4x8P_SSE 1
movq m1, [r0]
movq m2, [r0+r1]
lea r0, [r0+2*r1]
movq m3, [r2]
movq m4, [r2+r3]
lea r2, [r2+2*r3]
movhps m1, [r0]
movhps m2, [r0+r1]
movhps m3, [r2]
movhps m4, [r2+r3]
lea r0, [r0+2*r1]
psadbw m1, m3
psadbw m2, m4
lea r2, [r2+2*r3]
%if %1
paddw m0, m1
%else
SWAP m0, m1
%endif
paddw m0, m2
%endmacro
;Even on Nehalem, no sizes other than 8x16 benefit from this method.
cglobal x264_pixel_sad_8x16_sse2, 4,4
SAD_INC_4x8P_SSE 0
SAD_INC_4x8P_SSE 1
SAD_INC_4x8P_SSE 1
SAD_INC_4x8P_SSE 1
SAD_END_SSE2
RET
;-----------------------------------------------------------------------------
; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
......@@ -598,6 +629,93 @@ SAD_X 4, 4, 4
lea r3, [r3+2*r4]
%endmacro
%macro SAD_X3_START_2x8P_SSE2 0
movq xmm7, [r0]
movq xmm0, [r1]
movq xmm1, [r2]
movq xmm2, [r3]
movhps xmm7, [r0+FENC_STRIDE]
movhps xmm0, [r1+r4]
movhps xmm1, [r2+r4]
movhps xmm2, [r3+r4]
psadbw xmm0, xmm7
psadbw xmm1, xmm7
psadbw xmm2, xmm7
%endmacro
%macro SAD_X3_2x8P_SSE2 0
movq xmm7, [r0]
movq xmm3, [r1]
movq xmm4, [r2]
movq xmm5, [r3]
movhps xmm7, [r0+FENC_STRIDE]
movhps xmm3, [r1+r4]
movhps xmm4, [r2+r4]
movhps xmm5, [r3+r4]
psadbw xmm3, xmm7
psadbw xmm4, xmm7
psadbw xmm5, xmm7
paddw xmm0, xmm3
paddw xmm1, xmm4
paddw xmm2, xmm5
%endmacro
%macro SAD_X4_START_2x8P_SSE2 0
movq xmm7, [r0]
movq xmm0, [r1]
movq xmm1, [r2]
movq xmm2, [r3]
movq xmm3, [r4]
movhps xmm7, [r0+FENC_STRIDE]
movhps xmm0, [r1+r5]
movhps xmm1, [r2+r5]
movhps xmm2, [r3+r5]
movhps xmm3, [r4+r5]
psadbw xmm0, xmm7
psadbw xmm1, xmm7
psadbw xmm2, xmm7
psadbw xmm3, xmm7
%endmacro
%macro SAD_X4_2x8P_SSE2 0
movq xmm7, [r0]
movq xmm4, [r1]
movq xmm5, [r2]
%ifdef ARCH_X86_64
movq xmm6, [r3]
movq xmm8, [r4]
movhps xmm7, [r0+FENC_STRIDE]
movhps xmm4, [r1+r5]
movhps xmm5, [r2+r5]
movhps xmm6, [r3+r5]
movhps xmm8, [r4+r5]
psadbw xmm4, xmm7
psadbw xmm5, xmm7
psadbw xmm6, xmm7
psadbw xmm8, xmm7
paddw xmm0, xmm4
paddw xmm1, xmm5
paddw xmm2, xmm6
paddw xmm3, xmm8
%else
movhps xmm7, [r0+FENC_STRIDE]
movhps xmm4, [r1+r5]
movhps xmm5, [r2+r5]
psadbw xmm4, xmm7
psadbw xmm5, xmm7
paddw xmm0, xmm4
paddw xmm1, xmm5
movq xmm6, [r3]
movq xmm4, [r4]
movhps xmm6, [r3+r5]
movhps xmm4, [r4+r5]
psadbw xmm6, xmm7
psadbw xmm4, xmm7
paddw xmm2, xmm6
paddw xmm3, xmm4
%endif
%endmacro
%macro SAD_X4_START_1x16P_SSE2 0
movdqa xmm7, [r0]
movdqu xmm0, [r1]
......@@ -652,6 +770,31 @@ SAD_X 4, 4, 4
lea r4, [r4+2*r5]
%endmacro
%macro SAD_X3_2x8P_SSE2 1
%if %1
SAD_X3_START_2x8P_SSE2
%else
SAD_X3_2x8P_SSE2
%endif
add r0, 2*FENC_STRIDE
lea r1, [r1+2*r4]
lea r2, [r2+2*r4]
lea r3, [r3+2*r4]
%endmacro
%macro SAD_X4_2x8P_SSE2 1
%if %1
SAD_X4_START_2x8P_SSE2
%else
SAD_X4_2x8P_SSE2
%endif
add r0, 2*FENC_STRIDE
lea r1, [r1+2*r5]
lea r2, [r2+2*r5]
lea r3, [r3+2*r5]
lea r4, [r4+2*r5]
%endmacro
%macro SAD_X3_END_SSE2 0
movhlps xmm4, xmm0
movhlps xmm5, xmm1
......@@ -702,8 +845,14 @@ cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1
SAD_X_SSE2 3, 16, 16, sse2
SAD_X_SSE2 3, 16, 8, sse2
SAD_X_SSE2 3, 8, 16, sse2
SAD_X_SSE2 3, 8, 8, sse2
SAD_X_SSE2 3, 8, 4, sse2
SAD_X_SSE2 4, 16, 16, sse2
SAD_X_SSE2 4, 16, 8, sse2
SAD_X_SSE2 4, 8, 16, sse2
SAD_X_SSE2 4, 8, 8, sse2
SAD_X_SSE2 4, 8, 4, sse2
%define movdqu lddqu
SAD_X_SSE2 3, 16, 16, sse3
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment