Commit b597966b authored by Loren Merritt's avatar Loren Merritt Committed by Fiona Glaser
Browse files

Optimize x86 intra_sa8d_x3_8x8

~40% faster.
Also some other minor asm cosmetics.
parent f3fc0c44
......@@ -48,6 +48,9 @@ const pw_32_0, times 4 dw 32,
const pw_8000, times 8 dw 0x8000
const pw_3fff, times 8 dw 0x3fff
const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1)
const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1
const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1
const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
const pd_1, times 4 dd 1
const pd_32, times 4 dd 32
......
......@@ -27,6 +27,9 @@
%include "x86inc.asm"
%include "x86util.asm"
cextern pw_ppmmppmm
cextern pw_pmpmpmpm
SECTION .text
INIT_MMX mmx2
......@@ -151,37 +154,71 @@ cglobal pixel_sa8d_8x8_internal
%macro LOAD_4x8P 1 ; dx
pxor m7, m7
movd m6, [eax+%1+7*FENC_STRIDE]
movd m0, [eax+%1+0*FENC_STRIDE]
movd m1, [eax+%1+1*FENC_STRIDE]
movd m2, [eax+%1+2*FENC_STRIDE]
movd m3, [eax+%1+3*FENC_STRIDE]
movd m4, [eax+%1+4*FENC_STRIDE]
movd m5, [eax+%1+5*FENC_STRIDE]
movd m6, [r0+%1+7*FENC_STRIDE]
movd m0, [r0+%1+0*FENC_STRIDE]
movd m1, [r0+%1+1*FENC_STRIDE]
movd m2, [r0+%1+2*FENC_STRIDE]
movd m3, [r0+%1+3*FENC_STRIDE]
movd m4, [r0+%1+4*FENC_STRIDE]
movd m5, [r0+%1+5*FENC_STRIDE]
punpcklbw m6, m7
punpcklbw m0, m7
punpcklbw m1, m7
movq [spill], m6
punpcklbw m2, m7
punpcklbw m3, m7
movd m6, [eax+%1+6*FENC_STRIDE]
movd m6, [r0+%1+6*FENC_STRIDE]
punpcklbw m4, m7
punpcklbw m5, m7
punpcklbw m6, m7
movq m7, [spill]
%endmacro
%macro HSUMSUB2 4
pshufw m4, %1, %3
pshufw m5, %2, %3
pmullw %1, %4
pmullw m5, %4
paddw %1, m4
paddw %2, m5
%endmacro
;-----------------------------------------------------------------------------
; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res )
; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
;-----------------------------------------------------------------------------
cglobal intra_sa8d_x3_8x8_core
mov eax, [esp+4]
mov ecx, [esp+8]
sub esp, 0x70
%define args esp+0x74
cglobal intra_sa8d_x3_8x8, 2,3
SUB esp, 0x94
%define edge esp+0x70 ; +32
%define spill esp+0x60 ; +16
%define trans esp+0 ; +96
%define sum esp+0 ; +32
pxor m7, m7
movq m0, [r1+7]
movq m2, [r1+16]
movq m1, m0
movq m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
movq m6, [pw_ppmmppmm]
HSUMSUB2 m0, m2, q1032, m6
HSUMSUB2 m1, m3, q1032, m6
movq m6, [pw_pmpmpmpm]
HSUMSUB2 m0, m2, q2301, m6
HSUMSUB2 m1, m3, q2301, m6
movq m4, m0
movq m5, m2
paddw m0, m1
paddw m2, m3
psubw m4, m1
psubw m3, m5
movq [edge+0], m0
movq [edge+8], m4
movq [edge+16], m2
movq [edge+24], m3
LOAD_4x8P 0
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
......@@ -231,7 +268,7 @@ cglobal intra_sa8d_x3_8x8_core
ABSW m1, m1, m4
paddw m2, m1 ; 7x4 sum
movq m7, m0
movq m1, [ecx+8] ; left bottom
movq m1, [edge+8] ; left bottom
psllw m1, 3
psubw m7, m1
ABSW2 m0, m7, m0, m7, m5, m3
......@@ -276,14 +313,14 @@ cglobal intra_sa8d_x3_8x8_core
paddw m2, m1 ; 7x4 sum
movq m1, m0
movq m7, [ecx+0]
movq m7, [edge+0]
psllw m7, 3 ; left top
movzx edx, word [ecx+0]
add dx, [ecx+16]
lea edx, [4*edx+32]
and edx, -64
movd m6, edx ; dc
mov r2, [edge+0]
add r2, [edge+16]
lea r2, [4*r2+32]
and r2, 0xffc0
movd m6, r2 ; dc
psubw m1, m7
psubw m0, m6
......@@ -297,8 +334,8 @@ cglobal intra_sa8d_x3_8x8_core
psrlq m2, 16
paddw m2, m3
movq m3, [ecx+16] ; top left
movq m4, [ecx+24] ; top right
movq m3, [edge+16] ; top left
movq m4, [edge+24] ; top right
psllw m3, 3
psllw m4, 3
psubw m3, [sum+16]
......@@ -307,24 +344,17 @@ cglobal intra_sa8d_x3_8x8_core
paddw m2, m3
paddw m2, m4 ; v
SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
mov eax, [args+8]
movd ecx, m2
movd edx, m1
add ecx, 2
add edx, 2
shr ecx, 2
shr edx, 2
mov [eax+0], ecx ; i8x8_v satd
mov [eax+4], edx ; i8x8_h satd
movd ecx, m0
add ecx, 2
shr ecx, 2
mov [eax+8], ecx ; i8x8_dc satd
add esp, 0x70
ret
%undef args
SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, pavgw
mov r2, r2m
pxor m7, m7
punpckldq m2, m1
pavgw m0, m7
pavgw m2, m7
movd [r2+8], m0 ; dc
movq [r2+0], m2 ; v, h
ADD esp, 0x94
RET
%undef edge
%undef spill
%undef trans
%undef sum
......@@ -335,25 +365,23 @@ cglobal intra_sa8d_x3_8x8_core
; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
; const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
cglobal pixel_ssim_4x4x2_core
push ebx
push edi
mov ebx, [esp+16]
mov edx, [esp+24]
mov edi, 4
cglobal pixel_ssim_4x4x2_core, 0,5
mov r1, r1m
mov r3, r3m
mov r4, 4
pxor m0, m0
.loop:
mov eax, [esp+12]
mov ecx, [esp+20]
add eax, edi
add ecx, edi
mov r0, r0m
mov r2, r2m
add r0, r4
add r2, r4
pxor m1, m1
pxor m2, m2
pxor m3, m3
pxor m4, m4
%rep 4
movd m5, [eax]
movd m6, [ecx]
movd m5, [r0]
movd m6, [r2]
punpcklbw m5, m0
punpcklbw m6, m0
paddw m1, m5
......@@ -365,11 +393,11 @@ cglobal pixel_ssim_4x4x2_core
paddd m3, m5
paddd m4, m7
paddd m3, m6
add eax, ebx
add ecx, edx
add r0, r1
add r2, r3
%endrep
mov eax, [esp+28]
lea eax, [eax+edi*4]
mov r0, r4m
lea r0, [r0+r4*4]
pshufw m5, m1, q0032
pshufw m6, m2, q0032
paddusw m1, m5
......@@ -383,12 +411,10 @@ cglobal pixel_ssim_4x4x2_core
paddd m4, m6
punpcklwd m1, m0
punpckldq m3, m4
movq [eax+0], m1
movq [eax+8], m3
sub edi, 4
movq [r0+0], m1
movq [r0+8], m3
sub r4, 4
jge .loop
pop edi
pop ebx
emms
ret
RET
......@@ -54,16 +54,22 @@ hmul_8p: times 8 db 1
times 4 db 1, -1
mask_10: times 4 dw 0, -1
mask_1100: times 2 dd 0, -1
pb_pppm: times 4 db 1,1,1,-1
deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
intrax3_shuf: db 7,6,7,6,5,4,5,4,3,2,3,2,1,0,1,0
sw_f0: dq 0xfff0, 0
pd_f0: times 4 dd 0xffff0000
sq_0f: times 1 dq 0xffffffff
SECTION .text
cextern pw_1
cextern pw_8
cextern pw_00ff
cextern pw_ppppmmmm
cextern pw_ppmmppmm
cextern pw_pmpmpmpm
cextern hsub_mul
;=============================================================================
......@@ -1525,12 +1531,21 @@ cglobal pixel_sa8d_16x16, 4,7
; INTRA SATD
;=============================================================================
%macro HSUMSUB2 8
pshufd %4, %2, %7
pshufd %5, %3, %7
%1 %2, %8
%1 %6, %8
paddw %2, %4
paddw %3, %5
%endmacro
%macro INTRA_SA8D_SSE2 0
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; void intra_sa8d_x3_8x8_core( uint8_t *fenc, int16_t edges[2][8], int *res )
; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
;-----------------------------------------------------------------------------
cglobal intra_sa8d_x3_8x8_core, 3,3,16
cglobal intra_sa8d_x3_8x8, 3,3,16
; 8x8 hadamard
pxor m8, m8
movq m0, [r0+0*FENC_STRIDE]
......@@ -1550,39 +1565,57 @@ cglobal intra_sa8d_x3_8x8_core, 3,3,16
punpcklbw m6, m8
punpcklbw m7, m8
HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
; dc
movzx r0d, word [r1+0]
add r0w, word [r1+16]
add r0d, 8
and r0d, -16
shl r0d, 2
pxor m15, m15
movdqa m8, m2
movdqa m9, m3
movdqa m10, m4
movdqa m11, m5
ABSW2 m8, m9, m8, m9, m12, m13
ABSW2 m10, m11, m10, m11, m12, m13
ABSW2 m8, m9, m2, m3, m2, m3
ABSW2 m10, m11, m4, m5, m4, m5
paddusw m8, m10
paddusw m9, m11
ABSW2 m10, m11, m6, m7, m6, m7
ABSW2 m10, m11, m6, m7, m6, m7
ABSW m15, m1, m1
paddusw m10, m11
paddusw m8, m9
paddusw m15, m10
paddusw m15, m8
movdqa m8, [r1+0] ; left edge
movd m9, r0d
psllw m8, 3
; 1D hadamard of edges
movq m8, [r1+7]
movq m9, [r1+16]
%if cpuflag(ssse3)
punpcklwd m8, m8
pshufb m9, [intrax3_shuf]
pmaddubsw m8, [pb_pppm]
pmaddubsw m9, [pb_pppm]
HSUMSUB2 psignw, m8, m9, m10, m11, m9, q1032, [pw_ppppmmmm]
HSUMSUB2 psignw, m8, m9, m10, m11, m9, q2301, [pw_ppmmppmm]
%else ; sse2
pxor m10, m10
punpcklbw m8, m10
punpcklbw m9, m10
HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q1032, [pw_ppppmmmm]
HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q2301, [pw_ppmmppmm]
pshuflw m10, m8, q2301
pshuflw m11, m9, q2301
pshufhw m10, m10, q2301
pshufhw m11, m11, q2301
pmullw m8, [pw_pmpmpmpm]
pmullw m11, [pw_pmpmpmpm]
paddw m8, m10
paddw m9, m11
%endif
; differences
paddw m10, m8, m9
paddw m10, [pw_8]
pand m10, [sw_f0]
psllw m10, 2 ; dc
psllw m8, 3 ; left edge
psubw m8, m0
psubw m9, m0
ABSW2 m8, m9, m8, m9, m10, m11 ; 1x8 sum
paddusw m14, m15, m8
paddusw m15, m9
psubw m10, m0
ABSW2 m8, m10, m8, m10, m11, m12 ; 1x8 sum
paddusw m14, m8, m15
paddusw m15, m10
punpcklwd m0, m1
punpcklwd m2, m3
punpcklwd m4, m5
......@@ -1590,11 +1623,10 @@ cglobal intra_sa8d_x3_8x8_core, 3,3,16
punpckldq m0, m2
punpckldq m4, m6
punpcklqdq m0, m4 ; transpose
movdqa m1, [r1+16] ; top edge
psllw m1, 3
psrldq m2, m15, 2 ; 8x7 sum
psubw m0, m1 ; 8x1 sum
ABSW m0, m0, m1
psllw m9, 3 ; top edge
psrldq m2, m15, 2 ; 8x7 sum
psubw m0, m9 ; 8x1 sum
ABSW m0, m0, m9
paddusw m2, m0
; 3x HADDW
......@@ -2424,8 +2456,8 @@ SA8D
INIT_XMM sse2
SA8D
SATDS_SSE2
INTRA_SA8D_SSE2
%ifndef HIGH_BIT_DEPTH
INTRA_SA8D_SSE2
INIT_MMX mmx2
INTRA_SATDS_MMX
%endif
......@@ -2446,9 +2478,11 @@ HADAMARD_AC_SSE2
%undef movdqa ; nehalem doesn't like movaps
%undef movdqu ; movups
%undef punpcklqdq ; or movlhps
%ifndef HIGH_BIT_DEPTH
INTRA_SA8D_SSE2
INIT_MMX ssse3
INTRA_SATDS_MMX
%endif
%define TRANS TRANS_SSE4
%define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
......@@ -2460,7 +2494,9 @@ HADAMARD_AC_SSE2
INIT_XMM avx
SATDS_SSE2
SA8D
%ifndef HIGH_BIT_DEPTH
INTRA_SA8D_SSE2
%endif
HADAMARD_AC_SSE2
;=============================================================================
......
......@@ -113,10 +113,6 @@ void x264_intra_sad_x3_8x8_mmx2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_avx ( pixel *, pixel *, int * );
void x264_intra_sa8d_x3_8x8_core_mmx2 ( uint8_t *, int16_t [2][8], int * );
void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * );
void x264_intra_sa8d_x3_8x8_core_ssse3( uint8_t *, int16_t [2][8], int * );
void x264_intra_sa8d_x3_8x8_core_avx ( uint8_t *, int16_t [2][8], int * );
void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, int stride1,
pixel *pixuv2, int stride2, int width,
......
......@@ -322,8 +322,8 @@ static void x264_predict_8x8c_p_ssse3( uint8_t *src )
#endif
}
#endif
#if !HIGH_BIT_DEPTH
#if ARCH_X86_64
#if ARCH_X86_64 && !HIGH_BIT_DEPTH
static void x264_predict_8x8c_dc_left( uint8_t *src )
{
int y;
......@@ -350,63 +350,7 @@ static void x264_predict_8x8c_dc_left( uint8_t *src )
}
}
#endif
#define PL(y) \
UNUSED int l##y = edge[14-y];
#define PT(x) \
UNUSED int t##x = edge[16+x];
#define PREDICT_8x8_LOAD_LEFT \
PL(0) PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) PL(7)
#define PREDICT_8x8_LOAD_TOP \
PT(0) PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) PT(7)
#define SUMSUB(a,b,c,d,e,f,g,h)\
t=a; a+=b; b-=t;\
t=c; c+=d; d-=t;\
t=e; e+=f; f-=t;\
t=g; g+=h; h-=t;
#define INTRA_SA8D_X3(cpu)\
void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[36], int res[3] )\
{\
PREDICT_8x8_LOAD_TOP\
PREDICT_8x8_LOAD_LEFT\
int t;\
ALIGNED_16( int16_t sa8d_1d[2][8] );\
SUMSUB(l0,l4,l1,l5,l2,l6,l3,l7);\
SUMSUB(l0,l2,l1,l3,l4,l6,l5,l7);\
SUMSUB(l0,l1,l2,l3,l4,l5,l6,l7);\
sa8d_1d[0][0] = l0;\
sa8d_1d[0][1] = l1;\
sa8d_1d[0][2] = l2;\
sa8d_1d[0][3] = l3;\
sa8d_1d[0][4] = l4;\
sa8d_1d[0][5] = l5;\
sa8d_1d[0][6] = l6;\
sa8d_1d[0][7] = l7;\
SUMSUB(t0,t4,t1,t5,t2,t6,t3,t7);\
SUMSUB(t0,t2,t1,t3,t4,t6,t5,t7);\
SUMSUB(t0,t1,t2,t3,t4,t5,t6,t7);\
sa8d_1d[1][0] = t0;\
sa8d_1d[1][1] = t1;\
sa8d_1d[1][2] = t2;\
sa8d_1d[1][3] = t3;\
sa8d_1d[1][4] = t4;\
sa8d_1d[1][5] = t5;\
sa8d_1d[1][6] = t6;\
sa8d_1d[1][7] = t7;\
x264_intra_sa8d_x3_8x8_core_##cpu( fenc, sa8d_1d, res );\
}
#if ARCH_X86_64
INTRA_SA8D_X3(sse2)
INTRA_SA8D_X3(ssse3)
INTRA_SA8D_X3(avx)
#else
INTRA_SA8D_X3(mmx2)
#endif
#endif // !HIGH_BIT_DEPTH
#endif // ARCH_X86_64 && !HIGH_BIT_DEPTH
/****************************************************************************
* Exported functions:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment