Commit 8a9608bb authored by Henrik Gramner's avatar Henrik Gramner Committed by Fiona Glaser

x86inc: Use VEX-encoded instructions in AVX functions

Automatically use VEX-encoding in AVX/AVX2/XOP/FMA3/FMA4 functions for all instructions that exists in a VEX-encoded version.
This change makes it easier to extend existing code to use AVX2.
Also add support for AVX emulation of a few instructions that were missing before.
parent 4cf27285
......@@ -1274,7 +1274,7 @@ cglobal zigzag_scan_8x8_field, 2,3,8
mova m1, [r1+ 4*SIZEOF_DCTCOEF] ; 07 06 05 04
mova m2, [r1+ 8*SIZEOF_DCTCOEF] ; 11 10 09 08
pshuf%1 m3, m0, q3333 ; 03 03 03 03
movd r2, m2 ; 09 08
movd r2d, m2 ; 09 08
pshuf%1 m2, m2, q0321 ; 08 11 10 09
punpckl%2 m3, m1 ; 05 03 04 03
pinsr%1 m0, r2d, 3 ; 08 02 01 00
......
......@@ -171,7 +171,7 @@ cglobal deblock_v_luma, 5,5,8
%define bm [rsp+mmsize*4]
SUB rsp, pad
add r1, r1
LOAD_AB m4, m5, r2, r3
LOAD_AB m4, m5, r2d, r3d
mov r3, 32/mmsize
mov r2, r0
sub r0, r1
......@@ -227,7 +227,7 @@ cglobal deblock_h_luma, 5,6,8
%define bm [rsp+mmsize*6]
SUB rsp, pad
add r1, r1
LOAD_AB m4, m5, r2, r3
LOAD_AB m4, m5, r2d, r3d
mov r3, r1
mova am, m4
add r3, r1
......@@ -355,7 +355,7 @@ cglobal deblock_v_luma, 5,5,15
%define mask1 m10
%define mask2 m11
add r1, r1
LOAD_AB m12, m13, r2, r3
LOAD_AB m12, m13, r2d, r3d
mov r2, r0
sub r0, r1
sub r0, r1
......@@ -382,7 +382,7 @@ cglobal deblock_v_luma, 5,5,15
cglobal deblock_h_luma, 5,7,15
add r1, r1
LOAD_AB m12, m13, r2, r3
LOAD_AB m12, m13, r2d, r3d
mov r2, r1
add r2, r1
add r2, r1
......@@ -1216,7 +1216,7 @@ cglobal deblock_%1_luma, 5,5,8,2*%2
mova m1, [r4+2*r1] ; p0
mova m2, [r0] ; q0
mova m3, [r0+r1] ; q1
LOAD_MASK r2, r3
LOAD_MASK r2d, r3d
mov r3, r4mp
movd m4, [r3] ; tc0
......@@ -1660,7 +1660,7 @@ DEBLOCK_LUMA_INTRA v8
%macro DEBLOCK_CHROMA 0
cglobal deblock_inter_body
LOAD_AB m4, m5, r2, r3
LOAD_AB m4, m5, r2d, r3d
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
pxor m4, m4
LOAD_TC m6, r4
......@@ -1710,7 +1710,7 @@ cglobal deblock_h_chroma, 5,7,8
cglobal deblock_intra_body
LOAD_AB m4, m5, r2, r3
LOAD_AB m4, m5, r2d, r3d
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
ret
......@@ -1721,7 +1721,7 @@ cglobal deblock_intra_body
cglobal deblock_v_chroma_intra, 4,6,8
add r1, r1
mov r5, 32/mmsize
movd m5, r3
movd m5, r3d
mov r4, r0
sub r0, r1
sub r0, r1
......@@ -1766,7 +1766,7 @@ cglobal deblock_h_chroma_intra_mbaff, 4,6,8
lea r5, [r1*3]
%endif
CHROMA_H_LOAD r5
LOAD_AB m4, m5, r2, r3
LOAD_AB m4, m5, r2d, r3d
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
CHROMA_H_STORE r5
......@@ -1788,7 +1788,7 @@ cglobal deblock_h_chroma_mbaff, 5,7,8
.loop:
%endif
CHROMA_H_LOAD r6
LOAD_AB m4, m5, r2, r3
LOAD_AB m4, m5, r2d, r3d
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
movd m6, [r4]
punpcklbw m6, m6
......@@ -1832,7 +1832,7 @@ cglobal deblock_h_chroma_422, 5,7,8
lea r6, [r1*3]
.loop:
CHROMA_H_LOAD r6
LOAD_AB m4, m5, r2m, r3
LOAD_AB m4, m5, r2m, r3d
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
pxor m4, m4
movd m6, [r4-1]
......
......@@ -746,7 +746,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%if cpuflag(avx)
%assign avx_enabled 1
%endif
%if mmsize == 16 && notcpuflag(sse2)
%if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
%define mova movaps
%define movu movups
%define movnta movntps
......@@ -825,10 +825,10 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%if ARCH_X86_64
%define num_mmregs 16
%endif
%define mova vmovaps
%define movu vmovups
%define mova movdqa
%define movu movdqu
%undef movh
%define movnta vmovntps
%define movnta movntdq
%assign %%i 0
%rep num_mmregs
CAT_XDEFINE m, %%i, ymm %+ %%i
......@@ -989,101 +989,107 @@ INIT_XMM
;%1 == instruction
;%2 == 1 if float, 0 if int
;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
;%4 == number of operands given
;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
;%5+: operands
%macro RUN_AVX_INSTR 6-7+
%ifid %6
%define %%sizeofreg sizeof%6
%elifid %5
%define %%sizeofreg sizeof%5
%macro RUN_AVX_INSTR 5-8+
%ifnum sizeof%6
%assign %%sizeofreg sizeof%6
%elifnum sizeof%5
%assign %%sizeofreg sizeof%5
%else
%define %%sizeofreg mmsize
%assign %%sizeofreg mmsize
%endif
%if %%sizeofreg==32
%if %4>=3
v%1 %5, %6, %7
%else
v%1 %5, %6
%endif
%assign %%emulate_avx 0
%if avx_enabled && %%sizeofreg >= 16
%xdefine %%instr v%1
%else
%if %%sizeofreg==8
%define %%regmov movq
%elif %2
%define %%regmov movaps
%else
%define %%regmov movdqa
%xdefine %%instr %1
%if %0 >= 7+%3
%assign %%emulate_avx 1
%endif
%endif
%if %4>=3+%3
%ifnidn %5, %6
%if avx_enabled && %%sizeofreg==16
v%1 %5, %6, %7
%else
CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
%%regmov %5, %6
%1 %5, %7
%if %%emulate_avx
%xdefine %%src1 %6
%xdefine %%src2 %7
%ifnidn %5, %6
%if %0 >= 8
CHECK_AVX_INSTR_EMU {%1 %5, %6, %7, %8}, %5, %7, %8
%else
CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
%endif
%if %4 && %3 == 0
%ifnid %7
; 3-operand AVX instructions with a memory arg can only have it in src2,
; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
; So, if the instruction is commutative with a memory arg, swap them.
%xdefine %%src1 %7
%xdefine %%src2 %6
%endif
%endif
%if %%sizeofreg == 8
MOVQ %5, %%src1
%elif %2
MOVAPS %5, %%src1
%else
%1 %5, %7
MOVDQA %5, %%src1
%endif
%elif %4>=3
%1 %5, %6, %7
%else
%1 %5, %6
%endif
%endif
%endmacro
; 3arg AVX ops with a memory arg can only have it in src2,
; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
; So, if the op is symmetric and the wrong one is memory, swap them.
%macro RUN_AVX_INSTR1 8
%assign %%swap 0
%if avx_enabled
%ifnid %6
%assign %%swap 1
%endif
%elifnidn %5, %6
%ifnid %7
%assign %%swap 1
%if %0 >= 8
%1 %5, %%src2, %8
%else
%1 %5, %%src2
%endif
%endif
%if %%swap && %3 == 0 && %8 == 1
RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
%elif %0 >= 8
%%instr %5, %6, %7, %8
%elif %0 == 7
%%instr %5, %6, %7
%elif %0 == 6
%%instr %5, %6
%else
RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
%%instr %5
%endif
%endmacro
;%1 == instruction
;%2 == 1 if float, 0 if int
;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
%macro AVX_INSTR 4
%macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
%ifidn %3, fnord
RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
%macro AVX_INSTR 1-4 0, 1, 0
%macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4
%ifidn %2, fnord
RUN_AVX_INSTR %6, %7, %8, %9, %1
%elifidn %3, fnord
RUN_AVX_INSTR %6, %7, %8, %9, %1, %2
%elifidn %4, fnord
RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3
%elifidn %5, fnord
RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4
%else
RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4, %5
%endif
%endmacro
%endmacro
; Instructions with both VEX and non-VEX encodings
; Non-destructive instructions are written without parameters
AVX_INSTR addpd, 1, 0, 1
AVX_INSTR addps, 1, 0, 1
AVX_INSTR addsd, 1, 0, 1
AVX_INSTR addss, 1, 0, 1
AVX_INSTR addsubpd, 1, 0, 0
AVX_INSTR addsubps, 1, 0, 0
AVX_INSTR andpd, 1, 0, 1
AVX_INSTR andps, 1, 0, 1
AVX_INSTR aesdec, 0, 0, 0
AVX_INSTR aesdeclast, 0, 0, 0
AVX_INSTR aesenc, 0, 0, 0
AVX_INSTR aesenclast, 0, 0, 0
AVX_INSTR aesimc
AVX_INSTR aeskeygenassist
AVX_INSTR andnpd, 1, 0, 0
AVX_INSTR andnps, 1, 0, 0
AVX_INSTR andpd, 1, 0, 1
AVX_INSTR andps, 1, 0, 1
AVX_INSTR blendpd, 1, 0, 0
AVX_INSTR blendps, 1, 0, 0
AVX_INSTR blendvpd, 1, 0, 0
......@@ -1092,18 +1098,39 @@ AVX_INSTR cmppd, 1, 0, 0
AVX_INSTR cmpps, 1, 0, 0
AVX_INSTR cmpsd, 1, 0, 0
AVX_INSTR cmpss, 1, 0, 0
AVX_INSTR cvtdq2ps, 1, 0, 0
AVX_INSTR cvtps2dq, 1, 0, 0
AVX_INSTR comisd
AVX_INSTR comiss
AVX_INSTR cvtdq2pd
AVX_INSTR cvtdq2ps
AVX_INSTR cvtpd2dq
AVX_INSTR cvtpd2ps
AVX_INSTR cvtps2dq
AVX_INSTR cvtps2pd
AVX_INSTR cvtsd2si
AVX_INSTR cvtsd2ss
AVX_INSTR cvtsi2sd
AVX_INSTR cvtsi2ss
AVX_INSTR cvtss2sd
AVX_INSTR cvtss2si
AVX_INSTR cvttpd2dq
AVX_INSTR cvttps2dq
AVX_INSTR cvttsd2si
AVX_INSTR cvttss2si
AVX_INSTR divpd, 1, 0, 0
AVX_INSTR divps, 1, 0, 0
AVX_INSTR divsd, 1, 0, 0
AVX_INSTR divss, 1, 0, 0
AVX_INSTR dppd, 1, 1, 0
AVX_INSTR dpps, 1, 1, 0
AVX_INSTR extractps
AVX_INSTR haddpd, 1, 0, 0
AVX_INSTR haddps, 1, 0, 0
AVX_INSTR hsubpd, 1, 0, 0
AVX_INSTR hsubps, 1, 0, 0
AVX_INSTR insertps, 1, 1, 0
AVX_INSTR lddqu
AVX_INSTR ldmxcsr
AVX_INSTR maskmovdqu
AVX_INSTR maxpd, 1, 0, 1
AVX_INSTR maxps, 1, 0, 1
AVX_INSTR maxsd, 1, 0, 1
......@@ -1112,10 +1139,31 @@ AVX_INSTR minpd, 1, 0, 1
AVX_INSTR minps, 1, 0, 1
AVX_INSTR minsd, 1, 0, 1
AVX_INSTR minss, 1, 0, 1
AVX_INSTR movapd
AVX_INSTR movaps
AVX_INSTR movd
AVX_INSTR movddup
AVX_INSTR movdqa
AVX_INSTR movdqu
AVX_INSTR movhlps, 1, 0, 0
AVX_INSTR movhpd, 1, 0, 0
AVX_INSTR movhps, 1, 0, 0
AVX_INSTR movlhps, 1, 0, 0
AVX_INSTR movlpd, 1, 0, 0
AVX_INSTR movlps, 1, 0, 0
AVX_INSTR movmskpd
AVX_INSTR movmskps
AVX_INSTR movntdq
AVX_INSTR movntdqa
AVX_INSTR movntpd
AVX_INSTR movntps
AVX_INSTR movq
AVX_INSTR movsd, 1, 0, 0
AVX_INSTR movshdup
AVX_INSTR movsldup
AVX_INSTR movss, 1, 0, 0
AVX_INSTR movupd
AVX_INSTR movups
AVX_INSTR mpsadbw, 0, 1, 0
AVX_INSTR mulpd, 1, 0, 1
AVX_INSTR mulps, 1, 0, 1
......@@ -1123,9 +1171,9 @@ AVX_INSTR mulsd, 1, 0, 1
AVX_INSTR mulss, 1, 0, 1
AVX_INSTR orpd, 1, 0, 1
AVX_INSTR orps, 1, 0, 1
AVX_INSTR pabsb, 0, 0, 0
AVX_INSTR pabsw, 0, 0, 0
AVX_INSTR pabsd, 0, 0, 0
AVX_INSTR pabsb
AVX_INSTR pabsd
AVX_INSTR pabsw
AVX_INSTR packsswb, 0, 0, 0
AVX_INSTR packssdw, 0, 0, 0
AVX_INSTR packuswb, 0, 0, 0
......@@ -1145,10 +1193,11 @@ AVX_INSTR pavgb, 0, 0, 1
AVX_INSTR pavgw, 0, 0, 1
AVX_INSTR pblendvb, 0, 0, 0
AVX_INSTR pblendw, 0, 1, 0
AVX_INSTR pcmpestri, 0, 0, 0
AVX_INSTR pcmpestrm, 0, 0, 0
AVX_INSTR pcmpistri, 0, 0, 0
AVX_INSTR pcmpistrm, 0, 0, 0
AVX_INSTR pclmulqdq, 0, 1, 0
AVX_INSTR pcmpestri
AVX_INSTR pcmpestrm
AVX_INSTR pcmpistri
AVX_INSTR pcmpistrm
AVX_INSTR pcmpeqb, 0, 0, 1
AVX_INSTR pcmpeqw, 0, 0, 1
AVX_INSTR pcmpeqd, 0, 0, 1
......@@ -1157,12 +1206,21 @@ AVX_INSTR pcmpgtb, 0, 0, 0
AVX_INSTR pcmpgtw, 0, 0, 0
AVX_INSTR pcmpgtd, 0, 0, 0
AVX_INSTR pcmpgtq, 0, 0, 0
AVX_INSTR pextrb
AVX_INSTR pextrd
AVX_INSTR pextrq
AVX_INSTR pextrw
AVX_INSTR phaddw, 0, 0, 0
AVX_INSTR phaddd, 0, 0, 0
AVX_INSTR phaddsw, 0, 0, 0
AVX_INSTR phminposuw
AVX_INSTR phsubw, 0, 0, 0
AVX_INSTR phsubd, 0, 0, 0
AVX_INSTR phsubsw, 0, 0, 0
AVX_INSTR pinsrb, 0, 1, 0
AVX_INSTR pinsrd, 0, 1, 0
AVX_INSTR pinsrq, 0, 1, 0
AVX_INSTR pinsrw, 0, 1, 0
AVX_INSTR pmaddwd, 0, 0, 1
AVX_INSTR pmaddubsw, 0, 0, 0
AVX_INSTR pmaxsb, 0, 0, 1
......@@ -1177,20 +1235,32 @@ AVX_INSTR pminsd, 0, 0, 1
AVX_INSTR pminub, 0, 0, 1
AVX_INSTR pminuw, 0, 0, 1
AVX_INSTR pminud, 0, 0, 1
AVX_INSTR pmovmskb, 0, 0, 0
AVX_INSTR pmulhuw, 0, 0, 1
AVX_INSTR pmovmskb
AVX_INSTR pmovsxbw
AVX_INSTR pmovsxbd
AVX_INSTR pmovsxbq
AVX_INSTR pmovsxwd
AVX_INSTR pmovsxwq
AVX_INSTR pmovsxdq
AVX_INSTR pmovzxbw
AVX_INSTR pmovzxbd
AVX_INSTR pmovzxbq
AVX_INSTR pmovzxwd
AVX_INSTR pmovzxwq
AVX_INSTR pmovzxdq
AVX_INSTR pmuldq, 0, 0, 1
AVX_INSTR pmulhrsw, 0, 0, 1
AVX_INSTR pmulhuw, 0, 0, 1
AVX_INSTR pmulhw, 0, 0, 1
AVX_INSTR pmullw, 0, 0, 1
AVX_INSTR pmulld, 0, 0, 1
AVX_INSTR pmuludq, 0, 0, 1
AVX_INSTR pmuldq, 0, 0, 1
AVX_INSTR por, 0, 0, 1
AVX_INSTR psadbw, 0, 0, 1
AVX_INSTR pshufb, 0, 0, 0
AVX_INSTR pshufd, 0, 1, 0
AVX_INSTR pshufhw, 0, 1, 0
AVX_INSTR pshuflw, 0, 1, 0
AVX_INSTR pshufd
AVX_INSTR pshufhw
AVX_INSTR pshuflw
AVX_INSTR psignb, 0, 0, 0
AVX_INSTR psignw, 0, 0, 0
AVX_INSTR psignd, 0, 0, 0
......@@ -1212,7 +1282,7 @@ AVX_INSTR psubsb, 0, 0, 0
AVX_INSTR psubsw, 0, 0, 0
AVX_INSTR psubusb, 0, 0, 0
AVX_INSTR psubusw, 0, 0, 0
AVX_INSTR ptest, 0, 0, 0
AVX_INSTR ptest
AVX_INSTR punpckhbw, 0, 0, 0
AVX_INSTR punpckhwd, 0, 0, 0
AVX_INSTR punpckhdq, 0, 0, 0
......@@ -1222,11 +1292,27 @@ AVX_INSTR punpcklwd, 0, 0, 0
AVX_INSTR punpckldq, 0, 0, 0
AVX_INSTR punpcklqdq, 0, 0, 0
AVX_INSTR pxor, 0, 0, 1
AVX_INSTR rcpps, 1, 0, 0
AVX_INSTR rcpss, 1, 0, 0
AVX_INSTR roundpd
AVX_INSTR roundps
AVX_INSTR roundsd
AVX_INSTR roundss
AVX_INSTR rsqrtps, 1, 0, 0
AVX_INSTR rsqrtss, 1, 0, 0
AVX_INSTR shufpd, 1, 1, 0
AVX_INSTR shufps, 1, 1, 0
AVX_INSTR sqrtpd, 1, 0, 0
AVX_INSTR sqrtps, 1, 0, 0
AVX_INSTR sqrtsd, 1, 0, 0
AVX_INSTR sqrtss, 1, 0, 0
AVX_INSTR stmxcsr
AVX_INSTR subpd, 1, 0, 0
AVX_INSTR subps, 1, 0, 0
AVX_INSTR subsd, 1, 0, 0
AVX_INSTR subss, 1, 0, 0
AVX_INSTR ucomisd
AVX_INSTR ucomiss
AVX_INSTR unpckhpd, 1, 0, 0
AVX_INSTR unpckhps, 1, 0, 0
AVX_INSTR unpcklpd, 1, 0, 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment