Skip to content
Snippets Groups Projects

x86inc: Support memory operands in src1 in 3-operand instructions

Merged Henrik Gramner requested to merge gramner/x264:x86inc_vex_mem_operands into master
All threads resolved!
+ 41
12
@@ -86,6 +86,11 @@
%define mangle(x) x
%endif
; Use VEX-encoding even in non-AVX functions
%ifndef FORCE_VEX_ENCODING
%define FORCE_VEX_ENCODING 0
%endif
%macro SECTION_RODATA 0-1 16
%ifidn __OUTPUT_FORMAT__,win32
SECTION .rdata align=%1
@@ -1014,7 +1019,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%endmacro
%macro INIT_XMM 0-1+
%assign avx_enabled 0
%assign avx_enabled FORCE_VEX_ENCODING
%define RESET_MM_PERMUTATION INIT_XMM %1
%define mmsize 16
%define mova movdqa
@@ -1339,26 +1344,50 @@ INIT_XMM
%elif %0 >= 9
__instr %6, %7, %8, %9
%elif %0 == 8
%if avx_enabled && %5
%if avx_enabled && __sizeofreg >= 16 && %4 == 0
%xdefine __src1 %7
%xdefine __src2 %8
%ifnum regnumof%7
%ifnum regnumof%8
%if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
; Most VEX-encoded instructions require an additional byte to encode when
; src2 is a high register (e.g. m8..15). If the instruction is commutative
; we can swap src1 and src2 when doing so reduces the instruction length.
%xdefine __src1 %8
%xdefine __src2 %7
%if %5
%ifnum regnumof%7
%ifnum regnumof%8
%if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
; Most VEX-encoded instructions require an additional byte to encode when
; src2 is a high register (e.g. m8..15). If the instruction is commutative
; we can swap src1 and src2 when doing so reduces the instruction length.
%xdefine __src1 %8
%xdefine __src2 %7
%endif
%endif
%elifnum regnumof%8 ; put memory operands in src2 when possible
%xdefine __src1 %8
%xdefine __src2 %7
%else
%assign __emulate_avx 1
%endif
+3
%elifnnum regnumof%7
; EVEX allows imm8 shift instructions to be used with memory operands,
; but VEX does not. This handles those special cases.
%ifnnum %8
%assign __emulate_avx 1
%elif notcpuflag(avx512)
%assign __emulate_avx 1
%endif
%endif
__instr %6, __src1, __src2
%if __emulate_avx ; a separate load is required
%if %3
vmovaps %6, %7
%else
vmovdqa %6, %7
%endif
__instr %6, %8
%else
__instr %6, __src1, __src2
%endif
%else
__instr %6, %7, %8
%endif
%elif %0 == 7
%if avx_enabled && %5
%if avx_enabled && __sizeofreg >= 16 && %5
%xdefine __src1 %6
%xdefine __src2 %7
%ifnum regnumof%6
Loading