diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index 0e4f4f9dfa427e1f9522fb953b749019802ba868..606782c5aadc3cd9eca7e1dbb5d66a3cdd53ea2b 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -1339,26 +1339,50 @@ INIT_XMM %elif %0 >= 9 __instr %6, %7, %8, %9 %elif %0 == 8 - %if avx_enabled && %5 + %if avx_enabled && __sizeofreg >= 16 && %4 == 0 %xdefine __src1 %7 %xdefine __src2 %8 - %ifnum regnumof%7 - %ifnum regnumof%8 - %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32 - ; Most VEX-encoded instructions require an additional byte to encode when - ; src2 is a high register (e.g. m8..15). If the instruction is commutative - ; we can swap src1 and src2 when doing so reduces the instruction length. - %xdefine __src1 %8 - %xdefine __src2 %7 + %if %5 + %ifnum regnumof%7 + %ifnum regnumof%8 + %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32 + ; Most VEX-encoded instructions require an additional byte to encode when + ; src2 is a high register (e.g. m8..15). If the instruction is commutative + ; we can swap src1 and src2 when doing so reduces the instruction length. + %xdefine __src1 %8 + %xdefine __src2 %7 + %endif %endif + %elifnum regnumof%8 ; put memory operands in src2 when possible + %xdefine __src1 %8 + %xdefine __src2 %7 + %else + %assign __emulate_avx 1 + %endif + %elifnnum regnumof%7 + ; EVEX allows imm8 shift instructions to be used with memory operands, + ; but VEX does not. This handles those special cases. + %ifnnum %8 + %assign __emulate_avx 1 + %elif notcpuflag(avx512) + %assign __emulate_avx 1 %endif %endif - __instr %6, __src1, __src2 + %if __emulate_avx ; a separate load is required + %if %3 + vmovaps %6, %7 + %else + vmovdqa %6, %7 + %endif + __instr %6, %8 + %else + __instr %6, __src1, __src2 + %endif %else __instr %6, %7, %8 %endif %elif %0 == 7 - %if avx_enabled && %5 + %if avx_enabled && __sizeofreg >= 16 && %5 %xdefine __src1 %6 %xdefine __src2 %7 %ifnum regnumof%6