diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index 0e4f4f9dfa427e1f9522fb953b749019802ba868..606782c5aadc3cd9eca7e1dbb5d66a3cdd53ea2b 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -1339,26 +1339,50 @@ INIT_XMM
     %elif %0 >= 9
         __instr %6, %7, %8, %9
     %elif %0 == 8
-        %if avx_enabled && %5
+        %if avx_enabled && __sizeofreg >= 16 && %4 == 0
             %xdefine __src1 %7
             %xdefine __src2 %8
-            %ifnum regnumof%7
-                %ifnum regnumof%8
-                    %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
-                        ; Most VEX-encoded instructions require an additional byte to encode when
-                        ; src2 is a high register (e.g. m8..15). If the instruction is commutative
-                        ; we can swap src1 and src2 when doing so reduces the instruction length.
-                        %xdefine __src1 %8
-                        %xdefine __src2 %7
+            %if %5
+                %ifnum regnumof%7
+                    %ifnum regnumof%8
+                        %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
+                            ; Most VEX-encoded instructions require an additional byte to encode when
+                            ; src2 is a high register (e.g. m8..15). If the instruction is commutative
+                            ; we can swap src1 and src2 when doing so reduces the instruction length.
+                            %xdefine __src1 %8
+                            %xdefine __src2 %7
+                        %endif
                     %endif
+                %elifnum regnumof%8 ; put memory operands in src2 when possible
+                    %xdefine __src1 %8
+                    %xdefine __src2 %7
+                %else
+                    %assign __emulate_avx 1
+                %endif
+            %elifnnum regnumof%7
+                ; EVEX allows imm8 shift instructions to be used with memory operands,
+                ; but VEX does not. This handles those special cases.
+                %ifnnum %8
+                    %assign __emulate_avx 1
+                %elif notcpuflag(avx512)
+                    %assign __emulate_avx 1
                 %endif
             %endif
-            __instr %6, __src1, __src2
+            %if __emulate_avx ; a separate load is required
+                %if %3
+                    vmovaps %6, %7
+                %else
+                    vmovdqa %6, %7
+                %endif
+                __instr %6, %8
+            %else
+                __instr %6, __src1, __src2
+            %endif
         %else
             __instr %6, %7, %8
         %endif
     %elif %0 == 7
-        %if avx_enabled && %5
+        %if avx_enabled && __sizeofreg >= 16 && %5
             %xdefine __src1 %6
             %xdefine __src2 %7
             %ifnum regnumof%6