From 0d6a31f275dc73dabf9fd8742a4adaa9ecc13da6 Mon Sep 17 00:00:00 2001
From: Henrik Gramner <gramner@twoorioles.com>
Date: Thu, 12 Jan 2023 01:49:39 +0100
Subject: [PATCH] x86: Remove stack alignment compiler flags

The intent was good, but in practice it results in a significant
amount of problems due to various compiler bugs for negligible gains.
---
 meson.build                 | 34 +++-------------------------------
 src/meson.build             | 27 +++++----------------------
 src/x86/cdef_avx2.asm       | 32 ++++++++------------------------
 src/x86/ipred16_avx2.asm    |  4 ----
 src/x86/mc16_avx2.asm       | 25 ++++---------------------
 src/x86/mc_avx2.asm         | 23 ++++-------------------
 src/x86/msac.asm            | 10 ----------
 tests/libfuzzer/meson.build |  5 ++---
 tests/meson.build           |  3 +--
 9 files changed, 27 insertions(+), 136 deletions(-)

diff --git a/meson.build b/meson.build
index cee8b791e..0402f73d3 100644
--- a/meson.build
+++ b/meson.build
@@ -319,45 +319,17 @@ if fuzzing_engine == 'libfuzzer'
     add_project_arguments(cc.first_supported_argument(fuzzer_args), language : 'c')
 endif
 
-# Stack alignments flags
-
-stackalign_flag = []
-stackrealign_flag = []
-
 cdata.set10('ENDIANNESS_BIG', host_machine.endian() == 'big')
 
 if host_machine.cpu_family().startswith('x86')
     if get_option('stack_alignment') > 0
         stack_alignment = get_option('stack_alignment')
-    elif host_machine.cpu_family() == 'x86_64'
-        if cc.has_argument('-mpreferred-stack-boundary=6')
-            stackalign_flag = ['-mpreferred-stack-boundary=6']
-            stackrealign_flag = ['-mincoming-stack-boundary=4']
-            stack_alignment = 32
-        elif cc.has_argument('-mstack-alignment=64')
-            stackalign_flag = ['-mstack-alignment=64']
-            stackrealign_flag = ['-mstackrealign']
-            stack_alignment = 32
-        else
-            stack_alignment = 16
-        endif
+    elif host_machine.cpu_family() == 'x86_64' or host_machine.system() in ['linux', 'darwin', 'ios', 'tvos']
+        stack_alignment = 16
     else
-        if host_machine.system() == 'linux' or host_machine.system() in ['darwin', 'ios', 'tvos']
-            stack_alignment = 16
-        elif cc.has_argument('-mpreferred-stack-boundary=4')
-            stackalign_flag = ['-mpreferred-stack-boundary=4']
-            stackrealign_flag = ['-mincoming-stack-boundary=2']
-            stack_alignment = 16
-        elif cc.has_argument('-mstack-alignment=16')
-            stackalign_flag = ['-mstack-alignment=16']
-            stackrealign_flag = ['-mstackrealign']
-            stack_alignment = 16
-        else
-            stack_alignment = 4
-        endif
+        stack_alignment = 4
     endif
     cdata_asm.set('STACK_ALIGNMENT', stack_alignment)
-    cdata.set('STACK_ALIGNMENT', stack_alignment)
 endif
 
 cdata.set10('ARCH_AARCH64', host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64')
diff --git a/src/meson.build b/src/meson.build
index 719015496..2ace55e1a 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -37,6 +37,7 @@ libdav1d_sources = files(
     'intra_edge.c',
     'itx_1d.c',
     'lf_mask.c',
+    'lib.c',
     'log.c',
     'mem.c',
     'msac.c',
@@ -47,6 +48,7 @@ libdav1d_sources = files(
     'refmvs.c',
     'scan.c',
     'tables.c',
+    'thread_task.c',
     'warpmv.c',
     'wedge.c',
 )
@@ -74,14 +76,6 @@ libdav1d_arch_tmpl_sources = []
 
 libdav1d_bitdepth_objs = []
 
-# libdav1d entrypoint source files
-# These source files contain library entry points and are
-# built with the stack-realign flag set, where necessary.
-libdav1d_entrypoints_sources = files(
-    'lib.c',
-    'thread_task.c'
-)
-
 # ASM specific sources
 libdav1d_asm_objs = []
 # Arch-specific flags
@@ -245,7 +239,7 @@ endif
 
 
 libdav1d_rc_obj = []
-libdav1d_flags = [stackalign_flag]
+libdav1d_flags = []
 api_export_flags = []
 
 #
@@ -280,18 +274,6 @@ endif
 # Library definitions
 #
 
-# Helper library for dav1d entrypoints
-libdav1d_entrypoints_objs = static_library('dav1d_entrypoint',
-    libdav1d_entrypoints_sources,
-    rev_target, config_h_target,
-
-    include_directories : dav1d_inc_dirs,
-    dependencies: [stdatomic_dependencies],
-    c_args : [libdav1d_flags, stackrealign_flag, api_export_flags],
-    install : false,
-    build_by_default : false,
-).extract_all_objects(recursive: true)
-
 # Helper library for each bitdepth
 libdav1d_bitdepth_objs = []
 foreach bitdepth : dav1d_bitdepths
@@ -330,10 +312,11 @@ libdav1d = library('dav1d',
     libdav1d_sources,
     libdav1d_asm_objs,
     libdav1d_rc_obj,
+    rev_target,
+    config_h_target,
 
     objects : [
         libdav1d_bitdepth_objs,
-        libdav1d_entrypoints_objs
         ],
 
     include_directories : dav1d_inc_dirs,
diff --git a/src/x86/cdef_avx2.asm b/src/x86/cdef_avx2.asm
index c5c66c759..1f30f8a3b 100644
--- a/src/x86/cdef_avx2.asm
+++ b/src/x86/cdef_avx2.asm
@@ -396,21 +396,17 @@ SECTION .text
 
 %macro CDEF_FILTER 2 ; w, h
 INIT_YMM avx2
-cglobal cdef_filter_%1x%2_8bpc, 5, 10, 0, dst, stride, left, top, bot, \
+cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \
                                           pri, sec, dir, damping, edge
 %assign stack_offset_entry stack_offset
     mov          edged, edgem
     cmp          edged, 0xf
     jne .border_block
 
-    PUSH           r10
     PUSH           r11
+    PUSH           r12
 %if %2 == 4
- %assign regs_used 12
- %if STACK_ALIGNMENT < 32
-    PUSH  r%+regs_used
-  %assign regs_used regs_used+1
- %endif
+%assign regs_used 13
     ALLOC_STACK   0x60, 16
     pmovzxbw       xm0, [leftq+1]
     vpermq          m0, m0, q0110
@@ -420,23 +416,15 @@ cglobal cdef_filter_%1x%2_8bpc, 5, 10, 0, dst, stride, left, top, bot, \
     movu    [rsp+0x28], m1
     movu    [rsp+0x40], m2
 %elif %1 == 4
-    PUSH           r12
- %assign regs_used 13
- %if STACK_ALIGNMENT < 32
-    PUSH  r%+regs_used
-   %assign regs_used regs_used+1
- %endif
+%assign regs_used 14
+    PUSH           r13
     ALLOC_STACK 8*2+%1*%2*1, 16
     pmovzxwd        m0, [leftq]
     mova    [rsp+0x10], m0
 %else
-    PUSH           r12
+%assign regs_used 15
     PUSH           r13
- %assign regs_used 14
- %if STACK_ALIGNMENT < 32
-    PUSH  r%+regs_used
-  %assign regs_used regs_used+1
- %endif
+    PUSH           r14
     ALLOC_STACK 8*4+%1*%2*2+32, 16
     lea            r11, [strideq*3]
     movu           xm4, [dstq+strideq*2]
@@ -1209,11 +1197,7 @@ cglobal cdef_filter_%1x%2_8bpc, 5, 10, 0, dst, stride, left, top, bot, \
  DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge
 %define rstk rsp
 %assign stack_offset stack_offset_entry
-%assign regs_used 10
-%if STACK_ALIGNMENT < 32
-    PUSH  r%+regs_used
- %assign regs_used regs_used+1
-%endif
+%assign regs_used 11
     ALLOC_STACK 2*16+(%2+4)*32, 16
 %define px rsp+2*16+2*32
 
diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm
index 72300c2a4..7ddb18991 100644
--- a/src/x86/ipred16_avx2.asm
+++ b/src/x86/ipred16_avx2.asm
@@ -1936,11 +1936,7 @@ ALIGN function_align
 .upsample_left: ; h4/h8
     mova                xm0, [tlq-16]            ; 8 7 6 5 4 3 2 1
     movu                xm1, [tlq-14]            ; 7 6 5 4 3 2 1 0
-%if STACK_ALIGNMENT < 32
     vpbroadcastw        xm4, r8m ; pixel_max
-%else
-    vpbroadcastw        xm4, r9m ; r8m -> r9m due to call
-%endif
     cmp                  hd, 8
     je .upsample_left_h8
     pshufhw             xm2, xm0, q2100          ; _ _ _ _ 4 4 3 2
diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm
index 8b2ec4fa9..61eeaa100 100644
--- a/src/x86/mc16_avx2.asm
+++ b/src/x86/mc16_avx2.asm
@@ -2650,23 +2650,14 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
 %ifidn %1, put
  %assign isput  1
  %assign isprep 0
- %if required_stack_alignment <= STACK_ALIGNMENT
-cglobal put_8tap_scaled_16bpc, 4, 15, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
- %else
 cglobal put_8tap_scaled_16bpc, 4, 14, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
- %endif
  %xdefine base_reg r12
     mov                 r7d, pxmaxm
 %else
  %assign isput  0
  %assign isprep 1
- %if required_stack_alignment <= STACK_ALIGNMENT
-cglobal prep_8tap_scaled_16bpc, 4, 15, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
-  %xdefine tmp_stridem r14q
- %else
 cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
   %define tmp_stridem qword [rsp+0xd0]
- %endif
  %xdefine base_reg r11
 %endif
     lea            base_reg, [%1_8tap_scaled_16bpc_avx2]
@@ -2698,15 +2689,9 @@ cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx,
   DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
   %define hm r6m
  %endif
- %if required_stack_alignment > STACK_ALIGNMENT
-  %define dsm [rsp+0x98]
-  %define rX r1
-  %define rXd r1d
- %else
-  %define dsm dsq
-  %define rX r14
-  %define rXd r14d
- %endif
+ %define dsm [rsp+0x98]
+ %define rX r1
+ %define rXd r1d
 %else ; prep
  %if WIN64
     mov                 r7d, hm
@@ -3580,9 +3565,7 @@ cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx,
     ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
     mov                 myd, mym
 %if isput
- %if required_stack_alignment > STACK_ALIGNMENT
-  %define dsm [rsp+0xb8]
- %endif
+ %define dsm [rsp+0xb8]
     movifnidn           dsm, dsq
     mova         [rsp+0xc0], xm7
 %else
diff --git a/src/x86/mc_avx2.asm b/src/x86/mc_avx2.asm
index 2719ef361..3b208033b 100644
--- a/src/x86/mc_avx2.asm
+++ b/src/x86/mc_avx2.asm
@@ -2721,22 +2721,13 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
 %macro MC_8TAP_SCALED 1
 %ifidn %1, put
  %assign isprep 0
- %if required_stack_alignment <= STACK_ALIGNMENT
-cglobal put_8tap_scaled_8bpc, 4, 15, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
- %else
 cglobal put_8tap_scaled_8bpc, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy
- %endif
  %xdefine base_reg r12
  %define rndshift 10
 %else
  %assign isprep 1
- %if required_stack_alignment <= STACK_ALIGNMENT
-cglobal prep_8tap_scaled_8bpc, 4, 15, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
-  %xdefine tmp_stridem r14q
- %else
 cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
-  %define tmp_stridem qword [rsp+120]
- %endif
+ %define tmp_stridem qword [rsp+120]
  %xdefine base_reg r11
  %define rndshift 6
 %endif
@@ -2763,15 +2754,9 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, d
   DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
   %define hm r6m
  %endif
- %if required_stack_alignment > STACK_ALIGNMENT
-  %define dsm [rsp+112]
-  %define rX r1
-  %define rXd r1d
- %else
-  %define dsm dsq
-  %define rX r14
-  %define rXd r14d
- %endif
+ %define dsm [rsp+112]
+ %define rX r1
+ %define rXd r1d
 %else ; prep
  %if WIN64
     mov                 r7d, hm
diff --git a/src/x86/msac.asm b/src/x86/msac.asm
index 92a3a731d..9f05c921a 100644
--- a/src/x86/msac.asm
+++ b/src/x86/msac.asm
@@ -619,7 +619,6 @@ cglobal msac_decode_symbol_adapt16, 3, 6, 6
     mov           t3d, [t0+msac.update_cdf]
     mov           t4d, t2d
     not            t2
-%if STACK_ALIGNMENT < 32
     mov            r5, rsp
 %if WIN64
     and           rsp, ~31
@@ -627,11 +626,6 @@ cglobal msac_decode_symbol_adapt16, 3, 6, 6
 %else
     and            r5, ~31
     %define buf r5-32
-%endif
-%elif WIN64
-    sub           rsp, 64
-%else
-    %define buf rsp-56
 %endif
     psrlw          m1, m0, 6
     movd      [buf-4], xm2
@@ -666,11 +660,7 @@ cglobal msac_decode_symbol_adapt16, 3, 6, 6
     movzx         t2d, word [buf+rax-2]
     shr           eax, 1
 %if WIN64
-%if STACK_ALIGNMENT < 32
     mov           rsp, r5
-%else
-    add           rsp, 64
-%endif
 %endif
     vzeroupper
     jmp m(msac_decode_symbol_adapt4, _sse2).renorm2
diff --git a/tests/libfuzzer/meson.build b/tests/libfuzzer/meson.build
index 35914033d..45d28562c 100644
--- a/tests/libfuzzer/meson.build
+++ b/tests/libfuzzer/meson.build
@@ -51,7 +51,6 @@ endif
 dav1d_fuzzer = executable('dav1d_fuzzer',
     dav1d_fuzzer_sources,
     include_directories: dav1d_inc_dirs,
-    c_args: [stackalign_flag, stackrealign_flag],
     link_args: fuzzer_ldflags,
     link_with : libdav1d,
     build_by_default: true,
@@ -62,7 +61,7 @@ dav1d_fuzzer = executable('dav1d_fuzzer',
 dav1d_fuzzer_mt = executable('dav1d_fuzzer_mt',
     dav1d_fuzzer_sources,
     include_directories: dav1d_inc_dirs,
-    c_args: [stackalign_flag, stackrealign_flag, '-DDAV1D_MT_FUZZING'],
+    c_args: ['-DDAV1D_MT_FUZZING'],
     link_args: fuzzer_ldflags,
     link_with : libdav1d,
     build_by_default: true,
@@ -92,7 +91,7 @@ if (objcopy.found() and
     dav1d_fuzzer_mem = executable('dav1d_fuzzer_mem',
         dav1d_fuzzer_sources + ['alloc_fail.c'],
         include_directories: dav1d_inc_dirs,
-        c_args: [stackalign_flag, stackrealign_flag, '-DDAV1D_ALLOC_FAIL'],
+        c_args: ['-DDAV1D_ALLOC_FAIL'],
         link_args: fuzzer_ldflags + [join_paths(libdav1d_af.full_path())],
         link_depends: libdav1d_af,
         build_by_default: false,
diff --git a/tests/meson.build b/tests/meson.build
index 30358d237..f0f0eb713 100644
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -54,7 +54,7 @@ if is_asm_enabled
             'checkasm_bitdepth_@0@'.format(bitdepth),
             checkasm_tmpl_sources,
             include_directories: dav1d_inc_dirs,
-            c_args: ['-DBITDEPTH=@0@'.format(bitdepth), stackalign_flag],
+            c_args: ['-DBITDEPTH=@0@'.format(bitdepth)],
             install: false,
             build_by_default: false,
         )
@@ -87,7 +87,6 @@ if is_asm_enabled
             ],
 
         include_directories: dav1d_inc_dirs,
-        c_args: [stackalign_flag, stackrealign_flag],
         build_by_default: false,
         dependencies : [
             thread_dependency,
-- 
GitLab