From 0d6a31f275dc73dabf9fd8742a4adaa9ecc13da6 Mon Sep 17 00:00:00 2001 From: Henrik Gramner <gramner@twoorioles.com> Date: Thu, 12 Jan 2023 01:49:39 +0100 Subject: [PATCH] x86: Remove stack alignment compiler flags The intent was good, but in practice it results in a significant amount of problems due to various compiler bugs for negligible gains. --- meson.build | 34 +++------------------------------- src/meson.build | 27 +++++---------------------- src/x86/cdef_avx2.asm | 32 ++++++++------------------------ src/x86/ipred16_avx2.asm | 4 ---- src/x86/mc16_avx2.asm | 25 ++++--------------------- src/x86/mc_avx2.asm | 23 ++++------------------- src/x86/msac.asm | 10 ---------- tests/libfuzzer/meson.build | 5 ++--- tests/meson.build | 3 +-- 9 files changed, 27 insertions(+), 136 deletions(-) diff --git a/meson.build b/meson.build index cee8b791e..0402f73d3 100644 --- a/meson.build +++ b/meson.build @@ -319,45 +319,17 @@ if fuzzing_engine == 'libfuzzer' add_project_arguments(cc.first_supported_argument(fuzzer_args), language : 'c') endif -# Stack alignments flags - -stackalign_flag = [] -stackrealign_flag = [] - cdata.set10('ENDIANNESS_BIG', host_machine.endian() == 'big') if host_machine.cpu_family().startswith('x86') if get_option('stack_alignment') > 0 stack_alignment = get_option('stack_alignment') - elif host_machine.cpu_family() == 'x86_64' - if cc.has_argument('-mpreferred-stack-boundary=6') - stackalign_flag = ['-mpreferred-stack-boundary=6'] - stackrealign_flag = ['-mincoming-stack-boundary=4'] - stack_alignment = 32 - elif cc.has_argument('-mstack-alignment=64') - stackalign_flag = ['-mstack-alignment=64'] - stackrealign_flag = ['-mstackrealign'] - stack_alignment = 32 - else - stack_alignment = 16 - endif + elif host_machine.cpu_family() == 'x86_64' or host_machine.system() in ['linux', 'darwin', 'ios', 'tvos'] + stack_alignment = 16 else - if host_machine.system() == 'linux' or host_machine.system() in ['darwin', 'ios', 'tvos'] - stack_alignment = 16 - elif cc.has_argument('-mpreferred-stack-boundary=4') - stackalign_flag = ['-mpreferred-stack-boundary=4'] - stackrealign_flag = ['-mincoming-stack-boundary=2'] - stack_alignment = 16 - elif cc.has_argument('-mstack-alignment=16') - stackalign_flag = ['-mstack-alignment=16'] - stackrealign_flag = ['-mstackrealign'] - stack_alignment = 16 - else - stack_alignment = 4 - endif + stack_alignment = 4 endif cdata_asm.set('STACK_ALIGNMENT', stack_alignment) - cdata.set('STACK_ALIGNMENT', stack_alignment) endif cdata.set10('ARCH_AARCH64', host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64') diff --git a/src/meson.build b/src/meson.build index 719015496..2ace55e1a 100644 --- a/src/meson.build +++ b/src/meson.build @@ -37,6 +37,7 @@ libdav1d_sources = files( 'intra_edge.c', 'itx_1d.c', 'lf_mask.c', + 'lib.c', 'log.c', 'mem.c', 'msac.c', @@ -47,6 +48,7 @@ libdav1d_sources = files( 'refmvs.c', 'scan.c', 'tables.c', + 'thread_task.c', 'warpmv.c', 'wedge.c', ) @@ -74,14 +76,6 @@ libdav1d_arch_tmpl_sources = [] libdav1d_bitdepth_objs = [] -# libdav1d entrypoint source files -# These source files contain library entry points and are -# built with the stack-realign flag set, where necessary. -libdav1d_entrypoints_sources = files( - 'lib.c', - 'thread_task.c' -) - # ASM specific sources libdav1d_asm_objs = [] # Arch-specific flags @@ -245,7 +239,7 @@ endif libdav1d_rc_obj = [] -libdav1d_flags = [stackalign_flag] +libdav1d_flags = [] api_export_flags = [] # @@ -280,18 +274,6 @@ endif # Library definitions # -# Helper library for dav1d entrypoints -libdav1d_entrypoints_objs = static_library('dav1d_entrypoint', - libdav1d_entrypoints_sources, - rev_target, config_h_target, - - include_directories : dav1d_inc_dirs, - dependencies: [stdatomic_dependencies], - c_args : [libdav1d_flags, stackrealign_flag, api_export_flags], - install : false, - build_by_default : false, -).extract_all_objects(recursive: true) - # Helper library for each bitdepth libdav1d_bitdepth_objs = [] foreach bitdepth : dav1d_bitdepths @@ -330,10 +312,11 @@ libdav1d = library('dav1d', libdav1d_sources, libdav1d_asm_objs, libdav1d_rc_obj, + rev_target, + config_h_target, objects : [ libdav1d_bitdepth_objs, - libdav1d_entrypoints_objs ], include_directories : dav1d_inc_dirs, diff --git a/src/x86/cdef_avx2.asm b/src/x86/cdef_avx2.asm index c5c66c759..1f30f8a3b 100644 --- a/src/x86/cdef_avx2.asm +++ b/src/x86/cdef_avx2.asm @@ -396,21 +396,17 @@ SECTION .text %macro CDEF_FILTER 2 ; w, h INIT_YMM avx2 -cglobal cdef_filter_%1x%2_8bpc, 5, 10, 0, dst, stride, left, top, bot, \ +cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge %assign stack_offset_entry stack_offset mov edged, edgem cmp edged, 0xf jne .border_block - PUSH r10 PUSH r11 + PUSH r12 %if %2 == 4 - %assign regs_used 12 - %if STACK_ALIGNMENT < 32 - PUSH r%+regs_used - %assign regs_used regs_used+1 - %endif +%assign regs_used 13 ALLOC_STACK 0x60, 16 pmovzxbw xm0, [leftq+1] vpermq m0, m0, q0110 @@ -420,23 +416,15 @@ cglobal cdef_filter_%1x%2_8bpc, 5, 10, 0, dst, stride, left, top, bot, \ movu [rsp+0x28], m1 movu [rsp+0x40], m2 %elif %1 == 4 - PUSH r12 - %assign regs_used 13 - %if STACK_ALIGNMENT < 32 - PUSH r%+regs_used - %assign regs_used regs_used+1 - %endif +%assign regs_used 14 + PUSH r13 ALLOC_STACK 8*2+%1*%2*1, 16 pmovzxwd m0, [leftq] mova [rsp+0x10], m0 %else - PUSH r12 +%assign regs_used 15 PUSH r13 - %assign regs_used 14 - %if STACK_ALIGNMENT < 32 - PUSH r%+regs_used - %assign regs_used regs_used+1 - %endif + PUSH r14 ALLOC_STACK 8*4+%1*%2*2+32, 16 lea r11, [strideq*3] movu xm4, [dstq+strideq*2] @@ -1209,11 +1197,7 @@ cglobal cdef_filter_%1x%2_8bpc, 5, 10, 0, dst, stride, left, top, bot, \ DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge %define rstk rsp %assign stack_offset stack_offset_entry -%assign regs_used 10 -%if STACK_ALIGNMENT < 32 - PUSH r%+regs_used - %assign regs_used regs_used+1 -%endif +%assign regs_used 11 ALLOC_STACK 2*16+(%2+4)*32, 16 %define px rsp+2*16+2*32 diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm index 72300c2a4..7ddb18991 100644 --- a/src/x86/ipred16_avx2.asm +++ b/src/x86/ipred16_avx2.asm @@ -1936,11 +1936,7 @@ ALIGN function_align .upsample_left: ; h4/h8 mova xm0, [tlq-16] ; 8 7 6 5 4 3 2 1 movu xm1, [tlq-14] ; 7 6 5 4 3 2 1 0 -%if STACK_ALIGNMENT < 32 vpbroadcastw xm4, r8m ; pixel_max -%else - vpbroadcastw xm4, r9m ; r8m -> r9m due to call -%endif cmp hd, 8 je .upsample_left_h8 pshufhw xm2, xm0, q2100 ; _ _ _ _ 4 4 3 2 diff --git a/src/x86/mc16_avx2.asm b/src/x86/mc16_avx2.asm index 8b2ec4fa9..61eeaa100 100644 --- a/src/x86/mc16_avx2.asm +++ b/src/x86/mc16_avx2.asm @@ -2650,23 +2650,14 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my %ifidn %1, put %assign isput 1 %assign isprep 0 - %if required_stack_alignment <= STACK_ALIGNMENT -cglobal put_8tap_scaled_16bpc, 4, 15, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax - %else cglobal put_8tap_scaled_16bpc, 4, 14, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax - %endif %xdefine base_reg r12 mov r7d, pxmaxm %else %assign isput 0 %assign isprep 1 - %if required_stack_alignment <= STACK_ALIGNMENT -cglobal prep_8tap_scaled_16bpc, 4, 15, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax - %xdefine tmp_stridem r14q - %else cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax %define tmp_stridem qword [rsp+0xd0] - %endif %xdefine base_reg r11 %endif lea base_reg, [%1_8tap_scaled_16bpc_avx2] @@ -2698,15 +2689,9 @@ cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 %define hm r6m %endif - %if required_stack_alignment > STACK_ALIGNMENT - %define dsm [rsp+0x98] - %define rX r1 - %define rXd r1d - %else - %define dsm dsq - %define rX r14 - %define rXd r14d - %endif + %define dsm [rsp+0x98] + %define rX r1 + %define rXd r1d %else ; prep %if WIN64 mov r7d, hm @@ -3580,9 +3565,7 @@ cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free mov myd, mym %if isput - %if required_stack_alignment > STACK_ALIGNMENT - %define dsm [rsp+0xb8] - %endif + %define dsm [rsp+0xb8] movifnidn dsm, dsq mova [rsp+0xc0], xm7 %else diff --git a/src/x86/mc_avx2.asm b/src/x86/mc_avx2.asm index 2719ef361..3b208033b 100644 --- a/src/x86/mc_avx2.asm +++ b/src/x86/mc_avx2.asm @@ -2721,22 +2721,13 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 %macro MC_8TAP_SCALED 1 %ifidn %1, put %assign isprep 0 - %if required_stack_alignment <= STACK_ALIGNMENT -cglobal put_8tap_scaled_8bpc, 4, 15, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy - %else cglobal put_8tap_scaled_8bpc, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy - %endif %xdefine base_reg r12 %define rndshift 10 %else %assign isprep 1 - %if required_stack_alignment <= STACK_ALIGNMENT -cglobal prep_8tap_scaled_8bpc, 4, 15, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy - %xdefine tmp_stridem r14q - %else cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy - %define tmp_stridem qword [rsp+120] - %endif + %define tmp_stridem qword [rsp+120] %xdefine base_reg r11 %define rndshift 6 %endif @@ -2763,15 +2754,9 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, d DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 %define hm r6m %endif - %if required_stack_alignment > STACK_ALIGNMENT - %define dsm [rsp+112] - %define rX r1 - %define rXd r1d - %else - %define dsm dsq - %define rX r14 - %define rXd r14d - %endif + %define dsm [rsp+112] + %define rX r1 + %define rXd r1d %else ; prep %if WIN64 mov r7d, hm diff --git a/src/x86/msac.asm b/src/x86/msac.asm index 92a3a731d..9f05c921a 100644 --- a/src/x86/msac.asm +++ b/src/x86/msac.asm @@ -619,7 +619,6 @@ cglobal msac_decode_symbol_adapt16, 3, 6, 6 mov t3d, [t0+msac.update_cdf] mov t4d, t2d not t2 -%if STACK_ALIGNMENT < 32 mov r5, rsp %if WIN64 and rsp, ~31 @@ -627,11 +626,6 @@ cglobal msac_decode_symbol_adapt16, 3, 6, 6 %else and r5, ~31 %define buf r5-32 -%endif -%elif WIN64 - sub rsp, 64 -%else - %define buf rsp-56 %endif psrlw m1, m0, 6 movd [buf-4], xm2 @@ -666,11 +660,7 @@ cglobal msac_decode_symbol_adapt16, 3, 6, 6 movzx t2d, word [buf+rax-2] shr eax, 1 %if WIN64 -%if STACK_ALIGNMENT < 32 mov rsp, r5 -%else - add rsp, 64 -%endif %endif vzeroupper jmp m(msac_decode_symbol_adapt4, _sse2).renorm2 diff --git a/tests/libfuzzer/meson.build b/tests/libfuzzer/meson.build index 35914033d..45d28562c 100644 --- a/tests/libfuzzer/meson.build +++ b/tests/libfuzzer/meson.build @@ -51,7 +51,6 @@ endif dav1d_fuzzer = executable('dav1d_fuzzer', dav1d_fuzzer_sources, include_directories: dav1d_inc_dirs, - c_args: [stackalign_flag, stackrealign_flag], link_args: fuzzer_ldflags, link_with : libdav1d, build_by_default: true, @@ -62,7 +61,7 @@ dav1d_fuzzer = executable('dav1d_fuzzer', dav1d_fuzzer_mt = executable('dav1d_fuzzer_mt', dav1d_fuzzer_sources, include_directories: dav1d_inc_dirs, - c_args: [stackalign_flag, stackrealign_flag, '-DDAV1D_MT_FUZZING'], + c_args: ['-DDAV1D_MT_FUZZING'], link_args: fuzzer_ldflags, link_with : libdav1d, build_by_default: true, @@ -92,7 +91,7 @@ if (objcopy.found() and dav1d_fuzzer_mem = executable('dav1d_fuzzer_mem', dav1d_fuzzer_sources + ['alloc_fail.c'], include_directories: dav1d_inc_dirs, - c_args: [stackalign_flag, stackrealign_flag, '-DDAV1D_ALLOC_FAIL'], + c_args: ['-DDAV1D_ALLOC_FAIL'], link_args: fuzzer_ldflags + [join_paths(libdav1d_af.full_path())], link_depends: libdav1d_af, build_by_default: false, diff --git a/tests/meson.build b/tests/meson.build index 30358d237..f0f0eb713 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -54,7 +54,7 @@ if is_asm_enabled 'checkasm_bitdepth_@0@'.format(bitdepth), checkasm_tmpl_sources, include_directories: dav1d_inc_dirs, - c_args: ['-DBITDEPTH=@0@'.format(bitdepth), stackalign_flag], + c_args: ['-DBITDEPTH=@0@'.format(bitdepth)], install: false, build_by_default: false, ) @@ -87,7 +87,6 @@ if is_asm_enabled ], include_directories: dav1d_inc_dirs, - c_args: [stackalign_flag, stackrealign_flag], build_by_default: false, dependencies : [ thread_dependency, -- GitLab