Commit b073e870 authored by Ronald S. Bultje's avatar Ronald S. Bultje Committed by Fiona Glaser

x86inc: support stack mem allocation and re-alignment in PROLOGUE

Use this in 8-bit loopfilter functions so they can be used if
there is no aligned stack (e.g. x86-32 MSVC or ICC 10.x).
parent 9d5ec55b
......@@ -779,13 +779,13 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2;
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_sse2;
pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_sse2;
pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
pf->deblock_luma[0] = x264_deblock_h_luma_sse2;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
pf->deblock_luma[0] = x264_deblock_h_luma_sse2;
pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_sse2;
#if HIGH_BIT_DEPTH
......@@ -801,13 +801,13 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx;
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_avx;
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_avx;
pf->deblock_luma[1] = x264_deblock_v_luma_avx;
pf->deblock_luma[0] = x264_deblock_h_luma_avx;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->deblock_luma[1] = x264_deblock_v_luma_avx;
pf->deblock_luma[0] = x264_deblock_h_luma_avx;
pf->deblock_chroma[1] = x264_deblock_v_chroma_avx;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_avx;
#if HIGH_BIT_DEPTH
......
......@@ -1205,14 +1205,12 @@ DEBLOCK_LUMA
;-----------------------------------------------------------------------------
; void deblock_v8_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_%1_luma, 5,5
cglobal deblock_%1_luma, 5,5,8,2*%2
lea r4, [r1*3]
dec r2 ; alpha-1
neg r4
dec r3 ; beta-1
add r4, r0 ; pix-3*stride
%assign pad 2*%2+12-(stack_offset&15)
SUB esp, pad
mova m0, [r4+r1] ; p1
mova m1, [r4+2*r1] ; p0
......@@ -1251,22 +1249,19 @@ cglobal deblock_%1_luma, 5,5
DEBLOCK_P0_Q0
mova [r4+2*r1], m1
mova [r0], m2
ADD esp, pad
RET
;-----------------------------------------------------------------------------
; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX cpuname
cglobal deblock_h_luma, 0,5
cglobal deblock_h_luma, 0,5,8,0x60+HAVE_ALIGNED_STACK*12
mov r0, r0mp
mov r3, r1m
lea r4, [r3*3]
sub r0, 4
lea r1, [r0+r4]
%assign pad 0x78-(stack_offset&15)
SUB esp, pad
%define pix_tmp esp+12
%define pix_tmp esp+12*HAVE_ALIGNED_STACK
; transpose 6x16 -> tmp space
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
......@@ -1308,7 +1303,6 @@ cglobal deblock_h_luma, 0,5
movq m3, [pix_tmp+0x48]
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
ADD esp, pad
RET
%endmacro ; DEBLOCK_LUMA
......@@ -1439,7 +1433,7 @@ DEBLOCK_LUMA v, 16
%define mpb_0 m14
%define mpb_1 m15
%else
%define spill(x) [esp+16*x+((stack_offset+4)&15)]
%define spill(x) [esp+16*x]
%define p2 [r4+r1]
%define q2 [r0+2*r1]
%define t4 spill(0)
......@@ -1454,10 +1448,7 @@ DEBLOCK_LUMA v, 16
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_%1_luma_intra, 4,6,16
%if ARCH_X86_64 == 0
sub esp, 0x60
%endif
cglobal deblock_%1_luma_intra, 4,6,16,ARCH_X86_64*0x50-0x50
lea r4, [r1*4]
lea r5, [r1*3] ; 3*stride
dec r2d ; alpha-1
......@@ -1506,9 +1497,6 @@ cglobal deblock_%1_luma_intra, 4,6,16
LUMA_INTRA_SWAP_PQ
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
.end:
%if ARCH_X86_64 == 0
add esp, 0x60
%endif
RET
INIT_MMX cpuname
......@@ -1545,12 +1533,10 @@ cglobal deblock_h_luma_intra, 4,9
add rsp, 0x88
RET
%else
cglobal deblock_h_luma_intra, 2,4
cglobal deblock_h_luma_intra, 2,4,8,0x80
lea r3, [r1*3]
sub r0, 4
lea r2, [r0+r3]
%assign pad 0x8c-(stack_offset&15)
SUB rsp, pad
%define pix_tmp rsp
; transpose 8x16 -> tmp space
......@@ -1581,7 +1567,6 @@ cglobal deblock_h_luma_intra, 2,4
lea r0, [r0+r1*8]
lea r2, [r2+r1*8]
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
ADD rsp, pad
RET
%endif ; ARCH_X86_64
%endmacro ; DEBLOCK_LUMA_INTRA
......
......@@ -105,7 +105,12 @@ CPU amdnop
; %1 = number of arguments. loads them from stack if needed.
; %2 = number of registers used. pushes callee-saved regs if needed.
; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
; %4 = list of names to define to registers
; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x,
; MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes),
; and an extra register will be allocated to hold the original stack
; pointer (to not invalidate r0m etc.). To prevent the use of an extra
; register as stack pointer, request a negative stack size.
; %4+/%5+ = list of names to define to registers
; PROLOGUE can also be invoked by adding the same options to cglobal
; e.g.
......@@ -140,11 +145,11 @@ CPU amdnop
%define r%1m %2d
%define r%1mp %2
%elif ARCH_X86_64 ; memory
%define r%1m [rsp + stack_offset + %3]
%define r%1mp qword r %+ %1m
%define r%1m [rstk + stack_offset + %3]
%define r%1mp qword r %+ %1 %+ m
%else
%define r%1m [esp + stack_offset + %3]
%define r%1mp dword r %+ %1m
%define r%1m [rstk + stack_offset + %3]
%define r%1mp dword r %+ %1 %+ m
%endif
%define r%1 %2
%endmacro
......@@ -205,12 +210,16 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%macro PUSH 1
push %1
%assign stack_offset stack_offset+gprsize
%ifidn rstk, rsp
%assign stack_offset stack_offset+gprsize
%endif
%endmacro
%macro POP 1
pop %1
%assign stack_offset stack_offset-gprsize
%ifidn rstk, rsp
%assign stack_offset stack_offset-gprsize
%endif
%endmacro
%macro PUSH_IF_USED 1-*
......@@ -242,14 +251,14 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%macro SUB 2
sub %1, %2
%ifidn %1, rsp
%ifidn %1, rstk
%assign stack_offset stack_offset+(%2)
%endif
%endmacro
%macro ADD 2
add %1, %2
%ifidn %1, rsp
%ifidn %1, rstk
%assign stack_offset stack_offset-(%2)
%endif
%endmacro
......@@ -307,6 +316,79 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%assign n_arg_names %0
%endmacro
%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
%ifnum %1
%if %1 != 0
%assign %%stack_alignment ((mmsize + 15) & ~15)
%assign stack_size %1
%if stack_size < 0
%assign stack_size -stack_size
%endif
%if mmsize != 8
%assign xmm_regs_used %2
%endif
%if mmsize <= 16 && HAVE_ALIGNED_STACK
%assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
%if xmm_regs_used > 6
%assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
%endif
SUB rsp, stack_size_padded
%else
%assign %%reg_num (regs_used - 1)
%xdefine rstk r %+ %%reg_num
; align stack, and save original stack location directly above
; it, i.e. in [rsp+stack_size_padded], so we can restore the
; stack in a single instruction (i.e. mov rsp, rstk or mov
; rsp, [rsp+stack_size_padded])
mov rstk, rsp
%assign stack_size_padded stack_size
%if xmm_regs_used > 6
%assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
%if mmsize == 32 && xmm_regs_used & 1
; re-align to 32 bytes
%assign stack_size_padded (stack_size_padded + 16)
%endif
%endif
%if %1 < 0 ; need to store rsp on stack
sub rsp, gprsize+stack_size_padded
and rsp, ~(%%stack_alignment-1)
%xdefine rstkm [rsp+stack_size_padded]
mov rstkm, rstk
%else ; can keep rsp in rstk during whole function
sub rsp, stack_size_padded
and rsp, ~(%%stack_alignment-1)
%xdefine rstkm rstk
%endif
%endif
%if xmm_regs_used > 6
WIN64_PUSH_XMM
%endif
%endif
%endif
%endmacro
%macro SETUP_STACK_POINTER 1
%ifnum %1
%if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32)
%if %1 > 0
%assign regs_used (regs_used + 1)
%elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
%warning "Stack pointer will overwrite register argument"
%endif
%endif
%endif
%endmacro
%macro DEFINE_ARGS_INTERNAL 3+
%ifnum %2
DEFINE_ARGS %3
%elif %1 == 4
DEFINE_ARGS %2
%elif %1 > 4
DEFINE_ARGS %2, %3
%endif
%endmacro
%if WIN64 ; Windows x64 ;=================================================
DECLARE_REG 0, rcx
......@@ -325,19 +407,27 @@ DECLARE_REG 12, R13, 104
DECLARE_REG 13, R14, 112
DECLARE_REG 14, R15, 120
%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
%assign regs_used %2
ASSERT regs_used >= num_args
SETUP_STACK_POINTER %4
ASSERT regs_used <= 15
PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
%if mmsize == 8
%assign xmm_regs_used 0
%else
ALLOC_STACK %4, %3
%if mmsize != 8 && stack_size == 0
WIN64_SPILL_XMM %3
%endif
LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
DEFINE_ARGS %4
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
%macro WIN64_PUSH_XMM 0
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
%assign %%i %%i-1
movdqa [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i
%endrep
%endmacro
%macro WIN64_SPILL_XMM 1
......@@ -345,11 +435,7 @@ DECLARE_REG 14, R15, 120
ASSERT xmm_regs_used <= 16
%if xmm_regs_used > 6
SUB rsp, (xmm_regs_used-6)*16+16
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
%assign %%i %%i-1
movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
%endrep
WIN64_PUSH_XMM
%endif
%endmacro
......@@ -358,19 +444,28 @@ DECLARE_REG 14, R15, 120
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
%assign %%i %%i-1
movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
movdqa xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)]
%endrep
add %1, (xmm_regs_used-6)*16+16
%if stack_size_padded == 0
add %1, (xmm_regs_used-6)*16+16
%endif
%endif
%if stack_size_padded > 0
%if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
mov rsp, rstkm
%else
add %1, stack_size_padded
%endif
%endif
%endmacro
%macro WIN64_RESTORE_XMM 1
WIN64_RESTORE_XMM_INTERNAL %1
%assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
%assign stack_offset (stack_offset-stack_size_padded)
%assign xmm_regs_used 0
%endmacro
%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32
%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
%macro RET 0
WIN64_RESTORE_XMM_INTERNAL rsp
......@@ -399,19 +494,28 @@ DECLARE_REG 12, R13, 56
DECLARE_REG 13, R14, 64
DECLARE_REG 14, R15, 72
%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
%assign regs_used %2
ASSERT regs_used >= num_args
SETUP_STACK_POINTER %4
ASSERT regs_used <= 15
PUSH_IF_USED 9, 10, 11, 12, 13, 14
ALLOC_STACK %4
LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
DEFINE_ARGS %4
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
%define has_epilogue regs_used > 9 || mmsize == 32
%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
%macro RET 0
%if stack_size_padded > 0
%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
mov rsp, rstkm
%else
add rsp, stack_size_padded
%endif
%endif
POP_IF_USED 14, 13, 12, 11, 10, 9
%if mmsize == 32
vzeroupper
......@@ -432,7 +536,7 @@ DECLARE_REG 6, ebp, 28
%macro DECLARE_ARG 1-*
%rep %0
%define r%1m [esp + stack_offset + 4*%1 + 4]
%define r%1m [rstk + stack_offset + 4*%1 + 4]
%define r%1mp dword r%1m
%rotate 1
%endrep
......@@ -440,21 +544,34 @@ DECLARE_REG 6, ebp, 28
DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
%assign regs_used %2
ASSERT regs_used >= num_args
%if num_args > 7
%assign num_args 7
%endif
%if regs_used > 7
%assign regs_used 7
%endif
ASSERT regs_used >= num_args
SETUP_STACK_POINTER %4
ASSERT regs_used <= 7
PUSH_IF_USED 3, 4, 5, 6
ALLOC_STACK %4
LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
DEFINE_ARGS %4
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
%define has_epilogue regs_used > 3 || mmsize == 32
%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
%macro RET 0
%if stack_size_padded > 0
%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
mov rsp, rstkm
%else
add rsp, stack_size_padded
%endif
%endif
POP_IF_USED 6, 5, 4, 3
%if mmsize == 32
vzeroupper
......@@ -469,6 +586,8 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%endmacro
%macro WIN64_RESTORE_XMM 1
%endmacro
%macro WIN64_PUSH_XMM 0
%endmacro
%endif
%macro REP_RET 0
......@@ -498,12 +617,10 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
; Applies any symbol mangling needed for C linkage, and sets up a define such that
; subsequent uses of the function name automatically refer to the mangled version.
; Appends cpuflags to the function name if cpuflags has been specified.
%macro cglobal 1-2+ ; name, [PROLOGUE args]
%if %0 == 1
cglobal_internal %1 %+ SUFFIX
%else
%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
; the "" is a workaround for nasm, which fails if SUFFIX is empty
; and we call cglobal_internal with just %1 %+ SUFFIX (without %2)
cglobal_internal %1 %+ SUFFIX, %2
%endif
%endmacro
%macro cglobal_internal 1-2+
%ifndef cglobaled_%1
......@@ -520,8 +637,12 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
align function_align
%1:
RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
%xdefine rstk rsp
%assign stack_offset 0
%if %0 > 1
%assign stack_size 0
%assign stack_size_padded 0
%assign xmm_regs_used 0
%ifnidn %2, ""
PROLOGUE %2
%endif
%endmacro
......
......@@ -528,6 +528,7 @@ esac
LDFLAGS="$LDFLAGS $libm"
aligned_stack=1
case $host_cpu in
i*86)
ARCH="X86"
......@@ -548,6 +549,7 @@ case $host_cpu in
# < 11 is completely incapable of keeping a mod16 stack
if cpp_check "" "" "__INTEL_COMPILER < 1100" ; then
define BROKEN_STACK_ALIGNMENT
aligned_stack=0
# 11 <= x < 12 is capable of keeping a mod16 stack, but defaults to not doing so.
elif cpp_check "" "" "__INTEL_COMPILER < 1200" ; then
CFLAGS="$CFLAGS -falign-stack=assume-16-byte"
......@@ -555,7 +557,7 @@ case $host_cpu in
# >= 12 defaults to a mod16 stack
fi
# icl on windows has no mod16 stack support
[ $SYS = WINDOWS ] && define BROKEN_STACK_ALIGNMENT
[ $SYS = WINDOWS ] && define BROKEN_STACK_ALIGNMENT && aligned_stack=0
fi
if [ "$SYS" = MACOSX ]; then
ASFLAGS="$ASFLAGS -f macho -DPREFIX"
......@@ -648,6 +650,7 @@ case $host_cpu in
ARCH="$(echo $host_cpu | tr a-z A-Z)"
;;
esac
ASFLAGS="$ASFLAGS -DHAVE_ALIGNED_STACK=${aligned_stack}"
if [ $SYS = WINDOWS ]; then
if ! rc_check "0 RCDATA {0}" ; then
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment