Commit 275ef533 authored by Henrik Gramner's avatar Henrik Gramner Committed by Anton Mitrofanov

x86: Always use PIC in x86-64 asm

Most x86-64 operating systems nowadays doesn't even allow .text relocations
in object files any more, and there is no measurable overall performance
difference from using RIP-relative addressing in x264 asm.

Enforcing PIC reduces complexity and simplifies testing.
parent 72db4377
......@@ -36,11 +36,7 @@ SECTION_RODATA 64
%xdefine %%funccpu2 %3 ; last64
%xdefine %%funccpu3 %4 ; last15/last16
coeff_last_%1:
%ifdef PIC
%xdefine %%base coeff_last_%1 ; offset relative to the start of the table
%else
%xdefine %%base 0 ; absolute address
%endif
%xdefine %%base coeff_last_%1
%rep 14
%ifidn %5, 4
dd mangle(private_prefix %+ _coeff_last%5_ %+ %%funccpu1) - %%base
......@@ -121,15 +117,13 @@ struc cb
endstruc
%macro LOAD_GLOBAL 3-5 0 ; dst, base, off1, off2, tmp
%ifdef PIC
%ifidn %4, 0
movzx %1, byte [%2+%3+r7-$$]
%else
lea %5, [r7+%4]
movzx %1, byte [%2+%3+%5-$$]
%endif
%else
%if ARCH_X86_64 == 0
movzx %1, byte [%2+%3+%4]
%elifidn %4, 0
movzx %1, byte [%2+%3+r7-$$]
%else
lea %5, [r7+%4]
movzx %1, byte [%2+%3+%5-$$]
%endif
%endmacro
......@@ -154,9 +148,9 @@ cglobal cabac_encode_decision_%1, 1,7
shr t5d, 6
movifnidn t2d, r2m
%if WIN64
PUSH r7
PUSH r7
%endif
%ifdef PIC
%if ARCH_X86_64
lea r7, [$$]
%endif
LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2, t4
......@@ -183,7 +177,7 @@ cglobal cabac_encode_decision_%1, 1,7
shl t6d, t3b
%endif
%if WIN64
POP r7
POP r7
%endif
mov [t0+cb.range], t4d
add t3d, [t0+cb.queue]
......@@ -278,6 +272,7 @@ cabac_putbyte_%1:
CABAC asm
CABAC bmi2
%if ARCH_X86_64
; %1 = label name
; %2 = node_ctx init?
%macro COEFF_ABS_LEVEL_GT1 2
......@@ -409,13 +404,9 @@ CABAC bmi2
%endmacro
%macro COEFF_LAST 2 ; table, ctx_block_cat
%ifdef PIC
lea r1, [%1 GLOBAL]
movsxd r6, [r1+4*%2]
add r6, r1
%else
movsxd r6, [%1+4*%2]
%endif
call r6
%endmacro
......@@ -436,15 +427,9 @@ CABAC bmi2
%define dct r4
%endif
%ifdef PIC
cglobal func, 4,13,6,-maxcoeffs*SIZEOF_DCTCOEF
cglobal func, 4,13,6,-maxcoeffs*SIZEOF_DCTCOEF
lea r12, [$$]
%define GLOBAL +r12-$$
%else
cglobal func, 4,12,6,-maxcoeffs*SIZEOF_DCTCOEF
%define GLOBAL
%endif
shl r1d, 4 ; MB_INTERLACED*16
%if %1
lea r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] ; r12 = sig offset 8x8
......@@ -554,7 +539,6 @@ CABAC bmi2
RET
%endmacro
%if ARCH_X86_64
INIT_XMM sse2
CABAC_RESIDUAL_RD 0, coeff_last_sse2
CABAC_RESIDUAL_RD 1, coeff_last_sse2
......@@ -575,7 +559,6 @@ INIT_YMM avx512
CABAC_RESIDUAL_RD 0, coeff_last_avx512
INIT_ZMM avx512
CABAC_RESIDUAL_RD 1, coeff_last_avx512
%endif
;-----------------------------------------------------------------------------
; void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced,
......@@ -653,15 +636,10 @@ CABAC_RESIDUAL_RD 1, coeff_last_avx512
%macro CABAC_RESIDUAL 1
cglobal cabac_block_residual_internal, 4,15,0,-4*64
%ifdef PIC
; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register.
lea r7, [$$]
%define lastm [rsp+4*1]
%define GLOBAL +r7-$$
%else
%define lastm r7d
%define GLOBAL
%endif
shl r1d, 4
%define sigoffq r8
......@@ -779,7 +757,6 @@ cglobal cabac_block_residual_internal, 4,15,0,-4*64
RET
%endmacro
%if ARCH_X86_64
INIT_XMM sse2
CABAC_RESIDUAL coeff_last_sse2
INIT_XMM lzcnt
......
......@@ -1331,7 +1331,7 @@ cglobal pixel_avg2_w16_cache64_ssse3
sub r4, r2
shl r6, 4 ;jump = (offset + align*2)*48
%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
%ifdef PIC
%if ARCH_X86_64
lea r7, [avg_w16_addr]
add r6, r7
%else
......@@ -2020,7 +2020,7 @@ cglobal mc_chroma
%if cpuflag(cache64)
mov t0d, r3d
and t0d, 7
%ifdef PIC
%if ARCH_X86_64
lea t1, [ch_shuf_adj]
movddup xm5, [t1 + t0*4]
%else
......
......@@ -2866,7 +2866,7 @@ cglobal intra_satd_x3_8x8c, 0,6
; output the predicted samples
mov r3d, eax
shr r3d, 16
%ifdef PIC
%if ARCH_X86_64
lea r2, [%2_lut]
movzx r2d, byte [r2+r3]
%else
......@@ -5103,7 +5103,7 @@ cglobal pixel_ssim_end4, 2,3
je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
neg r2
%ifdef PIC
%if ARCH_X86_64
lea r3, [mask_ff + 16]
%xdefine %%mask r3
%else
......@@ -5553,7 +5553,7 @@ ads_mvs_ssse3:
add r5, r6
xor r0d, r0d ; nmv
mov [r5], r0d
%ifdef PIC
%if ARCH_X86_64
lea r1, [$$]
%define GLOBAL +r1-$$
%else
......
......@@ -688,7 +688,7 @@ INIT_XMM cpuname
je .fix_lt_2
.do_top:
and r2d, 4
%ifdef PIC
%if ARCH_X86_64
lea r3, [shuf_fixtr]
pshufb m3, [r3+r2*4]
%else
......
......@@ -673,7 +673,7 @@ cglobal dequant_%1x%1_flat16, 0,3
sub t2d, t0d
sub t2d, t1d ; i_mf = i_qp % 6
shl t2d, %2
%ifdef PIC
%if ARCH_X86_64
lea r1, [dequant%1_scale]
add r1, t2
%else
......@@ -761,7 +761,7 @@ DEQUANT 8, 6, 4
sub t2d, t1d ; i_mf = i_qp % 6
shl t2d, %1
%if %2
%ifdef PIC
%if ARCH_X86_64
%define dmf r1+t2
lea r1, [dequant8_scale]
%else
......@@ -1449,7 +1449,7 @@ cglobal decimate_score%1, 1,3
shr edx, 1
%endif
%endif
%ifdef PIC
%if ARCH_X86_64
lea r4, [decimate_mask_table4]
%define mask_table r4
%else
......@@ -1580,16 +1580,11 @@ cglobal decimate_score64, 1,5
add eax, r3d
jnz .ret9
%endif
%ifdef PIC
lea r4, [decimate_table8]
%define table r4
%else
%define table decimate_table8
%endif
lea r4, [decimate_table8]
mov al, -6
.loop:
tzcnt rcx, r1
add al, byte [table + rcx]
add al, byte [r4 + rcx]
jge .ret9
shr r1, 1
SHRX r1, rcx
......@@ -2165,7 +2160,7 @@ COEFF_LEVELRUN 16
%macro COEFF_LEVELRUN_LUT 1
cglobal coeff_level_run%1,2,4+(%1/9)
%ifdef PIC
%if ARCH_X86_64
lea r5, [$$]
%define GLOBAL +r5-$$
%else
......
......@@ -1920,7 +1920,7 @@ cglobal pixel_sad_16x%2_cache64_%1
shl r4d, 4 ; code size = 80
%endif
%define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
%ifdef PIC
%if ARCH_X86_64
lea r5, [sad_w16_addr]
add r5, r4
%else
......
......@@ -202,7 +202,6 @@ cglobal %1, 4,15,9
paddd m6, m6
%define unquant_mf m6
%endif
%ifdef PIC
%if dc == 0
mov unquant_mfm, unquant_mfq
%endif
......@@ -212,9 +211,6 @@ cglobal %1, 4,15,9
; (Any address in .text would work, this one was just convenient.)
lea r0, [$$]
%define GLOBAL +r0-$$
%else
%define GLOBAL
%endif
TRELLIS_LOOP 0 ; node_ctx 0..3
TRELLIS_LOOP 1 ; node_ctx 1..7
......@@ -304,12 +300,8 @@ cglobal %1, 4,15,9
mov r10, cabac_state_sigm
%if num_coefs == 64
mov r6d, b_interlacedm
%ifdef PIC
add r6d, iid
movzx r6d, byte [significant_coeff_flag_offset_8x8 + r6 GLOBAL]
%else
movzx r6d, byte [significant_coeff_flag_offset_8x8 + r6 + iiq]
%endif
movzx r10, byte [r10 + r6]
%elif num_coefs == 8
movzx r13, byte [coeff_flag_offset_chroma_422_dc + iiq GLOBAL]
......@@ -408,12 +400,8 @@ cglobal %1, 4,15,9
%if dc
pmuludq m0, unquant_mf
%else
%ifdef PIC
mov r10, unquant_mfm
LOAD_DUP m3, [r10 + zigzagiq*4]
%else
LOAD_DUP m3, [unquant_mfq + zigzagiq*4]
%endif
pmuludq m0, m3
%endif
paddd m0, [pq_128]
......
......@@ -734,11 +734,11 @@ case $host_cpu in
ARCH="X86_64"
AS="${AS-nasm}"
AS_EXT=".asm"
ASFLAGS="$ASFLAGS -DARCH_X86_64=1 -I\$(SRCPATH)/common/x86/"
ASFLAGS="$ASFLAGS -DARCH_X86_64=1 -DPIC -I\$(SRCPATH)/common/x86/"
stack_alignment=16
[ $compiler = GNU ] && CFLAGS="-m64 $CFLAGS" && LDFLAGS="-m64 $LDFLAGS"
if [ "$SYS" = MACOSX ]; then
ASFLAGS="$ASFLAGS -f macho64 -DPIC -DPREFIX"
ASFLAGS="$ASFLAGS -f macho64 -DPREFIX"
if cc_check '' "-arch x86_64"; then
CFLAGS="$CFLAGS -arch x86_64"
LDFLAGS="$LDFLAGS -arch x86_64"
......@@ -1253,7 +1253,7 @@ cc_check "stdint.h" "" "uint32_t test_vec __attribute__ ((vector_size (16))) = {
if [ "$pic" = "yes" ] ; then
[ "$SYS" != WINDOWS -a "$SYS" != CYGWIN ] && CFLAGS="$CFLAGS -fPIC"
ASFLAGS="$ASFLAGS -DPIC"
[[ "$ASFLAGS" != *"-DPIC"* ]] && ASFLAGS="$ASFLAGS -DPIC"
# resolve textrels in the x86 asm
cc_check stdio.h "-shared -Wl,-Bsymbolic" && SOFLAGS="$SOFLAGS -Wl,-Bsymbolic"
[ $SYS = SunOS -a "$ARCH" = "X86" ] && SOFLAGS="$SOFLAGS -mimpure-text"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment