Commit 0d7a9100 authored by Henrik Gramner's avatar Henrik Gramner Committed by Fiona Glaser

x86inc improvements for 64-bit

Add support for all x86-64 registers
Prefer caller-saved register over callee-saved on WIN64
Support up to 15 function arguments
parent 8a6a062e
......@@ -35,7 +35,7 @@ cextern cabac_renorm_shift
; t3 must be ecx, since it's used for shift.
%ifdef WIN64
DECLARE_REG_TMP 3,1,2,0,4,5,6,2
DECLARE_REG_TMP 3,1,2,0,6,5,4,2
%define pointer resq
%elifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,6,6
......@@ -61,11 +61,11 @@ endstruc
%macro LOAD_GLOBAL 4
%ifdef PIC
; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
lea r11, [%2]
lea r7, [%2]
%ifnidn %3, 0
add r11, %3
add r7, %3
%endif
movzx %1, byte [r11+%4]
movzx %1, byte [r7+%4]
%else
movzx %1, byte [%2+%3+%4]
%endif
......@@ -81,6 +81,9 @@ cglobal cabac_encode_decision_asm, 0,7
and t4d, t6d
shr t5d, 6
movifnidn t2d, r2m
%ifdef WIN64
PUSH r7
%endif
LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2
LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
and t6d, 1
......@@ -95,6 +98,9 @@ cglobal cabac_encode_decision_asm, 0,7
mov t4d, t3d
shr t3d, 3
LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
%ifdef WIN64
POP r7
%endif
shl t4d, t3b
shl t6d, t3b
mov [t0+cb.range], t4d
......@@ -144,12 +150,11 @@ cglobal cabac_encode_terminal_asm, 0,3
PROLOGUE 0,7
mov t3d, [t0+cb.queue]
mov t6d, [t0+cb.low]
jmp cabac_putbyte
cabac_putbyte:
; alive: t0=cb t3=queue t6=low
%ifdef WIN64
DECLARE_REG_TMP 3,4,1,0,2,5,6,10
DECLARE_REG_TMP 3,6,1,0,2,5,4
%endif
mov t1d, -1
add t3d, 10
......
......@@ -366,9 +366,6 @@ cglobal %1, 3,3,%7
%endif
%endif ; !HIGH_BIT_DEPTH
.skip_prologue:
%ifdef WIN64
sub rsp, 8
%endif
call %2.skip_prologue
add r0, %3
add r1, %4-%5-%6*FENC_STRIDE
......@@ -383,7 +380,6 @@ cglobal %1, 3,3,%7
add r2, %4-%5-%6*FDEC_STRIDE
%ifdef WIN64
call %2.skip_prologue
add rsp, 8
RET
%else
jmp %2.skip_prologue
......@@ -407,9 +403,6 @@ cglobal %1, 2,2,11
add r0, 4*FDEC_STRIDE
%endif
.skip_prologue:
%ifdef WIN64
sub rsp, 8
%endif
call %2.skip_prologue
add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3
......@@ -421,7 +414,6 @@ cglobal %1, 2,2,11
add r1, %3
%ifdef WIN64
call %2.skip_prologue
add rsp, 8
RET
%else
jmp %2.skip_prologue
......
......@@ -1138,11 +1138,11 @@ cglobal deblock_v_luma, 5,5,10
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX cpuname
cglobal deblock_h_luma, 5,7
movsxd r10, r1d
lea r11, [r10+r10*2]
cglobal deblock_h_luma, 5,9
movsxd r7, r1d
lea r8, [r7*3]
lea r6, [r0-4]
lea r5, [r0-4+r11]
lea r5, [r0-4+r8]
%ifdef WIN64
sub rsp, 0x98
%define pix_tmp rsp+0x30
......@@ -1152,14 +1152,14 @@ cglobal deblock_h_luma, 5,7
%endif
; transpose 6x16 -> tmp space
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
lea r6, [r6+r10*8]
lea r5, [r5+r10*8]
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp
lea r6, [r6+r7*8]
lea r5, [r5+r7*8]
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4
; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
lea r0, [pix_tmp+0x30]
mov r1d, 0x10
%ifdef WIN64
......@@ -1174,17 +1174,17 @@ cglobal deblock_h_luma, 5,7
movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48]
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
shl r10, 3
sub r6, r10
sub r5, r10
shr r10, 3
shl r7, 3
sub r6, r7
sub r5, r7
shr r7, 3
movq m0, [pix_tmp+0x10]
movq m1, [pix_tmp+0x20]
movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40]
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
%ifdef WIN64
add rsp, 0x98
......@@ -1516,32 +1516,32 @@ INIT_MMX cpuname
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra, 4,7
movsxd r10, r1d
lea r11, [r10*3]
cglobal deblock_h_luma_intra, 4,9
movsxd r7, r1d
lea r8, [r7*3]
lea r6, [r0-4]
lea r5, [r0-4+r11]
lea r5, [r0-4+r8]
sub rsp, 0x88
%define pix_tmp rsp
; transpose 8x16 -> tmp space
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
lea r6, [r6+r10*8]
lea r5, [r5+r10*8]
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
lea r6, [r6+r7*8]
lea r5, [r5+r7*8]
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
lea r0, [pix_tmp+0x40]
mov r1, 0x10
call deblock_v_luma_intra
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
lea r5, [r6+r11]
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
shl r10, 3
sub r6, r10
sub r5, r10
shr r10, 3
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
lea r5, [r6+r8]
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
shl r7, 3
sub r6, r7
sub r5, r7
shr r7, 3
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
add rsp, 0x88
RET
%else
......@@ -2008,9 +2008,9 @@ DEBLOCK_H_CHROMA_420_MBAFF
%endif
%macro DEBLOCK_H_CHROMA_422 0
cglobal deblock_h_chroma_422, 5,7,8
cglobal deblock_h_chroma_422, 5,8,8
%ifdef ARCH_X86_64
%define cntr r11
%define cntr r7
%else
%define cntr dword r0m
%endif
......
......@@ -58,13 +58,16 @@ cextern pd_32
; implicit weighted biprediction
;=============================================================================
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
%ifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,10,11
%macro AVG_START 0-1 0
PROLOGUE 6,7,%1
%ifdef WIN64
movsxd r5, r5d
%endif
DECLARE_REG_TMP 0,1,2,3,4,5,4,5
%macro AVG_START 0-1 0
PROLOGUE 5,7,%1
movsxd r5, dword r5m
%endmacro
%elifdef UNIX64
DECLARE_REG_TMP 0,1,2,3,4,5,7,8
%macro AVG_START 0-1 0
PROLOGUE 6,9,%1
%endmacro
%else
DECLARE_REG_TMP 1,2,3,4,5,6,1,2
......@@ -1157,7 +1160,9 @@ avg_w16_align%1_%2_ssse3:
jg avg_w16_align%1_%2_ssse3
ret
%if %1==0
times 13 db 0x90 ; make sure the first ones don't end up short
; make sure the first ones don't end up short
ALIGN 16
times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop
%endif
%endmacro
......@@ -1171,7 +1176,7 @@ cglobal pixel_avg2_w16_cache64_ssse3
and eax, 7
jz x264_pixel_avg2_w16_sse2
%endif
PROLOGUE 6, 7
PROLOGUE 6, 8
lea r6, [r4+r2]
and r4, ~0xf
and r6, 0x1f
......@@ -1181,8 +1186,8 @@ cglobal pixel_avg2_w16_cache64_ssse3
shl r6, 4 ;jump = (offset + align*2)*48
%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
%ifdef PIC
lea r11, [avg_w16_addr]
add r6, r11
lea r7, [avg_w16_addr]
add r6, r7
%else
lea r6, [avg_w16_addr + r6]
%endif
......@@ -1393,17 +1398,22 @@ cglobal prefetch_ref, 3,3
;=============================================================================
%ifdef ARCH_X86_64
DECLARE_REG_TMP 10,11,6
DECLARE_REG_TMP 6,7,8
%else
DECLARE_REG_TMP 0,1,2
%endif
%macro MC_CHROMA_START 0
%macro MC_CHROMA_START 1
%ifdef ARCH_X86_64
PROLOGUE 0,9,%1
%else
PROLOGUE 0,6,%1
%endif
movifnidn r3, r3mp
movifnidn r4d, r4m
movifnidn r5d, r5m
movifnidn t2d, r6m
mov t0d, t2d
movifnidn t0d, r6m
mov t2d, t0d
mov t1d, r5d
sar t0d, 3
sar t1d, 3
......@@ -1447,8 +1457,8 @@ cglobal prefetch_ref, 3,3
; int width, int height )
;-----------------------------------------------------------------------------
%macro MC_CHROMA 0
cglobal mc_chroma, 0,6
MC_CHROMA_START
cglobal mc_chroma
MC_CHROMA_START 0
FIX_STRIDES r4
and r5d, 7
%ifdef ARCH_X86_64
......@@ -1726,8 +1736,8 @@ ALIGN 4
movifnidn r5d, r8m
cmp dword r7m, 4
jg .mc1d_w8
mov r10, r2
mov r11, r4
mov r7, r2
mov r8, r4
%if mmsize!=8
shr r5d, 1
%endif
......@@ -1741,7 +1751,7 @@ ALIGN 4
%else
movu m0, [r3]
movu m1, [r3+r6]
add r3, r11
add r3, r8
movu m2, [r3]
movu m3, [r3+r6]
%endif
......@@ -1757,7 +1767,7 @@ ALIGN 4
movq m0, [r3]
movq m1, [r3+r6]
%if mmsize!=8
add r3, r11
add r3, r8
movhps m0, [r3]
movhps m1, [r3+r6]
%endif
......@@ -1778,22 +1788,22 @@ ALIGN 4
psrlw m2, 3
%ifdef HIGH_BIT_DEPTH
%if mmsize == 8
xchg r4, r11
xchg r2, r10
xchg r4, r8
xchg r2, r7
%endif
movq [r0], m0
movq [r1], m2
%if mmsize == 16
add r0, r10
add r1, r10
add r0, r7
add r1, r7
movhps [r0], m0
movhps [r1], m2
%endif
%else ; !HIGH_BIT_DEPTH
packuswb m0, m2
%if mmsize==8
xchg r4, r11
xchg r2, r10
xchg r4, r8
xchg r2, r7
movd [r0], m0
psrlq m0, 32
movd [r1], m0
......@@ -1801,8 +1811,8 @@ ALIGN 4
movhlps m1, m0
movd [r0], m0
movd [r1], m1
add r0, r10
add r1, r10
add r0, r7
add r1, r7
psrldq m0, 4
psrldq m1, 4
movd [r0], m0
......@@ -1818,8 +1828,8 @@ ALIGN 4
.mc1d_w8:
sub r2, 4*SIZEOF_PIXEL
sub r4, 8*SIZEOF_PIXEL
mov r10, 4*SIZEOF_PIXEL
mov r11, 8*SIZEOF_PIXEL
mov r7, 4*SIZEOF_PIXEL
mov r8, 8*SIZEOF_PIXEL
%if mmsize==8
shl r5d, 1
%endif
......@@ -1827,10 +1837,9 @@ ALIGN 4
%endif ; ARCH_X86_64
%endmacro ; MC_CHROMA
%macro MC_CHROMA_SSSE3 0
cglobal mc_chroma, 0,6,9
MC_CHROMA_START
cglobal mc_chroma
MC_CHROMA_START 9
and r5d, 7
and t2d, 7
mov t0d, r5d
......
......@@ -660,7 +660,7 @@ HPEL_V 0
mova %1, m1
mova %2, m4
FILT_PACK m1, m4, 5, m15
movntps [r11+r4+%5], m1
movntps [r8+r4+%5], m1
%endmacro
%macro FILT_C 4
......@@ -728,26 +728,26 @@ HPEL_V 0
; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
; uint8_t *src, int stride, int width, int height)
;-----------------------------------------------------------------------------
cglobal hpel_filter, 7,7,16
cglobal hpel_filter, 7,9,16
%ifdef WIN64
movsxd r4, r4d
movsxd r5, r5d
%endif
mov r10, r3
mov r7, r3
sub r5, 16
mov r11, r1
and r10, 15
sub r3, r10
mov r8, r1
and r7, 15
sub r3, r7
add r0, r5
add r11, r5
add r10, r5
add r8, r5
add r7, r5
add r5, r2
mov r2, r4
neg r10
neg r7
lea r1, [r3+r2]
sub r3, r2
sub r3, r2
mov r4, r10
mov r4, r7
mova m15, [pw_16]
%if cpuflag(ssse3)
mova m0, [filt_mul51]
......@@ -774,14 +774,14 @@ cglobal hpel_filter, 7,7,16
cmp r4, 16
jl .lastx
; setup regs for next y
sub r4, r10
sub r4, r7
sub r4, r2
sub r1, r4
sub r3, r4
add r0, r2
add r11, r2
add r8, r2
add r5, r2
mov r4, r10
mov r4, r7
sub r6d, 1
jg .loopy
sfence
......@@ -950,7 +950,7 @@ cglobal plane_copy_core_mmx2, 6,7
; uint8_t *srcv, int i_srcv, int w, int h )
;-----------------------------------------------------------------------------
; assumes i_dst and w are multiples of 16, and i_dst>2*w
cglobal plane_copy_interleave_core, 7,7
cglobal plane_copy_interleave_core, 7,9
FIX_STRIDES r1d, r3d, r5d, r6d
%ifdef HIGH_BIT_DEPTH
mov r1m, r1d
......@@ -965,7 +965,7 @@ cglobal plane_copy_interleave_core, 7,7
add r2, r6
add r4, r6
%ifdef ARCH_X86_64
DECLARE_REG_TMP 10,11
DECLARE_REG_TMP 7,8
%else
DECLARE_REG_TMP 1,3
%endif
......
......@@ -1267,15 +1267,21 @@ cglobal pixel_satd_4x4, 4,6
%macro BACKUP_POINTERS 0
%ifdef ARCH_X86_64
mov r10, r0
mov r11, r2
%ifdef WIN64
PUSH r7
%endif
mov r6, r0
mov r7, r2
%endif
%endmacro
%macro RESTORE_AND_INC_POINTERS 0
%ifdef ARCH_X86_64
lea r0, [r10+8]
lea r2, [r11+8]
lea r0, [r6+8]
lea r2, [r7+8]
%ifdef WIN64
POP r7
%endif
%else
mov r0, r0mp
mov r2, r2mp
......@@ -1473,10 +1479,10 @@ cglobal pixel_satd_8x4, 4,6,8
; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
cglobal pixel_sa8d_8x8_internal
lea r10, [r0+4*r1]
lea r11, [r2+4*r3]
lea r6, [r0+4*r1]
lea r7, [r2+4*r3]
LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r10, r11
LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7
%if vertical
HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
%else ; non-sse2
......@@ -1488,7 +1494,7 @@ cglobal pixel_sa8d_8x8_internal
SAVE_MM_PERMUTATION
ret
cglobal pixel_sa8d_8x8, 4,6,12
cglobal pixel_sa8d_8x8, 4,8,12
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
......@@ -1506,7 +1512,7 @@ cglobal pixel_sa8d_8x8, 4,6,12
shr eax, 1
RET
cglobal pixel_sa8d_16x16, 4,6,12
cglobal pixel_sa8d_16x16, 4,8,12
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
......@@ -1942,14 +1948,6 @@ cglobal intra_satd_x3_4x4, 3,3
%endif
RET
%ifdef ARCH_X86_64
%define t0 r10
%define t2 r11
%else
%define t0 r0
%define t2 r2
%endif
;-----------------------------------------------------------------------------
; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
......@@ -1974,14 +1972,14 @@ cglobal intra_satd_x3_16x16, 0,5
%endif
; 1D hadamards
mov t0d, 12
mov r3d, 12
movd m6, [pw_32]
.loop_edge:
SCALAR_HADAMARD left, t0, m0, m1
SCALAR_HADAMARD top, t0, m1, m2, m3
SCALAR_HADAMARD left, r3, m0, m1
SCALAR_HADAMARD top, r3, m1, m2, m3
pavgw m0, m1
paddw m6, m0
sub t0d, 4
sub r3d, 4
jge .loop_edge
psrlw m6, 2
pand m6, [sw_f0] ; dc
......@@ -2060,6 +2058,12 @@ cglobal intra_satd_x3_16x16, 0,5
ADD rsp, stack_pad
RET
%ifdef ARCH_X86_64
%define t0 r6
%else
%define t0 r2
%endif
;-----------------------------------------------------------------------------
; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
......@@ -2077,29 +2081,29 @@ cglobal intra_satd_x3_8x8c, 0,6
mova [sums+16], m7
; 1D hadamards
mov t0d, 4
mov r3d, 4
.loop_edge:
SCALAR_HADAMARD left, t0, m0, m1
SCALAR_HADAMARD top, t0, m0, m1, m2
sub t0d, 4
SCALAR_HADAMARD left, r3, m0, m1
SCALAR_HADAMARD top, r3, m0, m1, m2
sub r3d, 4
jge .loop_edge
; dc
movzx t2d, word [left_1d+0]
movzx t0d, word [left_1d+0]
movzx r3d, word [top_1d+0]
movzx r4d, word [left_1d+8]
movzx r5d, word [top_1d+8]
lea t2d, [t2 + r3 + 16]
lea t0d, [t0 + r3 + 16]
lea r3d, [r4 + r5 + 16]
shr t2d, 1
shr t0d, 1
shr r3d, 1
add r4d, 8
add r5d, 8
and t2d, -16 ; tl
and t0d, -16 ; tl
and r3d, -16 ; br
and r4d, -16 ; bl
and r5d, -16 ; tr
mov [dc_1d+ 0], t2d ; tl
mov [dc_1d+ 0], t0d ; tl
mov [dc_1d+ 4], r5d ; tr
mov [dc_1d+ 8], r4d ; bl
mov [dc_1d+12], r3d ; br
......
......@@ -921,10 +921,10 @@ cextern decimate_table8
;This is not true for score64.
cglobal decimate_score%1, 1,3
%ifdef PIC
lea r10, [decimate_table4]
lea r11, [decimate_mask_table4]
%define table r10
%define mask_table r11
lea r4, [decimate_table4]
lea r5, [decimate_mask_table4]
%define table r4
%define mask_table r5