Commit 0d7a9100 authored by Henrik Gramner's avatar Henrik Gramner Committed by Fiona Glaser

x86inc improvements for 64-bit

Add support for all x86-64 registers
Prefer caller-saved register over callee-saved on WIN64
Support up to 15 function arguments
parent 8a6a062e
...@@ -35,7 +35,7 @@ cextern cabac_renorm_shift ...@@ -35,7 +35,7 @@ cextern cabac_renorm_shift
; t3 must be ecx, since it's used for shift. ; t3 must be ecx, since it's used for shift.
%ifdef WIN64 %ifdef WIN64
DECLARE_REG_TMP 3,1,2,0,4,5,6,2 DECLARE_REG_TMP 3,1,2,0,6,5,4,2
%define pointer resq %define pointer resq
%elifdef ARCH_X86_64 %elifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,6,6 DECLARE_REG_TMP 0,1,2,3,4,5,6,6
...@@ -61,11 +61,11 @@ endstruc ...@@ -61,11 +61,11 @@ endstruc
%macro LOAD_GLOBAL 4 %macro LOAD_GLOBAL 4
%ifdef PIC %ifdef PIC
; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
lea r11, [%2] lea r7, [%2]
%ifnidn %3, 0 %ifnidn %3, 0
add r11, %3 add r7, %3
%endif %endif
movzx %1, byte [r11+%4] movzx %1, byte [r7+%4]
%else %else
movzx %1, byte [%2+%3+%4] movzx %1, byte [%2+%3+%4]
%endif %endif
...@@ -81,6 +81,9 @@ cglobal cabac_encode_decision_asm, 0,7 ...@@ -81,6 +81,9 @@ cglobal cabac_encode_decision_asm, 0,7
and t4d, t6d and t4d, t6d
shr t5d, 6 shr t5d, 6
movifnidn t2d, r2m movifnidn t2d, r2m
%ifdef WIN64
PUSH r7
%endif
LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2 LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2
LOAD_GLOBAL t4d, cabac_transition, t2, t6*2 LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
and t6d, 1 and t6d, 1
...@@ -95,6 +98,9 @@ cglobal cabac_encode_decision_asm, 0,7 ...@@ -95,6 +98,9 @@ cglobal cabac_encode_decision_asm, 0,7
mov t4d, t3d mov t4d, t3d
shr t3d, 3 shr t3d, 3
LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3 LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
%ifdef WIN64
POP r7
%endif
shl t4d, t3b shl t4d, t3b
shl t6d, t3b shl t6d, t3b
mov [t0+cb.range], t4d mov [t0+cb.range], t4d
...@@ -144,12 +150,11 @@ cglobal cabac_encode_terminal_asm, 0,3 ...@@ -144,12 +150,11 @@ cglobal cabac_encode_terminal_asm, 0,3
PROLOGUE 0,7 PROLOGUE 0,7
mov t3d, [t0+cb.queue] mov t3d, [t0+cb.queue]
mov t6d, [t0+cb.low] mov t6d, [t0+cb.low]
jmp cabac_putbyte
cabac_putbyte: cabac_putbyte:
; alive: t0=cb t3=queue t6=low ; alive: t0=cb t3=queue t6=low
%ifdef WIN64 %ifdef WIN64
DECLARE_REG_TMP 3,4,1,0,2,5,6,10 DECLARE_REG_TMP 3,6,1,0,2,5,4
%endif %endif
mov t1d, -1 mov t1d, -1
add t3d, 10 add t3d, 10
......
...@@ -366,9 +366,6 @@ cglobal %1, 3,3,%7 ...@@ -366,9 +366,6 @@ cglobal %1, 3,3,%7
%endif %endif
%endif ; !HIGH_BIT_DEPTH %endif ; !HIGH_BIT_DEPTH
.skip_prologue: .skip_prologue:
%ifdef WIN64
sub rsp, 8
%endif
call %2.skip_prologue call %2.skip_prologue
add r0, %3 add r0, %3
add r1, %4-%5-%6*FENC_STRIDE add r1, %4-%5-%6*FENC_STRIDE
...@@ -383,7 +380,6 @@ cglobal %1, 3,3,%7 ...@@ -383,7 +380,6 @@ cglobal %1, 3,3,%7
add r2, %4-%5-%6*FDEC_STRIDE add r2, %4-%5-%6*FDEC_STRIDE
%ifdef WIN64 %ifdef WIN64
call %2.skip_prologue call %2.skip_prologue
add rsp, 8
RET RET
%else %else
jmp %2.skip_prologue jmp %2.skip_prologue
...@@ -407,9 +403,6 @@ cglobal %1, 2,2,11 ...@@ -407,9 +403,6 @@ cglobal %1, 2,2,11
add r0, 4*FDEC_STRIDE add r0, 4*FDEC_STRIDE
%endif %endif
.skip_prologue: .skip_prologue:
%ifdef WIN64
sub rsp, 8
%endif
call %2.skip_prologue call %2.skip_prologue
add r0, %4-%5-%6*FDEC_STRIDE add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3 add r1, %3
...@@ -421,7 +414,6 @@ cglobal %1, 2,2,11 ...@@ -421,7 +414,6 @@ cglobal %1, 2,2,11
add r1, %3 add r1, %3
%ifdef WIN64 %ifdef WIN64
call %2.skip_prologue call %2.skip_prologue
add rsp, 8
RET RET
%else %else
jmp %2.skip_prologue jmp %2.skip_prologue
......
...@@ -1138,28 +1138,28 @@ cglobal deblock_v_luma, 5,5,10 ...@@ -1138,28 +1138,28 @@ cglobal deblock_v_luma, 5,5,10
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_MMX cpuname INIT_MMX cpuname
cglobal deblock_h_luma, 5,7 cglobal deblock_h_luma, 5,9
movsxd r10, r1d movsxd r7, r1d
lea r11, [r10+r10*2] lea r8, [r7*3]
lea r6, [r0-4] lea r6, [r0-4]
lea r5, [r0-4+r11] lea r5, [r0-4+r8]
%ifdef WIN64 %ifdef WIN64
sub rsp, 0x98 sub rsp, 0x98
%define pix_tmp rsp+0x30 %define pix_tmp rsp+0x30
%else %else
sub rsp, 0x68 sub rsp, 0x68
%define pix_tmp rsp %define pix_tmp rsp
%endif %endif
; transpose 6x16 -> tmp space ; transpose 6x16 -> tmp space
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp
lea r6, [r6+r10*8] lea r6, [r6+r7*8]
lea r5, [r5+r10*8] lea r5, [r5+r7*8]
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
; vertical filter ; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4 ; alpha, beta, tc0 are still in r2d, r3d, r4
; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
lea r0, [pix_tmp+0x30] lea r0, [pix_tmp+0x30]
mov r1d, 0x10 mov r1d, 0x10
%ifdef WIN64 %ifdef WIN64
...@@ -1174,17 +1174,17 @@ cglobal deblock_h_luma, 5,7 ...@@ -1174,17 +1174,17 @@ cglobal deblock_h_luma, 5,7
movq m1, [pix_tmp+0x28] movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38] movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48] movq m3, [pix_tmp+0x48]
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
shl r10, 3 shl r7, 3
sub r6, r10 sub r6, r7
sub r5, r10 sub r5, r7
shr r10, 3 shr r7, 3
movq m0, [pix_tmp+0x10] movq m0, [pix_tmp+0x10]
movq m1, [pix_tmp+0x20] movq m1, [pix_tmp+0x20]
movq m2, [pix_tmp+0x30] movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40] movq m3, [pix_tmp+0x40]
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
%ifdef WIN64 %ifdef WIN64
add rsp, 0x98 add rsp, 0x98
...@@ -1516,33 +1516,33 @@ INIT_MMX cpuname ...@@ -1516,33 +1516,33 @@ INIT_MMX cpuname
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra, 4,7 cglobal deblock_h_luma_intra, 4,9
movsxd r10, r1d movsxd r7, r1d
lea r11, [r10*3] lea r8, [r7*3]
lea r6, [r0-4] lea r6, [r0-4]
lea r5, [r0-4+r11] lea r5, [r0-4+r8]
sub rsp, 0x88 sub rsp, 0x88
%define pix_tmp rsp %define pix_tmp rsp
; transpose 8x16 -> tmp space ; transpose 8x16 -> tmp space
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
lea r6, [r6+r10*8] lea r6, [r6+r7*8]
lea r5, [r5+r10*8] lea r5, [r5+r7*8]
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
lea r0, [pix_tmp+0x40] lea r0, [pix_tmp+0x40]
mov r1, 0x10 mov r1, 0x10
call deblock_v_luma_intra call deblock_v_luma_intra
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
lea r5, [r6+r11] lea r5, [r6+r8]
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
shl r10, 3 shl r7, 3
sub r6, r10 sub r6, r7
sub r5, r10 sub r5, r7
shr r10, 3 shr r7, 3
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
add rsp, 0x88 add rsp, 0x88
RET RET
%else %else
cglobal deblock_h_luma_intra, 2,4 cglobal deblock_h_luma_intra, 2,4
...@@ -2008,9 +2008,9 @@ DEBLOCK_H_CHROMA_420_MBAFF ...@@ -2008,9 +2008,9 @@ DEBLOCK_H_CHROMA_420_MBAFF
%endif %endif
%macro DEBLOCK_H_CHROMA_422 0 %macro DEBLOCK_H_CHROMA_422 0
cglobal deblock_h_chroma_422, 5,7,8 cglobal deblock_h_chroma_422, 5,8,8
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
%define cntr r11 %define cntr r7
%else %else
%define cntr dword r0m %define cntr dword r0m
%endif %endif
......
...@@ -58,13 +58,16 @@ cextern pd_32 ...@@ -58,13 +58,16 @@ cextern pd_32
; implicit weighted biprediction ; implicit weighted biprediction
;============================================================================= ;=============================================================================
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
%ifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,10,11
%macro AVG_START 0-1 0
PROLOGUE 6,7,%1
%ifdef WIN64 %ifdef WIN64
movsxd r5, r5d DECLARE_REG_TMP 0,1,2,3,4,5,4,5
%endif %macro AVG_START 0-1 0
PROLOGUE 5,7,%1
movsxd r5, dword r5m
%endmacro
%elifdef UNIX64
DECLARE_REG_TMP 0,1,2,3,4,5,7,8
%macro AVG_START 0-1 0
PROLOGUE 6,9,%1
%endmacro %endmacro
%else %else
DECLARE_REG_TMP 1,2,3,4,5,6,1,2 DECLARE_REG_TMP 1,2,3,4,5,6,1,2
...@@ -1157,7 +1160,9 @@ avg_w16_align%1_%2_ssse3: ...@@ -1157,7 +1160,9 @@ avg_w16_align%1_%2_ssse3:
jg avg_w16_align%1_%2_ssse3 jg avg_w16_align%1_%2_ssse3
ret ret
%if %1==0 %if %1==0
times 13 db 0x90 ; make sure the first ones don't end up short ; make sure the first ones don't end up short
ALIGN 16
times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop
%endif %endif
%endmacro %endmacro
...@@ -1171,7 +1176,7 @@ cglobal pixel_avg2_w16_cache64_ssse3 ...@@ -1171,7 +1176,7 @@ cglobal pixel_avg2_w16_cache64_ssse3
and eax, 7 and eax, 7
jz x264_pixel_avg2_w16_sse2 jz x264_pixel_avg2_w16_sse2
%endif %endif
PROLOGUE 6, 7 PROLOGUE 6, 8
lea r6, [r4+r2] lea r6, [r4+r2]
and r4, ~0xf and r4, ~0xf
and r6, 0x1f and r6, 0x1f
...@@ -1181,8 +1186,8 @@ cglobal pixel_avg2_w16_cache64_ssse3 ...@@ -1181,8 +1186,8 @@ cglobal pixel_avg2_w16_cache64_ssse3
shl r6, 4 ;jump = (offset + align*2)*48 shl r6, 4 ;jump = (offset + align*2)*48
%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3) %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
%ifdef PIC %ifdef PIC
lea r11, [avg_w16_addr] lea r7, [avg_w16_addr]
add r6, r11 add r6, r7
%else %else
lea r6, [avg_w16_addr + r6] lea r6, [avg_w16_addr + r6]
%endif %endif
...@@ -1393,17 +1398,22 @@ cglobal prefetch_ref, 3,3 ...@@ -1393,17 +1398,22 @@ cglobal prefetch_ref, 3,3
;============================================================================= ;=============================================================================
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
DECLARE_REG_TMP 10,11,6 DECLARE_REG_TMP 6,7,8
%else %else
DECLARE_REG_TMP 0,1,2 DECLARE_REG_TMP 0,1,2
%endif %endif
%macro MC_CHROMA_START 0 %macro MC_CHROMA_START 1
%ifdef ARCH_X86_64
PROLOGUE 0,9,%1
%else
PROLOGUE 0,6,%1
%endif
movifnidn r3, r3mp movifnidn r3, r3mp
movifnidn r4d, r4m movifnidn r4d, r4m
movifnidn r5d, r5m movifnidn r5d, r5m
movifnidn t2d, r6m movifnidn t0d, r6m
mov t0d, t2d mov t2d, t0d
mov t1d, r5d mov t1d, r5d
sar t0d, 3 sar t0d, 3
sar t1d, 3 sar t1d, 3
...@@ -1447,8 +1457,8 @@ cglobal prefetch_ref, 3,3 ...@@ -1447,8 +1457,8 @@ cglobal prefetch_ref, 3,3
; int width, int height ) ; int width, int height )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro MC_CHROMA 0 %macro MC_CHROMA 0
cglobal mc_chroma, 0,6 cglobal mc_chroma
MC_CHROMA_START MC_CHROMA_START 0
FIX_STRIDES r4 FIX_STRIDES r4
and r5d, 7 and r5d, 7
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
...@@ -1726,8 +1736,8 @@ ALIGN 4 ...@@ -1726,8 +1736,8 @@ ALIGN 4
movifnidn r5d, r8m movifnidn r5d, r8m
cmp dword r7m, 4 cmp dword r7m, 4
jg .mc1d_w8 jg .mc1d_w8
mov r10, r2 mov r7, r2
mov r11, r4 mov r8, r4
%if mmsize!=8 %if mmsize!=8
shr r5d, 1 shr r5d, 1
%endif %endif
...@@ -1741,7 +1751,7 @@ ALIGN 4 ...@@ -1741,7 +1751,7 @@ ALIGN 4
%else %else
movu m0, [r3] movu m0, [r3]
movu m1, [r3+r6] movu m1, [r3+r6]
add r3, r11 add r3, r8
movu m2, [r3] movu m2, [r3]
movu m3, [r3+r6] movu m3, [r3+r6]
%endif %endif
...@@ -1757,7 +1767,7 @@ ALIGN 4 ...@@ -1757,7 +1767,7 @@ ALIGN 4
movq m0, [r3] movq m0, [r3]
movq m1, [r3+r6] movq m1, [r3+r6]
%if mmsize!=8 %if mmsize!=8
add r3, r11 add r3, r8
movhps m0, [r3] movhps m0, [r3]
movhps m1, [r3+r6] movhps m1, [r3+r6]
%endif %endif
...@@ -1778,22 +1788,22 @@ ALIGN 4 ...@@ -1778,22 +1788,22 @@ ALIGN 4
psrlw m2, 3 psrlw m2, 3
%ifdef HIGH_BIT_DEPTH %ifdef HIGH_BIT_DEPTH
%if mmsize == 8 %if mmsize == 8
xchg r4, r11 xchg r4, r8
xchg r2, r10 xchg r2, r7
%endif %endif
movq [r0], m0 movq [r0], m0
movq [r1], m2 movq [r1], m2
%if mmsize == 16 %if mmsize == 16
add r0, r10 add r0, r7
add r1, r10 add r1, r7
movhps [r0], m0 movhps [r0], m0
movhps [r1], m2 movhps [r1], m2
%endif %endif
%else ; !HIGH_BIT_DEPTH %else ; !HIGH_BIT_DEPTH
packuswb m0, m2 packuswb m0, m2
%if mmsize==8 %if mmsize==8
xchg r4, r11 xchg r4, r8
xchg r2, r10 xchg r2, r7
movd [r0], m0 movd [r0], m0
psrlq m0, 32 psrlq m0, 32
movd [r1], m0 movd [r1], m0
...@@ -1801,8 +1811,8 @@ ALIGN 4 ...@@ -1801,8 +1811,8 @@ ALIGN 4
movhlps m1, m0 movhlps m1, m0
movd [r0], m0 movd [r0], m0
movd [r1], m1 movd [r1], m1
add r0, r10 add r0, r7
add r1, r10 add r1, r7
psrldq m0, 4 psrldq m0, 4
psrldq m1, 4 psrldq m1, 4
movd [r0], m0 movd [r0], m0
...@@ -1818,8 +1828,8 @@ ALIGN 4 ...@@ -1818,8 +1828,8 @@ ALIGN 4
.mc1d_w8: .mc1d_w8:
sub r2, 4*SIZEOF_PIXEL sub r2, 4*SIZEOF_PIXEL
sub r4, 8*SIZEOF_PIXEL sub r4, 8*SIZEOF_PIXEL
mov r10, 4*SIZEOF_PIXEL mov r7, 4*SIZEOF_PIXEL
mov r11, 8*SIZEOF_PIXEL mov r8, 8*SIZEOF_PIXEL
%if mmsize==8 %if mmsize==8
shl r5d, 1 shl r5d, 1
%endif %endif
...@@ -1827,10 +1837,9 @@ ALIGN 4 ...@@ -1827,10 +1837,9 @@ ALIGN 4
%endif ; ARCH_X86_64 %endif ; ARCH_X86_64
%endmacro ; MC_CHROMA %endmacro ; MC_CHROMA
%macro MC_CHROMA_SSSE3 0 %macro MC_CHROMA_SSSE3 0
cglobal mc_chroma, 0,6,9 cglobal mc_chroma
MC_CHROMA_START MC_CHROMA_START 9
and r5d, 7 and r5d, 7
and t2d, 7 and t2d, 7
mov t0d, r5d mov t0d, r5d
......
...@@ -660,7 +660,7 @@ HPEL_V 0 ...@@ -660,7 +660,7 @@ HPEL_V 0
mova %1, m1 mova %1, m1
mova %2, m4 mova %2, m4
FILT_PACK m1, m4, 5, m15 FILT_PACK m1, m4, 5, m15
movntps [r11+r4+%5], m1 movntps [r8+r4+%5], m1
%endmacro %endmacro
%macro FILT_C 4 %macro FILT_C 4
...@@ -728,26 +728,26 @@ HPEL_V 0 ...@@ -728,26 +728,26 @@ HPEL_V 0
; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
; uint8_t *src, int stride, int width, int height) ; uint8_t *src, int stride, int width, int height)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal hpel_filter, 7,7,16 cglobal hpel_filter, 7,9,16
%ifdef WIN64 %ifdef WIN64
movsxd r4, r4d movsxd r4, r4d
movsxd r5, r5d movsxd r5, r5d
%endif %endif
mov r10, r3 mov r7, r3
sub r5, 16 sub r5, 16
mov r11, r1 mov r8, r1
and r10, 15 and r7, 15
sub r3, r10 sub r3, r7
add r0, r5 add r0, r5
add r11, r5 add r8, r5
add r10, r5 add r7, r5
add r5, r2 add r5, r2
mov r2, r4 mov r2, r4
neg r10 neg r7
lea r1, [r3+r2] lea r1, [r3+r2]
sub r3, r2 sub r3, r2
sub r3, r2 sub r3, r2
mov r4, r10 mov r4, r7
mova m15, [pw_16] mova m15, [pw_16]
%if cpuflag(ssse3) %if cpuflag(ssse3)
mova m0, [filt_mul51] mova m0, [filt_mul51]
...@@ -774,14 +774,14 @@ cglobal hpel_filter, 7,7,16 ...@@ -774,14 +774,14 @@ cglobal hpel_filter, 7,7,16
cmp r4, 16 cmp r4, 16
jl .lastx jl .lastx
; setup regs for next y