Commit c2292efc authored by Henrik Gramner's avatar Henrik Gramner

Implement support for PIC in x86-32 asm

Convert all existing 32-bit SSSE3 asm to use PIC.
parent 7cb756ea
...@@ -256,13 +256,12 @@ if host_machine.cpu_family().startswith('x86') ...@@ -256,13 +256,12 @@ if host_machine.cpu_family().startswith('x86')
cdata.set10('ARCH_X86_64', true) cdata.set10('ARCH_X86_64', true)
cdata_asm.set10('ARCH_X86_32', false) cdata_asm.set10('ARCH_X86_32', false)
cdata.set10('ARCH_X86_32', false) cdata.set10('ARCH_X86_32', false)
cdata_asm.set10('PIC', true)
else else
cdata_asm.set10('ARCH_X86_64', false) cdata_asm.set10('ARCH_X86_64', false)
cdata.set10('ARCH_X86_64', false) cdata.set10('ARCH_X86_64', false)
cdata_asm.set10('ARCH_X86_32', true) cdata_asm.set10('ARCH_X86_32', true)
cdata.set10('ARCH_X86_32', true) cdata.set10('ARCH_X86_32', true)
cdata_asm.set10('PIC', true)
endif endif
else else
cdata.set10('ARCH_X86', false) cdata.set10('ARCH_X86', false)
......
...@@ -89,16 +89,13 @@ ...@@ -89,16 +89,13 @@
%endif %endif
%endmacro %endmacro
%if WIN64 %if ARCH_X86_64
%define PIC %define PIC 1 ; always use PIC on x86-64
%elif ARCH_X86_64 == 0
; x86_32 doesn't require PIC.
; Some distros prefer shared objects to be PIC, but nothing breaks if
; the code contains a few textrels, so we'll skip that complexity.
%undef PIC
%endif
%ifdef PIC
default rel default rel
%elifidn __OUTPUT_FORMAT__,win32
%define PIC 0 ; PIC isn't used on 32-bit Windows
%elifndef PIC
%define PIC 0
%endif %endif
%ifdef __NASM_VER__ %ifdef __NASM_VER__
...@@ -220,6 +217,18 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 ...@@ -220,6 +217,18 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%define gprsize 4 %define gprsize 4
%endif %endif
%macro LEA 2
%if ARCH_X86_64
lea %1, [%2]
%elif PIC
call $+5 ; special-cased to not affect the RSB on most CPU:s
pop %1
add %1, (%2)-$+1
%else
mov %1, %2
%endif
%endmacro
%macro PUSH 1 %macro PUSH 1
push %1 push %1
%ifidn rstk, rsp %ifidn rstk, rsp
......
...@@ -93,7 +93,7 @@ SECTION .text ...@@ -93,7 +93,7 @@ SECTION .text
INIT_XMM ssse3 INIT_XMM ssse3
cglobal ipred_h, 3, 6, 2, dst, stride, tl, w, h, stride3 cglobal ipred_h, 3, 6, 2, dst, stride, tl, w, h, stride3
lea r5, [ipred_h_ssse3_table] LEA r5, ipred_h_ssse3_table
tzcnt wd, wm tzcnt wd, wm
movifnidn hd, hm movifnidn hd, hm
movsxd wq, [r5+wq*4] movsxd wq, [r5+wq*4]
......
...@@ -55,9 +55,15 @@ SECTION .text ...@@ -55,9 +55,15 @@ SECTION .text
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
%if ARCH_X86_64
%define o(x) x
%else
%define o(x) r5-$$+x ; PIC
%endif
%macro ITX4_END 4-5 2048 ; row[1-4], rnd %macro ITX4_END 4-5 2048 ; row[1-4], rnd
%if %5 %if %5
mova m2, [pw_%5] mova m2, [o(pw_%5)]
pmulhrsw m0, m2 pmulhrsw m0, m2
pmulhrsw m1, m2 pmulhrsw m1, m2
%endif %endif
...@@ -100,18 +106,17 @@ SECTION .text ...@@ -100,18 +106,17 @@ SECTION .text
ret ret
%endmacro %endmacro
; flags: 1 = swap, 2: coef_regs ; flags: 1 = swap, 2: coef_regs
%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags %macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
%if %6 & 2 %if %6 & 2
pmaddwd m%2, m%4, m%1 pmaddwd m%2, m%4, m%1
pmaddwd m%1, m%5 pmaddwd m%1, m%5
%elif %6 & 1 %elif %6 & 1
pmaddwd m%2, m%1, [pw_%5_%4] pmaddwd m%2, m%1, [o(pw_%5_%4)]
pmaddwd m%1, [pw_%4_m%5] pmaddwd m%1, [pw_%4_m%5]
%else %else
pmaddwd m%2, m%1, [pw_%4_m%5] pmaddwd m%2, m%1, [o(pw_%4_m%5)]
pmaddwd m%1, [pw_%5_%4] pmaddwd m%1, [o(pw_%5_%4)]
%endif %endif
paddd m%2, m%3 paddd m%2, m%3
paddd m%1, m%3 paddd m%1, m%3
...@@ -126,13 +131,13 @@ SECTION .text ...@@ -126,13 +131,13 @@ SECTION .text
paddw m0, m1 paddw m0, m1
punpcklqdq m0, m3 ;high: in0-in2 ;low: in0+in2 punpcklqdq m0, m3 ;high: in0-in2 ;low: in0+in2
mova m3, [pd_2048] mova m3, [o(pd_2048)]
ITX_MUL2X_PACK 2, 1, 3, 1567, 3784 ITX_MUL2X_PACK 2, 1, 3, 1567, 3784
%if %0 == 1 %if %0 == 1
pmulhrsw m0, m%1 pmulhrsw m0, m%1
%else %else
pmulhrsw m0, [pw_2896x8] ;high: t1 ;low: t0 pmulhrsw m0, [o(pw_2896x8)] ;high: t1 ;low: t0
%endif %endif
psubsw m1, m0, m2 ;high: out2 ;low: out3 psubsw m1, m0, m2 ;high: out2 ;low: out3
...@@ -146,15 +151,14 @@ SECTION .text ...@@ -146,15 +151,14 @@ SECTION .text
punpckhqdq m1, m1 ; punpckhqdq m1, m1 ;
paddw m1, m0 ;low: in0 - in2 + in3 paddw m1, m0 ;low: in0 - in2 + in3
pmaddwd m0, m2, [pw_1321_3803] ;1321 * in0 + 3803 * in2 pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
pmaddwd m2, [pw_2482_m1321] ;2482 * in0 - 1321 * in2 pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
pmaddwd m4, m3, [pw_3344_2482] ;3344 * in1 + 2482 * in3 pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
pmaddwd m5, m3, [pw_3344_m3803] ;3344 * in1 - 3803 * in3 pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
paddd m4, m0 ;t0 + t3 paddd m4, m0 ;t0 + t3
pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
pmaddwd m3, [pw_m6688_m3803] ;-2 * 3344 * in1 - 3803 * in3 pmulhrsw m1, [o(pw_3344x8)] ;low: out2
pmulhrsw m1, [pw_3344x8] ;low: out2 mova m0, [o(pd_2048)]
mova m0, [pd_2048]
paddd m2, m0 paddd m2, m0
paddd m0, m4 ;t0 + t3 + 2048 paddd m0, m4 ;t0 + t3 + 2048
paddd m5, m2 ;t1 + t3 + 2048 paddd m5, m2 ;t1 + t3 + 2048
...@@ -169,9 +173,11 @@ SECTION .text ...@@ -169,9 +173,11 @@ SECTION .text
%endmacro %endmacro
%macro INV_TXFM_FN 4 ; type1, type2, fast_thresh, size %macro INV_TXFM_FN 4 ; type1, type2, fast_thresh, size
cglobal inv_txfm_add_%1_%2_%4, 4, 5, 0, dst, stride, coeff, eob, tx2 cglobal inv_txfm_add_%1_%2_%4, 4, 6, 0, dst, stride, coeff, eob, tx2
%undef cmp %undef cmp
lea tx2q, [m(i%2_%4_internal).pass2] %if ARCH_X86_32
LEA r5, $$
%endif
%if %3 > 0 %if %3 > 0
cmp eobd, %3 cmp eobd, %3
jle %%end jle %%end
...@@ -179,7 +185,8 @@ cglobal inv_txfm_add_%1_%2_%4, 4, 5, 0, dst, stride, coeff, eob, tx2 ...@@ -179,7 +185,8 @@ cglobal inv_txfm_add_%1_%2_%4, 4, 5, 0, dst, stride, coeff, eob, tx2
test eobd, eobd test eobd, eobd
jz %%end jz %%end
%endif %endif
call i%1_%4_internal lea tx2q, [o(m(i%2_%4_internal).pass2)]
call m(i%1_%4_internal)
RET RET
ALIGN function_align ALIGN function_align
%%end: %%end:
...@@ -188,10 +195,10 @@ ALIGN function_align ...@@ -188,10 +195,10 @@ ALIGN function_align
%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh %macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 4x4 INV_TXFM_FN %1, %2, %3, 4x4
%ifidn %1_%2, dct_identity %ifidn %1_%2, dct_identity
mova m0, [pw_2896x8] mova m0, [o(pw_2896x8)]
pmulhrsw m0, [coeffq] pmulhrsw m0, [coeffq]
paddw m0, m0 paddw m0, m0
pmulhrsw m0, [pw_5793x4] pmulhrsw m0, [o(pw_5793x4)]
punpcklwd m0, m0 punpcklwd m0, m0
punpckhdq m1, m0, m0 punpckhdq m1, m0, m0
punpckldq m0, m0 punpckldq m0, m0
...@@ -205,8 +212,8 @@ ALIGN function_align ...@@ -205,8 +212,8 @@ ALIGN function_align
punpcklwd m0, m1 punpcklwd m0, m1
punpcklqdq m0, m0 punpcklqdq m0, m0
paddw m0, m0 paddw m0, m0
pmulhrsw m0, [pw_5793x4] pmulhrsw m0, [o(pw_5793x4)]
pmulhrsw m0, [pw_2896x8] pmulhrsw m0, [o(pw_2896x8)]
mova m1, m0 mova m1, m0
call m(iadst_4x4_internal).end call m(iadst_4x4_internal).end
RET RET
...@@ -214,17 +221,17 @@ ALIGN function_align ...@@ -214,17 +221,17 @@ ALIGN function_align
pshuflw m0, [coeffq], q0000 pshuflw m0, [coeffq], q0000
punpcklqdq m0, m0 punpcklqdq m0, m0
%ifidn %1, dct %ifidn %1, dct
mova m1, [pw_2896x8] mova m1, [o(pw_2896x8)]
pmulhrsw m0, m1 pmulhrsw m0, m1
%elifidn %1, adst %elifidn %1, adst
pmulhrsw m0, [iadst4_dconly1a] pmulhrsw m0, [o(iadst4_dconly1a)]
%elifidn %1, flipadst %elifidn %1, flipadst
pmulhrsw m0, [iadst4_dconly1b] pmulhrsw m0, [o(iadst4_dconly1b)]
%endif %endif
mov [coeffq], eobd ;0 mov [coeffq], eobd ;0
%ifidn %2, dct %ifidn %2, dct
%ifnidn %1, dct %ifnidn %1, dct
pmulhrsw m0, [pw_2896x8] pmulhrsw m0, [o(pw_2896x8)]
%else %else
pmulhrsw m0, m1 pmulhrsw m0, m1
%endif %endif
...@@ -232,24 +239,28 @@ ALIGN function_align ...@@ -232,24 +239,28 @@ ALIGN function_align
call m(iadst_4x4_internal).end2 call m(iadst_4x4_internal).end2
RET RET
%else ; adst / flipadst %else ; adst / flipadst
pmulhrsw m1, m0, [iadst4_dconly2b] pmulhrsw m1, m0, [o(iadst4_dconly2b)]
pmulhrsw m0, [iadst4_dconly2a] pmulhrsw m0, [o(iadst4_dconly2a)]
call m(i%2_4x4_internal).end2 call m(i%2_4x4_internal).end2
RET RET
%endif %endif
%endif %endif
%endmacro %endmacro
INIT_XMM ssse3 INIT_XMM ssse3
INV_TXFM_4X4_FN dct, dct, 0
INV_TXFM_4X4_FN dct, adst, 0
INV_TXFM_4X4_FN dct, flipadst, 0
INV_TXFM_4X4_FN dct, identity, 3
cglobal idct_4x4_internal, 0, 0, 4, dst, stride, coeff, eob, tx2 cglobal idct_4x4_internal, 0, 0, 4, dst, stride, coeff, eob, tx2
mova m0, [coeffq+16*0] ;high: in1 ;low: in0 mova m0, [coeffq+16*0] ;high: in1 ;low: in0
mova m1, [coeffq+16*1] ;high: in3 ;low in2 mova m1, [coeffq+16*1] ;high: in3 ;low in2
IDCT4_1D_PACKED IDCT4_1D_PACKED
mova m2, [deint_shuf] mova m2, [o(deint_shuf)]
shufps m3, m0, m1, q1331 shufps m3, m0, m1, q1331
shufps m0, m1, q0220 shufps m0, m1, q0220
pshufb m0, m2 ;high: in1 ;low: in0 pshufb m0, m2 ;high: in1 ;low: in0
...@@ -265,7 +276,10 @@ cglobal idct_4x4_internal, 0, 0, 4, dst, stride, coeff, eob, tx2 ...@@ -265,7 +276,10 @@ cglobal idct_4x4_internal, 0, 0, 4, dst, stride, coeff, eob, tx2
ITX4_END 0, 1, 3, 2 ITX4_END 0, 1, 3, 2
INV_TXFM_4X4_FN dct, dct, 0 INV_TXFM_4X4_FN adst, dct, 0
INV_TXFM_4X4_FN adst, adst, 0
INV_TXFM_4X4_FN adst, flipadst, 0
INV_TXFM_4X4_FN adst, identity
cglobal iadst_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2 cglobal iadst_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
mova m0, [coeffq+16*0] mova m0, [coeffq+16*0]
...@@ -294,9 +308,10 @@ ALIGN function_align ...@@ -294,9 +308,10 @@ ALIGN function_align
IADST4_1D_PACKED IADST4_1D_PACKED
ret ret
INV_TXFM_4X4_FN adst, adst, 0 INV_TXFM_4X4_FN flipadst, dct, 0
INV_TXFM_4X4_FN dct, adst, 0 INV_TXFM_4X4_FN flipadst, adst, 0
INV_TXFM_4X4_FN adst, dct, 0 INV_TXFM_4X4_FN flipadst, flipadst, 0
INV_TXFM_4X4_FN flipadst, identity
cglobal iflipadst_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2 cglobal iflipadst_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
mova m0, [coeffq+16*0] mova m0, [coeffq+16*0]
...@@ -321,16 +336,15 @@ cglobal iflipadst_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2 ...@@ -321,16 +336,15 @@ cglobal iflipadst_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
.end2: .end2:
ITX4_END 3, 2, 1, 0 ITX4_END 3, 2, 1, 0
INV_TXFM_4X4_FN flipadst, flipadst, 0 INV_TXFM_4X4_FN identity, dct, 3
INV_TXFM_4X4_FN flipadst, dct, 0 INV_TXFM_4X4_FN identity, adst
INV_TXFM_4X4_FN flipadst, adst, 0 INV_TXFM_4X4_FN identity, flipadst
INV_TXFM_4X4_FN dct, flipadst, 0 INV_TXFM_4X4_FN identity, identity
INV_TXFM_4X4_FN adst, flipadst, 0
cglobal iidentity_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2 cglobal iidentity_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
mova m0, [coeffq+16*0] mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1] mova m1, [coeffq+16*1]
mova m2, [pw_5793x4] mova m2, [o(pw_5793x4)]
paddw m0, m0 paddw m0, m0
paddw m1, m1 paddw m1, m1
pmulhrsw m0, m2 pmulhrsw m0, m2
...@@ -343,21 +357,13 @@ cglobal iidentity_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2 ...@@ -343,21 +357,13 @@ cglobal iidentity_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
jmp tx2q jmp tx2q
.pass2: .pass2:
mova m2, [pw_5793x4] mova m2, [o(pw_5793x4)]
paddw m0, m0 paddw m0, m0
paddw m1, m1 paddw m1, m1
pmulhrsw m0, m2 pmulhrsw m0, m2
pmulhrsw m1, m2 pmulhrsw m1, m2
jmp m(iadst_4x4_internal).end jmp m(iadst_4x4_internal).end
INV_TXFM_4X4_FN identity, identity
INV_TXFM_4X4_FN identity, dct, 3
INV_TXFM_4X4_FN identity, adst
INV_TXFM_4X4_FN identity, flipadst
INV_TXFM_4X4_FN dct, identity, 3
INV_TXFM_4X4_FN adst, identity
INV_TXFM_4X4_FN flipadst, identity
%macro IWHT4_1D_PACKED 0 %macro IWHT4_1D_PACKED 0
punpckhqdq m3, m0, m1 ;low: in1 high: in3 punpckhqdq m3, m0, m1 ;low: in1 high: in3
punpcklqdq m0, m1 ;low: in0 high: in2 punpcklqdq m0, m1 ;low: in0 high: in2
......
...@@ -186,7 +186,7 @@ DECLARE_REG_TMP 6, 7 ...@@ -186,7 +186,7 @@ DECLARE_REG_TMP 6, 7
%endmacro %endmacro
cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
lea r6, [avg_ssse3_table] LEA r6, avg_ssse3_table
tzcnt wd, wm ; leading zeros tzcnt wd, wm ; leading zeros
movifnidn hd, hm ; move h(stack) to h(register) if not already that register movifnidn hd, hm ; move h(stack) to h(register) if not already that register
movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg
...@@ -216,7 +216,7 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 ...@@ -216,7 +216,7 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
%define W_AVG_INC_PTR AVG_INC_PTR %define W_AVG_INC_PTR AVG_INC_PTR
cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
lea r6, [w_avg_ssse3_table] LEA r6, w_avg_ssse3_table
tzcnt wd, wm tzcnt wd, wm
movifnidn hd, hm movifnidn hd, hm
movd m0, r6m movd m0, r6m
...@@ -269,11 +269,12 @@ cglobal mask, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3 ...@@ -269,11 +269,12 @@ cglobal mask, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3 cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
%define hd dword r5m %define hd dword r5m
%endif %endif
lea r6, [mask_ssse3_table] %define base r6-mask_ssse3_table
LEA r6, mask_ssse3_table
tzcnt wd, wm tzcnt wd, wm
movsxd wq, dword [r6+wq*4] movsxd wq, dword [r6+wq*4]
pxor m4, m4 pxor m4, m4
mova m5, [pw_2048+r6-mask_ssse3_table] mova m5, [base+pw_2048]
add wq, r6 add wq, r6
mov maskq, r6m mov maskq, r6m
BIDIR_FN MASK BIDIR_FN MASK
...@@ -284,9 +285,9 @@ cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3 ...@@ -284,9 +285,9 @@ cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
%define reg_pw_27 m9 %define reg_pw_27 m9
%define reg_pw_2048 m10 %define reg_pw_2048 m10
%else %else
%define reg_pw_8 [pw_8] %define reg_pw_8 [base+pw_8]
%define reg_pw_27 [pw_26] ; 64 - 38 %define reg_pw_27 [base+pw_26] ; 64 - 38
%define reg_pw_2048 [pw_2048] %define reg_pw_2048 [base+pw_2048]
%endif %endif
%macro W_MASK_420_B 2 ; src_offset in bytes, mask_out %macro W_MASK_420_B 2 ; src_offset in bytes, mask_out
...@@ -323,63 +324,60 @@ cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3 ...@@ -323,63 +324,60 @@ cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
W_MASK_420_B (%1*16), %2 W_MASK_420_B (%1*16), %2
%endmacro %endmacro
%define base r6-w_mask_420_ssse3_table
%if ARCH_X86_64 %if ARCH_X86_64
; args: dst, stride, tmp1, tmp2, w, h, mask, sign ; args: dst, stride, tmp1, tmp2, w, h, mask, sign
cglobal w_mask_420, 4, 9, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 cglobal w_mask_420, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask
lea r7, [w_mask_420_ssse3_table] lea r6, [w_mask_420_ssse3_table]
mov wd, wm mov wd, wm
tzcnt r8d, wd tzcnt r7d, wd
movifnidn hd, hm movifnidn hd, hm
mov maskq, maskmp
movd m0, r7m movd m0, r7m
pshuflw m0, m0, q0000 ; sign pshuflw m0, m0, q0000 ; sign
punpcklqdq m0, m0 punpcklqdq m0, m0
movsxd r8, dword [r7+r8*4] movsxd r7, [r6+r7*4]
mova reg_pw_8, [pw_8] mova reg_pw_8, [base+pw_8]
mova reg_pw_27, [pw_26] ; 64 - 38 mova reg_pw_27, [base+pw_26] ; 64 - 38
mova reg_pw_2048, [pw_2048] mova reg_pw_2048, [base+pw_2048]
mova m6, [pw_258] ; 64 * 4 + 2 mova m6, [base+pw_258] ; 64 * 4 + 2
add r7, r6
mov maskq, maskmp
psubw m6, m0 psubw m6, m0
add r8, r7
W_MASK_420 0, 4 W_MASK_420 0, 4
lea stride3q, [strideq*3] jmp r7
jmp r8 %define loop_w r7d
%define dst_bak r8
%define loop_w r7
%define orig_w wq
%else %else
cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask, stride3 cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
tzcnt r6d, r4m tzcnt wd, wm
mov wd, w_mask_420_ssse3_table LEA r6, w_mask_420_ssse3_table
add wd, [wq+r6*4] mov wd, [r6+wq*4]
mov maskq, r6mp mov maskq, r6mp
movd m0, r7m movd m0, r7m
pshuflw m0, m0, q0000 ; sign pshuflw m0, m0, q0000 ; sign
punpcklqdq m0, m0 punpcklqdq m0, m0
mova m6, [pw_258] ; 64 * 4 + 2 mova m6, [base+pw_258] ; 64 * 4 + 2
add wq, r6
psubw m6, m0 psubw m6, m0
W_MASK_420 0, 4 W_MASK_420 0, 4
lea stride3q, [strideq*3]
jmp wd jmp wd
%define dst_bak r0m %define loop_w dword r0m
%define loop_w r6q %define hd dword r5m
%define orig_w r4m
%define hd dword r5m
%endif %endif
.w4_loop: .w4_loop:
add tmp1q, 2*16 add tmp1q, 2*16
add tmp2q, 2*16 add tmp2q, 2*16
W_MASK_420 0, 4 W_MASK_420 0, 4
lea dstq, [dstq+strideq*4] lea dstq, [dstq+strideq*2]
add maskq, 4 add maskq, 4
.w4: .w4:
movd [dstq ], m0 ; copy m0[0] movd [dstq ], m0 ; copy m0[0]
pshuflw m1, m0, q1032 pshuflw m1, m0, q1032
movd [dstq+strideq*1], m1 ; copy m0[1] movd [dstq+strideq*1], m1 ; copy m0[1]
lea dstq, [dstq+strideq*2]
punpckhqdq m0, m0 punpckhqdq m0, m0
movd [dstq+strideq*2], m0 ; copy m0[2] movd [dstq+strideq*0], m0 ; copy m0[2]
psrlq m0, 32 psrlq m0, 32
movd [dstq+stride3q ], m0 ; copy m0[3] movd [dstq+strideq*1], m0 ; copy m0[3]
pshufd m5, m4, q3131; DBDB even lines repeated pshufd m5, m4, q3131; DBDB even lines repeated
pshufd m4, m4, q2020; CACA odd lines repeated pshufd m4, m4, q2020; CACA odd lines repeated
psubw m1, m6, m4 ; m9 == 64 * 4 + 2 psubw m1, m6, m4 ; m9 == 64 * 4 + 2
...@@ -409,20 +407,19 @@ cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask, stride3 ...@@ -409,20 +407,19 @@ cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask, stride3
jg .w8_loop jg .w8_loop
RET