Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
VideoLAN
dav1d
Commits
c2292efc
Commit
c2292efc
authored
Dec 16, 2018
by
Henrik Gramner
Browse files
Implement support for PIC in x86-32 asm
Convert all existing 32-bit SSSE3 asm to use PIC.
parent
7cb756ea
Changes
6
Hide whitespace changes
Inline
Side-by-side
meson.build
View file @
c2292efc
...
...
@@ -256,13 +256,12 @@ if host_machine.cpu_family().startswith('x86')
cdata.set10('ARCH_X86_64', true)
cdata_asm.set10('ARCH_X86_32', false)
cdata.set10('ARCH_X86_32', false)
cdata_asm.set10('PIC', true)
else
cdata_asm.set10('ARCH_X86_64', false)
cdata.set10('ARCH_X86_64', false)
cdata_asm.set10('ARCH_X86_32', true)
cdata.set10('ARCH_X86_32', true)
cdata_asm.set10('PIC', true)
endif
else
cdata.set10('ARCH_X86', false)
...
...
src/ext/x86/x86inc.asm
View file @
c2292efc
...
...
@@ -89,16 +89,13 @@
%endif
%endmacro
%if WIN64
%define PIC
%elif ARCH_X86_64 == 0
; x86_32 doesn't require PIC.
; Some distros prefer shared objects to be PIC, but nothing breaks if
; the code contains a few textrels, so we'll skip that complexity.
%undef PIC
%endif
%ifdef PIC
%if ARCH_X86_64
%define PIC 1
; always use PIC on x86-64
default
rel
%elifidn __OUTPUT_FORMAT__,win32
%define PIC 0
; PIC isn't used on 32-bit Windows
%elifndef PIC
%define PIC 0
%endif
%ifdef __NASM_VER__
...
...
@@ -220,6 +217,18 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%define gprsize 4
%endif
%macro LEA 2
%if ARCH_X86_64
lea
%
1
,
[
%
2
]
%elif PIC
call
$
+
5
; special-cased to not affect the RSB on most CPU:s
pop
%
1
add
%
1
,
(
%
2
)
-
$
+
1
%else
mov
%
1
,
%
2
%endif
%endmacro
%macro PUSH 1
push
%
1
%ifidn rstk, rsp
...
...
src/x86/ipred_ssse3.asm
View file @
c2292efc
...
...
@@ -93,7 +93,7 @@ SECTION .text
INIT_XMM
ss
se3
cglobal
ipred_h
,
3
,
6
,
2
,
ds
t
,
stride
,
tl
,
w
,
h
,
stride3
lea
r5
,
[
ipred_h_ssse3_table
]
LEA
r5
,
ipred_h_ssse3_table
tzcnt
wd
,
wm
movifnidn
hd
,
hm
movsxd
wq
,
[
r5
+
wq
*
4
]
...
...
src/x86/itx_ssse3.asm
View file @
c2292efc
...
...
@@ -55,9 +55,15 @@ SECTION .text
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
%if ARCH_X86_64
%define o(x) x
%else
%define o(x) r5-$$+x
; PIC
%endif
%macro ITX4_END 4-5 2048
; row[1-4], rnd
%if %5
mova
m2
,
[
pw_
%
5
]
mova
m2
,
[
o
(
pw_
%
5
)
]
pmulhrsw
m0
,
m2
pmulhrsw
m1
,
m2
%endif
...
...
@@ -100,18 +106,17 @@ SECTION .text
ret
%endmacro
; flags: 1 = swap, 2: coef_regs
%macro ITX_MUL2X_PACK 5-6 0
; dst/src, tmp[1], rnd, coef[1-2], flags
%if %6 & 2
pmaddwd
m
%
2
,
m
%
4
,
m
%
1
pmaddwd
m
%
1
,
m
%
5
%elif %6 & 1
pmaddwd
m
%
2
,
m
%
1
,
[
pw_
%
5
_
%
4
]
pmaddwd
m
%
2
,
m
%
1
,
[
o
(
pw_
%
5
_
%
4
)
]
pmaddwd
m
%
1
,
[
pw_
%
4
_m
%
5
]
%else
pmaddwd
m
%
2
,
m
%
1
,
[
pw_
%
4
_m
%
5
]
pmaddwd
m
%
1
,
[
pw_
%
5
_
%
4
]
pmaddwd
m
%
2
,
m
%
1
,
[
o
(
pw_
%
4
_m
%
5
)
]
pmaddwd
m
%
1
,
[
o
(
pw_
%
5
_
%
4
)
]
%endif
paddd
m
%
2
,
m
%
3
paddd
m
%
1
,
m
%
3
...
...
@@ -126,13 +131,13 @@ SECTION .text
paddw
m0
,
m1
punpcklqdq
m0
,
m3
;high: in0-in2 ;low: in0+in2
mova
m3
,
[
pd_2048
]
mova
m3
,
[
o
(
pd_2048
)
]
ITX_MUL2X_PACK
2
,
1
,
3
,
1567
,
3784
%if %0 == 1
pmulhrsw
m0
,
m
%
1
%else
pmulhrsw
m0
,
[
pw_2896x8
]
;high: t1 ;low: t0
pmulhrsw
m0
,
[
o
(
pw_2896x8
)
]
;high: t1 ;low: t0
%endif
psubsw
m1
,
m0
,
m2
;high: out2 ;low: out3
...
...
@@ -146,15 +151,14 @@ SECTION .text
punpckhqdq
m1
,
m1
;
paddw
m1
,
m0
;low: in0 - in2 + in3
pmaddwd
m0
,
m2
,
[
pw_1321_3803
]
;1321 * in0 + 3803 * in2
pmaddwd
m2
,
[
pw_2482_m1321
]
;2482 * in0 - 1321 * in2
pmaddwd
m4
,
m3
,
[
pw_3344_2482
]
;3344 * in1 + 2482 * in3
pmaddwd
m5
,
m3
,
[
pw_3344_m3803
]
;3344 * in1 - 3803 * in3
pmaddwd
m0
,
m2
,
[
o
(
pw_1321_3803
)
]
;1321 * in0 + 3803 * in2
pmaddwd
m2
,
[
o
(
pw_2482_m1321
)
]
;2482 * in0 - 1321 * in2
pmaddwd
m4
,
m3
,
[
o
(
pw_3344_2482
)
]
;3344 * in1 + 2482 * in3
pmaddwd
m5
,
m3
,
[
o
(
pw_3344_m3803
)
]
;3344 * in1 - 3803 * in3
paddd
m4
,
m0
;t0 + t3
pmaddwd
m3
,
[
pw_m6688_m3803
]
;-2 * 3344 * in1 - 3803 * in3
pmulhrsw
m1
,
[
pw_3344x8
]
;low: out2
mova
m0
,
[
pd_2048
]
pmaddwd
m3
,
[
o
(
pw_m6688_m3803
)]
;-2 * 3344 * in1 - 3803 * in3
pmulhrsw
m1
,
[
o
(
pw_3344x8
)]
;low: out2
mova
m0
,
[
o
(
pd_2048
)]
paddd
m2
,
m0
paddd
m0
,
m4
;t0 + t3 + 2048
paddd
m5
,
m2
;t1 + t3 + 2048
...
...
@@ -169,9 +173,11 @@ SECTION .text
%endmacro
%macro INV_TXFM_FN 4
; type1, type2, fast_thresh, size
cglobal
inv_txfm_add_
%
1
_
%
2
_
%
4
,
4
,
5
,
0
,
ds
t
,
stride
,
coeff
,
eob
,
tx2
cglobal
inv_txfm_add_
%
1
_
%
2
_
%
4
,
4
,
6
,
0
,
ds
t
,
stride
,
coeff
,
eob
,
tx2
%undef cmp
lea
tx2q
,
[
m
(
i
%
2
_
%
4
_internal
)
.pass2
]
%if ARCH_X86_32
LEA
r5
,
$$
%endif
%if %3 > 0
cmp
eobd
,
%
3
jle
%%
end
...
...
@@ -179,7 +185,8 @@ cglobal inv_txfm_add_%1_%2_%4, 4, 5, 0, dst, stride, coeff, eob, tx2
test
eobd
,
eobd
jz
%%
end
%endif
call
i
%
1
_
%
4
_internal
lea
tx2q
,
[
o
(
m
(
i
%
2
_
%
4
_internal
)
.pass2
)]
call
m
(
i
%
1
_
%
4
_internal
)
RET
ALIGN
function_align
%%end:
...
...
@@ -188,10 +195,10 @@ ALIGN function_align
%macro INV_TXFM_4X4_FN 2-3 -1
; type1, type2, fast_thresh
INV_TXFM_FN
%
1
,
%
2
,
%
3
,
4
x4
%ifidn %1_%2, dct_identity
mova
m0
,
[
pw_2896x8
]
mova
m0
,
[
o
(
pw_2896x8
)
]
pmulhrsw
m0
,
[
coeffq
]
paddw
m0
,
m0
pmulhrsw
m0
,
[
pw_5793x4
]
pmulhrsw
m0
,
[
o
(
pw_5793x4
)
]
punpcklwd
m0
,
m0
punpckhdq
m1
,
m0
,
m0
punpckldq
m0
,
m0
...
...
@@ -205,8 +212,8 @@ ALIGN function_align
punpcklwd
m0
,
m1
punpcklqdq
m0
,
m0
paddw
m0
,
m0
pmulhrsw
m0
,
[
pw_5793x4
]
pmulhrsw
m0
,
[
pw_2896x8
]
pmulhrsw
m0
,
[
o
(
pw_5793x4
)
]
pmulhrsw
m0
,
[
o
(
pw_2896x8
)
]
mova
m1
,
m0
call
m
(
iadst_4x4_internal
)
.end
RET
...
...
@@ -214,17 +221,17 @@ ALIGN function_align
pshuflw
m0
,
[
coeffq
],
q0000
punpcklqdq
m0
,
m0
%ifidn %1, dct
mova
m1
,
[
pw_2896x8
]
mova
m1
,
[
o
(
pw_2896x8
)
]
pmulhrsw
m0
,
m1
%elifidn %1, adst
pmulhrsw
m0
,
[
iadst4_dconly1a
]
pmulhrsw
m0
,
[
o
(
iadst4_dconly1a
)
]
%elifidn %1, flipadst
pmulhrsw
m0
,
[
iadst4_dconly1b
]
pmulhrsw
m0
,
[
o
(
iadst4_dconly1b
)
]
%endif
mov
[
coeffq
],
eobd
;0
%ifidn %2, dct
%ifnidn %1, dct
pmulhrsw
m0
,
[
pw_2896x8
]
pmulhrsw
m0
,
[
o
(
pw_2896x8
)
]
%else
pmulhrsw
m0
,
m1
%endif
...
...
@@ -232,24 +239,28 @@ ALIGN function_align
call
m
(
iadst_4x4_internal
)
.end2
RET
%else
; adst / flipadst
pmulhrsw
m1
,
m0
,
[
iadst4_dconly2b
]
pmulhrsw
m0
,
[
iadst4_dconly2a
]
pmulhrsw
m1
,
m0
,
[
o
(
iadst4_dconly2b
)
]
pmulhrsw
m0
,
[
o
(
iadst4_dconly2a
)
]
call
m
(
i
%
2
_4x4_internal
)
.end2
RET
%endif
%endif
%endmacro
INIT_XMM
ss
se3
INV_TXFM_4X4_FN
dct
,
dct
,
0
INV_TXFM_4X4_FN
dct
,
adst
,
0
INV_TXFM_4X4_FN
dct
,
flipadst
,
0
INV_TXFM_4X4_FN
dct
,
identity
,
3
cglobal
idct_4x4_internal
,
0
,
0
,
4
,
ds
t
,
stride
,
coeff
,
eob
,
tx2
mova
m0
,
[
coeffq
+
16
*
0
]
;high: in1 ;low: in0
mova
m1
,
[
coeffq
+
16
*
1
]
;high: in3 ;low in2
IDCT4_1D_PACKED
mova
m2
,
[
deint_shuf
]
mova
m2
,
[
o
(
deint_shuf
)
]
shufps
m3
,
m0
,
m1
,
q1331
shufps
m0
,
m1
,
q0220
pshufb
m0
,
m2
;high: in1 ;low: in0
...
...
@@ -265,7 +276,10 @@ cglobal idct_4x4_internal, 0, 0, 4, dst, stride, coeff, eob, tx2
ITX4_END
0
,
1
,
3
,
2
INV_TXFM_4X4_FN
dct
,
dct
,
0
INV_TXFM_4X4_FN
adst
,
dct
,
0
INV_TXFM_4X4_FN
adst
,
adst
,
0
INV_TXFM_4X4_FN
adst
,
flipadst
,
0
INV_TXFM_4X4_FN
adst
,
identity
cglobal
iadst_4x4_internal
,
0
,
0
,
6
,
ds
t
,
stride
,
coeff
,
eob
,
tx2
mova
m0
,
[
coeffq
+
16
*
0
]
...
...
@@ -294,9 +308,10 @@ ALIGN function_align
IADST4_1D_PACKED
ret
INV_TXFM_4X4_FN
adst
,
adst
,
0
INV_TXFM_4X4_FN
dct
,
adst
,
0
INV_TXFM_4X4_FN
adst
,
dct
,
0
INV_TXFM_4X4_FN
flipadst
,
dct
,
0
INV_TXFM_4X4_FN
flipadst
,
adst
,
0
INV_TXFM_4X4_FN
flipadst
,
flipadst
,
0
INV_TXFM_4X4_FN
flipadst
,
identity
cglobal
iflipadst_4x4_internal
,
0
,
0
,
6
,
ds
t
,
stride
,
coeff
,
eob
,
tx2
mova
m0
,
[
coeffq
+
16
*
0
]
...
...
@@ -321,16 +336,15 @@ cglobal iflipadst_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
.end2:
ITX4_END
3
,
2
,
1
,
0
INV_TXFM_4X4_FN
flipadst
,
flipadst
,
0
INV_TXFM_4X4_FN
flipadst
,
dct
,
0
INV_TXFM_4X4_FN
flipadst
,
adst
,
0
INV_TXFM_4X4_FN
dct
,
flipadst
,
0
INV_TXFM_4X4_FN
adst
,
flipadst
,
0
INV_TXFM_4X4_FN
identity
,
dct
,
3
INV_TXFM_4X4_FN
identity
,
adst
INV_TXFM_4X4_FN
identity
,
flipadst
INV_TXFM_4X4_FN
identity
,
identity
cglobal
iidentity_4x4_internal
,
0
,
0
,
6
,
ds
t
,
stride
,
coeff
,
eob
,
tx2
mova
m0
,
[
coeffq
+
16
*
0
]
mova
m1
,
[
coeffq
+
16
*
1
]
mova
m2
,
[
pw_5793x4
]
mova
m2
,
[
o
(
pw_5793x4
)
]
paddw
m0
,
m0
paddw
m1
,
m1
pmulhrsw
m0
,
m2
...
...
@@ -343,21 +357,13 @@ cglobal iidentity_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
jmp
tx2q
.pass2:
mova
m2
,
[
pw_5793x4
]
mova
m2
,
[
o
(
pw_5793x4
)
]
paddw
m0
,
m0
paddw
m1
,
m1
pmulhrsw
m0
,
m2
pmulhrsw
m1
,
m2
jmp
m
(
iadst_4x4_internal
)
.end
INV_TXFM_4X4_FN
identity
,
identity
INV_TXFM_4X4_FN
identity
,
dct
,
3
INV_TXFM_4X4_FN
identity
,
adst
INV_TXFM_4X4_FN
identity
,
flipadst
INV_TXFM_4X4_FN
dct
,
identity
,
3
INV_TXFM_4X4_FN
adst
,
identity
INV_TXFM_4X4_FN
flipadst
,
identity
%macro IWHT4_1D_PACKED 0
punpckhqdq
m3
,
m0
,
m1
;low: in1 high: in3
punpcklqdq
m0
,
m1
;low: in0 high: in2
...
...
src/x86/mc_ssse3.asm
View file @
c2292efc
...
...
@@ -186,7 +186,7 @@ DECLARE_REG_TMP 6, 7
%endmacro
cglobal
avg
,
4
,
7
,
3
,
ds
t
,
stride
,
tmp1
,
tmp2
,
w
,
h
,
stride3
lea
r6
,
[
avg_ssse3_table
]
LEA
r6
,
avg_ssse3_table
tzcnt
wd
,
wm
; leading zeros
movifnidn
hd
,
hm
; move h(stack) to h(register) if not already that register
movsxd
wq
,
dword
[
r6
+
wq
*
4
]
; push table entry matching the tile width (tzcnt) in widen reg
...
...
@@ -216,7 +216,7 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
%define W_AVG_INC_PTR AVG_INC_PTR
cglobal
w_avg
,
4
,
7
,
6
,
ds
t
,
stride
,
tmp1
,
tmp2
,
w
,
h
,
stride3
lea
r6
,
[
w_avg_ssse3_table
]
LEA
r6
,
w_avg_ssse3_table
tzcnt
wd
,
wm
movifnidn
hd
,
hm
movd
m0
,
r6m
...
...
@@ -269,11 +269,12 @@ cglobal mask, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
cglobal
mask
,
4
,
7
,
7
,
ds
t
,
stride
,
tmp1
,
tmp2
,
w
,
mask
,
stride3
%define hd dword r5m
%endif
lea
r6
,
[
mask_ssse3_table
]
%define base r6-mask_ssse3_table
LEA
r6
,
mask_ssse3_table
tzcnt
wd
,
wm
movsxd
wq
,
dword
[
r6
+
wq
*
4
]
pxor
m4
,
m4
mova
m5
,
[
pw_2048
+
r6
-
mask_ssse3_table
]
mova
m5
,
[
base
+
pw_2048
]
add
wq
,
r6
mov
maskq
,
r6m
BIDIR_FN
MASK
...
...
@@ -284,9 +285,9 @@ cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
%define reg_pw_27 m9
%define reg_pw_2048 m10
%else
%define reg_pw_8 [pw_8]
%define reg_pw_27 [pw_26]
; 64 - 38
%define reg_pw_2048 [pw_2048]
%define reg_pw_8 [
base+
pw_8]
%define reg_pw_27 [
base+
pw_26]
; 64 - 38
%define reg_pw_2048 [
base+
pw_2048]
%endif
%macro W_MASK_420_B 2
; src_offset in bytes, mask_out
...
...
@@ -323,63 +324,60 @@ cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
W_MASK_420_B
(
%
1
*
16
),
%
2
%endmacro
%define base r6-w_mask_420_ssse3_table
%if ARCH_X86_64
; args: dst, stride, tmp1, tmp2, w, h, mask, sign
cglobal
w_mask_420
,
4
,
9
,
11
,
ds
t
,
stride
,
tmp1
,
tmp2
,
w
,
h
,
mask
,
stride3
lea
r
7
,
[
w_mask_420_ssse3_table
]
cglobal
w_mask_420
,
4
,
8
,
11
,
ds
t
,
stride
,
tmp1
,
tmp2
,
w
,
h
,
mask
lea
r
6
,
[
w_mask_420_ssse3_table
]
mov
wd
,
wm
tzcnt
r
8
d
,
wd
tzcnt
r
7
d
,
wd
movifnidn
hd
,
hm
mov
maskq
,
maskmp
movd
m0
,
r7m
pshuflw
m0
,
m0
,
q0000
; sign
punpcklqdq
m0
,
m0
movsxd
r8
,
dword
[
r7
+
r8
*
4
]
mova
reg_pw_8
,
[
pw_8
]
mova
reg_pw_27
,
[
pw_26
]
; 64 - 38
mova
reg_pw_2048
,
[
pw_2048
]
mova
m6
,
[
pw_258
]
; 64 * 4 + 2
movsxd
r7
,
[
r6
+
r7
*
4
]
mova
reg_pw_8
,
[
base
+
pw_8
]
mova
reg_pw_27
,
[
base
+
pw_26
]
; 64 - 38
mova
reg_pw_2048
,
[
base
+
pw_2048
]
mova
m6
,
[
base
+
pw_258
]
; 64 * 4 + 2
add
r7
,
r6
mov
maskq
,
maskmp
psubw
m6
,
m0
add
r8
,
r7
W_MASK_420
0
,
4
lea
stride3q
,
[
strideq
*
3
]
jmp
r8
%define dst_bak r8
%define loop_w r7
%define orig_w wq
jmp
r7
%define loop_w r7d
%else
cglobal
w_mask_420
,
4
,
7
,
8
,
ds
t
,
stride
,
tmp1
,
tmp2
,
w
,
mask
,
stride3
tzcnt
r6
d
,
r4
m
mov
wd
,
w_mask_420_ssse3_table
add
wd
,
[
wq
+
r6
*
4
]
cglobal
w_mask_420
,
4
,
7
,
8
,
ds
t
,
stride
,
tmp1
,
tmp2
,
w
,
mask
tzcnt
w
d
,
w
m
LEA
r6
,
w_mask_420_ssse3_table
mov
wd
,
[
r6
+
wq
*
4
]
mov
maskq
,
r6mp
movd
m0
,
r7m
pshuflw
m0
,
m0
,
q0000
; sign
punpcklqdq
m0
,
m0
mova
m6
,
[
pw_258
]
; 64 * 4 + 2
mova
m6
,
[
base
+
pw_258
]
; 64 * 4 + 2
add
wq
,
r6
psubw
m6
,
m0
W_MASK_420
0
,
4
lea
stride3q
,
[
strideq
*
3
]
jmp
wd
%define dst_bak r0m
%define loop_w r6q
%define orig_w r4m
%define hd dword r5m
%define loop_w dword r0m
%define hd dword r5m
%endif
.w4_loop:
add
tmp1q
,
2
*
16
add
tmp2q
,
2
*
16
W_MASK_420
0
,
4
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
lea
ds
tq
,
[
ds
tq
+
strideq
*
2
]
add
maskq
,
4
.w4:
movd
[
ds
tq
],
m0
; copy m0[0]
pshuflw
m1
,
m0
,
q1032
movd
[
ds
tq
+
strideq
*
1
],
m1
; copy m0[1]
lea
ds
tq
,
[
ds
tq
+
strideq
*
2
]
punpckhqdq
m0
,
m0
movd
[
ds
tq
+
strideq
*
2
],
m0
; copy m0[2]
movd
[
ds
tq
+
strideq
*
0
],
m0
; copy m0[2]
psrlq
m0
,
32
movd
[
ds
tq
+
stride
3q
],
m0
; copy m0[3]
movd
[
ds
tq
+
stride
q
*
1
],
m0
; copy m0[3]
pshufd
m5
,
m4
,
q3131
; DBDB even lines repeated
pshufd
m4
,
m4
,
q2020
; CACA odd lines repeated
psubw
m1
,
m6
,
m4
; m9 == 64 * 4 + 2
...
...
@@ -409,20 +407,19 @@ cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask, stride3
jg
.w8_loop
RET
.w16:
; w32/64/128
mov
ds
t_bak
,
ds
tq
mov
loop_w
,
orig_w
; use width as counter
%if ARCH_X86_32
mov
w
q
,
orig_w
; because we altered it in 32bit setup
mov
w
d
,
wm
; because we altered it in 32bit setup
%endif
mov
loop_w
,
wd
; use width as counter
jmp
.w16ge_inner_loop_first
.w16ge_loop:
lea
tmp1q
,
[
tmp1q
+
wq
*
2
]
; skip even line pixels
lea
tmp2q
,
[
tmp2q
+
wq
*
2
]
; skip even line pixels
sub
ds
tq
,
wq
mov
loop_w
,
wd
lea
ds
tq
,
[
ds
tq
+
strideq
*
2
]
mov
ds
t_bak
,
ds
tq
mov
loop_w
,
orig_w
.w16ge_inner_loop:
W_MASK_420_B
0
,
4
W_MASK_420_B
0
,
4
.w16ge_inner_loop_first:
mova
[
ds
tq
],
m0
W_MASK_420_B
wq
*
2
,
5
; load matching even line (offset = widthpx * (16+16))
...
...
@@ -438,7 +435,6 @@ cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask, stride3
add
ds
tq
,
16
sub
loop_w
,
16
jg
.w16ge_inner_loop
mov
ds
tq
,
ds
t_bak
sub
hd
,
2
jg
.w16ge_loop
RET
...
...
@@ -470,7 +466,7 @@ cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask, stride3
cglobal
bl
end
,
3
,
7
,
7
,
ds
t
,
ds
,
tmp
,
w
,
h
,
mask
%define base r6-blend_ssse3_table
lea
r6
,
[
bl
end_ssse3_table
]
LEA
r6
,
bl
end_ssse3_table
tzcnt
wd
,
wm
movifnidn
hd
,
hm
movifnidn
maskq
,
maskmp
...
...
@@ -546,7 +542,7 @@ cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
cglobal
bl
end_v
,
3
,
6
,
8
,
ds
t
,
ds
,
tmp
,
w
,
h
,
mask
%define base r5-blend_v_ssse3_table
lea
r5
,
[
bl
end_v_ssse3_table
]
LEA
r5
,
bl
end_v_ssse3_table
tzcnt
wd
,
wm
movifnidn
hd
,
hm
movsxd
wq
,
dword
[
r5
+
wq
*
4
]
...
...
@@ -646,15 +642,21 @@ cglobal blend_v, 3, 6, 8, dst, ds, tmp, w, h, mask
jg
.w32_loop
RET
cglobal
bl
end_h
,
4
,
7
,
6
,
ds
t
,
ds
,
tmp
,
w
,
h
,
mask
%define base r5-blend_h_ssse3_table
lea
r5
,
[
bl
end_h_ssse3_table
]
cglobal
bl
end_h
,
3
,
7
,
6
,
ds
t
,
ds
,
tmp
,
w
,
h
,
mask
%define base t0-blend_h_ssse3_table
%if ARCH_X86_32
; We need to keep the PIC pointer for w4, reload wd from stack instead
DECLARE_REG_TMP
6
%else
DECLARE_REG_TMP
5
mov
r6d
,
wd
tzcnt
wd
,
wd
%endif
LEA
t0
,
bl
end_h_ssse3_table
tzcnt
wd
,
wm
mov
hd
,
hm
movsxd
wq
,
dword
[
r5
+
wq
*
4
]
movsxd
wq
,
dword
[
t0
+
wq
*
4
]
mova
m5
,
[
base
+
pw_512
]
add
wq
,
r5
add
wq
,
t0
lea
maskq
,
[
base
+
obmc_masks
+
hq
*
4
]
neg
hq
jmp
wq
...
...
@@ -678,7 +680,11 @@ cglobal blend_h, 4, 7, 6, dst, ds, tmp, w, h, mask
jl
.w2
RET
.w4:
%if ARCH_X86_32
mova
m3
,
[
base
+
bl
end_shuf
]
%else
mova
m3
,
[
bl
end_shuf
]
%endif
.w4_loop:
movd
m0
,
[
ds
tq
+
ds
q
*
0
]
movd
m2
,
[
ds
tq
+
ds
q
*
1
]
...
...
@@ -716,6 +722,9 @@ cglobal blend_h, 4, 7, 6, dst, ds, tmp, w, h, mask
RET
; w16/w32/w64/w128
.w16:
%if ARCH_X86_32
mov
r6d
,
wm
%endif
sub
ds
q
,
r6
.w16_loop0:
movd
m3
,
[
maskq
+
hq
*
2
]
...
...
tests/checkasm/x86/checkasm.asm
View file @
c2292efc
...
...
@@ -200,7 +200,7 @@ cglobal checked_call, 1,7
jz
.ok
mov
r3
,
eax
mov
r4
,
edx
lea
r0
,
[
error_message
]
LEA
r0
,
error_message
mov
[
esp
],
r0
call
fail_func
mov
edx
,
r4
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment