Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
VideoLAN
dav1d
Commits
007fd651
Commit
007fd651
authored
Feb 11, 2019
by
Henrik Gramner
Browse files
x86: Optimize MC w_mask
parent
3dda2dd6
Changes
2
Hide whitespace changes
Inline
Side-by-side
src/x86/mc.asm
View file @
007fd651
...
...
@@ -62,13 +62,12 @@ deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
blend_shuf:
db
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
pb_64:
times
4
db
64
pw_8:
times
2
dw
8
pw_26:
times
2
dw
26
pw_34:
times
2
dw
34
pw_258:
times
2
dw
258
pw_512:
times
2
dw
512
pw_1024:
times
2
dw
1024
pw_2048:
times
2
dw
2048
pw_6903:
times
2
dw
6903
pw_8192:
times
2
dw
8192
pd_32:
dd
32
pd_512:
dd
512
...
...
@@ -3060,9 +3059,8 @@ cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
mova
m1
,
[
tmp2q
+
(
%
1
+
0
)
*
mmsize
]
psubw
m1
,
m0
pabsw
m
%
2
,
m1
paddw
m
%
2
,
m6
psrlw
m
%
2
,
8
; (abs(tmp1 - tmp2) + 8) >> 8
psubusw
m
%
2
,
m7
,
m
%
2
; 64 - min(m, 64)
psubusw
m
%
2
,
m6
,
m
%
2
psrlw
m
%
2
,
8
; 64 - m
psllw
m2
,
m
%
2
,
10
pmulhw
m1
,
m2
paddw
m0
,
m1
...
...
@@ -3070,32 +3068,32 @@ cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
mova
m2
,
[
tmp2q
+
(
%
1
+
1
)
*
mmsize
]
psubw
m2
,
m1
pabsw
m3
,
m2
p
addw
m3
,
m6
p
subusw
m3
,
m6
,
m3
psrlw
m3
,
8
psubusw
m3
,
m7
,
m3
phaddw
m
%
2
,
m3
psllw
m3
,
10
pmulhw
m2
,
m3
paddw
m1
,
m2
pmulhrsw
m0
,
m
8
pmulhrsw
m1
,
m
8
pmulhrsw
m0
,
m
7
pmulhrsw
m1
,
m
7
packuswb
m0
,
m1
%endmacro
cglobal
w_mask_420
,
4
,
8
,
15
,
ds
t
,
stride
,
tmp1
,
tmp2
,
w
,
h
,
mask
,
stride3
cglobal
w_mask_420
,
4
,
8
,
14
,
ds
t
,
stride
,
tmp1
,
tmp2
,
w
,
h
,
mask
,
stride3
%define base r7-w_mask_420_avx2_table
lea
r7
,
[
w_mask_420_avx2_table
]
tzcnt
wd
,
wm
movifnidn
hd
,
hm
mov
maskq
,
maskmp
vpbroadcastw
m0
,
r7m
; sign
movd
x
m0
,
r7m
; sign
movsxd
wq
,
dword
[
r7
+
wq
*
4
]
vpbroadcastd
m6
,
[
pw_8
+
r7
-
w_mask_420_avx2_table
]
vpbroadcastd
m7
,
[
pw_26
+
r7
-
w_mask_420_avx2_table
]
; 64 - 38
vpbroadcastd
m8
,
[
pw_2048
+
r7
-
w_mask_420_avx2_table
]
vpbroadcastd
m9
,
[
pw_258
+
r7
-
w_mask_420_avx2_table
]
; 64 * 4 + 2
pmovzxbd
m10
,
[
deint_shuf4
+
r7
-
w_mask_420_avx2_table
]
psubw
m9
,
m0
vpbroadcastd
m6
,
[
base
+
pw_6903
]
; ((64 - 38) << 8) + 255 - 8
vpbroadcastd
m7
,
[
base
+
pw_2048
]
movd
xm8
,
[
base
+
pw_258
]
; 64 * 4 + 2
pmovzxbd
m9
,
[
base
+
deint_shuf4
]
psubw
xm8
,
xm0
add
wq
,
r7
vpbroadcastw
m8
,
xm8
W_MASK_420
0
,
4
lea
stride3q
,
[
strideq
*
3
]
jmp
wq
...
...
@@ -3105,21 +3103,20 @@ cglobal w_mask_420, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
pextrd
[
ds
tq
+
strideq
*
1
],
xm0
,
1
movd
[
ds
tq
+
strideq
*
2
],
xm1
pextrd
[
ds
tq
+
stride3q
],
xm1
,
1
cmp
hd
,
4
j
e
.w4_end
cmp
hd
,
8
j
l
.w4_end
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
pextrd
[
ds
tq
],
xm0
,
2
pextrd
[
ds
tq
+
strideq
*
1
],
xm0
,
3
pextrd
[
ds
tq
+
strideq
*
2
],
xm1
,
2
pextrd
[
ds
tq
+
stride3q
],
xm1
,
3
cmp
hd
,
8
jg
.w4_h16
.w4_end:
vextracti128
xm0
,
m4
,
1
vpblendd
xm1
,
xm4
,
xm0
,
0x05
vpblendd
xm4
,
xm4
,
xm0
,
0x0a
pshufd
xm1
,
xm1
,
q2301
psubw
xm4
,
xm
9
,
xm4
psubw
xm4
,
xm
8
,
xm4
psubw
xm4
,
xm1
psrlw
xm4
,
2
packuswb
xm4
,
xm4
...
...
@@ -3130,9 +3127,9 @@ cglobal w_mask_420, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
phaddd
m4
,
m5
vextracti128
xm1
,
m0
,
1
psubw
m4
,
m
9
,
m4
psubw
m4
,
m
8
,
m4
psrlw
m4
,
2
vpermd
m4
,
m
10
,
m4
vpermd
m4
,
m
9
,
m4
vextracti128
xm5
,
m4
,
1
packuswb
xm4
,
xm5
movd
[
ds
tq
],
xm0
...
...
@@ -3155,7 +3152,7 @@ cglobal w_mask_420, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
.w8:
vextracti128
xm2
,
m4
,
1
vextracti128
xm1
,
m0
,
1
psubw
xm4
,
xm
9
,
xm4
psubw
xm4
,
xm
8
,
xm4
psubw
xm4
,
xm2
psrlw
xm4
,
2
packuswb
xm4
,
xm4
...
...
@@ -3180,12 +3177,12 @@ cglobal w_mask_420, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
W_MASK_420
2
,
5
punpckhqdq
m1
,
m4
,
m5
punpcklqdq
m4
,
m5
psubw
m1
,
m
9
,
m1
psubw
m1
,
m
8
,
m1
psubw
m1
,
m4
psrlw
m1
,
2
vpermq
m0
,
m0
,
q3120
packuswb
m1
,
m1
vpermd
m1
,
m
10
,
m1
vpermd
m1
,
m
9
,
m1
mova
[
ds
tq
+
strideq
*
2
],
xm0
vextracti128
[
ds
tq
+
stride3q
],
m0
,
1
mova
[
maskq
],
xm1
...
...
@@ -3202,20 +3199,20 @@ cglobal w_mask_420, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
vpermq
m0
,
m0
,
q3120
mova
[
ds
tq
],
m0
W_MASK_420
2
,
5
psubw
m4
,
m
9
,
m4
psubw
m4
,
m
8
,
m4
psubw
m4
,
m5
psrlw
m4
,
2
vpermq
m0
,
m0
,
q3120
packuswb
m4
,
m4
vpermd
m4
,
m
10
,
m4
vpermd
m4
,
m
9
,
m4
mova
[
ds
tq
+
strideq
*
1
],
m0
mova
[
maskq
],
xm4
sub
hd
,
2
jg
.w32_loop
RET
.w64_loop_even:
psubw
m1
1
,
m
9
,
m4
psubw
m1
2
,
m
9
,
m5
psubw
m1
0
,
m
8
,
m4
psubw
m1
1
,
m
8
,
m5
dec
hd
.w64_loop:
add
tmp1q
,
4
*
32
...
...
@@ -3230,20 +3227,20 @@ cglobal w_mask_420, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
mova
[
ds
tq
+
32
],
m0
test
hd
,
1
jz
.w64_loop_even
psubw
m4
,
m1
1
,
m4
psubw
m5
,
m1
2
,
m5
psubw
m4
,
m1
0
,
m4
psubw
m5
,
m1
1
,
m5
psrlw
m4
,
2
psrlw
m5
,
2
packuswb
m4
,
m5
vpermd
m4
,
m
10
,
m4
vpermd
m4
,
m
9
,
m4
mova
[
maskq
],
m4
add
maskq
,
32
dec
hd
jg
.w64_loop
RET
.w128_loop_even:
psubw
m1
3
,
m
9
,
m4
psubw
m1
4
,
m
9
,
m5
psubw
m1
2
,
m
8
,
m4
psubw
m1
3
,
m
8
,
m5
dec
hd
.w128_loop:
W_MASK_420
0
,
4
...
...
@@ -3258,17 +3255,17 @@ cglobal w_mask_420, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
add
tmp2q
,
8
*
32
test
hd
,
1
jz
.w128_even
psubw
m4
,
m1
1
,
m4
psubw
m5
,
m1
2
,
m5
psubw
m4
,
m1
0
,
m4
psubw
m5
,
m1
1
,
m5
psrlw
m4
,
2
psrlw
m5
,
2
packuswb
m4
,
m5
vpermd
m4
,
m
10
,
m4
vpermd
m4
,
m
9
,
m4
mova
[
maskq
],
m4
jmp
.w128_odd
.w128_even:
psubw
m1
1
,
m
9
,
m4
psubw
m1
2
,
m
9
,
m5
psubw
m1
0
,
m
8
,
m4
psubw
m1
1
,
m
8
,
m5
.w128_odd:
W_MASK_420
-
4
,
4
vpermq
m0
,
m0
,
q3120
...
...
@@ -3278,12 +3275,12 @@ cglobal w_mask_420, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
mova
[
ds
tq
+
3
*
32
],
m0
test
hd
,
1
jz
.w128_loop_even
psubw
m4
,
m1
3
,
m4
psubw
m5
,
m1
4
,
m5
psubw
m4
,
m1
2
,
m4
psubw
m5
,
m1
3
,
m5
psrlw
m4
,
2
psrlw
m5
,
2
packuswb
m4
,
m5
vpermd
m4
,
m
10
,
m4
vpermd
m4
,
m
9
,
m4
mova
[
maskq
+
32
],
m4
add
maskq
,
64
dec
hd
...
...
src/x86/mc_ssse3.asm
View file @
007fd651
...
...
@@ -51,12 +51,12 @@ bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
blend_shuf:
db
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
pb_64:
times
16
db
64
pw_8:
times
8
dw
8
pw_26:
times
8
dw
26
pw_258:
times
8
dw
258
pw_512:
times
8
dw
512
pw_1024:
times
8
dw
1024
pw_2048:
times
8
dw
2048
pw_6903:
times
8
dw
6903
pw_258:
times
2
dw
258
%macro BIDIR_JMP_TABLE 1-*
;evaluated at definition time (in loop below)
...
...
@@ -918,41 +918,34 @@ cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
BIDIR_FN
MASK
%undef hd
%if ARCH_X86_64
%define reg_pw_8 m8
%define reg_pw_27 m9
%define reg_pw_2048 m10
%else
%define reg_pw_8 [base+pw_8]
%define reg_pw_27 [base+pw_26]
; 64 - 38
%define reg_pw_2048 [base+pw_2048]
%endif
%macro W_MASK_420_B 2
; src_offset in bytes, mask_out
;**** do m0 = u16.dst[7..0], m%2 = u16.m[7..0] ****
mova
m0
,
[
tmp1q
+
(
%
1
)]
mova
m1
,
[
tmp2q
+
(
%
1
)]
psubw
m1
,
m0
; tmp1 - tmp2
pabsw
m3
,
m1
; abs(tmp1 - tmp2)
paddw
m3
,
reg_pw_8
; abs(tmp1 - tmp2) + 8
psrlw
m3
,
8
; (abs(tmp1 - tmp2) + 8) >> 8
psubusw
m
%
2
,
reg_pw_27
,
m3
; 64 - min(m, 64)
psllw
m2
,
m
%
2
,
10
mova
m2
,
reg_pw_6903
psubw
m1
,
m0
pabsw
m
%
2
,
m1
; abs(tmp1 - tmp2)
mova
m3
,
m2
psubusw
m2
,
m
%
2
psrlw
m2
,
8
; 64 - m
mova
m
%
2
,
m2
psllw
m2
,
10
pmulhw
m1
,
m2
; tmp2 * ()
paddw
m0
,
m1
; tmp1 + ()
;**** do m1 = u16.dst[7..0], m%2 = u16.m[7..0] ****
mova
m1
,
[
tmp1q
+
(
%
1
)
+
mmsize
]
mova
m2
,
[
tmp2q
+
(
%
1
)
+
mmsize
]
psubw
m2
,
m1
; tmp1 - tmp2
psubw
m2
,
m1
pabsw
m7
,
m2
; abs(tmp1 - tmp2)
paddw
m7
,
reg_pw_8
; abs(tmp1 - tmp2) + 8
psrlw
m7
,
8
; (abs(tmp1 - tmp2) + 8) >> 8
psubusw
m3
,
reg_pw_27
,
m7
; 64 - min(m, 64)
psubusw
m3
,
m7
psrlw
m3
,
8
; 64 - m
phaddw
m
%
2
,
m3
; pack both u16.m[8..0]runs as u8.m [15..0]
psllw
m3
,
10
pmulhw
m2
,
m3
%if ARCH_X86_32
mova
reg_pw_2048
,
[
base
+
pw_2048
]
%endif
paddw
m1
,
m2
;********
pmulhrsw
m0
,
reg_pw_2048
; round/scale 2048
pmulhrsw
m1
,
reg_pw_2048
; round/scale 2048
packuswb
m0
,
m1
; concat m0 = u8.dst[15..0]
...
...
@@ -964,38 +957,41 @@ cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
%define base r6-w_mask_420_ssse3_table
%if ARCH_X86_64
%define reg_pw_6903 m8
%define reg_pw_2048 m9
; args: dst, stride, tmp1, tmp2, w, h, mask, sign
cglobal
w_mask_420
,
4
,
8
,
1
1
,
ds
t
,
stride
,
tmp1
,
tmp2
,
w
,
h
,
mask
cglobal
w_mask_420
,
4
,
8
,
1
0
,
ds
t
,
stride
,
tmp1
,
tmp2
,
w
,
h
,
mask
lea
r6
,
[
w_mask_420_ssse3_table
]
mov
wd
,
wm
tzcnt
r7d
,
wd
movd
m0
,
r7m
; sign
movifnidn
hd
,
hm
movd
m0
,
r7m
pshuflw
m0
,
m0
,
q0000
; sign
punpcklqdq
m0
,
m0
movsxd
r7
,
[
r6
+
r7
*
4
]
mova
reg_pw_8
,
[
base
+
pw_8
]
mova
reg_pw_27
,
[
base
+
pw_26
]
; 64 - 38
mova
reg_pw_6903
,
[
base
+
pw_6903
]
; ((64 - 38) << 8) + 255 - 8
mova
reg_pw_2048
,
[
base
+
pw_2048
]
mov
a
m6
,
[
base
+
pw_258
]
; 64 * 4 + 2
mov
d
m6
,
[
base
+
pw_258
]
; 64 * 4 + 2
add
r7
,
r6
mov
maskq
,
maskmp
psubw
m6
,
m0
pshuflw
m6
,
m6
,
q0000
punpcklqdq
m6
,
m6
W_MASK_420
0
,
4
jmp
r7
%define loop_w r7d
%else
%define reg_pw_6903 [base+pw_6903]
%define reg_pw_2048 m3
cglobal
w_mask_420
,
4
,
7
,
8
,
ds
t
,
stride
,
tmp1
,
tmp2
,
w
,
mask
tzcnt
wd
,
wm
LEA
r6
,
w_mask_420_ssse3_table
mov
wd
,
[
r6
+
wq
*
4
]
mov
d
m0
,
r7m
; sign
mov
maskq
,
r6mp
movd
m0
,
r7m
pshuflw
m0
,
m0
,
q0000
; sign
punpcklqdq
m0
,
m0
mova
m6
,
[
base
+
pw_258
]
; 64 * 4 + 2
mov
wd
,
[
r6
+
wq
*
4
]
movd
m6
,
[
base
+
pw_258
]
add
wq
,
r6
psubw
m6
,
m0
pshuflw
m6
,
m6
,
q0000
punpcklqdq
m6
,
m6
W_MASK_420
0
,
4
jmp
wd
%define loop_w dword r0m
...
...
@@ -1016,12 +1012,12 @@ cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
movd
[
ds
tq
+
strideq
*
0
],
m0
; copy m0[2]
psrlq
m0
,
32
movd
[
ds
tq
+
strideq
*
1
],
m0
; copy m0[3]
pshufd
m5
,
m4
,
q3131
; DBDB even lines repeated
pshufd
m4
,
m4
,
q2020
; CACA odd lines repeated
psubw
m1
,
m6
,
m4
; m9 == 64 * 4 + 2
psubw
m1
,
m5
; C-D A-B C-D A-B
psrlw
m1
,
2
; >> 2
psubw
m1
,
m6
,
m4
; a _ c _
psrlq
m4
,
32
; b _ d _
psubw
m1
,
m4
psrlw
m1
,
2
packuswb
m1
,
m1
pshuflw
m1
,
m1
,
q2020
movd
[
maskq
],
m1
sub
hd
,
4
jg
.w4_loop
...
...
@@ -1035,9 +1031,9 @@ cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
.w8:
movq
[
ds
tq
],
m0
movhps
[
ds
tq
+
strideq
*
1
],
m0
pshufd
m1
,
m4
,
q3232
psubw
m0
,
m6
,
m4
psubw
m0
,
m1
punpckhqdq
m4
,
m4
psubw
m0
,
m4
psrlw
m0
,
2
packuswb
m0
,
m0
movd
[
maskq
],
m0
...
...
@@ -1077,8 +1073,7 @@ cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
jg
.w16ge_loop
RET
%undef reg_pw_8
%undef reg_pw_27
%undef reg_pw_6903
%undef reg_pw_2048
%undef dst_bak
%undef loop_w
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment