Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
VideoLAN
dav1d
Commits
4c21c931
Commit
4c21c931
authored
Feb 11, 2019
by
Henrik Gramner
Browse files
x86: Add w_mask_444 AVX2 asm
parent
37093f98
Pipeline
#4531
passed with stages
in 7 minutes and 9 seconds
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
src/x86/mc.asm
View file @
4c21c931
...
...
@@ -92,6 +92,7 @@ BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE
mask_avx2
,
4
,
8
,
16
,
32
,
64
,
128
BIDIR_JMP_TABLE
w_mask_420_avx2
,
4
,
8
,
16
,
32
,
64
,
128
BIDIR_JMP_TABLE
w_mask_422_avx2
,
4
,
8
,
16
,
32
,
64
,
128
BIDIR_JMP_TABLE
w_mask_444_avx2
,
4
,
8
,
16
,
32
,
64
,
128
BIDIR_JMP_TABLE
bl
end_avx2
,
4
,
8
,
16
,
32
BIDIR_JMP_TABLE
bl
end_v_avx2
,
2
,
4
,
8
,
16
,
32
BIDIR_JMP_TABLE
bl
end_h_avx2
,
2
,
4
,
8
,
16
,
32
,
32
,
32
...
...
@@ -3055,7 +3056,7 @@ cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
add
wq
,
r7
BIDIR_FN
MASK
%macro W_MASK 2
; src_offset, mask_out
%macro W_MASK 2
-3 0
; src_offset, mask_out
, 4:4:4
mova
m0
,
[
tmp1q
+
(
%
1
+
0
)
*
mmsize
]
mova
m1
,
[
tmp2q
+
(
%
1
+
0
)
*
mmsize
]
psubw
m1
,
m0
...
...
@@ -3071,7 +3072,13 @@ cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
pabsw
m3
,
m2
psubusw
m3
,
m6
,
m3
psrlw
m3
,
8
%if %3
packuswb
m
%
2
,
m3
psubb
m
%
2
,
m5
,
m
%
2
vpermq
m
%
2
,
m
%
2
,
q3120
%else
phaddw
m
%
2
,
m3
%endif
psllw
m3
,
10
pmulhw
m2
,
m3
paddw
m1
,
m2
...
...
@@ -3463,6 +3470,136 @@ cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
jg
.w128_loop
RET
cglobal
w_mask_444
,
4
,
8
,
8
,
ds
t
,
stride
,
tmp1
,
tmp2
,
w
,
h
,
mask
,
stride3
%define base r7-w_mask_444_avx2_table
lea
r7
,
[
w_mask_444_avx2_table
]
tzcnt
wd
,
wm
movifnidn
hd
,
hm
mov
maskq
,
maskmp
movsxd
wq
,
dword
[
r7
+
wq
*
4
]
vpbroadcastd
m6
,
[
base
+
pw_6903
]
; ((64 - 38) << 8) + 255 - 8
vpbroadcastd
m7
,
[
base
+
pw_2048
]
vpbroadcastd
m5
,
[
base
+
pb_64
]
add
wq
,
r7
W_MASK
0
,
4
,
1
lea
stride3q
,
[
strideq
*
3
]
jmp
wq
.w4:
vextracti128
xm1
,
m0
,
1
movd
[
ds
tq
+
strideq
*
0
],
xm0
pextrd
[
ds
tq
+
strideq
*
1
],
xm0
,
1
movd
[
ds
tq
+
strideq
*
2
],
xm1
pextrd
[
ds
tq
+
stride3q
],
xm1
,
1
mova
[
maskq
+
32
*
0
],
m4
cmp
hd
,
8
jl
.w4_end
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
pextrd
[
ds
tq
+
strideq
*
0
],
xm0
,
2
pextrd
[
ds
tq
+
strideq
*
1
],
xm0
,
3
pextrd
[
ds
tq
+
strideq
*
2
],
xm1
,
2
pextrd
[
ds
tq
+
stride3q
],
xm1
,
3
je
.w4_end
W_MASK
2
,
4
,
1
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
vextracti128
xm1
,
m0
,
1
movd
[
ds
tq
+
strideq
*
0
],
xm0
pextrd
[
ds
tq
+
strideq
*
1
],
xm0
,
1
movd
[
ds
tq
+
strideq
*
2
],
xm1
pextrd
[
ds
tq
+
stride3q
],
xm1
,
1
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
pextrd
[
ds
tq
+
strideq
*
0
],
xm0
,
2
pextrd
[
ds
tq
+
strideq
*
1
],
xm0
,
3
pextrd
[
ds
tq
+
strideq
*
2
],
xm1
,
2
pextrd
[
ds
tq
+
stride3q
],
xm1
,
3
mova
[
maskq
+
32
*
1
],
m4
.w4_end:
RET
.w8_loop:
add
tmp1q
,
32
*
2
add
tmp2q
,
32
*
2
W_MASK
0
,
4
,
1
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
add
maskq
,
32
.w8:
vextracti128
xm1
,
m0
,
1
movq
[
ds
tq
+
strideq
*
0
],
xm0
movq
[
ds
tq
+
strideq
*
1
],
xm1
movhps
[
ds
tq
+
strideq
*
2
],
xm0
movhps
[
ds
tq
+
stride3q
],
xm1
mova
[
maskq
],
m4
sub
hd
,
4
jg
.w8_loop
RET
.w16_loop:
add
tmp1q
,
32
*
2
add
tmp2q
,
32
*
2
W_MASK
0
,
4
,
1
lea
ds
tq
,
[
ds
tq
+
strideq
*
2
]
add
maskq
,
32
.w16:
vpermq
m0
,
m0
,
q3120
mova
[
ds
tq
+
strideq
*
0
],
xm0
vextracti128
[
ds
tq
+
strideq
*
1
],
m0
,
1
mova
[
maskq
],
m4
sub
hd
,
2
jg
.w16_loop
RET
.w32_loop:
add
tmp1q
,
32
*
2
add
tmp2q
,
32
*
2
W_MASK
0
,
4
,
1
add
ds
tq
,
strideq
add
maskq
,
32
.w32:
vpermq
m0
,
m0
,
q3120
mova
[
ds
tq
],
m0
mova
[
maskq
],
m4
dec
hd
jg
.w32_loop
RET
.w64_loop:
add
tmp1q
,
32
*
4
add
tmp2q
,
32
*
4
W_MASK
0
,
4
,
1
add
ds
tq
,
strideq
add
maskq
,
32
*
2
.w64:
vpermq
m0
,
m0
,
q3120
mova
[
ds
tq
+
32
*
0
],
m0
mova
[
maskq
+
32
*
0
],
m4
W_MASK
2
,
4
,
1
vpermq
m0
,
m0
,
q3120
mova
[
ds
tq
+
32
*
1
],
m0
mova
[
maskq
+
32
*
1
],
m4
dec
hd
jg
.w64_loop
RET
.w128_loop:
add
tmp1q
,
32
*
8
add
tmp2q
,
32
*
8
W_MASK
0
,
4
,
1
add
ds
tq
,
strideq
add
maskq
,
32
*
4
.w128:
vpermq
m0
,
m0
,
q3120
mova
[
ds
tq
+
32
*
0
],
m0
mova
[
maskq
+
32
*
0
],
m4
W_MASK
2
,
4
,
1
vpermq
m0
,
m0
,
q3120
mova
[
ds
tq
+
32
*
1
],
m0
mova
[
maskq
+
32
*
1
],
m4
W_MASK
4
,
4
,
1
vpermq
m0
,
m0
,
q3120
mova
[
ds
tq
+
32
*
2
],
m0
mova
[
maskq
+
32
*
2
],
m4
W_MASK
6
,
4
,
1
vpermq
m0
,
m0
,
q3120
mova
[
ds
tq
+
32
*
3
],
m0
mova
[
maskq
+
32
*
3
],
m4
dec
hd
jg
.w128_loop
RET
cglobal
bl
end
,
3
,
7
,
7
,
ds
t
,
ds
,
tmp
,
w
,
h
,
mask
%define base r6-blend_avx2_table
lea
r6
,
[
bl
end_avx2_table
]
...
...
src/x86/mc_init_tmpl.c
View file @
4c21c931
...
...
@@ -60,6 +60,7 @@ decl_mask_fn(dav1d_mask_ssse3);
decl_w_mask_fn
(
dav1d_w_mask_420_avx2
);
decl_w_mask_fn
(
dav1d_w_mask_420_ssse3
);
decl_w_mask_fn
(
dav1d_w_mask_422_avx2
);
decl_w_mask_fn
(
dav1d_w_mask_444_avx2
);
decl_blend_fn
(
dav1d_blend_avx2
);
decl_blend_fn
(
dav1d_blend_ssse3
);
decl_blend_dir_fn
(
dav1d_blend_v_avx2
);
...
...
@@ -126,6 +127,7 @@ void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
c
->
avg
=
dav1d_avg_avx2
;
c
->
w_avg
=
dav1d_w_avg_avx2
;
c
->
mask
=
dav1d_mask_avx2
;
c
->
w_mask
[
0
]
=
dav1d_w_mask_444_avx2
;
c
->
w_mask
[
1
]
=
dav1d_w_mask_422_avx2
;
c
->
w_mask
[
2
]
=
dav1d_w_mask_420_avx2
;
c
->
blend
=
dav1d_blend_avx2
;
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment