Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Open sidebar
François Cartegnie
dav1d
Commits
793c5048
Commit
793c5048
authored
Oct 16, 2018
by
David Michael Barr
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
x86: Add chroma-from-luma intra prediction AVX2 asm
Helped-by:
Henrik Gramner
<
gramner@twoorioles.com
>
parent
acfa495a
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
295 additions
and
0 deletions
+295
-0
src/x86/ipred.asm
src/x86/ipred.asm
+285
-0
src/x86/ipred_init.c
src/x86/ipred_init.c
+10
-0
No files found.
src/x86/ipred.asm
View file @
793c5048
...
...
@@ -80,6 +80,7 @@ pb_127_m127: times 2 db 127, -127
%endmacro
%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4)
%define ipred_cfl_splat_avx2_table (ipred_cfl_avx2_table + 8*4)
JMP_TABLE
ipred_smooth
,
avx2
,
w4
,
w8
,
w16
,
w32
,
w64
JMP_TABLE
ipred_smooth_v
,
avx2
,
w4
,
w8
,
w16
,
w32
,
w64
...
...
@@ -89,6 +90,9 @@ JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
s4
-
10
*
4
,
s8
-
10
*
4
,
s16
-
10
*
4
,
s32
-
10
*
4
,
s64
-
10
*
4
JMP_TABLE
ipred_dc_left
,
avx2
,
h4
,
h8
,
h16
,
h32
,
h64
JMP_TABLE
ipred_h
,
avx2
,
w4
,
w8
,
w16
,
w32
,
w64
JMP_TABLE
ipred_cfl
,
avx2
,
h4
,
h8
,
h16
,
h32
,
w4
,
w8
,
w16
,
w32
,
\
s4
-
8
*
4
,
s8
-
8
*
4
,
s16
-
8
*
4
,
s32
-
8
*
4
JMP_TABLE
ipred_cfl_left
,
avx2
,
h4
,
h8
,
h16
,
h32
SECTION
.text
...
...
@@ -1230,4 +1234,285 @@ ALIGN function_align
sub
r3
,
hq
ret
%if WIN64
DECLARE_REG_TMP
5
%else
DECLARE_REG_TMP
7
%endif
%macro IPRED_CFL 1
; ac in, unpacked pixels out
psignw
m3
,
m
%
1
,
m1
pabsw
m
%
1
,
m
%
1
pmulhrsw
m
%
1
,
m2
psignw
m
%
1
,
m3
paddw
m
%
1
,
m0
%endmacro
cglobal
ipred_cfl_top
,
3
,
7
,
6
,
ds
t
,
stride
,
tl
,
w
,
h
,
ac
,
al
pha
lea
t0
,
[
ipred_cfl_left_avx2_table
]
tzcnt
wd
,
wm
inc
tlq
movu
m0
,
[
tlq
]
movifnidn
hd
,
hm
mov
r6d
,
0x8000
shrx
r6d
,
r6d
,
wd
movd
xm3
,
r6d
movsxd
r6
,
[
t0
+
wq
*
4
]
pcmpeqd
m2
,
m2
pmaddubsw
m0
,
m2
add
r6
,
t0
add
t0
,
ipred_cfl_splat_avx2_table
-
ipred_cfl_left_avx2_table
movsxd
wq
,
[
t0
+
wq
*
4
]
add
wq
,
t0
movifnidn
acq
,
acmp
jmp
r6
cglobal
ipred_cfl_left
,
3
,
7
,
6
,
ds
t
,
stride
,
tl
,
w
,
h
,
ac
,
al
pha
mov
hd
,
hm
; zero upper half
tzcnt
r6d
,
hd
sub
tlq
,
hq
tzcnt
wd
,
wm
movu
m0
,
[
tlq
]
mov
t0d
,
0x8000
shrx
t0d
,
t0d
,
r6d
movd
xm3
,
t0d
lea
t0
,
[
ipred_cfl_left_avx2_table
]
movsxd
r6
,
[
t0
+
r6
*
4
]
pcmpeqd
m2
,
m2
pmaddubsw
m0
,
m2
add
r6
,
t0
add
t0
,
ipred_cfl_splat_avx2_table
-
ipred_cfl_left_avx2_table
movsxd
wq
,
[
t0
+
wq
*
4
]
add
wq
,
t0
movifnidn
acq
,
acmp
jmp
r6
.h32:
vextracti128
xm1
,
m0
,
1
paddw
xm0
,
xm1
.h16:
punpckhqdq
xm1
,
xm0
,
xm0
paddw
xm0
,
xm1
.h8:
psrlq
xm1
,
xm0
,
32
paddw
xm0
,
xm1
.h4:
pmaddwd
xm0
,
xm2
pmulhrsw
xm0
,
xm3
vpbroadcastw
m0
,
xm0
jmp
wq
cglobal
ipred_cfl
,
3
,
7
,
6
,
ds
t
,
stride
,
tl
,
w
,
h
,
ac
,
al
pha
movifnidn
hd
,
hm
movifnidn
wd
,
wm
tzcnt
r6d
,
hd
lea
t0d
,
[
wq
+
hq
]
movd
xm4
,
t0d
tzcnt
t0d
,
t0d
movd
xm5
,
t0d
lea
t0
,
[
ipred_cfl_avx2_table
]
tzcnt
wd
,
wd
movsxd
r6
,
[
t0
+
r6
*
4
]
movsxd
wq
,
[
t0
+
wq
*
4
+
4
*
4
]
pcmpeqd
m3
,
m3
psrlw
xm4
,
1
add
r6
,
t0
add
wq
,
t0
movifnidn
acq
,
acmp
jmp
r6
.h4:
movd
xm0
,
[
tlq
-
4
]
pmaddubsw
xm0
,
xm3
jmp
wq
.w4:
movd
xm1
,
[
tlq
+
1
]
pmaddubsw
xm1
,
xm3
psubw
xm0
,
xm4
paddw
xm0
,
xm1
pmaddwd
xm0
,
xm3
cmp
hd
,
4
jg
.w4_mul
psrlw
xm0
,
3
jmp
.w4_end
.w4_mul:
punpckhqdq
xm1
,
xm0
,
xm0
lea
r2d
,
[
hq
*
2
]
mov
r6d
,
0x55563334
paddw
xm0
,
xm1
shrx
r6d
,
r6d
,
r2d
psrlq
xm1
,
xm0
,
32
paddw
xm0
,
xm1
movd
xm1
,
r6d
psrlw
xm0
,
2
pmulhuw
xm0
,
xm1
.w4_end:
vpbroadcastw
m0
,
xm0
.s4:
vpbroadcastw
m1
,
al
pham
lea
r6
,
[
strideq
*
3
]
pabsw
m2
,
m1
psllw
m2
,
9
.s4_loop:
mova
m4
,
[
acq
]
IPRED_CFL
4
packuswb
m4
,
m4
vextracti128
xm5
,
m4
,
1
movd
[
ds
tq
+
strideq
*
0
],
xm4
pextrd
[
ds
tq
+
strideq
*
1
],
xm4
,
1
movd
[
ds
tq
+
strideq
*
2
],
xm5
pextrd
[
ds
tq
+
r6
],
xm5
,
1
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
add
acq
,
32
sub
hd
,
4
jg
.s4_loop
RET
ALIGN
function_align
.h8:
movq
xm0
,
[
tlq
-
8
]
pmaddubsw
xm0
,
xm3
jmp
wq
.w8:
movq
xm1
,
[
tlq
+
1
]
vextracti128
xm2
,
m0
,
1
pmaddubsw
xm1
,
xm3
psubw
xm0
,
xm4
paddw
xm0
,
xm2
punpckhqdq
xm2
,
xm0
,
xm0
paddw
xm0
,
xm2
paddw
xm0
,
xm1
psrlq
xm1
,
xm0
,
32
paddw
xm0
,
xm1
pmaddwd
xm0
,
xm3
psrlw
xm0
,
xm5
cmp
hd
,
8
je
.w8_end
mov
r6d
,
0x5556
mov
r2d
,
0x3334
cmp
hd
,
32
cmovz
r6d
,
r2d
movd
xm1
,
r6d
pmulhuw
xm0
,
xm1
.w8_end:
vpbroadcastw
m0
,
xm0
.s8:
vpbroadcastw
m1
,
al
pham
lea
r6
,
[
strideq
*
3
]
pabsw
m2
,
m1
psllw
m2
,
9
.s8_loop:
mova
m4
,
[
acq
]
mova
m5
,
[
acq
+
32
]
IPRED_CFL
4
IPRED_CFL
5
packuswb
m4
,
m5
vextracti128
xm5
,
m4
,
1
movq
[
ds
tq
+
strideq
*
0
],
xm4
movq
[
ds
tq
+
strideq
*
1
],
xm5
movhps
[
ds
tq
+
strideq
*
2
],
xm4
movhps
[
ds
tq
+
r6
],
xm5
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
add
acq
,
64
sub
hd
,
4
jg
.s8_loop
RET
ALIGN
function_align
.h16:
mova
xm0
,
[
tlq
-
16
]
pmaddubsw
xm0
,
xm3
jmp
wq
.w16:
movu
xm1
,
[
tlq
+
1
]
vextracti128
xm2
,
m0
,
1
pmaddubsw
xm1
,
xm3
psubw
xm0
,
xm4
paddw
xm0
,
xm2
paddw
xm0
,
xm1
punpckhqdq
xm1
,
xm0
,
xm0
paddw
xm0
,
xm1
psrlq
xm1
,
xm0
,
32
paddw
xm0
,
xm1
pmaddwd
xm0
,
xm3
psrlw
xm0
,
xm5
cmp
hd
,
16
je
.w16_end
mov
r6d
,
0x5556
mov
r2d
,
0x3334
test
hb
,
8
|
32
cmovz
r6d
,
r2d
movd
xm1
,
r6d
pmulhuw
xm0
,
xm1
.w16_end:
vpbroadcastw
m0
,
xm0
.s16:
vpbroadcastw
m1
,
al
pham
pabsw
m2
,
m1
psllw
m2
,
9
.s16_loop:
mova
m4
,
[
acq
]
mova
m5
,
[
acq
+
32
]
IPRED_CFL
4
IPRED_CFL
5
packuswb
m4
,
m5
vpermq
m4
,
m4
,
q3120
mova
[
ds
tq
+
strideq
*
0
],
xm4
vextracti128
[
ds
tq
+
strideq
*
1
],
m4
,
1
lea
ds
tq
,
[
ds
tq
+
strideq
*
2
]
add
acq
,
64
sub
hd
,
2
jg
.s16_loop
RET
ALIGN
function_align
.h32:
mova
m0
,
[
tlq
-
32
]
pmaddubsw
m0
,
m3
jmp
wq
.w32:
movu
m1
,
[
tlq
+
1
]
pmaddubsw
m1
,
m3
paddw
m0
,
m1
vextracti128
xm1
,
m0
,
1
psubw
xm0
,
xm4
paddw
xm0
,
xm1
punpckhqdq
xm1
,
xm0
,
xm0
paddw
xm0
,
xm1
psrlq
xm1
,
xm0
,
32
paddw
xm0
,
xm1
pmaddwd
xm0
,
xm3
psrlw
xm0
,
xm5
cmp
hd
,
32
je
.w32_end
lea
r2d
,
[
hq
*
2
]
mov
r6d
,
0x33345556
shrx
r6d
,
r6d
,
r2d
movd
xm1
,
r6d
pmulhuw
xm0
,
xm1
.w32_end:
vpbroadcastw
m0
,
xm0
.s32:
vpbroadcastw
m1
,
al
pham
pabsw
m2
,
m1
psllw
m2
,
9
.s32_loop:
mova
m4
,
[
acq
]
mova
m5
,
[
acq
+
32
]
IPRED_CFL
4
IPRED_CFL
5
packuswb
m4
,
m5
vpermq
m4
,
m4
,
q3120
mova
[
ds
tq
],
m4
add
ds
tq
,
strideq
add
acq
,
64
dec
hd
jg
.s32_loop
RET
cglobal
ipred_cfl_128
,
3
,
7
,
6
,
ds
t
,
stride
,
tl
,
w
,
h
,
ac
,
al
pha
lea
t0
,
[
ipred_cfl_splat_avx2_table
]
tzcnt
wd
,
wm
movifnidn
hd
,
hm
movsxd
wq
,
[
t0
+
wq
*
4
]
vpbroadcastd
m0
,
[
t0
-
ipred_cfl_splat_avx2_table
+
pw_128
]
add
wq
,
t0
movifnidn
acq
,
acmp
jmp
wq
%endif
src/x86/ipred_init.c
View file @
793c5048
...
...
@@ -39,6 +39,11 @@ decl_angular_ipred_fn(dav1d_ipred_smooth_avx2);
decl_angular_ipred_fn
(
dav1d_ipred_smooth_v_avx2
);
decl_angular_ipred_fn
(
dav1d_ipred_smooth_h_avx2
);
decl_cfl_pred_fn
(
dav1d_ipred_cfl_avx2
);
decl_cfl_pred_fn
(
dav1d_ipred_cfl_128_avx2
);
decl_cfl_pred_fn
(
dav1d_ipred_cfl_top_avx2
);
decl_cfl_pred_fn
(
dav1d_ipred_cfl_left_avx2
);
void
bitfn
(
dav1d_intra_pred_dsp_init_x86
)(
Dav1dIntraPredDSPContext
*
const
c
)
{
const
unsigned
flags
=
dav1d_get_cpu_flags
();
...
...
@@ -55,5 +60,10 @@ void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
c
->
intra_pred
[
SMOOTH_PRED
]
=
dav1d_ipred_smooth_avx2
;
c
->
intra_pred
[
SMOOTH_V_PRED
]
=
dav1d_ipred_smooth_v_avx2
;
c
->
intra_pred
[
SMOOTH_H_PRED
]
=
dav1d_ipred_smooth_h_avx2
;
c
->
cfl_pred
[
DC_PRED
]
=
dav1d_ipred_cfl_avx2
;
c
->
cfl_pred
[
DC_128_PRED
]
=
dav1d_ipred_cfl_128_avx2
;
c
->
cfl_pred
[
TOP_DC_PRED
]
=
dav1d_ipred_cfl_top_avx2
;
c
->
cfl_pred
[
LEFT_DC_PRED
]
=
dav1d_ipred_cfl_left_avx2
;
#endif
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment