Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
D
dav1d
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
François Cartegnie
dav1d
Commits
e4d0bd41
Commit
e4d0bd41
authored
Oct 08, 2018
by
Henrik Gramner
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
x86: Add paeth intra prediction AVX2 asm
parent
701f88b9
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
154 additions
and
5 deletions
+154
-5
src/x86/ipred.asm
src/x86/ipred.asm
+152
-5
src/x86/ipred_init.c
src/x86/ipred_init.c
+2
-0
No files found.
src/x86/ipred.asm
View file @
e4d0bd41
...
...
@@ -28,8 +28,12 @@
%if ARCH_X86_64
SECTION
_RODATA
SECTION
_RODATA
32
paeth_shuf:
db
7
,
7
,
7
,
7
,
3
,
3
,
3
,
3
,
6
,
6
,
6
,
6
,
2
,
2
,
2
,
2
db
5
,
5
,
5
,
5
,
1
,
1
,
1
,
1
,
4
,
4
,
4
,
4
,
0
,
0
,
0
,
0
pb_1:
times
4
db
1
pb_128:
times
4
db
128
%macro JMP_TABLE 3-*
...
...
@@ -44,10 +48,11 @@ pb_128: times 4 db 128
%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4)
JMP_TABLE
ipred_dc
,
avx2
,
h4
,
h8
,
h16
,
h32
,
h64
,
w4
,
w8
,
w16
,
w32
,
w64
,
\
s4
-
10
*
4
,
s8
-
10
*
4
,
s16
-
10
*
4
,
s32
-
10
*
4
,
s64
-
10
*
4
JMP_TABLE
ipred_dc_left
,
avx2
,
h4
,
h8
,
h16
,
h32
,
h64
JMP_TABLE
ipred_h
,
avx2
,
w4
,
w8
,
w16
,
w32
,
w64
JMP_TABLE
ipred_paeth
,
avx2
,
w4
,
w8
,
w16
,
w32
,
w64
JMP_TABLE
ipred_dc
,
avx2
,
h4
,
h8
,
h16
,
h32
,
h64
,
w4
,
w8
,
w16
,
w32
,
w64
,
\
s4
-
10
*
4
,
s8
-
10
*
4
,
s16
-
10
*
4
,
s32
-
10
*
4
,
s64
-
10
*
4
JMP_TABLE
ipred_dc_left
,
avx2
,
h4
,
h8
,
h16
,
h32
,
h64
JMP_TABLE
ipred_h
,
avx2
,
w4
,
w8
,
w16
,
w32
,
w64
SECTION
.text
...
...
@@ -396,4 +401,146 @@ INIT_YMM avx2
jg
.w64
RET
%macro PAETH 2
; top, ldiff
pavgb
m1
,
m
%
1
,
m3
; Calculating tldiff normally requires
pxor
m0
,
m
%
1
,
m3
; 10-bit intermediates, but we can do it
pand
m0
,
m4
; in 8-bit with some tricks which avoids
psubusb
m2
,
m5
,
m1
; having to unpack everything to 16-bit.
psubb
m1
,
m0
psubusb
m1
,
m5
por
m1
,
m2
paddusb
m1
,
m1
por
m1
,
m0
; min(tldiff, 255)
psubusb
m2
,
m5
,
m3
psubusb
m0
,
m3
,
m5
por
m2
,
m0
; tdiff
pminub
m2
,
m
%
2
pcmpeqb
m0
,
m
%
2
,
m2
; ldiff <= tdiff
vpblendvb
m0
,
m
%
1
,
m3
,
m0
pminub
m1
,
m2
pcmpeqb
m1
,
m2
; ldiff <= tldiff || tdiff <= tldiff
vpblendvb
m0
,
m5
,
m0
,
m1
%endmacro
cglobal
ipred_paeth
,
3
,
6
,
9
,
ds
t
,
stride
,
tl
,
w
,
h
lea
r5
,
[
ipred_paeth_avx2_table
]
tzcnt
wd
,
wm
vpbroadcastb
m5
,
[
tlq
]
; topleft
movifnidn
hd
,
hm
movsxd
wq
,
[
r5
+
wq
*
4
]
vpbroadcastd
m4
,
[
r5
-
ipred_paeth_avx2_table
+
pb_1
]
add
wq
,
r5
jmp
wq
.w4:
vpbroadcastd
m6
,
[
tlq
+
1
]
; top
mova
m8
,
[
r5
-
ipred_paeth_avx2_table
+
paeth_shuf
]
lea
r3
,
[
strideq
*
3
]
psubusb
m7
,
m5
,
m6
psubusb
m0
,
m6
,
m5
por
m7
,
m0
; ldiff
.w4_loop:
sub
tlq
,
8
vpbroadcastq
m3
,
[
tlq
]
pshufb
m3
,
m8
; left
PAETH
6
,
7
vextracti128
xm1
,
m0
,
1
movd
[
ds
tq
+
strideq
*
0
],
xm0
pextrd
[
ds
tq
+
strideq
*
1
],
xm0
,
2
movd
[
ds
tq
+
strideq
*
2
],
xm1
pextrd
[
ds
tq
+
r3
],
xm1
,
2
cmp
hd
,
4
je
.ret
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
pextrd
[
ds
tq
+
strideq
*
0
],
xm0
,
1
pextrd
[
ds
tq
+
strideq
*
1
],
xm0
,
3
pextrd
[
ds
tq
+
strideq
*
2
],
xm1
,
1
pextrd
[
ds
tq
+
r3
],
xm1
,
3
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
sub
hd
,
8
jg
.w4_loop
.ret:
RET
ALIGN
function_align
.w8:
vpbroadcastq
m6
,
[
tlq
+
1
]
mova
m8
,
[
r5
-
ipred_paeth_avx2_table
+
paeth_shuf
]
lea
r3
,
[
strideq
*
3
]
psubusb
m7
,
m5
,
m6
psubusb
m0
,
m6
,
m5
por
m7
,
m0
.w8_loop:
sub
tlq
,
4
vpbroadcastd
m3
,
[
tlq
]
pshufb
m3
,
m8
PAETH
6
,
7
vextracti128
xm1
,
m0
,
1
movq
[
ds
tq
+
strideq
*
0
],
xm0
movhps
[
ds
tq
+
strideq
*
1
],
xm0
movq
[
ds
tq
+
strideq
*
2
],
xm1
movhps
[
ds
tq
+
r3
],
xm1
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
sub
hd
,
4
jg
.w8_loop
RET
ALIGN
function_align
.w16:
vbroadcasti128
m6
,
[
tlq
+
1
]
mova
xm8
,
xm4
; lower half = 1, upper half = 0
psubusb
m7
,
m5
,
m6
psubusb
m0
,
m6
,
m5
por
m7
,
m0
.w16_loop:
sub
tlq
,
2
vpbroadcastd
m3
,
[
tlq
]
pshufb
m3
,
m8
PAETH
6
,
7
mova
[
ds
tq
+
strideq
*
0
],
xm0
vextracti128
[
ds
tq
+
strideq
*
1
],
m0
,
1
lea
ds
tq
,
[
ds
tq
+
strideq
*
2
]
sub
hd
,
2
jg
.w16_loop
RET
ALIGN
function_align
.w32:
movu
m6
,
[
tlq
+
1
]
psubusb
m7
,
m5
,
m6
psubusb
m0
,
m6
,
m5
por
m7
,
m0
.w32_loop:
dec
tlq
vpbroadcastb
m3
,
[
tlq
]
PAETH
6
,
7
mova
[
ds
tq
],
m0
add
ds
tq
,
strideq
dec
hd
jg
.w32_loop
RET
ALIGN
function_align
.w64:
movu
m6
,
[
tlq
+
1
]
movu
m7
,
[
tlq
+
33
]
%if WIN64
movaps
r4m
,
xmm9
%endif
psubusb
m8
,
m5
,
m6
psubusb
m0
,
m6
,
m5
psubusb
m9
,
m5
,
m7
psubusb
m1
,
m7
,
m5
por
m8
,
m0
por
m9
,
m1
.w64_loop:
dec
tlq
vpbroadcastb
m3
,
[
tlq
]
PAETH
6
,
8
mova
[
ds
tq
+
32
*
0
],
m0
PAETH
7
,
9
mova
[
ds
tq
+
32
*
1
],
m0
add
ds
tq
,
strideq
dec
hd
jg
.w64_loop
%if WIN64
movaps
xmm9
,
r4m
%endif
RET
%endif
src/x86/ipred_init.c
View file @
e4d0bd41
...
...
@@ -34,6 +34,7 @@ decl_angular_ipred_fn(dav1d_ipred_dc_top_avx2);
decl_angular_ipred_fn
(
dav1d_ipred_dc_left_avx2
);
decl_angular_ipred_fn
(
dav1d_ipred_h_avx2
);
decl_angular_ipred_fn
(
dav1d_ipred_v_avx2
);
decl_angular_ipred_fn
(
dav1d_ipred_paeth_avx2
);
void
bitfn
(
dav1d_intra_pred_dsp_init_x86
)(
Dav1dIntraPredDSPContext
*
const
c
)
{
const
unsigned
flags
=
dav1d_get_cpu_flags
();
...
...
@@ -47,5 +48,6 @@ void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
c
->
intra_pred
[
LEFT_DC_PRED
]
=
dav1d_ipred_dc_left_avx2
;
c
->
intra_pred
[
HOR_PRED
]
=
dav1d_ipred_h_avx2
;
c
->
intra_pred
[
VERT_PRED
]
=
dav1d_ipred_v_avx2
;
c
->
intra_pred
[
PAETH_PRED
]
=
dav1d_ipred_paeth_avx2
;
#endif
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment