Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
D
dav1d
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
François Cartegnie
dav1d
Commits
59c3370e
Commit
59c3370e
authored
Oct 11, 2018
by
Henrik Gramner
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
x86: Add smooth intra prediction AVX2 asm
parent
613ef787
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
704 additions
and
11 deletions
+704
-11
src/x86/ipred.asm
src/x86/ipred.asm
+698
-11
src/x86/ipred_init.c
src/x86/ipred_init.c
+6
-0
No files found.
src/x86/ipred.asm
View file @
59c3370e
...
...
@@ -30,11 +30,44 @@
SECTION_RODATA
32
paeth_shuf
:
db
7
,
7
,
7
,
7
,
3
,
3
,
3
,
3
,
6
,
6
,
6
,
6
,
2
,
2
,
2
,
2
db
5
,
5
,
5
,
5
,
1
,
1
,
1
,
1
,
4
,
4
,
4
,
4
,
0
,
0
,
0
,
0
%macro
SMOOTH_WEIGHT_TABLE
1
-*
%
rep
%0
db
%1
-
128
,
127
-
%1
%
rotate
1
%
endrep
%endmacro
; sm_weights[], but modified to precalculate x and 256-x with offsets to
; enable efficient use of pmaddubsw (which requires signed values)
smooth_weights
:
SMOOTH_WEIGHT_TABLE
\
0
,
0
,
255
,
128
,
255
,
149
,
85
,
64
,
\
255
,
197
,
146
,
105
,
73
,
50
,
37
,
32
,
\
255
,
225
,
196
,
170
,
145
,
123
,
102
,
84
,
\
68
,
54
,
43
,
33
,
26
,
20
,
17
,
16
,
\
255
,
240
,
225
,
210
,
196
,
182
,
169
,
157
,
\
145
,
133
,
122
,
111
,
101
,
92
,
83
,
74
,
\
66
,
59
,
52
,
45
,
39
,
34
,
29
,
25
,
\
21
,
17
,
14
,
12
,
10
,
9
,
8
,
8
,
\
255
,
248
,
240
,
233
,
225
,
218
,
210
,
203
,
\
196
,
189
,
182
,
176
,
169
,
163
,
156
,
150
,
\
144
,
138
,
133
,
127
,
121
,
116
,
111
,
106
,
\
101
,
96
,
91
,
86
,
82
,
77
,
73
,
69
,
\
65
,
61
,
57
,
54
,
50
,
47
,
44
,
41
,
\
38
,
35
,
32
,
29
,
27
,
25
,
22
,
20
,
\
18
,
16
,
15
,
13
,
12
,
10
,
9
,
8
,
\
7
,
6
,
6
,
5
,
5
,
4
,
4
,
4
ipred_v_shuf
:
db
0
,
1
,
0
,
1
,
4
,
5
,
4
,
5
,
8
,
9
,
8
,
9
,
12
,
13
,
12
,
13
db
2
,
3
,
2
,
3
,
6
,
7
,
6
,
7
,
10
,
11
,
10
,
11
,
14
,
15
,
14
,
15
ipred_h_shuf
:
db
7
,
7
,
7
,
7
,
3
,
3
,
3
,
3
,
5
,
5
,
5
,
5
,
1
,
1
,
1
,
1
db
6
,
6
,
6
,
6
,
2
,
2
,
2
,
2
,
4
,
4
,
4
,
4
,
0
,
0
,
0
,
0
pb_1
:
times
4
db
1
pb_128
:
times
4
db
128
pw_128
:
times
2
dw
128
pw_255
:
times
2
dw
255
pb_127_m127
:
times
2
db
127
,
-
127
%macro
JMP_TABLE
3
-*
%
xdefine
%1
_
%2
_table
(
%%
table
-
2
*
4
)
...
...
@@ -48,6 +81,9 @@ pb_128: times 4 db 128
%define
ipred_dc_splat_avx2_table
(
ipred_dc_avx2_table
+
10
*
4
)
JMP_TABLE
ipred_smooth
,
avx2
,
w4
,
w8
,
w16
,
w32
,
w64
JMP_TABLE
ipred_smooth_v
,
avx2
,
w4
,
w8
,
w16
,
w32
,
w64
JMP_TABLE
ipred_smooth_h
,
avx2
,
w4
,
w8
,
w16
,
w32
,
w64
JMP_TABLE
ipred_paeth
,
avx2
,
w4
,
w8
,
w16
,
w32
,
w64
JMP_TABLE
ipred_dc
,
avx2
,
h4
,
h8
,
h16
,
h32
,
h64
,
w4
,
w8
,
w16
,
w32
,
w64
,
\
s4
-
10
*
4
,
s8
-
10
*
4
,
s16
-
10
*
4
,
s32
-
10
*
4
,
s64
-
10
*
4
...
...
@@ -423,17 +459,18 @@ INIT_YMM avx2
%endmacro
cglobal
ipred_paeth
,
3
,
6
,
9
,
dst
,
stride
,
tl
,
w
,
h
%define
base
r5
-
ipred_paeth_avx2_table
lea
r5
,
[
ipred_paeth_avx2_table
]
tzcnt
wd
,
wm
vpbroadcastb
m5
,
[tlq]
; topleft
movifnidn
hd
,
hm
movsxd
wq
,
[
r5
+
wq
*
4
]
vpbroadcastd
m4
,
[
r5
-
ipred_paeth_avx2_tabl
e
+
pb_1
]
vpbroadcastd
m4
,
[
bas
e
+
pb_1
]
add
wq
,
r5
jmp
wq
.
w4
:
vpbroadcastd
m6
,
[
tlq
+
1
]
; top
mova
m8
,
[
r5
-
ipred_paeth_avx2_table
+
paet
h_shuf
]
mova
m8
,
[
base
+
ipred_
h_shuf
]
lea
r3
,
[
strideq
*
3
]
psubusb
m7
,
m5
,
m6
psubusb
m0
,
m6
,
m5
...
...
@@ -445,15 +482,15 @@ cglobal ipred_paeth, 3, 6, 9, dst, stride, tl, w, h
PAETH
6
,
7
vextracti128
xm1
,
m0
,
1
movd
[
dstq
+
strideq
*
0
]
,
xm0
pextrd
[
dstq
+
strideq
*
1
]
,
xm0
,
2
movd
[
dstq
+
strideq
*
2
]
,
xm1
movd
[
dstq
+
strideq
*
1
]
,
xm1
pextrd
[
dstq
+
strideq
*
2
]
,
xm0
,
2
pextrd
[
dstq
+
r3
]
,
xm1
,
2
cmp
hd
,
4
je
.
ret
lea
dstq
,
[
dstq
+
strideq
*
4
]
pextrd
[
dstq
+
strideq
*
0
]
,
xm0
,
1
pextrd
[
dstq
+
strideq
*
1
]
,
xm
0
,
3
pextrd
[
dstq
+
strideq
*
2
]
,
xm
1
,
1
pextrd
[
dstq
+
strideq
*
1
]
,
xm
1
,
1
pextrd
[
dstq
+
strideq
*
2
]
,
xm
0
,
3
pextrd
[
dstq
+
r3
]
,
xm1
,
3
lea
dstq
,
[
dstq
+
strideq
*
4
]
sub
hd
,
8
...
...
@@ -463,7 +500,7 @@ cglobal ipred_paeth, 3, 6, 9, dst, stride, tl, w, h
ALIGN
function_align
.
w8
:
vpbroadcastq
m6
,
[
tlq
+
1
]
mova
m8
,
[
r5
-
ipred_paeth_avx2_table
+
paet
h_shuf
]
mova
m8
,
[
base
+
ipred_
h_shuf
]
lea
r3
,
[
strideq
*
3
]
psubusb
m7
,
m5
,
m6
psubusb
m0
,
m6
,
m5
...
...
@@ -475,8 +512,8 @@ ALIGN function_align
PAETH
6
,
7
vextracti128
xm1
,
m0
,
1
movq
[
dstq
+
strideq
*
0
]
,
xm0
mov
hps
[
dstq
+
strideq
*
1
]
,
xm0
mov
q
[
dstq
+
strideq
*
2
]
,
xm1
mov
q
[
dstq
+
strideq
*
1
]
,
xm1
mov
hps
[
dstq
+
strideq
*
2
]
,
xm0
movhps
[
dstq
+
r3
]
,
xm1
lea
dstq
,
[
dstq
+
strideq
*
4
]
sub
hd
,
4
...
...
@@ -543,4 +580,654 @@ ALIGN function_align
%endif
RET
%macro
SMOOTH
6
; src[1-2], mul[1-2], add[1-2]
; w * a = (w - 128) * a + 128 * a
; (256 - w) * b = (127 - w) * b + 129 * b
pmaddubsw
m0
,
m%3
,
m%1
pmaddubsw
m1
,
m%4
,
m%2
paddw
m0
,
m%5
paddw
m1
,
m%6
psrlw
m0
,
8
psrlw
m1
,
8
packuswb
m0
,
m1
%endmacro
cglobal
ipred_smooth_v
,
3
,
7
,
0
,
dst
,
stride
,
tl
,
w
,
h
,
weights
%define
base
r6
-
ipred_smooth_v_avx2_table
lea
r6
,
[
ipred_smooth_v_avx2_table
]
tzcnt
wd
,
wm
mov
hd
,
hm
movsxd
wq
,
[
r6
+
wq
*
4
]
vpbroadcastd
m0
,
[
base
+
pb_127_m127
]
vpbroadcastd
m1
,
[
base
+
pw_128
]
lea
weightsq
,
[
base
+
smooth_weights
+
hq
*
4
]
neg
hq
vpbroadcastb
m5
,
[
tlq
+
hq
]
; bottom
add
wq
,
r6
jmp
wq
.
w4
:
vpbroadcastd
m2
,
[
tlq
+
1
]
punpcklbw
m2
,
m5
; top, bottom
mova
m5
,
[
base
+
ipred_v_shuf
]
lea
r3
,
[
strideq
*
3
]
punpckldq
m4
,
m5
,
m5
punpckhdq
m5
,
m5
pmaddubsw
m3
,
m2
,
m0
paddw
m1
,
m2
; 1 * top + 256 * bottom + 128, overflow is ok
paddw
m3
,
m1
; 128 * top + 129 * bottom + 128
.
w4_loop
:
vbroadcasti128
m1
,
[
weightsq
+
hq
*
2
]
pshufb
m0
,
m1
,
m4
pshufb
m1
,
m5
SMOOTH
0
,
1
,
2
,
2
,
3
,
3
vextracti128
xm1
,
m0
,
1
movd
[
dstq
+
strideq
*
0
]
,
xm0
movd
[
dstq
+
strideq
*
1
]
,
xm1
pextrd
[
dstq
+
strideq
*
2
]
,
xm0
,
1
pextrd
[
dstq
+
r3
]
,
xm1
,
1
cmp
hd
,
-
4
je
.
ret
lea
dstq
,
[
dstq
+
strideq
*
4
]
pextrd
[
dstq
+
strideq
*
0
]
,
xm0
,
2
pextrd
[
dstq
+
strideq
*
1
]
,
xm1
,
2
pextrd
[
dstq
+
strideq
*
2
]
,
xm0
,
3
pextrd
[
dstq
+
r3
]
,
xm1
,
3
lea
dstq
,
[
dstq
+
strideq
*
4
]
add
hq
,
8
jl
.
w4_loop
.
ret
:
RET
ALIGN
function_align
.
w8
:
vpbroadcastq
m2
,
[
tlq
+
1
]
punpcklbw
m2
,
m5
mova
m5
,
[
base
+
ipred_v_shuf
]
lea
r3
,
[
strideq
*
3
]
pshufd
m4
,
m5
,
q0000
pshufd
m5
,
m5
,
q1111
pmaddubsw
m3
,
m2
,
m0
paddw
m1
,
m2
paddw
m3
,
m1
.
w8_loop
:
vpbroadcastq
m1
,
[
weightsq
+
hq
*
2
]
pshufb
m0
,
m1
,
m4
pshufb
m1
,
m5
SMOOTH
0
,
1
,
2
,
2
,
3
,
3
vextracti128
xm1
,
m0
,
1
movq
[
dstq
+
strideq
*
0
]
,
xm0
movq
[
dstq
+
strideq
*
1
]
,
xm1
movhps
[
dstq
+
strideq
*
2
]
,
xm0
movhps
[
dstq
+
r3
]
,
xm1
lea
dstq
,
[
dstq
+
strideq
*
4
]
add
hq
,
4
jl
.
w8_loop
RET
ALIGN
function_align
.
w16
:
WIN64_SPILL_XMM
7
vbroadcasti128
m3
,
[
tlq
+
1
]
mova
m6
,
[
base
+
ipred_v_shuf
]
punpcklbw
m2
,
m3
,
m5
punpckhbw
m3
,
m5
pmaddubsw
m4
,
m2
,
m0
pmaddubsw
m5
,
m3
,
m0
paddw
m0
,
m1
,
m2
paddw
m1
,
m3
paddw
m4
,
m0
paddw
m5
,
m1
.
w16_loop
:
vpbroadcastd
m1
,
[
weightsq
+
hq
*
2
]
pshufb
m1
,
m6
SMOOTH
1
,
1
,
2
,
3
,
4
,
5
mova
[
dstq
+
strideq
*
0
]
,
xm0
vextracti128
[
dstq
+
strideq
*
1
]
,
m0
,
1
lea
dstq
,
[
dstq
+
strideq
*
2
]
add
hq
,
2
jl
.
w16_loop
RET
ALIGN
function_align
.
w32
:
%
assign
stack_offset
stack_offset
-
stack_size_padded
WIN64_SPILL_XMM
6
movu
m3
,
[
tlq
+
1
]
punpcklbw
m2
,
m3
,
m5
punpckhbw
m3
,
m5
pmaddubsw
m4
,
m2
,
m0
pmaddubsw
m5
,
m3
,
m0
paddw
m0
,
m1
,
m2
paddw
m1
,
m3
paddw
m4
,
m0
paddw
m5
,
m1
.
w32_loop
:
vpbroadcastw
m1
,
[
weightsq
+
hq
*
2
]
SMOOTH
1
,
1
,
2
,
3
,
4
,
5
mova
[dstq],
m0
add
dstq
,
strideq
inc
hq
jl
.
w32_loop
RET
ALIGN
function_align
.
w64
:
WIN64_SPILL_XMM
11
movu
m4
,
[
tlq
+
1
]
movu
m8
,
[
tlq
+
33
]
punpcklbw
m3
,
m4
,
m5
punpckhbw
m4
,
m5
punpcklbw
m7
,
m8
,
m5
punpckhbw
m8
,
m5
pmaddubsw
m5
,
m3
,
m0
pmaddubsw
m6
,
m4
,
m0
pmaddubsw
m9
,
m7
,
m0
pmaddubsw
m10
,
m8
,
m0
paddw
m2
,
m1
,
m3
paddw
m5
,
m2
paddw
m2
,
m1
,
m4
paddw
m6
,
m2
paddw
m0
,
m1
,
m7
paddw
m9
,
m0
paddw
m1
,
m8
paddw
m10
,
m1
.
w64_loop
:
vpbroadcastw
m2
,
[
weightsq
+
hq
*
2
]
SMOOTH
2
,
2
,
3
,
4
,
5
,
6
mova
[
dstq
+
32
*
0
]
,
m0
SMOOTH
2
,
2
,
7
,
8
,
9
,
10
mova
[
dstq
+
32
*
1
]
,
m0
add
dstq
,
strideq
inc
hq
jl
.
w64_loop
RET
%macro
SETUP_STACK_FRAME
3
; stack_size, regs_used, xmm_regs_used
%
assign
stack_offset
0
%
assign
stack_size_padded
0
%
assign
regs_used
%2
%
xdefine
rstk
rsp
SETUP_STACK_POINTER
%1
%
if
regs_used
!
=
%2
&&
WIN64
PUSH
r%2
%
endif
ALLOC_STACK
%1
,
%3
%endmacro
cglobal
ipred_smooth_h
,
3
,
7
,
0
,
dst
,
stride
,
tl
,
w
,
h
%define
base
r6
-
ipred_smooth_h_avx2_table
lea
r6
,
[
ipred_smooth_h_avx2_table
]
mov
wd
,
wm
vpbroadcastb
m3
,
[
tlq
+
wq
]
; right
tzcnt
wd
,
wd
mov
hd
,
hm
movsxd
wq
,
[
r6
+
wq
*
4
]
vpbroadcastd
m4
,
[
base
+
pb_127_m127
]
vpbroadcastd
m5
,
[
base
+
pw_128
]
add
wq
,
r6
jmp
wq
.
w4
:
WIN64_SPILL_XMM
8
vpbroadcastq
m6
,
[
base
+
smooth_weights
+
4
*
2
]
mova
m7
,
[
base
+
ipred_h_shuf
]
sub
tlq
,
8
sub
tlq
,
hq
lea
r3
,
[
strideq
*
3
]
.
w4_loop
:
vpbroadcastq
m2
,
[
tlq
+
hq
]
pshufb
m2
,
m7
punpcklbw
m1
,
m2
,
m3
; left, right
punpckhbw
m2
,
m3
pmaddubsw
m0
,
m1
,
m4
; 127 * left - 127 * right
paddw
m0
,
m1
; 128 * left + 129 * right
pmaddubsw
m1
,
m6
paddw
m1
,
m5
paddw
m0
,
m1
pmaddubsw
m1
,
m2
,
m4
paddw
m1
,
m2
pmaddubsw
m2
,
m6
paddw
m2
,
m5
paddw
m1
,
m2
psrlw
m0
,
8
psrlw
m1
,
8
packuswb
m0
,
m1
vextracti128
xm1
,
m0
,
1
movd
[
dstq
+
strideq
*
0
]
,
xm0
movd
[
dstq
+
strideq
*
1
]
,
xm1
pextrd
[
dstq
+
strideq
*
2
]
,
xm0
,
2
pextrd
[
dstq
+
r3
]
,
xm1
,
2
cmp
hd
,
4
je
.
ret
lea
dstq
,
[
dstq
+
strideq
*
4
]
pextrd
[
dstq
+
strideq
*
0
]
,
xm0
,
1
pextrd
[
dstq
+
strideq
*
1
]
,
xm1
,
1
pextrd
[
dstq
+
strideq
*
2
]
,
xm0
,
3
pextrd
[
dstq
+
r3
]
,
xm1
,
3
lea
dstq
,
[
dstq
+
strideq
*
4
]
sub
hd
,
8
jg
.
w4_loop
.
ret
:
RET
ALIGN
function_align
.
w8
:
%
assign
stack_offset
stack_offset
-
stack_size_padded
WIN64_SPILL_XMM
8
vbroadcasti128
m6
,
[
base
+
smooth_weights
+
8
*
2
]
mova
m7
,
[
base
+
ipred_h_shuf
]
sub
tlq
,
4
lea
r3
,
[
strideq
*
3
]
sub
tlq
,
hq
.
w8_loop
:
vpbroadcastd
m2
,
[
tlq
+
hq
]
pshufb
m2
,
m7
punpcklbw
m1
,
m2
,
m3
punpckhbw
m2
,
m3
pmaddubsw
m0
,
m1
,
m4
paddw
m0
,
m1
pmaddubsw
m1
,
m6
paddw
m1
,
m5
paddw
m0
,
m1
pmaddubsw
m1
,
m2
,
m4
paddw
m1
,
m2
pmaddubsw
m2
,
m6
paddw
m2
,
m5
paddw
m1
,
m2
psrlw
m0
,
8
psrlw
m1
,
8
packuswb
m0
,
m1
vextracti128
xm1
,
m0
,
1
movq
[
dstq
+
strideq
*
0
]
,
xm0
movq
[
dstq
+
strideq
*
1
]
,
xm1
movhps
[
dstq
+
strideq
*
2
]
,
xm0
movhps
[
dstq
+
r3
]
,
xm1
lea
dstq
,
[
dstq
+
strideq
*
4
]
sub
hd
,
4
jg
.
w8_loop
RET
ALIGN
function_align
.
w16
:
SETUP_STACK_FRAME
32
*
4
,
7
,
8
lea
r3
,
[
rsp
+
64
*
2
-
4
]
call
.
prep
; only worthwhile for for w16 and above
sub
tlq
,
2
vpbroadcastd
xm6
,
[
base
+
pb_1
]
mova
xm7
,
[
base
+
ipred_v_shuf
+
16
]
vinserti128
m7
,
[
base
+
ipred_v_shuf
+
0
]
,
1
vbroadcasti128
m4
,
[
base
+
smooth_weights
+
16
*
2
]
vbroadcasti128
m5
,
[
base
+
smooth_weights
+
16
*
3
]
.
w16_loop
:
vpbroadcastd
m1
,
[
tlq
+
hq
]
vpbroadcastd
m2
,
[
r3
+
hq
*
2
]
pshufb
m1
,
m6
punpcklbw
m1
,
m3
pshufb
m2
,
m7
SMOOTH
4
,
5
,
1
,
1
,
2
,
2
mova
[
dstq
+
strideq
*
0
]
,
xm0
vextracti128
[
dstq
+
strideq
*
1
]
,
m0
,
1
lea
dstq
,
[
dstq
+
strideq
*
2
]
sub
hd
,
2
jg
.
w16_loop
RET
ALIGN
function_align
.
w32
:
SETUP_STACK_FRAME
32
*
4
,
7
,
6
lea
r3
,
[
rsp
+
64
*
2
-
2
]
call
.
prep
dec
tlq
mova
xm4
,
[
base
+
smooth_weights
+
16
*
4
]
vinserti128
m4
,
[
base
+
smooth_weights
+
16
*
6
]
,
1
mova
xm5
,
[
base
+
smooth_weights
+
16
*
5
]
vinserti128
m5
,
[
base
+
smooth_weights
+
16
*
7
]
,
1
.
w32_loop
:
vpbroadcastb
m1
,
[
tlq
+
hq
]
punpcklbw
m1
,
m3
vpbroadcastw
m2
,
[
r3
+
hq
*
2
]
SMOOTH
4
,
5
,
1
,
1
,
2
,
2
mova
[dstq],
m0
add
dstq
,
strideq
dec
hd
jg
.
w32_loop
RET
ALIGN
function_align
.
w64
:
SETUP_STACK_FRAME
32
*
4
,
7
,
9
lea
r3
,
[
rsp
+
64
*
2
-
2
]
call
.
prep
add
r6
,
smooth_weights
+
16
*
15
-
ipred_smooth_h_avx2_table
dec
tlq
mova
xm5
,
[
r6
-
16
*
7
]
vinserti128
m5
,
[
r6
-
16
*
5
]
,
1
mova
xm6
,
[
r6
-
16
*
6
]
vinserti128
m6
,
[
r6
-
16
*
4
]
,
1
mova
xm7
,
[
r6
-
16
*
3
]
vinserti128
m7
,
[
r6
-
16
*
1
]
,
1
mova
xm8
,
[
r6
-
16
*
2
]
vinserti128
m8
,
[
r6
-
16
*
0
]
,
1
.
w64_loop
:
vpbroadcastb
m2
,
[
tlq
+
hq
]
punpcklbw
m2
,
m3
vpbroadcastw
m4
,
[
r3
+
hq
*
2
]
SMOOTH
5
,
6
,
2
,
2
,
4
,
4
mova
[
dstq
+
32
*
0
]
,
m0
SMOOTH
7
,
8
,
2
,
2
,
4
,
4
mova
[
dstq
+
32
*
1
]
,
m0
add
dstq
,
strideq
dec
hd
jg
.
w64_loop
RET
ALIGN
function_align
.
prep
:
vpermq
m2
,
[
tlq
-
32
*
1
]
,
q3120
punpckhbw
m1
,
m2
,
m3
punpcklbw
m2
,
m3
pmaddubsw
m0
,
m1
,
m4
; 127 * left - 127 * right
paddw
m1
,
m5
; 1 * left + 256 * right + 128
paddw
m0
,
m1
; 128 * left + 129 * right + 128
pmaddubsw
m1
,
m2
,
m4
paddw
m2
,
m5
paddw
m1
,
m2
vpermq
m2
,
[
tlq
-
32
*
2
]
,
q3120
mova
[
rsp
+
gprsize
+
32
*
3
]
,
m0
mova
[
rsp
+
gprsize
+
32
*
2
]
,
m1
punpckhbw
m1
,
m2
,
m3
punpcklbw
m2
,
m3
pmaddubsw
m0
,
m1
,
m4
paddw
m1
,
m5
paddw
m0
,
m1
pmaddubsw
m1
,
m2
,
m4
paddw
m2
,
m5
paddw
m1
,
m2
mova
[
rsp
+
gprsize
+
32
*
1
]
,
m0
mova
[
rsp
+
gprsize
+
32
*
0
]
,
m1
sub
r3
,
hq
sub
tlq
,
hq
sub
r3
,
hq
ret
%macro
SMOOTH_2D_END
6
; src[1-2], mul[1-2], add[1-2]
pmaddubsw
m0
,
m%3
,
m%1
pmaddubsw
m1
,
m%4
,
m%2
%ifnum
%5
paddw
m0
,
m%5
%else
paddw
m0
,
%5
%endif
%ifnum
%6
paddw
m1
,
m%6
%else
paddw
m1
,
%6
%endif
pavgw
m0
,
m2
pavgw
m1
,
m3
psrlw
m0
,
8
psrlw
m1
,
8
packuswb
m0
,
m1
%endmacro
cglobal
ipred_smooth
,
3
,
7
,
0
,
dst
,
stride
,
tl
,
w
,
h
,
v_weights
%define
base
r6
-
ipred_smooth_avx2_table
lea
r6
,
[
ipred_smooth_avx2_table
]
mov
wd
,
wm
vpbroadcastb
m4
,
[
tlq
+
wq
]
; right
tzcnt
wd
,
wd
mov
hd
,
hm
mov
r5
,
tlq
sub
r5
,
hq
movsxd
wq
,
[
r6
+
wq
*
4
]
vpbroadcastd
m5
,
[
base
+
pb_127_m127
]
vpbroadcastb
m0
,
[r5]
; bottom
vpbroadcastd
m3
,
[
base
+
pw_255
]
add
wq
,
r6
lea
v_weightsq
,
[
base
+
smooth_weights
+
hq
*
2
]
jmp
wq
.
w4
:
WIN64_SPILL_XMM
12
mova
m10
,
[
base
+
ipred_h_shuf
]
vpbroadcastq
m11
,
[
base
+
smooth_weights
+
4
*
2
]
mova
m7
,
[
base
+
ipred_v_shuf
]
vpbroadcastd
m8
,
[
tlq
+
1
]
sub
tlq
,
8
lea
r3
,
[
strideq
*
3
]
sub
tlq
,
hq
punpcklbw
m8
,
m0
; top, bottom
pshufd
m6
,
m7
,
q2200
pshufd
m7
,
m7
,
q3311
pmaddubsw
m9
,
m8
,
m5
paddw
m3
,
m8
; 1 * top + 255 * bottom + 255
paddw
m9
,
m3
; 128 * top + 129 * bottom + 255
.
w4_loop
:
vpbroadcastq
m1
,
[
tlq
+
hq
]
pshufb
m1
,
m10
punpcklbw
m0
,
m1
,
m4
; left, right
punpckhbw
m1
,
m4
pmaddubsw
m2
,
m0
,
m5
; 127 * left - 127 * right
pmaddubsw
m3
,
m1
,
m5
paddw
m2
,
m0
; 128 * left + 129 * right
paddw
m3
,
m1
pmaddubsw
m0
,
m11
pmaddubsw
m1
,
m11
paddw
m2
,
m0
paddw
m3
,
m1
vbroadcasti128
m1
,
[
v_weightsq
]
add
v_weightsq
,
16
pshufb
m0
,
m1
,
m6
pshufb
m1
,
m7
SMOOTH_2D_END
0
,
1
,
8
,
8
,
9
,
9
vextracti128
xm1
,
m0
,
1
movd
[
dstq
+
strideq
*
0
]
,
xm0
movd
[
dstq
+
strideq
*
1
]
,
xm1
pextrd
[
dstq
+
strideq
*
2
]
,
xm0
,
2
pextrd
[
dstq
+
r3
]
,
xm1
,
2
cmp
hd
,
4
je
.
ret
lea
dstq
,
[
dstq
+
strideq
*
4
]
pextrd
[
dstq
+
strideq
*
0
]
,
xm0
,
1
pextrd
[
dstq
+
strideq
*
1
]
,
xm1
,
1
pextrd
[
dstq
+
strideq
*
2
]
,
xm0
,
3
pextrd
[
dstq
+
r3
]
,
xm1
,
3
lea
dstq
,
[
dstq
+
strideq
*
4
]
sub
hd
,
8
jg
.
w4_loop
.
ret
:
RET
ALIGN
function_align
.
w8
:
%
assign
stack_offset
stack_offset
-
stack_size_padded
WIN64_SPILL_XMM
12
mova
m10
,
[
base
+
ipred_h_shuf
]
vbroadcasti128
m11
,
[
base
+
smooth_weights
+
8
*
2
]
mova
m7
,
[
base
+
ipred_v_shuf
]
vpbroadcastq
m8
,
[
tlq
+
1
]
sub
tlq
,
4
lea
r3
,
[
strideq
*
3
]
sub
tlq
,
hq
punpcklbw
m8
,
m0
pshufd
m6
,
m7
,
q0000
pshufd
m7
,
m7
,
q1111
pmaddubsw
m9
,
m8
,
m5
paddw
m3
,
m8
paddw
m9
,
m3
.
w8_loop
:
vpbroadcastd
m1
,
[
tlq
+
hq
]
pshufb
m1
,
m10
punpcklbw
m0
,
m1
,
m4
punpckhbw
m1
,
m4
pmaddubsw
m2
,
m0
,
m5
pmaddubsw
m3
,
m1
,
m5
paddw
m2
,
m0
paddw
m3
,
m1
pmaddubsw
m0
,
m11
pmaddubsw
m1
,
m11
paddw
m2
,
m0
paddw
m3
,
m1
vpbroadcastq
m1
,
[
v_weightsq
]
add
v_weightsq
,
8
pshufb
m0
,
m1
,
m6
pshufb
m1
,
m7
SMOOTH_2D_END
0
,
1
,
8
,
8
,
9
,
9
vextracti128
xm1
,
m0
,
1
movq
[
dstq
+
strideq
*
0
]
,
xm0
movq
[
dstq
+
strideq
*
1
]
,
xm1
movhps
[
dstq
+
strideq
*
2
]
,
xm0
movhps
[
dstq
+
r3
]
,
xm1
lea
dstq
,
[
dstq
+
strideq
*
4
]
sub
hd
,
4
jg
.
w8_loop
RET
ALIGN
function_align
.
w16
:
SETUP_STACK_FRAME
32
*
4
,
7
,
14
vbroadcasti128
m11
,
[
tlq
+
1
]
lea
r3
,
[
rsp
+
64
*
2
-
4
]
punpcklbw
m10
,
m11
,
m0
; top, bottom
punpckhbw
m11
,
m0
call
.
prep_v
sub
tlq
,
2
pmaddubsw
m12
,
m10
,
m5
pmaddubsw
m13
,
m11
,
m5
vpbroadcastd
xm5
,
[
base
+
pb_1
]
mova
m9
,
[
base
+
ipred_v_shuf
]
vbroadcasti128
m6
,
[
base
+
smooth_weights
+
16
*
2
]
vbroadcasti128
m7
,
[
base
+
smooth_weights
+
16
*
3
]
vpermq
m8
,
m9
,
q1032
paddw
m0
,
m10
,
m3
paddw
m3
,
m11
paddw
m12
,
m0
paddw
m13
,
m3
.
w16_loop
:
vpbroadcastd
m3
,
[
tlq
+
hq
]
vpbroadcastd
m0
,
[
r3
+
hq
*
2
]
vpbroadcastd
m1
,
[
v_weightsq
]
add
v_weightsq
,
4
pshufb
m3
,
m5
punpcklbw
m3
,
m4
; left, right
pmaddubsw
m2
,
m3
,
m6
pmaddubsw
m3
,
m7
pshufb
m0
,
m8
pshufb
m1
,
m9
paddw
m2
,
m0
paddw
m3
,
m0
SMOOTH_2D_END
1
,
1
,
10
,
11
,
12
,
13
mova
[
dstq
+
strideq
*
0
]
,
xm0
vextracti128
[
dstq
+
strideq
*
1
]
,
m0
,
1
lea
dstq
,
[
dstq
+
strideq
*
2
]
sub
hd
,
2
jg
.
w16_loop
RET
ALIGN
function_align
.
w32
:
SETUP_STACK_FRAME
32
*
4
,
7
,
11
movu
m8
,
[
tlq
+
1
]
lea
r3
,
[
rsp
+
64
*
2
-
2
]
punpcklbw
m7
,
m8
,
m0
punpckhbw
m8
,
m0
call
.
prep_v
dec
tlq
pmaddubsw
m9
,
m7
,
m5
pmaddubsw
m10
,
m8
,
m5
mova
xm5
,
[
base
+
smooth_weights
+
16
*
4
]
vinserti128
m5
,
[
base
+
smooth_weights
+
16
*
6
]
,
1
mova
xm6
,
[
base
+
smooth_weights
+
16
*
5
]
vinserti128
m6
,
[
base
+
smooth_weights
+
16
*
7
]
,
1
paddw
m0
,
m7
,
m3
paddw
m3
,
m8
paddw
m9
,
m0
paddw
m10
,
m3
.
w32_loop
:
vpbroadcastb
m3
,
[
tlq
+
hq
]
punpcklbw
m3
,
m4
vpbroadcastw
m0
,
[
r3
+
hq
*
2
]
vpbroadcastw
m1
,
[
v_weightsq
]
add
v_weightsq
,
2
pmaddubsw
m2
,
m3
,
m5
pmaddubsw
m3
,
m6
paddw
m2
,
m0
paddw
m3
,
m0
SMOOTH_2D_END
1
,
1
,
7
,
8
,
9
,
10
mova
[dstq],
m0
add
dstq
,
strideq
dec
hd
jg
.
w32_loop
RET
ALIGN
function_align
.
w64
:
SETUP_STACK_FRAME
32
*
8
,
7
,
16
movu
m13
,
[
tlq
+
1
]
movu
m15
,
[
tlq
+
33
]
add
r6
,
smooth_weights
+
16
*
15
-
ipred_smooth_avx2_table
lea
r3
,
[
rsp
+
64
*
2
-
2
]
punpcklbw
m12
,
m13
,
m0
punpckhbw
m13
,
m0
punpcklbw
m14
,
m15
,
m0
punpckhbw
m15
,
m0
call
.
prep_v
dec
tlq
pmaddubsw
m0
,
m12
,
m5
pmaddubsw
m1
,
m13
,
m5
pmaddubsw
m2
,
m14
,
m5
pmaddubsw
m5
,
m15
,
m5
mova
xm8
,
[
r6
-
16
*
7
]
vinserti128
m8
,
[
r6
-
16
*
5
]
,
1
mova
xm9
,
[
r6
-
16
*
6
]
vinserti128
m9
,
[
r6
-
16
*
4
]
,
1
mova
xm10
,
[
r6
-
16
*
3
]
vinserti128
m10
,
[
r6
-
16
*
1
]
,
1
mova
xm11
,
[
r6
-
16
*
2
]