Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Open sidebar
François Cartegnie
dav1d
Commits
59c3370e
Commit
59c3370e
authored
Oct 11, 2018
by
Henrik Gramner
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
x86: Add smooth intra prediction AVX2 asm
parent
613ef787
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
704 additions
and
11 deletions
+704
-11
src/x86/ipred.asm
src/x86/ipred.asm
+698
-11
src/x86/ipred_init.c
src/x86/ipred_init.c
+6
-0
No files found.
src/x86/ipred.asm
View file @
59c3370e
...
...
@@ -30,11 +30,44 @@
SECTION
_RODATA
32
paeth_shuf:
db
7
,
7
,
7
,
7
,
3
,
3
,
3
,
3
,
6
,
6
,
6
,
6
,
2
,
2
,
2
,
2
db
5
,
5
,
5
,
5
,
1
,
1
,
1
,
1
,
4
,
4
,
4
,
4
,
0
,
0
,
0
,
0
%macro SMOOTH_WEIGHT_TABLE 1-*
%rep %0
db
%
1
-
128
,
127
-%
1
%rotate 1
%endrep
%endmacro
; sm_weights[], but modified to precalculate x and 256-x with offsets to
; enable efficient use of pmaddubsw (which requires signed values)
smooth_weights:
SMOOTH_WEIGHT_TABLE
\
0,
0,
255,
128,
255,
149,
85,
64,
\
255,
197,
146,
105,
73,
50,
37,
32,
\
255,
225,
196,
170,
145,
123,
102,
84,
\
68,
54,
43,
33,
26,
20,
17,
16,
\
255,
240,
225,
210,
196,
182,
169,
157,
\
145,
133,
122,
111,
101,
92,
83,
74,
\
66,
59,
52,
45,
39,
34,
29,
25,
\
21,
17,
14,
12,
10,
9,
8,
8,
\
255,
248,
240,
233,
225,
218,
210,
203,
\
196,
189,
182,
176,
169,
163,
156,
150,
\
144,
138,
133,
127,
121,
116,
111,
106,
\
101,
96,
91,
86,
82,
77,
73,
69,
\
65,
61,
57,
54,
50,
47,
44,
41,
\
38,
35,
32,
29,
27,
25,
22,
20,
\
18,
16,
15,
13,
12,
10,
9,
8,
\
7,
6,
6,
5,
5,
4,
4,
4
ipred_v_shuf:
db
0
,
1
,
0
,
1
,
4
,
5
,
4
,
5
,
8
,
9
,
8
,
9
,
12
,
13
,
12
,
13
db
2
,
3
,
2
,
3
,
6
,
7
,
6
,
7
,
10
,
11
,
10
,
11
,
14
,
15
,
14
,
15
ipred_h_shuf:
db
7
,
7
,
7
,
7
,
3
,
3
,
3
,
3
,
5
,
5
,
5
,
5
,
1
,
1
,
1
,
1
db
6
,
6
,
6
,
6
,
2
,
2
,
2
,
2
,
4
,
4
,
4
,
4
,
0
,
0
,
0
,
0
pb_1:
times
4
db
1
pb_128:
times
4
db
128
pw_128:
times
2
dw
128
pw_255:
times
2
dw
255
pb_127_m127:
times
2
db
127
,
-
127
%macro JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - 2*4)
...
...
@@ -48,6 +81,9 @@ pb_128: times 4 db 128
%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4)
JMP_TABLE
ipred_smooth
,
avx2
,
w4
,
w8
,
w16
,
w32
,
w64
JMP_TABLE
ipred_smooth_v
,
avx2
,
w4
,
w8
,
w16
,
w32
,
w64
JMP_TABLE
ipred_smooth_h
,
avx2
,
w4
,
w8
,
w16
,
w32
,
w64
JMP_TABLE
ipred_paeth
,
avx2
,
w4
,
w8
,
w16
,
w32
,
w64
JMP_TABLE
ipred_dc
,
avx2
,
h4
,
h8
,
h16
,
h32
,
h64
,
w4
,
w8
,
w16
,
w32
,
w64
,
\
s4
-
10
*
4
,
s8
-
10
*
4
,
s16
-
10
*
4
,
s32
-
10
*
4
,
s64
-
10
*
4
...
...
@@ -423,17 +459,18 @@ INIT_YMM avx2
%endmacro
cglobal
ipred_paeth
,
3
,
6
,
9
,
ds
t
,
stride
,
tl
,
w
,
h
%define base r5-ipred_paeth_avx2_table
lea
r5
,
[
ipred_paeth_avx2_table
]
tzcnt
wd
,
wm
vpbroadcastb
m5
,
[
tlq
]
; topleft
movifnidn
hd
,
hm
movsxd
wq
,
[
r5
+
wq
*
4
]
vpbroadcastd
m4
,
[
r5
-
ipred_paeth_avx2_tabl
e
+
pb_1
]
vpbroadcastd
m4
,
[
bas
e
+
pb_1
]
add
wq
,
r5
jmp
wq
.w4:
vpbroadcastd
m6
,
[
tlq
+
1
]
; top
mova
m8
,
[
r5
-
ipred_
paeth_avx2_table
+
paet
h_shuf
]
mova
m8
,
[
base
+
ipred_h_shuf
]
lea
r3
,
[
strideq
*
3
]
psubusb
m7
,
m5
,
m6
psubusb
m0
,
m6
,
m5
...
...
@@ -445,15 +482,15 @@ cglobal ipred_paeth, 3, 6, 9, dst, stride, tl, w, h
PAETH
6
,
7
vextracti128
xm1
,
m0
,
1
movd
[
ds
tq
+
strideq
*
0
],
xm0
pextrd
[
ds
tq
+
strideq
*
1
],
xm
0
,
2
movd
[
ds
tq
+
strideq
*
2
],
xm
1
movd
[
ds
tq
+
strideq
*
1
],
xm
1
pextrd
[
ds
tq
+
strideq
*
2
],
xm
0
,
2
pextrd
[
ds
tq
+
r3
],
xm1
,
2
cmp
hd
,
4
je
.ret
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
pextrd
[
ds
tq
+
strideq
*
0
],
xm0
,
1
pextrd
[
ds
tq
+
strideq
*
1
],
xm
0
,
3
pextrd
[
ds
tq
+
strideq
*
2
],
xm
1
,
1
pextrd
[
ds
tq
+
strideq
*
1
],
xm
1
,
1
pextrd
[
ds
tq
+
strideq
*
2
],
xm
0
,
3
pextrd
[
ds
tq
+
r3
],
xm1
,
3
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
sub
hd
,
8
...
...
@@ -463,7 +500,7 @@ cglobal ipred_paeth, 3, 6, 9, dst, stride, tl, w, h
ALIGN
function_align
.w8:
vpbroadcastq
m6
,
[
tlq
+
1
]
mova
m8
,
[
r5
-
ipred_
paeth_avx2_table
+
paet
h_shuf
]
mova
m8
,
[
base
+
ipred_h_shuf
]
lea
r3
,
[
strideq
*
3
]
psubusb
m7
,
m5
,
m6
psubusb
m0
,
m6
,
m5
...
...
@@ -475,8 +512,8 @@ ALIGN function_align
PAETH
6
,
7
vextracti128
xm1
,
m0
,
1
movq
[
ds
tq
+
strideq
*
0
],
xm0
mov
hps
[
ds
tq
+
strideq
*
1
],
xm
0
mov
q
[
ds
tq
+
strideq
*
2
],
xm
1
mov
q
[
ds
tq
+
strideq
*
1
],
xm
1
mov
hps
[
ds
tq
+
strideq
*
2
],
xm
0
movhps
[
ds
tq
+
r3
],
xm1
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
sub
hd
,
4
...
...
@@ -543,4 +580,654 @@ ALIGN function_align
%endif
RET
%macro SMOOTH 6
; src[1-2], mul[1-2], add[1-2]
; w * a = (w - 128) * a + 128 * a
; (256 - w) * b = (127 - w) * b + 129 * b
pmaddubsw
m0
,
m
%
3
,
m
%
1
pmaddubsw
m1
,
m
%
4
,
m
%
2
paddw
m0
,
m
%
5
paddw
m1
,
m
%
6
psrlw
m0
,
8
psrlw
m1
,
8
packuswb
m0
,
m1
%endmacro
cglobal
ipred_smooth_v
,
3
,
7
,
0
,
ds
t
,
stride
,
tl
,
w
,
h
,
weights
%define base r6-ipred_smooth_v_avx2_table
lea
r6
,
[
ipred_smooth_v_avx2_table
]
tzcnt
wd
,
wm
mov
hd
,
hm
movsxd
wq
,
[
r6
+
wq
*
4
]
vpbroadcastd
m0
,
[
base
+
pb_127_m127
]
vpbroadcastd
m1
,
[
base
+
pw_128
]
lea
weightsq
,
[
base
+
smooth_weights
+
hq
*
4
]
neg
hq
vpbroadcastb
m5
,
[
tlq
+
hq
]
; bottom
add
wq
,
r6
jmp
wq
.w4:
vpbroadcastd
m2
,
[
tlq
+
1
]
punpcklbw
m2
,
m5
; top, bottom
mova
m5
,
[
base
+
ipred_v_shuf
]
lea
r3
,
[
strideq
*
3
]
punpckldq
m4
,
m5
,
m5
punpckhdq
m5
,
m5
pmaddubsw
m3
,
m2
,
m0
paddw
m1
,
m2
; 1 * top + 256 * bottom + 128, overflow is ok
paddw
m3
,
m1
; 128 * top + 129 * bottom + 128
.w4_loop:
vbroadcasti128
m1
,
[
weightsq
+
hq
*
2
]
pshufb
m0
,
m1
,
m4
pshufb
m1
,
m5
SMOOTH
0
,
1
,
2
,
2
,
3
,
3
vextracti128
xm1
,
m0
,
1
movd
[
ds
tq
+
strideq
*
0
],
xm0
movd
[
ds
tq
+
strideq
*
1
],
xm1
pextrd
[
ds
tq
+
strideq
*
2
],
xm0
,
1
pextrd
[
ds
tq
+
r3
],
xm1
,
1
cmp
hd
,
-
4
je
.ret
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
pextrd
[
ds
tq
+
strideq
*
0
],
xm0
,
2
pextrd
[
ds
tq
+
strideq
*
1
],
xm1
,
2
pextrd
[
ds
tq
+
strideq
*
2
],
xm0
,
3
pextrd
[
ds
tq
+
r3
],
xm1
,
3
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
add
hq
,
8
jl
.w4_loop
.ret:
RET
ALIGN
function_align
.w8:
vpbroadcastq
m2
,
[
tlq
+
1
]
punpcklbw
m2
,
m5
mova
m5
,
[
base
+
ipred_v_shuf
]
lea
r3
,
[
strideq
*
3
]
pshufd
m4
,
m5
,
q0000
pshufd
m5
,
m5
,
q1111
pmaddubsw
m3
,
m2
,
m0
paddw
m1
,
m2
paddw
m3
,
m1
.w8_loop:
vpbroadcastq
m1
,
[
weightsq
+
hq
*
2
]
pshufb
m0
,
m1
,
m4
pshufb
m1
,
m5
SMOOTH
0
,
1
,
2
,
2
,
3
,
3
vextracti128
xm1
,
m0
,
1
movq
[
ds
tq
+
strideq
*
0
],
xm0
movq
[
ds
tq
+
strideq
*
1
],
xm1
movhps
[
ds
tq
+
strideq
*
2
],
xm0
movhps
[
ds
tq
+
r3
],
xm1
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
add
hq
,
4
jl
.w8_loop
RET
ALIGN
function_align
.w16:
WIN64_SPILL_XMM
7
vbroadcasti128
m3
,
[
tlq
+
1
]
mova
m6
,
[
base
+
ipred_v_shuf
]
punpcklbw
m2
,
m3
,
m5
punpckhbw
m3
,
m5
pmaddubsw
m4
,
m2
,
m0
pmaddubsw
m5
,
m3
,
m0
paddw
m0
,
m1
,
m2
paddw
m1
,
m3
paddw
m4
,
m0
paddw
m5
,
m1
.w16_loop:
vpbroadcastd
m1
,
[
weightsq
+
hq
*
2
]
pshufb
m1
,
m6
SMOOTH
1
,
1
,
2
,
3
,
4
,
5
mova
[
ds
tq
+
strideq
*
0
],
xm0
vextracti128
[
ds
tq
+
strideq
*
1
],
m0
,
1
lea
ds
tq
,
[
ds
tq
+
strideq
*
2
]
add
hq
,
2
jl
.w16_loop
RET
ALIGN
function_align
.w32:
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM
6
movu
m3
,
[
tlq
+
1
]
punpcklbw
m2
,
m3
,
m5
punpckhbw
m3
,
m5
pmaddubsw
m4
,
m2
,
m0
pmaddubsw
m5
,
m3
,
m0
paddw
m0
,
m1
,
m2
paddw
m1
,
m3
paddw
m4
,
m0
paddw
m5
,
m1
.w32_loop:
vpbroadcastw
m1
,
[
weightsq
+
hq
*
2
]
SMOOTH
1
,
1
,
2
,
3
,
4
,
5
mova
[
ds
tq
],
m0
add
ds
tq
,
strideq
inc
hq
jl
.w32_loop
RET
ALIGN
function_align
.w64:
WIN64_SPILL_XMM
11
movu
m4
,
[
tlq
+
1
]
movu
m8
,
[
tlq
+
33
]
punpcklbw
m3
,
m4
,
m5
punpckhbw
m4
,
m5
punpcklbw
m7
,
m8
,
m5
punpckhbw
m8
,
m5
pmaddubsw
m5
,
m3
,
m0
pmaddubsw
m6
,
m4
,
m0
pmaddubsw
m9
,
m7
,
m0
pmaddubsw
m10
,
m8
,
m0
paddw
m2
,
m1
,
m3
paddw
m5
,
m2
paddw
m2
,
m1
,
m4
paddw
m6
,
m2
paddw
m0
,
m1
,
m7
paddw
m9
,
m0
paddw
m1
,
m8
paddw
m10
,
m1
.w64_loop:
vpbroadcastw
m2
,
[
weightsq
+
hq
*
2
]
SMOOTH
2
,
2
,
3
,
4
,
5
,
6
mova
[
ds
tq
+
32
*
0
],
m0
SMOOTH
2
,
2
,
7
,
8
,
9
,
10
mova
[
ds
tq
+
32
*
1
],
m0
add
ds
tq
,
strideq
inc
hq
jl
.w64_loop
RET
%macro SETUP_STACK_FRAME 3
; stack_size, regs_used, xmm_regs_used
%assign stack_offset 0
%assign stack_size_padded 0
%assign regs_used %2
%xdefine rstk rsp
SETUP_STACK_POINTER
%
1
%if regs_used != %2 && WIN64
PUSH
r
%
2
%endif
ALLOC_STACK
%
1
,
%
3
%endmacro
cglobal
ipred_smooth_h
,
3
,
7
,
0
,
ds
t
,
stride
,
tl
,
w
,
h
%define base r6-ipred_smooth_h_avx2_table
lea
r6
,
[
ipred_smooth_h_avx2_table
]
mov
wd
,
wm
vpbroadcastb
m3
,
[
tlq
+
wq
]
; right
tzcnt
wd
,
wd
mov
hd
,
hm
movsxd
wq
,
[
r6
+
wq
*
4
]
vpbroadcastd
m4
,
[
base
+
pb_127_m127
]
vpbroadcastd
m5
,
[
base
+
pw_128
]
add
wq
,
r6
jmp
wq
.w4:
WIN64_SPILL_XMM
8
vpbroadcastq
m6
,
[
base
+
smooth_weights
+
4
*
2
]
mova
m7
,
[
base
+
ipred_h_shuf
]
sub
tlq
,
8
sub
tlq
,
hq
lea
r3
,
[
strideq
*
3
]
.w4_loop:
vpbroadcastq
m2
,
[
tlq
+
hq
]
pshufb
m2
,
m7
punpcklbw
m1
,
m2
,
m3
; left, right
punpckhbw
m2
,
m3
pmaddubsw
m0
,
m1
,
m4
; 127 * left - 127 * right
paddw
m0
,
m1
; 128 * left + 129 * right
pmaddubsw
m1
,
m6
paddw
m1
,
m5
paddw
m0
,
m1
pmaddubsw
m1
,
m2
,
m4
paddw
m1
,
m2
pmaddubsw
m2
,
m6
paddw
m2
,
m5
paddw
m1
,
m2
psrlw
m0
,
8
psrlw
m1
,
8
packuswb
m0
,
m1
vextracti128
xm1
,
m0
,
1
movd
[
ds
tq
+
strideq
*
0
],
xm0
movd
[
ds
tq
+
strideq
*
1
],
xm1
pextrd
[
ds
tq
+
strideq
*
2
],
xm0
,
2
pextrd
[
ds
tq
+
r3
],
xm1
,
2
cmp
hd
,
4
je
.ret
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
pextrd
[
ds
tq
+
strideq
*
0
],
xm0
,
1
pextrd
[
ds
tq
+
strideq
*
1
],
xm1
,
1
pextrd
[
ds
tq
+
strideq
*
2
],
xm0
,
3
pextrd
[
ds
tq
+
r3
],
xm1
,
3
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
sub
hd
,
8
jg
.w4_loop
.ret:
RET
ALIGN
function_align
.w8:
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM
8
vbroadcasti128
m6
,
[
base
+
smooth_weights
+
8
*
2
]
mova
m7
,
[
base
+
ipred_h_shuf
]
sub
tlq
,
4
lea
r3
,
[
strideq
*
3
]
sub
tlq
,
hq
.w8_loop:
vpbroadcastd
m2
,
[
tlq
+
hq
]
pshufb
m2
,
m7
punpcklbw
m1
,
m2
,
m3
punpckhbw
m2
,
m3
pmaddubsw
m0
,
m1
,
m4
paddw
m0
,
m1
pmaddubsw
m1
,
m6
paddw
m1
,
m5
paddw
m0
,
m1
pmaddubsw
m1
,
m2
,
m4
paddw
m1
,
m2
pmaddubsw
m2
,
m6
paddw
m2
,
m5
paddw
m1
,
m2
psrlw
m0
,
8
psrlw
m1
,
8
packuswb
m0
,
m1
vextracti128
xm1
,
m0
,
1
movq
[
ds
tq
+
strideq
*
0
],
xm0
movq
[
ds
tq
+
strideq
*
1
],
xm1
movhps
[
ds
tq
+
strideq
*
2
],
xm0
movhps
[
ds
tq
+
r3
],
xm1
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
sub
hd
,
4
jg
.w8_loop
RET
ALIGN
function_align
.w16:
SETUP_STACK_FRAME
32
*
4
,
7
,
8
lea
r3
,
[
rsp
+
64
*
2
-
4
]
call
.prep
; only worthwhile for for w16 and above
sub
tlq
,
2
vpbroadcastd
xm6
,
[
base
+
pb_1
]
mova
xm7
,
[
base
+
ipred_v_shuf
+
16
]
vinserti128
m7
,
[
base
+
ipred_v_shuf
+
0
],
1
vbroadcasti128
m4
,
[
base
+
smooth_weights
+
16
*
2
]
vbroadcasti128
m5
,
[
base
+
smooth_weights
+
16
*
3
]
.w16_loop:
vpbroadcastd
m1
,
[
tlq
+
hq
]
vpbroadcastd
m2
,
[
r3
+
hq
*
2
]
pshufb
m1
,
m6
punpcklbw
m1
,
m3
pshufb
m2
,
m7
SMOOTH
4
,
5
,
1
,
1
,
2
,
2
mova
[
ds
tq
+
strideq
*
0
],
xm0
vextracti128
[
ds
tq
+
strideq
*
1
],
m0
,
1
lea
ds
tq
,
[
ds
tq
+
strideq
*
2
]
sub
hd
,
2
jg
.w16_loop
RET
ALIGN
function_align
.w32:
SETUP_STACK_FRAME
32
*
4
,
7
,
6
lea
r3
,
[
rsp
+
64
*
2
-
2
]
call
.prep
dec
tlq
mova
xm4
,
[
base
+
smooth_weights
+
16
*
4
]
vinserti128
m4
,
[
base
+
smooth_weights
+
16
*
6
],
1
mova
xm5
,
[
base
+
smooth_weights
+
16
*
5
]
vinserti128
m5
,
[
base
+
smooth_weights
+
16
*
7
],
1
.w32_loop:
vpbroadcastb
m1
,
[
tlq
+
hq
]
punpcklbw
m1
,
m3
vpbroadcastw
m2
,
[
r3
+
hq
*
2
]
SMOOTH
4
,
5
,
1
,
1
,
2
,
2
mova
[
ds
tq
],
m0
add
ds
tq
,
strideq
dec
hd
jg
.w32_loop
RET
ALIGN
function_align
.w64:
SETUP_STACK_FRAME
32
*
4
,
7
,
9
lea
r3
,
[
rsp
+
64
*
2
-
2
]
call
.prep
add
r6
,
smooth_weights
+
16
*
15
-
ipred_smooth_h_avx2_table
dec
tlq
mova
xm5
,
[
r6
-
16
*
7
]
vinserti128
m5
,
[
r6
-
16
*
5
],
1
mova
xm6
,
[
r6
-
16
*
6
]
vinserti128
m6
,
[
r6
-
16
*
4
],
1
mova
xm7
,
[
r6
-
16
*
3
]
vinserti128
m7
,
[
r6
-
16
*
1
],
1
mova
xm8
,
[
r6
-
16
*
2
]
vinserti128
m8
,
[
r6
-
16
*
0
],
1
.w64_loop:
vpbroadcastb
m2
,
[
tlq
+
hq
]
punpcklbw
m2
,
m3
vpbroadcastw
m4
,
[
r3
+
hq
*
2
]
SMOOTH
5
,
6
,
2
,
2
,
4
,
4
mova
[
ds
tq
+
32
*
0
],
m0
SMOOTH
7
,
8
,
2
,
2
,
4
,
4
mova
[
ds
tq
+
32
*
1
],
m0
add
ds
tq
,
strideq
dec
hd
jg
.w64_loop
RET
ALIGN
function_align
.prep:
vpermq
m2
,
[
tlq
-
32
*
1
],
q3120
punpckhbw
m1
,
m2
,
m3
punpcklbw
m2
,
m3
pmaddubsw
m0
,
m1
,
m4
; 127 * left - 127 * right
paddw
m1
,
m5
; 1 * left + 256 * right + 128
paddw
m0
,
m1
; 128 * left + 129 * right + 128
pmaddubsw
m1
,
m2
,
m4
paddw
m2
,
m5
paddw
m1
,
m2
vpermq
m2
,
[
tlq
-
32
*
2
],
q3120
mova
[
rsp
+
gprsize
+
32
*
3
],
m0
mova
[
rsp
+
gprsize
+
32
*
2
],
m1
punpckhbw
m1
,
m2
,
m3
punpcklbw
m2
,
m3
pmaddubsw
m0
,
m1
,
m4
paddw
m1
,
m5
paddw
m0
,
m1
pmaddubsw
m1
,
m2
,
m4
paddw
m2
,
m5
paddw
m1
,
m2
mova
[
rsp
+
gprsize
+
32
*
1
],
m0
mova
[
rsp
+
gprsize
+
32
*
0
],
m1
sub
r3
,
hq
sub
tlq
,
hq
sub
r3
,
hq
ret
%macro SMOOTH_2D_END 6
; src[1-2], mul[1-2], add[1-2]
pmaddubsw
m0
,
m
%
3
,
m
%
1
pmaddubsw
m1
,
m
%
4
,
m
%
2
%ifnum %5
paddw
m0
,
m
%
5
%else
paddw
m0
,
%
5
%endif
%ifnum %6
paddw
m1
,
m
%
6
%else
paddw
m1
,
%
6
%endif
pavgw
m0
,
m2
pavgw
m1
,
m3
psrlw
m0
,
8
psrlw
m1
,
8
packuswb
m0
,
m1
%endmacro
cglobal
ipred_smooth
,
3
,
7
,
0
,
ds
t
,
stride
,
tl
,
w
,
h
,
v_weights
%define base r6-ipred_smooth_avx2_table
lea
r6
,
[
ipred_smooth_avx2_table
]
mov
wd
,
wm
vpbroadcastb
m4
,
[
tlq
+
wq
]
; right
tzcnt
wd
,
wd
mov
hd
,
hm
mov
r5
,
tlq
sub
r5
,
hq
movsxd
wq
,
[
r6
+
wq
*
4
]
vpbroadcastd
m5
,
[
base
+
pb_127_m127
]
vpbroadcastb
m0
,
[
r5
]
; bottom
vpbroadcastd
m3
,
[
base
+
pw_255
]
add
wq
,
r6
lea
v_weightsq
,
[
base
+
smooth_weights
+
hq
*
2
]
jmp
wq
.w4:
WIN64_SPILL_XMM
12
mova
m10
,
[
base
+
ipred_h_shuf
]
vpbroadcastq
m11
,
[
base
+
smooth_weights
+
4
*
2
]
mova
m7
,
[
base
+
ipred_v_shuf
]
vpbroadcastd
m8
,
[
tlq
+
1
]
sub
tlq
,
8
lea
r3
,
[
strideq
*
3
]
sub
tlq
,
hq
punpcklbw
m8
,
m0
; top, bottom
pshufd
m6
,
m7
,
q2200
pshufd
m7
,
m7
,
q3311
pmaddubsw
m9
,
m8
,
m5
paddw
m3
,
m8
; 1 * top + 255 * bottom + 255
paddw
m9
,
m3
; 128 * top + 129 * bottom + 255
.w4_loop:
vpbroadcastq
m1
,
[
tlq
+
hq
]
pshufb
m1
,
m10
punpcklbw
m0
,
m1
,
m4
; left, right
punpckhbw
m1
,
m4
pmaddubsw
m2
,
m0
,
m5
; 127 * left - 127 * right
pmaddubsw
m3
,
m1
,
m5
paddw
m2
,
m0
; 128 * left + 129 * right
paddw
m3
,
m1
pmaddubsw
m0
,
m11
pmaddubsw
m1
,
m11
paddw
m2
,
m0
paddw
m3
,
m1
vbroadcasti128
m1
,
[
v_weightsq
]
add
v_weightsq
,
16
pshufb
m0
,
m1
,
m6
pshufb
m1
,
m7
SMOOTH_2D_END
0
,
1
,
8
,
8
,
9
,
9
vextracti128
xm1
,
m0
,
1
movd
[
ds
tq
+
strideq
*
0
],
xm0
movd
[
ds
tq
+
strideq
*
1
],
xm1
pextrd
[
ds
tq
+
strideq
*
2
],
xm0
,
2
pextrd
[
ds
tq
+
r3
],
xm1
,
2
cmp
hd
,
4
je
.ret
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
pextrd
[
ds
tq
+
strideq
*
0
],
xm0
,
1
pextrd
[
ds
tq
+
strideq
*
1
],
xm1
,
1
pextrd
[
ds
tq
+
strideq
*
2
],
xm0
,
3
pextrd
[
ds
tq
+
r3
],
xm1
,
3
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
sub
hd
,
8
jg
.w4_loop
.ret:
RET
ALIGN
function_align
.w8:
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM
12
mova
m10
,
[
base
+
ipred_h_shuf
]
vbroadcasti128
m11
,
[
base
+
smooth_weights
+
8
*
2
]
mova
m7
,
[
base
+
ipred_v_shuf
]
vpbroadcastq
m8
,
[
tlq
+
1
]
sub
tlq
,
4
lea
r3
,
[
strideq
*
3
]
sub
tlq
,
hq
punpcklbw
m8
,
m0
pshufd
m6
,
m7
,
q0000
pshufd
m7
,
m7
,
q1111
pmaddubsw
m9
,
m8
,
m5
paddw
m3
,
m8
paddw
m9
,
m3
.w8_loop:
vpbroadcastd
m1
,
[
tlq
+
hq
]
pshufb
m1
,
m10
punpcklbw
m0
,
m1
,
m4
punpckhbw
m1
,
m4
pmaddubsw
m2
,
m0
,
m5
pmaddubsw
m3
,
m1
,
m5
paddw
m2
,
m0
paddw
m3
,
m1
pmaddubsw
m0
,
m11
pmaddubsw
m1
,
m11
paddw
m2
,
m0
paddw
m3
,
m1
vpbroadcastq
m1
,
[
v_weightsq
]
add
v_weightsq
,
8
pshufb
m0
,
m1
,
m6
pshufb
m1
,
m7
SMOOTH_2D_END
0
,
1
,
8
,
8
,
9
,
9
vextracti128
xm1
,
m0
,
1
movq
[
ds
tq
+
strideq
*
0
],
xm0
movq
[
ds
tq
+
strideq
*
1
],
xm1
movhps
[
ds
tq
+
strideq
*
2
],
xm0
movhps
[
ds
tq
+
r3
],
xm1