Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
VideoLAN
dav1d
Commits
205b723e
Commit
205b723e
authored
Jan 18, 2019
by
Henrik Gramner
Committed by
Henrik Gramner
Jan 19, 2019
Browse files
Add SGR optimizations
parent
33ce3829
Pipeline
#4030
passed with stages
in 5 minutes and 24 seconds
Changes
4
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
src/looprestoration_tmpl.c
View file @
205b723e
...
...
@@ -446,11 +446,11 @@ static void selfguided_filter(coef *dst, const pixel *src,
const
unsigned
p
=
imax
(
a
*
n
-
b
*
b
,
0
);
const
unsigned
z
=
(
p
*
s
+
(
1
<<
19
))
>>
20
;
const
unsigned
x
=
dav1d_sgr_x_by_x
[
imin
(
z
,
255
)];
const
int
x
=
dav1d_sgr_x_by_xplus1
[
imin
(
z
,
255
)];
// This is where we invert A and B, so that B is of size coef.
AA
[
i
]
=
(
((
1U
<<
8
)
-
x
)
*
BB
[
i
]
*
sgr_one_by_x
+
(
1
<<
11
))
>>
12
;
BB
[
i
]
=
x
;
AA
[
i
]
=
(
x
*
BB
[
i
]
*
sgr_one_by_x
+
(
1
<<
11
))
>>
12
;
BB
[
i
]
=
256
-
x
;
}
AA
+=
step
*
REST_UNIT_STRIDE
;
BB
+=
step
*
REST_UNIT_STRIDE
;
...
...
src/tables.c
View file @
205b723e
...
...
@@ -502,25 +502,25 @@ const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1
{
2
,
0
,
22
,
-
1
},
};
const
int
dav1d_sgr_x_by_x
plus1
[
256
]
=
{
1
,
128
,
171
,
192
,
205
,
213
,
219
,
224
,
2
28
,
2
30
,
233
,
235
,
236
,
238
,
239
,
240
,
241
,
242
,
24
3
,
24
3
,
244
,
244
,
245
,
245
,
246
,
246
,
247
,
247
,
247
,
247
,
24
8
,
24
8
,
24
8
,
24
8
,
249
,
249
,
249
,
249
,
249
,
250
,
250
,
250
,
250
,
250
,
250
,
250
,
251
,
251
,
251
,
251
,
251
,
251
,
251
,
251
,
251
,
251
,
252
,
252
,
252
,
252
,
252
,
252
,
252
,
252
,
252
,
252
,
252
,
252
,
252
,
252
,
252
,
252
,
252
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
25
3
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
254
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
,
256
,
const
u
int
8_t
dav1d_sgr_x_by_x
[
256
]
=
{
255
,
128
,
85
,
64
,
51
,
43
,
37
,
32
,
28
,
2
6
,
23
,
21
,
20
,
18
,
17
,
16
,
15
,
14
,
1
3
,
1
3
,
12
,
12
,
11
,
11
,
10
,
10
,
9
,
9
,
9
,
9
,
8
,
8
,
8
,
8
,
7
,
7
,
7
,
7
,
7
,
6
,
6
,
6
,
6
,
6
,
6
,
6
,
5
,
5
,
5
,
5
,
5
,
5
,
5
,
5
,
5
,
5
,
4
,
4
,
4
,
4
,
4
,
4
,
4
,
4
,
4
,
4
,
4
,
4
,
4
,
4
,
4
,
4
,
4
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
};
const
int8_t
ALIGN
(
dav1d_mc_subpel_filters
[
5
][
15
][
8
],
8
)
=
{
...
...
src/tables.h
View file @
205b723e
...
...
@@ -107,7 +107,7 @@ static const unsigned interintra_allowed_mask =
extern
const
Dav1dWarpedMotionParams
dav1d_default_wm_params
;
extern
const
int16_t
dav1d_sgr_params
[
16
][
4
];
extern
const
int
dav1d_sgr_x_by_x
plus1
[
256
];
extern
const
u
int
8_t
dav1d_sgr_x_by_x
[
256
];
extern
const
int8_t
dav1d_mc_subpel_filters
[
5
][
15
][
8
];
extern
const
int8_t
dav1d_mc_warp_filter
[
193
][
8
];
...
...
src/x86/looprestoration.asm
View file @
205b723e
...
...
@@ -42,14 +42,12 @@ pw_2048: times 2 dw 2048
pw_16380:
times
2
dw
16380
pw_0_128:
dw
0
,
128
pw_5_6:
dw
5
,
6
pw_82:
times
2
dw
82
pw_91_5:
dw
91
,
5
pd_6:
dd
6
pd_255:
dd
255
pd_1024:
dd
1024
pd_0x80000:
dd
0x80000
pd_0xf0080029:
dd
0xf0080029
pd_0xf00801c7:
dd
0xf00801c7
cextern
sgr_x_by_x
plus1
cextern
sgr_x_by_x
SECTION
.text
...
...
@@ -477,76 +475,65 @@ cglobal sgr_box3_v, 5, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr,
RET
INIT_YMM
avx2
cglobal
sgr_calc_ab1
,
4
,
6
,
1
4
,
a
,
b
,
w
,
h
,
s
cglobal
sgr_calc_ab1
,
4
,
6
,
1
1
,
a
,
b
,
w
,
h
,
s
sub
aq
,
(
384
+
16
-
1
)
*
4
sub
bq
,
(
384
+
16
-
1
)
*
2
add
hd
,
2
lea
r5
,
[
sgr_x_by_xplus1
]
pxor
m6
,
m6
vpbroadcastd
m7
,
[
pw_91_5
]
lea
r5
,
[
sgr_x_by_x
-
0xf03
]
%ifidn sd, sm
movd
xm
8
,
sd
vpbroadcastd
m
8
,
xm
8
movd
xm
6
,
sd
vpbroadcastd
m
6
,
xm
6
%else
vpbroadcastd
m
8
,
sm
vpbroadcastd
m
6
,
sm
%endif
vpbroadcastd
m9
,
[
pd_0x80000
]
vpbroadcastd
m10
,
[
pd_255
]
psrad
m12
,
m9
,
8
; pd_2048
psrad
m11
,
m9
,
11
; pd_256
pcmpeqb
m13
,
m13
vpbroadcastd
m8
,
[
pd_0xf00801c7
]
vpbroadcastd
m9
,
[
pw_256
]
pcmpeqb
m7
,
m7
psrld
m10
,
m9
,
13
; pd_2048
DEFINE_ARGS
a
,
b
,
w
,
h
,
x
.loop_y:
mov
xq
,
-
2
.loop_x:
movu
xm0
,
[
aq
+
xq
*
4
+
0
]
movu
xm1
,
[
aq
+
xq
*
4
+
16
]
vinserti128
m0
,
[
aq
+
xq
*
4
+
0
+
(
384
+
16
)
*
4
],
1
vinserti128
m1
,
[
aq
+
xq
*
4
+
16
+
(
384
+
16
)
*
4
],
1
movu
xm2
,
[
bq
+
xq
*
2
]
vinserti128
m2
,
[
bq
+
xq
*
2
+
(
384
+
16
)
*
2
],
1
pslld
m3
,
m0
,
3
pslld
m4
,
m1
,
3
paddd
m3
,
m0
; aa * 9 [first half]
paddd
m4
,
m1
; aa * 9 [second half]
punpcklwd
m0
,
m6
,
m2
punpckhwd
m2
,
m6
,
m2
pmaddwd
m1
,
m0
,
m0
pmaddwd
m5
,
m2
,
m2
pmaddwd
m0
,
m7
pmaddwd
m2
,
m7
psubd
m3
,
m1
; p = aa * 9 - bb * bb [first half]
psubd
m4
,
m5
; p = aa * 9 - bb * bb [second half]
pmulld
m3
,
m8
pmulld
m4
,
m8
paddd
m3
,
m9
paddd
m4
,
m9
psrld
m3
,
20
; z [first half]
psrld
m4
,
20
; z [second half]
pminsd
m3
,
m10
pminsd
m4
,
m10
mova
m5
,
m13
vpgatherdd
m1
,
[
r5
+
m3
*
4
],
m5
; xx [first half]
mova
m5
,
m13
vpgatherdd
m3
,
[
r5
+
m4
*
4
],
m5
; xx [second half]
psubd
m5
,
m11
,
m1
psubd
m4
,
m11
,
m3
packssdw
m1
,
m3
pmullw
m5
,
m7
pmullw
m4
,
m7
pmaddwd
m5
,
m0
pmaddwd
m4
,
m2
paddd
m5
,
m12
paddd
m4
,
m12
psrad
m5
,
12
psrad
m4
,
12
movu
[
bq
+
xq
*
2
],
xm1
vextracti128
[
bq
+
xq
*
2
+
(
384
+
16
)
*
2
],
m1
,
1
movu
[
aq
+
xq
*
4
+
0
],
xm5
movu
[
aq
+
xq
*
4
+
16
],
xm4
vextracti128
[
aq
+
xq
*
4
+
0
+
(
384
+
16
)
*
4
],
m5
,
1
vextracti128
[
aq
+
xq
*
4
+
16
+
(
384
+
16
)
*
4
],
m4
,
1
pmovzxwd
m0
,
[
bq
+
xq
*
2
]
pmovzxwd
m1
,
[
bq
+
xq
*
2
+
(
384
+
16
)
*
2
]
movu
m2
,
[
aq
+
xq
*
4
]
movu
m3
,
[
aq
+
xq
*
4
+
(
384
+
16
)
*
4
]
pslld
m4
,
m2
,
3
pslld
m5
,
m3
,
3
paddd
m2
,
m4
; aa * 9
paddd
m3
,
m5
pmaddwd
m4
,
m0
,
m0
pmaddwd
m5
,
m1
,
m1
pmaddwd
m0
,
m8
pmaddwd
m1
,
m8
psubd
m2
,
m4
; p = aa * 9 - bb * bb
psubd
m3
,
m5
pmulld
m2
,
m6
pmulld
m3
,
m6
paddusw
m2
,
m8
paddusw
m3
,
m8
psrld
m2
,
20
; z
psrld
m3
,
20
mova
m5
,
m7
vpgatherdd
m4
,
[
r5
+
m2
],
m5
; xx
mova
m5
,
m7
vpgatherdd
m2
,
[
r5
+
m3
],
m5
psrld
m4
,
24
psrld
m2
,
24
pmulld
m0
,
m4
pmulld
m1
,
m2
packssdw
m4
,
m2
psubw
m4
,
m9
,
m4
vpermq
m4
,
m4
,
q3120
paddd
m0
,
m10
paddd
m1
,
m10
psrld
m0
,
12
psrld
m1
,
12
movu
[
bq
+
xq
*
2
],
xm4
vextracti128
[
bq
+
xq
*
2
+
(
384
+
16
)
*
2
],
m4
,
1
movu
[
aq
+
xq
*
4
],
m0
movu
[
aq
+
xq
*
4
+
(
384
+
16
)
*
4
],
m1
add
xd
,
8
cmp
xd
,
wd
jl
.loop_x
...
...
@@ -903,78 +890,67 @@ cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr,
jmp
.loop_y_noload
INIT_YMM
avx2
cglobal
sgr_calc_ab2
,
4
,
6
,
1
4
,
a
,
b
,
w
,
h
,
s
cglobal
sgr_calc_ab2
,
4
,
6
,
1
1
,
a
,
b
,
w
,
h
,
s
sub
aq
,
(
384
+
16
-
1
)
*
4
sub
bq
,
(
384
+
16
-
1
)
*
2
add
hd
,
2
lea
r5
,
[
sgr_x_by_xplus1
]
pxor
m6
,
m6
vpbroadcastd
m7
,
[
pw_82
]
lea
r5
,
[
sgr_x_by_x
-
0xf03
]
%ifidn sd, sm
movd
xm
8
,
sd
vpbroadcastd
m
8
,
xm
8
movd
xm
6
,
sd
vpbroadcastd
m
6
,
xm
6
%else
vpbroadcastd
m
8
,
sm
vpbroadcastd
m
6
,
sm
%endif
vpbroadcastd
m9
,
[
pd_0x80000
]
vpbroadcastd
m10
,
[
pd_255
]
psrad
m12
,
m9
,
8
; pd_2048
psrad
m11
,
m9
,
11
; pd_256
pcmpeqb
m13
,
m13
vpbroadcastd
m8
,
[
pd_0xf0080029
]
vpbroadcastd
m9
,
[
pw_256
]
pcmpeqb
m7
,
m7
psrld
m10
,
m9
,
15
; pd_512
DEFINE_ARGS
a
,
b
,
w
,
h
,
x
.loop_y:
mov
xq
,
-
2
.loop_x:
movu
xm0
,
[
aq
+
xq
*
4
+
0
]
movu
xm1
,
[
aq
+
xq
*
4
+
16
]
vinserti128
m0
,
[
aq
+
xq
*
4
+
32
],
1
vinserti128
m1
,
[
aq
+
xq
*
4
+
48
],
1
movu
m2
,
[
bq
+
xq
*
2
]
pslld
m3
,
m0
,
5
; aa * 32 [first half]
pslld
m4
,
m1
,
5
; aa * 32 [second half]
paddd
m3
,
m0
; aa * 33 [first half]
paddd
m4
,
m1
; aa * 33 [first half]
pslld
m0
,
3
; aa * 8 [first half]
pslld
m1
,
3
; aa * 8 [second half]
psubd
m3
,
m0
; aa * 25 [first half]
psubd
m4
,
m1
; aa * 25 [second half]
punpcklwd
m0
,
m2
,
m6
punpckhwd
m2
,
m6
pmaddwd
m1
,
m0
,
m0
pmaddwd
m5
,
m2
,
m2
paddw
m0
,
m0
paddw
m2
,
m2
psubd
m3
,
m1
; p = aa * 25 - bb * bb [first half]
psubd
m4
,
m5
; p = aa * 25 - bb * bb [second half]
pmulld
m3
,
m8
pmulld
m4
,
m8
paddd
m3
,
m9
paddd
m4
,
m9
psrld
m3
,
20
; z [first half]
psrld
m4
,
20
; z [second half]
pminsd
m3
,
m10
pminsd
m4
,
m10
mova
m5
,
m13
vpgatherdd
m1
,
[
r5
+
m3
*
4
],
m5
; xx [first half]
mova
m5
,
m13
vpgatherdd
m3
,
[
r5
+
m4
*
4
],
m5
; xx [second half]
psubd
m5
,
m11
,
m1
psubd
m4
,
m11
,
m3
packssdw
m1
,
m3
pmullw
m5
,
m7
pmullw
m4
,
m7
pmaddwd
m5
,
m0
pmaddwd
m4
,
m2
paddd
m5
,
m12
paddd
m4
,
m12
psrad
m5
,
12
psrad
m4
,
12
movu
[
bq
+
xq
*
2
],
m1
movu
[
aq
+
xq
*
4
+
0
],
xm5
movu
[
aq
+
xq
*
4
+
16
],
xm4
vextracti128
[
aq
+
xq
*
4
+
32
],
m5
,
1
vextracti128
[
aq
+
xq
*
4
+
48
],
m4
,
1
pmovzxwd
m0
,
[
bq
+
xq
*
2
+
0
]
pmovzxwd
m1
,
[
bq
+
xq
*
2
+
16
]
movu
m2
,
[
aq
+
xq
*
4
+
0
]
movu
m3
,
[
aq
+
xq
*
4
+
32
]
pslld
m4
,
m2
,
3
; aa * 8
pslld
m5
,
m3
,
3
paddd
m2
,
m4
; aa * 9
paddd
m3
,
m5
paddd
m4
,
m4
; aa * 16
paddd
m5
,
m5
paddd
m2
,
m4
; aa * 25
paddd
m3
,
m5
pmaddwd
m4
,
m0
,
m0
pmaddwd
m5
,
m1
,
m1
psubd
m2
,
m4
; p = aa * 25 - bb * bb
psubd
m3
,
m5
pmulld
m2
,
m6
pmulld
m3
,
m6
paddusw
m2
,
m8
paddusw
m3
,
m8
psrld
m2
,
20
; z
psrld
m3
,
20
mova
m5
,
m7
vpgatherdd
m4
,
[
r5
+
m2
],
m5
; xx
mova
m5
,
m7
vpgatherdd
m2
,
[
r5
+
m3
],
m5
psrld
m4
,
24
psrld
m2
,
24
packssdw
m3
,
m4
,
m2
pmullw
m4
,
m8
pmullw
m2
,
m8
psubw
m3
,
m9
,
m3
vpermq
m3
,
m3
,
q3120
pmaddwd
m0
,
m4
pmaddwd
m1
,
m2
paddd
m0
,
m10
paddd
m1
,
m10
psrld
m0
,
10
psrld
m1
,
10
movu
[
bq
+
xq
*
2
],
m3
movu
[
aq
+
xq
*
4
+
0
],
m0
movu
[
aq
+
xq
*
4
+
32
],
m1
add
xd
,
16
cmp
xd
,
wd
jl
.loop_x
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment