Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
10
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Open sidebar
VideoLAN
dav1d
Commits
109ee513
Commit
109ee513
authored
Feb 09, 2019
by
Henrik Gramner
Committed by
Henrik Gramner
Feb 09, 2019
1
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
x86: Fix 32-bit looprestoration SSSE3 asm on MSVC
Also make some minor optimizations.
parent
2bc9ba82
Pipeline
#4485
passed with stages
in 6 minutes and 44 seconds
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
187 additions
and
166 deletions
+187
-166
src/x86/looprestoration_ssse3.asm
src/x86/looprestoration_ssse3.asm
+187
-166
No files found.
src/x86/looprestoration_ssse3.asm
View file @
109ee513
...
...
@@ -103,7 +103,10 @@ SECTION .text
INIT_XMM
ss
se3
%if ARCH_X86_64
cglobal
wiener_filter_h
,
8
,
15
,
16
,
ds
t
,
left
,
src
,
stride
,
fh
,
w
,
h
,
edge
cglobal
wiener_filter_h
,
5
,
15
,
16
,
ds
t
,
left
,
src
,
stride
,
fh
,
w
,
h
,
edge
mov
edged
,
edgem
movifnidn
wd
,
wm
mov
hd
,
hm
movq
m15
,
[
fhq
]
pshufb
m12
,
m15
,
[
pb_6_7
]
pshufb
m13
,
m15
,
[
pb_4
]
...
...
@@ -115,7 +118,11 @@ cglobal wiener_filter_h, 8, 15, 16, dst, left, src, stride, fh, w, h, edge
DEFINE_ARGS
ds
t
,
left
,
src
,
stride
,
x
,
w
,
h
,
edge
,
srcptr
,
ds
tptr
,
xlim
%else
cglobal
wiener_filter_h
,
8
,
8
,
8
,
92
,
ds
t
,
left
,
src
,
stride
,
fh
,
w
,
h
,
edge
cglobal
wiener_filter_h
,
5
,
7
,
8
,
-
100
,
ds
t
,
left
,
src
,
stride
,
fh
,
w
,
h
,
edge
mov
wd
,
edgem
mov
[
esp
+
12
],
wd
mov
wd
,
wm
mov
hd
,
hm
SETUP_PIC
hd
movq
m0
,
[
fhq
]
pshufb
m3
,
m0
,
[
PIC_sym
(
pb_6_7
)]
...
...
@@ -124,20 +131,19 @@ cglobal wiener_filter_h, 8, 8, 8, 92, dst, left, src, stride, fh, w, h, edge
pshufb
m0
,
m0
,
[
PIC_sym
(
pb_0
)]
DEFINE_ARGS
ds
t
,
left
,
src
,
stride
,
x
,
w
,
h
,
edge
%define xlimm r0m
%define xlimmp r0mp
%define srcptrq srcq
%define dstptrq dstq
%define hd dword [esp]
%define edged edgemp
%define edged dword [esp+12]
%define xlimd dword [esp+16]
%define m10 [PIC_sym(pw_16380)]
%define m11 [PIC_sym(pw_2048)]
%define m12 [esp+0
Ch
]
%define m13 [esp+
1Ch
]
%define m14 [esp+
2Ch
]
%define m15 [esp+
3Ch
]
%define m12 [esp+0
x14
]
%define m13 [esp+
0x24
]
%define m14 [esp+
0x34
]
%define m15 [esp+
0x44
]
mova
m15
,
m0
mova
m14
,
m1
...
...
@@ -149,11 +155,7 @@ cglobal wiener_filter_h, 8, 8, 8, 92, dst, left, src, stride, fh, w, h, edge
; else w -= 3, and use that as limit in x loop
test
edged
,
2
; has_right
jnz
.align
%if ARCH_X86_64
mov
xlimq
,
-
3
%else
mov
xlimmp
,
-
3
%endif
mov
xlimd
,
-
3
jmp
.loop
.align:
add
wd
,
15
...
...
@@ -161,7 +163,7 @@ cglobal wiener_filter_h, 8, 8, 8, 92, dst, left, src, stride, fh, w, h, edge
%if ARCH_X86_64
xor
xlimd
,
xlimd
%else
mov
xlim
mp
,
0
mov
xlim
d
,
0
%endif
; main y loop for vertical filter
...
...
@@ -169,12 +171,12 @@ cglobal wiener_filter_h, 8, 8, 8, 92, dst, left, src, stride, fh, w, h, edge
%if ARCH_X86_64
mov
srcptrq
,
srcq
mov
ds
tptrq
,
ds
tq
lea
xd
,
[
w
d
+
xlim
d
]
lea
xd
,
[
w
q
+
xlim
q
]
%else
mov
[
esp
+
8
],
srcq
mov
[
esp
+
4
],
ds
tq
mov
xd
,
w
d
add
xd
,
xlimm
mov
xd
,
xlim
d
add
xd
,
wd
%endif
; load left edge pixels
...
...
@@ -252,7 +254,7 @@ cglobal wiener_filter_h, 8, 8, 8, 92, dst, left, src, stride, fh, w, h, edge
palignr
m7
,
m1
,
m0
,
15
%if ARCH_X86_32
mova
[
esp
+
4Ch
],
m1
mova
[
esp
+
0x54
],
m1
%define m8 m1
%endif
punpcklbw
m0
,
m2
,
m1
...
...
@@ -294,7 +296,7 @@ cglobal wiener_filter_h, 8, 8, 8, 92, dst, left, src, stride, fh, w, h, edge
%if ARCH_X86_64
mova
m0
,
m1
%else
mova
m0
,
[
esp
+
4Ch
]
mova
m0
,
[
esp
+
0x54
]
%endif
add
srcptrq
,
16
add
ds
tptrq
,
32
...
...
@@ -303,11 +305,7 @@ cglobal wiener_filter_h, 8, 8, 8, 92, dst, left, src, stride, fh, w, h, edge
jg
.main_load
test
xd
,
xd
jg
.load_and_splat
%if ARCH_X86_64
cmp
xd
,
xlimd
%else
cmp
xd
,
xlimm
%endif
jg
.splat_right
%if ARCH_X86_32
...
...
@@ -321,7 +319,10 @@ cglobal wiener_filter_h, 8, 8, 8, 92, dst, left, src, stride, fh, w, h, edge
RET
%if ARCH_X86_64
cglobal
wiener_filter_v
,
7
,
10
,
16
,
ds
t
,
stride
,
mid
,
w
,
h
,
fv
,
edge
cglobal
wiener_filter_v
,
4
,
10
,
16
,
ds
t
,
stride
,
mid
,
w
,
h
,
fv
,
edge
mov
edged
,
edgem
movifnidn
fvq
,
fvmp
movifnidn
hd
,
hm
movq
m15
,
[
fvq
]
pshufb
m14
,
m15
,
[
pb_4_5_6_7
]
pshufb
m15
,
m15
,
[
pb_0_1_2_3
]
...
...
@@ -336,14 +337,16 @@ cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge
shr
ylimd
,
2
sub
ylimd
,
3
%else
cglobal
wiener_filter_v
,
7
,
7
,
8
,
92
,
ds
t
,
stride
,
mid
,
w
,
h
,
fv
,
edge
%define ylimm r0m
%define ylimmp r0mp
cglobal
wiener_filter_v
,
5
,
7
,
8
,
-
96
,
ds
t
,
stride
,
mid
,
w
,
h
,
fv
,
edge
%define ylimd [esp+12]
mov
ylimm
,
edged
and
ylimmp
,
8
; have_bottom
shr
ylimmp
,
2
sub
ylimmp
,
3
mov
r5d
,
edgem
and
r5d
,
8
shr
r5d
,
2
sub
r5d
,
3
mov
ylimd
,
r5d
mov
fvq
,
fvmp
mov
edged
,
edgem
SETUP_PIC
edged
...
...
@@ -351,8 +354,8 @@ cglobal wiener_filter_v, 7, 7, 8, 92, dst, stride, mid, w, h, fv, edge
pshufb
m1
,
m0
,
[
PIC_sym
(
pb_4_5_6_7
)]
pshufb
m0
,
m0
,
[
PIC_sym
(
pb_0_1_2_3
)]
paddw
m1
,
[
PIC_sym
(
pw_0_128
)]
mova
[
esp
+
4Ch
],
m0
mova
[
esp
+
3Ch
],
m1
mova
[
esp
+
0x50
],
m0
mova
[
esp
+
0x40
],
m1
DEFINE_ARGS
ds
t
,
stride
,
mid
,
w
,
h
,
y
,
edge
%define mptrq midq
...
...
@@ -386,7 +389,7 @@ cglobal wiener_filter_v, 7, 7, 8, 92, dst, stride, mid, w, h, fv, edge
%else
mov
[
esp
+
8
],
midq
mov
[
esp
+
4
],
ds
tq
add
yd
,
ylim
m
add
yd
,
ylim
d
%endif
jg
.load_threelines
...
...
@@ -438,19 +441,19 @@ cglobal wiener_filter_v, 7, 7, 8, 92, dst, stride, mid, w, h, fv, edge
pmaddwd
m7
,
m15
pmaddwd
m11
,
m14
pmaddwd
m9
,
m14
paddd
m10
,
m11
paddd
m7
,
m9
paddd
m10
,
m12
paddd
m7
,
m12
paddd
m10
,
m11
paddd
m7
,
m9
psrad
m10
,
11
psrad
m7
,
11
packssdw
m10
,
m7
packuswb
m10
,
m10
movq
[
ds
tptrq
],
m10
%else
mova
[
esp
+
2Ch
],
m1
mova
[
esp
+
1Ch
],
m2
mova
[
esp
+
0
Ch
],
m3
mova
[
esp
+
0x30
],
m1
mova
[
esp
+
0x20
],
m2
mova
[
esp
+
0
x10
],
m3
paddw
m0
,
m6
paddw
m1
,
m5
paddw
m2
,
m4
...
...
@@ -458,22 +461,24 @@ cglobal wiener_filter_v, 7, 7, 8, 92, dst, stride, mid, w, h, fv, edge
punpckhwd
m2
,
m3
punpcklwd
m3
,
m0
,
m1
punpckhwd
m0
,
m1
pmaddwd
m
3
,
[
esp
+
4Ch
]
pmaddwd
m
0
,
[
esp
+
4Ch
]
pmaddwd
m
7
,
[
esp
+
3Ch
]
pmaddwd
m
2
,
[
esp
+
3Ch
]
paddd
m
3
,
m
7
paddd
m
0
,
m
2
mova
m
1
,
[
esp
+
0x50
]
pmaddwd
m
3
,
m1
pmaddwd
m
0
,
m1
mova
m
1
,
[
esp
+
0x40
]
p
m
add
w
d
m
7
,
m
1
p
m
add
w
d
m
2
,
m
1
paddd
m3
,
[
PIC_sym
(
pd_1024
)]
paddd
m0
,
[
PIC_sym
(
pd_1024
)]
paddd
m3
,
m7
paddd
m0
,
m2
psrad
m3
,
11
psrad
m0
,
11
packssdw
m3
,
m0
packuswb
m3
,
m3
movq
[
ds
tq
],
m3
mova
m1
,
[
esp
+
2Ch
]
mova
m2
,
[
esp
+
1Ch
]
mova
m3
,
[
esp
+
0
Ch
]
mova
m1
,
[
esp
+
0x30
]
mova
m2
,
[
esp
+
0x20
]
mova
m3
,
[
esp
+
0
x10
]
%endif
; shift pixels one position
mova
m0
,
m1
...
...
@@ -487,11 +492,7 @@ cglobal wiener_filter_v, 7, 7, 8, 92, dst, stride, mid, w, h, fv, edge
dec
yd
jg
.loop_load
; for the bottom pixels, continue using m6 (as extended edge)
%if ARCH_X86_64
cmp
yd
,
ylimd
%else
cmp
yd
,
ylimm
%endif
jg
.loop
%if ARCH_X86_32
...
...
@@ -545,28 +546,27 @@ cglobal wiener_filter_v, 7, 7, 8, 92, dst, stride, mid, w, h, fv, edge
%endmacro
%if ARCH_X86_64
cglobal
sgr_box3_h
,
8
,
11
,
8
,
sumsq
,
sum
,
left
,
src
,
stride
,
x
,
h
,
edge
,
w
,
xlim
mov
xlimd
,
edged
cglobal
sgr_box3_h
,
5
,
11
,
8
,
sumsq
,
sum
,
left
,
src
,
stride
,
x
,
h
,
edge
,
w
,
xlim
mov
xlimd
,
edgem
movifnidn
xd
,
xm
mov
hd
,
hm
mov
edged
,
xlimd
and
xlimd
,
2
; have_right
add
xd
,
xlimd
xor
xlimd
,
2
; 2*!have_right
%else
cglobal
sgr_box3_h
,
8
,
8
,
8
,
4
,
sumsq
,
sum
,
left
,
src
,
stride
,
x
,
h
,
edge
,
w
,
xlim
%define wm r0m
%define xlimm r1m
SETUP_PIC
hd
PUSH
r0
mov
r0
,
edgem
and
r0
,
2
; have_right
add
xd
,
r0
xor
r0
,
2
; 2*!have_right
mov
xlimm
,
r0
POP
r0
%define hd dword [esp]
cglobal
sgr_box3_h
,
6
,
7
,
8
,
sumsq
,
sum
,
left
,
src
,
stride
,
x
,
h
,
edge
,
w
,
xlim
%define wq r0m
%define xlimd r1m
%define hd hmp
%define edged edgemp
%define xlimq xlimm
mov
r6
,
edgem
and
r6
,
2
; have_right
add
xd
,
r6
xor
r6
,
2
; 2*!have_right
mov
xlimd
,
r6
SETUP_PIC
r6
,
0
%endif
jnz
.no_right
...
...
@@ -578,16 +578,12 @@ cglobal sgr_box3_h, 8, 8, 8, 4, sumsq, sum, left, src, stride, x, h, edge, w, xl
lea
sumq
,
[
sumq
+
xq
*
2
-
2
]
lea
sumsqq
,
[
sumsqq
+
xq
*
4
-
4
]
neg
xq
%if ARCH_X86_64
mov
wq
,
xq
%if ARCH_X86_64
lea
r10
,
[
pb_right_ext_mask
+
16
]
%endif
.loop_y:
mov
xq
,
wq
%else
mov
wm
,
xd
.loop_y:
mov
xd
,
wm
%endif
; load left
test
edged
,
1
; have_left
...
...
@@ -661,11 +657,11 @@ cglobal sgr_box3_h, 8, 8, 8, 4, sumsq, sum, left, src, stride, x, h, edge, w, xl
; else if x < xlimd we extend from previous load (this implies have_right=0)
; else we are done
cmp
x
q
,
-
8
cmp
x
d
,
-
8
jle
.loop_x
test
x
q
,
x
q
test
x
d
,
x
d
jl
.partial_load_and_extend
cmp
x
q
,
xlim
q
cmp
x
d
,
xlim
d
jl
.right_extend
add
sumsqq
,
(
384
+
16
)
*
4
...
...
@@ -676,15 +672,14 @@ cglobal sgr_box3_h, 8, 8, 8, 4, sumsq, sum, left, src, stride, x, h, edge, w, xl
RET
%if ARCH_X86_64
cglobal
sgr_box3_v
,
5
,
10
,
9
,
sumsq
,
sum
,
w
,
h
,
edge
,
x
,
y
,
sumsq_base
,
sum_base
,
ylim
cglobal
sgr_box3_v
,
4
,
10
,
9
,
sumsq
,
sum
,
w
,
h
,
edge
,
x
,
y
,
sumsq_base
,
sum_base
,
ylim
movifnidn
edged
,
edgem
%else
cglobal
sgr_box3_v
,
5
,
7
,
8
,
16
,
sumsq
,
sum
,
w
,
h
,
edge
,
x
,
y
%define sumsq_basem r0m
%define sum_basem r1m
%define ylimm r4m
%define ylimmp r4mp
%define m8 [esp]
cglobal
sgr_box3_v
,
5
,
7
,
8
,
-
28
,
sumsq
,
sum
,
w
,
h
,
edge
,
x
,
y
%define sumsq_baseq dword [esp+0]
%define sum_baseq dword [esp+4]
%define ylimd dword [esp+8]
%define m8 [esp+12]
%endif
mov
xq
,
-
2
%if ARCH_X86_64
...
...
@@ -699,14 +694,18 @@ cglobal sgr_box3_v, 5, 7, 8, 16, sumsq, sum, w, h, edge, x, y
mov
sumq
,
sum_baseq
lea
yd
,
[
hd
+
ylimd
+
2
]
%else
and
ylimmp
,
8
; have_bottom
shr
ylimmp
,
2
sub
ylimmp
,
2
; -2 if have_bottom=0, else 0
mov
yd
,
edged
and
yd
,
8
; have_bottom
shr
yd
,
2
sub
yd
,
2
; -2 if have_bottom=0, else 0
mov
sumsq_baseq
,
sumsqq
mov
sum_baseq
,
sumq
mov
ylimd
,
yd
.loop_x:
mov
sumsqd
,
sumsq_base
m
mov
sumd
,
sum_base
m
mov
sumsqd
,
sumsq_base
q
mov
sumd
,
sum_base
q
lea
yd
,
[
hd
+
2
]
add
yd
,
ylim
m
add
yd
,
ylim
d
%endif
lea
sumsqq
,
[
sumsqq
+
xq
*
4
+
4
-
(
384
+
16
)
*
4
]
lea
sumq
,
[
sumq
+
xq
*
2
+
2
-
(
384
+
16
)
*
2
]
...
...
@@ -734,7 +733,7 @@ cglobal sgr_box3_v, 5, 7, 8, 16, sumsq, sum, w, h, edge, x, y
movu
m8
,
[
sumq
+
(
384
+
16
)
*
2
*
1
]
; l0
%else
movu
m4
,
[
sumq
+
(
384
+
16
)
*
2
*
1
]
; l0
mova
[
esp
]
,
m4
mova
m8
,
m4
%endif
movu
m4
,
[
sumsqq
+
(
384
+
16
)
*
4
*
1
]
; l0sq [left]
movu
m5
,
[
sumsqq
+
(
384
+
16
)
*
4
*
1
+
16
]
; l0sq [right]
...
...
@@ -760,18 +759,15 @@ cglobal sgr_box3_v, 5, 7, 8, 16, sumsq, sum, w, h, edge, x, y
add
sumq
,
(
384
+
16
)
*
2
dec
yd
jg
.loop_y
%if ARCH_X86_64
cmp
yd
,
ylimd
%else
cmp
yd
,
ylimm
%endif
jg
.loop_y_noload
add
xd
,
8
cmp
xd
,
wd
jl
.loop_x
RET
cglobal
sgr_calc_ab1
,
5
,
7
,
14
,
a
,
b
,
w
,
h
,
s
cglobal
sgr_calc_ab1
,
4
,
7
,
14
,
a
,
b
,
w
,
h
,
s
movifnidn
sd
,
sm
sub
aq
,
(
384
+
16
-
1
)
*
4
sub
bq
,
(
384
+
16
-
1
)
*
2
add
hd
,
2
...
...
@@ -845,34 +841,42 @@ cglobal sgr_calc_ab1, 5, 7, 14, a, b, w, h, s
RET
%if ARCH_X86_64
cglobal
sgr_finish_filter1
,
7
,
13
,
16
,
t
,
src
,
stride
,
a
,
b
,
w
,
h
,
\
cglobal
sgr_finish_filter1
,
5
,
13
,
16
,
t
,
src
,
stride
,
a
,
b
,
w
,
h
,
\
tmp_base
,
src_base
,
a_base
,
b_base
,
x
,
y
mova
m15
,
[
pw_16
]
movifnidn
wd
,
wm
mov
hd
,
hm
mova
m15
,
[
pw_16
]
mov
tmp_baseq
,
tq
mov
src_baseq
,
srcq
mov
a_baseq
,
aq
mov
b_baseq
,
bq
xor
xd
,
xd
%else
cglobal
sgr_finish_filter1
,
7
,
7
,
8
,
120
,
t
,
src
,
stride
,
a
,
b
,
x
,
y
%define tmp_baseq r0m
%define src_baseq r1m
%define a_baseq r3m
%define b_baseq r4m
%define wd r5m
%define hd r6m
cglobal
sgr_finish_filter1
,
7
,
7
,
8
,
-
144
,
t
,
src
,
stride
,
a
,
b
,
x
,
y
%define tmp_baseq [esp+8]
%define src_baseq [esp+12]
%define a_baseq [esp+16]
%define b_baseq [esp+20]
%define wd [esp+24]
%define hd [esp+28]
mov
tmp_baseq
,
tq
mov
src_baseq
,
srcq
mov
a_baseq
,
aq
mov
b_baseq
,
bq
mov
wd
,
xd
mov
hd
,
yd
xor
xd
,
xd
SETUP_PIC
yd
,
1
,
1
jmp
.loop_start
%endif
xor
xd
,
xd
.loop_x:
mov
tq
,
tmp_baseq
mov
srcq
,
src_baseq
mov
aq
,
a_baseq
mov
bq
,
b_baseq
%if ARCH_X86_32
.loop_start:
movu
m0
,
[
bq
+
xq
*
2
-
(
384
+
16
)
*
2
-
2
]
movu
m2
,
[
bq
+
xq
*
2
-
(
384
+
16
)
*
2
+
2
]
mova
m1
,
[
bq
+
xq
*
2
-
(
384
+
16
)
*
2
]
; b:top
...
...
@@ -881,9 +885,9 @@ cglobal sgr_finish_filter1, 7, 7, 8, 120, t, src, stride, a, b, x, y
movu
m3
,
[
bq
+
xq
*
2
+
2
]
paddw
m1
,
[
bq
+
xq
*
2
]
; b:top+ctr
paddw
m2
,
m3
; b:l+r
mova
[
esp
+
68h
],
m0
mova
[
esp
+
58h
],
m1
mova
[
esp
+
48h
],
m2
mova
[
esp
+
0x80
],
m0
mova
[
esp
+
0x70
],
m1
mova
[
esp
+
0x60
],
m2
%endif
movu
m0
,
[
aq
+
xq
*
4
-
(
384
+
16
)
*
4
-
4
]
movu
m2
,
[
aq
+
xq
*
4
-
(
384
+
16
)
*
4
+
4
]
...
...
@@ -941,9 +945,9 @@ cglobal sgr_finish_filter1, 7, 7, 8, 120, t, src, stride, a, b, x, y
%else
paddd
m1
,
[
aq
]
; a:top+ctr+bottom [first half]
paddd
m3
,
[
aq
+
16
]
; a:top+ctr+bottom [second half]
mova
[
esp
+
38h
],
m1
mova
[
esp
+
28h
],
m3
mova
[
esp
+
18h
],
m4
mova
[
esp
+
0x50
],
m1
mova
[
esp
+
0x40
],
m3
mova
[
esp
+
0x30
],
m4
movu
m6
,
[
aq
-
4
]
movu
m7
,
[
aq
+
4
]
paddd
m1
,
m4
; a:top+ctr+bottom+l+r [first half]
...
...
@@ -984,13 +988,13 @@ cglobal sgr_finish_filter1, 7, 7, 8, 120, t, src, stride, a, b, x, y
psubd
m1
,
[
aq
-
(
384
+
16
)
*
4
*
2
]
; a:ctr+bottom [first half]
psubd
m3
,
[
aq
-
(
384
+
16
)
*
4
*
2
+
16
]
; a:ctr+bottom [second half]
%else
mova
m4
,
[
esp
+
68h
]
mova
[
esp
+
68h
],
m5
mova
m5
,
[
esp
+
58h
]
mova
[
esp
+
58h
],
m6
mova
m6
,
[
esp
+
48h
]
mova
[
esp
+
48h
],
m7
mova
[
esp
+
0
8h
],
m1
mova
m4
,
[
esp
+
0x80
]
mova
[
esp
+
0x80
],
m5
mova
m5
,
[
esp
+
0x70
]
mova
[
esp
+
0x70
],
m6
mova
m6
,
[
esp
+
0x60
]
mova
[
esp
+
0x60
],
m7
mova
[
esp
+
0
x20
],
m1
movu
m7
,
[
bq
-
2
]
movu
m1
,
[
bq
+
2
]
paddw
m5
,
[
bq
]
; b:top+ctr+bottom
...
...
@@ -1021,7 +1025,7 @@ cglobal sgr_finish_filter1, 7, 7, 8, 120, t, src, stride, a, b, x, y
packssdw
m6
,
m10
mova
[
tq
],
m6
%else
paddd
m4
,
[
esp
+
0
8h
]
paddd
m4
,
[
esp
+
0
x20
]
paddd
m1
,
m3
psrad
m4
,
9
psrad
m1
,
9
...
...
@@ -1038,15 +1042,15 @@ cglobal sgr_finish_filter1, 7, 7, 8, 120, t, src, stride, a, b, x, y
mova
m6
,
m8
mova
m8
,
m9
%else
mova
m1
,
[
esp
+
38h
]
mova
m3
,
[
esp
+
28h
]
mova
m0
,
[
esp
+
18h
]
mova
m2
,
[
esp
+
68h
]
mova
m4
,
[
esp
+
58h
]
mova
[
esp
+
58h
],
m5
mova
m5
,
[
esp
+
48h
]
mova
[
esp
+
68h
],
m6
mova
[
esp
+
48h
],
m7
mova
m1
,
[
esp
+
0x50
]
mova
m3
,
[
esp
+
0x40
]
mova
m0
,
[
esp
+
0x30
]
mova
m2
,
[
esp
+
0x80
]
mova
m4
,
[
esp
+
0x70
]
mova
[
esp
+
0x70
],
m5
mova
m5
,
[
esp
+
0x60
]
mova
[
esp
+
0x80
],
m6
mova
[
esp
+
0x60
],
m7
psubd
m1
,
[
aq
-
(
384
+
16
)
*
4
*
2
]
; a:ctr+bottom [first half]
psubd
m3
,
[
aq
-
(
384
+
16
)
*
4
*
2
+
16
]
; a:ctr+bottom [second half]
%endif
...
...
@@ -1062,13 +1066,12 @@ cglobal sgr_finish_filter1, 7, 7, 8, 120, t, src, stride, a, b, x, y
jl
.loop_x
RET
%if ARCH_X86_64
cglobal
sgr_weighted1
,
6
,
6
,
8
,
ds
t
,
stride
,
t
,
w
,
h
,
wt
%else
cglobal
sgr_weighted1
,
6
,
7
,
8
,
ds
t
,
stride
,
t
,
w
,
h
,
wt
cglobal
sgr_weighted1
,
4
,
7
,
8
,
ds
t
,
stride
,
t
,
w
,
h
,
wt
movifnidn
hd
,
hm
%if ARCH_X86_32
SETUP_PIC
r6
,
0
%endif
movd
m0
,
wt
d
movd
m0
,
wt
m
pshufb
m0
,
[
PIC_sym
(
pb_0_1
)]
psllw
m0
,
4
pxor
m7
,
m7
...
...
@@ -1101,17 +1104,20 @@ cglobal sgr_weighted1, 6, 7, 8, dst, stride, t, w, h, wt
RET
%if ARCH_X86_64
cglobal
sgr_box5_h
,
8
,
11
,
12
,
sumsq
,
sum
,
left
,
src
,
stride
,
w
,
h
,
edge
,
x
,
xlim
cglobal
sgr_box5_h
,
5
,
11
,
12
,
sumsq
,
sum
,
left
,
src
,
stride
,
w
,
h
,
edge
,
x
,
xlim
mov
edged
,
edgem
movifnidn
wd
,
wm
mov
hd
,
hm
mova
m10
,
[
pb_0
]
mova
m11
,
[
pb_0_1
]
%else
cglobal
sgr_box5_h
,
8
,
8
,
8
,
8
,
sumsq
,
sum
,
left
,
src
,
xlim
,
x
,
h
,
edge
cglobal
sgr_box5_h
,
7
,
7
,
8
,
sumsq
,
sum
,
left
,
src
,
xlim
,
x
,
h
,
edge
%define edged edgemp
%define wd xd
%define wq wd
%define wm r5m
%define strideq r4m
SUB
esp
,
8
SETUP_PIC
sumsqd
,
1
,
1
%define m10 [PIC_sym(pb_0)]
...
...
@@ -1275,17 +1281,23 @@ cglobal sgr_box5_h, 8, 8, 8, 8, sumsq, sum, left, src, xlim, x, h, edge
add
srcq
,
strideq
dec
hd
jg
.loop_y
%if ARCH_X86_32
ADD
esp
,
8
%endif
RET
%if ARCH_X86_64
cglobal
sgr_box5_v
,
5
,
10
,
15
,
sumsq
,
sum
,
w
,
h
,
edge
,
x
,
y
,
sumsq_ptr
,
sum_ptr
,
ylim
cglobal
sgr_box5_v
,
4
,
10
,
15
,
sumsq
,
sum
,
w
,
h
,
edge
,
x
,
y
,
sumsq_ptr
,
sum_ptr
,
ylim
movifnidn
edged
,
edgem
mov
ylimd
,
edged
%else
cglobal
sgr_box5_v
,
5
,
7
,
8
,
32
,
sumsq
,
sum
,
x
,
y
,
ylim
,
sumsq_ptr
,
sum_ptr
%define wm r2m
%define hm r3m
%define edgem r4m
%define edgemp r4mp
cglobal
sgr_box5_v
,
5
,
7
,
8
,
-
44
,
sumsq
,
sum
,
x
,
y
,
ylim
,
sumsq_ptr
,
sum_ptr
%define wm [esp+0]
%define hm [esp+4]
%define edgem [esp+8]
mov
wm
,
xd
mov
hm
,
yd
mov
edgem
,
ylimd
%endif
and
ylimd
,
8
; have_bottom
...
...
@@ -1383,7 +1395,7 @@ cglobal sgr_box5_v, 5, 7, 8, 32, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
lea
yd
,
[
ylimd
+
2
]
add
yd
,
hm
lea
sumsq_ptrq
,
[
sumsqq
+
xq
*
4
+
4
-
(
384
+
16
)
*
4
]
test
edgem
p
,
4
; have_top
test
dword
edgem
,
4
; have_top
jnz
.sumsq_load_top
movu
m0
,
[
sumsq_ptrq
+
(
384
+
16
)
*
4
*
1
]
movu
m1
,
[
sumsq_ptrq
+
(
384
+
16
)
*
4
*
1
+
16
]
...
...
@@ -1391,16 +1403,16 @@ cglobal sgr_box5_v, 5, 7, 8, 32, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
mova
m5
,
m1
mova
m6
,
m0
mova
m7
,
m1
mova
[
esp
+
10h
],
m0