Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
D
dav1d
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
François Cartegnie
dav1d
Commits
aec3d25c
Commit
aec3d25c
authored
Oct 05, 2018
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add horizontal loopfilter AVX2 SIMD
parent
24effd47
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
846 additions
and
70 deletions
+846
-70
src/lf_apply.c
src/lf_apply.c
+3
-3
src/x86/loopfilter.asm
src/x86/loopfilter.asm
+835
-63
src/x86/loopfilter_init.c
src/x86/loopfilter_init.c
+8
-4
No files found.
src/lf_apply.c
View file @
aec3d25c
...
...
@@ -49,7 +49,7 @@ static inline void filter_plane_cols_y(const Dav1dFrameContext *const f,
for
(
int
x
=
0
;
x
<
w
;
x
++
)
{
if
(
!
have_left
&&
!
x
)
continue
;
dsp
->
lf
.
loop_filter_sb
[
0
][
0
](
&
dst
[
x
*
4
],
ls
,
starty4
?
(
const
uint32_t
[
3
])
{
starty4
?
(
const
uint32_t
[
4
])
{
mask
[
x
][
0
]
>>
starty4
,
mask
[
x
][
1
]
>>
starty4
,
mask
[
x
][
2
]
>>
starty4
,
...
...
@@ -98,14 +98,14 @@ static inline void filter_plane_cols_uv(const Dav1dFrameContext *const f,
for
(
int
x
=
0
;
x
<
w
;
x
++
)
{
if
(
!
have_left
&&
!
x
)
continue
;
dsp
->
lf
.
loop_filter_sb
[
1
][
0
](
&
u
[
x
*
4
],
ls
,
starty4
?
(
const
uint32_t
[
2
])
{
starty4
?
(
const
uint32_t
[
3
])
{
mask
[
x
][
0
]
>>
starty4
,
mask
[
x
][
1
]
>>
starty4
,
}
:
mask
[
x
],
(
const
uint8_t
(
*
)[
4
])
&
lvl
[
x
][
2
],
b4_stride
,
&
f
->
lf
.
lim_lut
,
endy4
-
starty4
);
dsp
->
lf
.
loop_filter_sb
[
1
][
0
](
&
v
[
x
*
4
],
ls
,
starty4
?
(
const
uint32_t
[
2
])
{
starty4
?
(
const
uint32_t
[
3
])
{
mask
[
x
][
0
]
>>
starty4
,
mask
[
x
][
1
]
>>
starty4
,
}
:
mask
[
x
],
...
...
src/x86/loopfilter.asm
View file @
aec3d25c
...
...
@@ -63,22 +63,496 @@ SECTION .text
por
%
1
,
%
4
%endmacro
%macro FILTER 1
; width
%macro TRANSPOSE_16x4_AND_WRITE_4x32 5
; transpose 16x4
punpcklbw
m
%
5
,
m
%
1
,
m
%
2
punpckhbw
m
%
1
,
m
%
2
punpcklbw
m
%
2
,
m
%
3
,
m
%
4
punpckhbw
m
%
3
,
m
%
4
punpcklwd
m
%
4
,
m
%
5
,
m
%
2
punpckhwd
m
%
5
,
m
%
2
punpcklwd
m
%
2
,
m
%
1
,
m
%
3
punpckhwd
m
%
1
,
m
%
3
; write out
movd
[
ds
tq
+
strideq
*
0
-
2
],
xm
%
4
pextrd
[
ds
tq
+
strideq
*
1
-
2
],
xm
%
4
,
1
pextrd
[
ds
tq
+
strideq
*
2
-
2
],
xm
%
4
,
2
pextrd
[
ds
tq
+
stride3q
-
2
],
xm
%
4
,
3
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
movd
[
ds
tq
+
strideq
*
0
-
2
],
xm
%
5
pextrd
[
ds
tq
+
strideq
*
1
-
2
],
xm
%
5
,
1
pextrd
[
ds
tq
+
strideq
*
2
-
2
],
xm
%
5
,
2
pextrd
[
ds
tq
+
stride3q
-
2
],
xm
%
5
,
3
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
movd
[
ds
tq
+
strideq
*
0
-
2
],
xm
%
2
pextrd
[
ds
tq
+
strideq
*
1
-
2
],
xm
%
2
,
1
pextrd
[
ds
tq
+
strideq
*
2
-
2
],
xm
%
2
,
2
pextrd
[
ds
tq
+
stride3q
-
2
],
xm
%
2
,
3
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
movd
[
ds
tq
+
strideq
*
0
-
2
],
xm
%
1
pextrd
[
ds
tq
+
strideq
*
1
-
2
],
xm
%
1
,
1
pextrd
[
ds
tq
+
strideq
*
2
-
2
],
xm
%
1
,
2
pextrd
[
ds
tq
+
stride3q
-
2
],
xm
%
1
,
3
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
vextracti128
xm
%
4
,
m
%
4
,
1
vextracti128
xm
%
5
,
m
%
5
,
1
vextracti128
xm
%
2
,
m
%
2
,
1
vextracti128
xm
%
1
,
m
%
1
,
1
movd
[
ds
tq
+
strideq
*
0
-
2
],
xm
%
4
pextrd
[
ds
tq
+
strideq
*
1
-
2
],
xm
%
4
,
1
pextrd
[
ds
tq
+
strideq
*
2
-
2
],
xm
%
4
,
2
pextrd
[
ds
tq
+
stride3q
-
2
],
xm
%
4
,
3
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
movd
[
ds
tq
+
strideq
*
0
-
2
],
xm
%
5
pextrd
[
ds
tq
+
strideq
*
1
-
2
],
xm
%
5
,
1
pextrd
[
ds
tq
+
strideq
*
2
-
2
],
xm
%
5
,
2
pextrd
[
ds
tq
+
stride3q
-
2
],
xm
%
5
,
3
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
movd
[
ds
tq
+
strideq
*
0
-
2
],
xm
%
2
pextrd
[
ds
tq
+
strideq
*
1
-
2
],
xm
%
2
,
1
pextrd
[
ds
tq
+
strideq
*
2
-
2
],
xm
%
2
,
2
pextrd
[
ds
tq
+
stride3q
-
2
],
xm
%
2
,
3
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
movd
[
ds
tq
+
strideq
*
0
-
2
],
xm
%
1
pextrd
[
ds
tq
+
strideq
*
1
-
2
],
xm
%
1
,
1
pextrd
[
ds
tq
+
strideq
*
2
-
2
],
xm
%
1
,
2
pextrd
[
ds
tq
+
stride3q
-
2
],
xm
%
1
,
3
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
%endmacro
%macro TRANSPOSE_16X16B 3
; in_load_15_from_mem, out_store_0_in_mem, mem
%if %1 == 0
mova
%
3
,
m15
%endif
; input in m0-15
punpcklbw
m15
,
m0
,
m1
punpckhbw
m0
,
m1
punpcklbw
m1
,
m2
,
m3
punpckhbw
m2
,
m3
punpcklbw
m3
,
m4
,
m5
punpckhbw
m4
,
m5
punpcklbw
m5
,
m6
,
m7
punpckhbw
m6
,
m7
punpcklbw
m7
,
m8
,
m9
punpckhbw
m8
,
m9
punpcklbw
m9
,
m10
,
m11
punpckhbw
m10
,
m11
punpcklbw
m11
,
m12
,
m13
punpckhbw
m12
,
m13
mova
m13
,
%
3
mova
%
3
,
m12
punpcklbw
m12
,
m14
,
m13
punpckhbw
m13
,
m14
,
m13
; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13
punpcklwd
m14
,
m15
,
m1
punpckhwd
m15
,
m1
punpcklwd
m1
,
m0
,
m2
punpckhwd
m0
,
m2
punpcklwd
m2
,
m3
,
m5
punpckhwd
m3
,
m5
punpcklwd
m5
,
m4
,
m6
punpckhwd
m4
,
m6
punpcklwd
m6
,
m7
,
m9
punpckhwd
m7
,
m9
punpcklwd
m9
,
m8
,
m10
punpckhwd
m8
,
m10
punpcklwd
m10
,
m11
,
m12
punpckhwd
m11
,
m12
mova
m12
,
%
3
mova
%
3
,
m11
punpcklwd
m11
,
m12
,
m13
punpckhwd
m12
,
m13
; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12
punpckldq
m13
,
m14
,
m2
punpckhdq
m14
,
m2
punpckldq
m2
,
m15
,
m3
punpckhdq
m15
,
m3
punpckldq
m3
,
m1
,
m5
punpckhdq
m1
,
m5
punpckldq
m5
,
m0
,
m4
punpckhdq
m0
,
m4
punpckldq
m4
,
m6
,
m10
punpckhdq
m6
,
m10
punpckldq
m10
,
m9
,
m11
punpckhdq
m9
,
m11
punpckldq
m11
,
m8
,
m12
punpckhdq
m8
,
m12
mova
m12
,
%
3
mova
%
3
,
m8
punpckldq
m8
,
m7
,
m12
punpckhdq
m7
,
m12
; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3
punpcklqdq
m12
,
m13
,
m4
punpckhqdq
m13
,
m4
punpcklqdq
m4
,
m14
,
m6
punpckhqdq
m14
,
m6
punpcklqdq
m6
,
m2
,
m8
punpckhqdq
m2
,
m8
punpcklqdq
m8
,
m15
,
m7
punpckhqdq
m15
,
m7
punpcklqdq
m7
,
m3
,
m10
punpckhqdq
m3
,
m10
punpcklqdq
m10
,
m1
,
m9
punpckhqdq
m1
,
m9
punpcklqdq
m9
,
m5
,
m11
punpckhqdq
m5
,
m11
mova
m11
,
%
3
mova
%
3
,
m12
punpcklqdq
m12
,
m0
,
m11
punpckhqdq
m0
,
m11
%if %2 == 0
mova
m11
,
%
3
%endif
; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0
SWAP
0
,
11
,
1
,
13
,
5
,
2
,
4
,
6
,
8
,
7
,
15
SWAP
3
,
14
,
12
,
9
%endmacro
%macro FILTER 2
; width [4/6/8/16], dir [h/v]
; load data
%ifidn %2, v
%if %1 == 4
lea
tmpq
,
[
ds
tq
+
mstrideq
*
2
]
mova
m3
,
[
tmpq
+
strideq
*
0
]
; p1
mova
m4
,
[
tmpq
+
strideq
*
1
]
; p0
mova
m5
,
[
tmpq
+
strideq
*
2
]
; q0
mova
m6
,
[
tmpq
+
stride3q
]
; q1
%else
; load 6-8 pixels, remainder (for wd=16) will be read inline
lea
tmpq
,
[
ds
tq
+
mstrideq
*
4
]
%if %1 != 6
mova
m12
,
[
tmpq
+
strideq
*
0
]
%endif
mova
m13
,
[
tmpq
+
strideq
*
1
]
mova
m3
,
[
tmpq
+
strideq
*
2
]
mova
m4
,
[
tmpq
+
stride3q
]
mova
m5
,
[
ds
tq
+
strideq
*
0
]
mova
m6
,
[
ds
tq
+
strideq
*
1
]
mova
m14
,
[
ds
tq
+
strideq
*
2
]
%if %1 != 6
mova
m15
,
[
ds
tq
+
stride3q
]
%endif
%endif
%else
; load lines
%if %1 == 4
movd
xm3
,
[
ds
tq
+
strideq
*
0
-
2
]
movd
xm4
,
[
ds
tq
+
strideq
*
1
-
2
]
movd
xm5
,
[
ds
tq
+
strideq
*
2
-
2
]
movd
xm6
,
[
ds
tq
+
stride3q
-
2
]
lea
tmpq
,
[
ds
tq
+
strideq
*
4
]
pinsrd
xm3
,
[
tmpq
+
strideq
*
0
-
2
],
2
pinsrd
xm4
,
[
tmpq
+
strideq
*
1
-
2
],
2
pinsrd
xm5
,
[
tmpq
+
strideq
*
2
-
2
],
2
pinsrd
xm6
,
[
tmpq
+
stride3q
-
2
],
2
lea
tmpq
,
[
tmpq
+
strideq
*
4
]
pinsrd
xm3
,
[
tmpq
+
strideq
*
0
-
2
],
1
pinsrd
xm4
,
[
tmpq
+
strideq
*
1
-
2
],
1
pinsrd
xm5
,
[
tmpq
+
strideq
*
2
-
2
],
1
pinsrd
xm6
,
[
tmpq
+
stride3q
-
2
],
1
lea
tmpq
,
[
tmpq
+
strideq
*
4
]
pinsrd
xm3
,
[
tmpq
+
strideq
*
0
-
2
],
3
pinsrd
xm4
,
[
tmpq
+
strideq
*
1
-
2
],
3
pinsrd
xm5
,
[
tmpq
+
strideq
*
2
-
2
],
3
pinsrd
xm6
,
[
tmpq
+
stride3q
-
2
],
3
lea
tmpq
,
[
tmpq
+
strideq
*
4
]
movd
xm12
,
[
tmpq
+
strideq
*
0
-
2
]
movd
xm13
,
[
tmpq
+
strideq
*
1
-
2
]
movd
xm14
,
[
tmpq
+
strideq
*
2
-
2
]
movd
xm15
,
[
tmpq
+
stride3q
-
2
]
lea
tmpq
,
[
tmpq
+
strideq
*
4
]
pinsrd
xm12
,
[
tmpq
+
strideq
*
0
-
2
],
2
pinsrd
xm13
,
[
tmpq
+
strideq
*
1
-
2
],
2
pinsrd
xm14
,
[
tmpq
+
strideq
*
2
-
2
],
2
pinsrd
xm15
,
[
tmpq
+
stride3q
-
2
],
2
lea
tmpq
,
[
tmpq
+
strideq
*
4
]
pinsrd
xm12
,
[
tmpq
+
strideq
*
0
-
2
],
1
pinsrd
xm13
,
[
tmpq
+
strideq
*
1
-
2
],
1
pinsrd
xm14
,
[
tmpq
+
strideq
*
2
-
2
],
1
pinsrd
xm15
,
[
tmpq
+
stride3q
-
2
],
1
lea
tmpq
,
[
tmpq
+
strideq
*
4
]
pinsrd
xm12
,
[
tmpq
+
strideq
*
0
-
2
],
3
pinsrd
xm13
,
[
tmpq
+
strideq
*
1
-
2
],
3
pinsrd
xm14
,
[
tmpq
+
strideq
*
2
-
2
],
3
pinsrd
xm15
,
[
tmpq
+
stride3q
-
2
],
3
vinserti128
m3
,
xm12
,
1
vinserti128
m4
,
xm13
,
1
vinserti128
m5
,
xm14
,
1
vinserti128
m6
,
xm15
,
1
; transpose 4x16
; xm3: A-D0,A-D8,A-D4,A-D12
; xm4: A-D1,A-D9,A-D5,A-D13
; xm5: A-D2,A-D10,A-D6,A-D14
; xm6: A-D3,A-D11,A-D7,A-D15
punpcklbw
m7
,
m3
,
m4
punpckhbw
m3
,
m4
punpcklbw
m4
,
m5
,
m6
punpckhbw
m5
,
m6
; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9
; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13
; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11
; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15
punpcklwd
m6
,
m7
,
m4
punpckhwd
m7
,
m4
punpcklwd
m4
,
m3
,
m5
punpckhwd
m3
,
m5
; xm6: A0-3,B0-3,C0-3,D0-3
; xm7: A8-11,B8-11,C8-11,D8-11
; xm4: A4-7,B4-7,C4-7,D4-7
; xm3: A12-15,B12-15,C12-15,D12-15
punpckldq
m5
,
m6
,
m4
punpckhdq
m6
,
m4
punpckldq
m4
,
m7
,
m3
punpckhdq
m7
,
m3
; xm5: A0-7,B0-7
; xm6: C0-7,D0-7
; xm4: A8-15,B8-15
; xm7: C8-15,D8-15
punpcklqdq
m3
,
m5
,
m4
punpckhqdq
m4
,
m5
,
m4
punpcklqdq
m5
,
m6
,
m7
punpckhqdq
m6
,
m7
; xm3: A0-15
; xm5: B0-15
; xm4: C0-15
; xm6: D0-15
%elif %1 == 6 || %1 == 8
movq
xm3
,
[
ds
tq
+
strideq
*
0
-%
1
/
2
]
movq
xm4
,
[
ds
tq
+
strideq
*
1
-%
1
/
2
]
movq
xm5
,
[
ds
tq
+
strideq
*
2
-%
1
/
2
]
movq
xm6
,
[
ds
tq
+
stride3q
-%
1
/
2
]
lea
tmpq
,
[
ds
tq
+
strideq
*
8
]
movhps
xm3
,
[
tmpq
+
strideq
*
0
-%
1
/
2
]
movhps
xm4
,
[
tmpq
+
strideq
*
1
-%
1
/
2
]
movhps
xm5
,
[
tmpq
+
strideq
*
2
-%
1
/
2
]
movhps
xm6
,
[
tmpq
+
stride3q
-%
1
/
2
]
lea
tmpq
,
[
tmpq
+
strideq
*
8
]
movq
xm7
,
[
tmpq
+
strideq
*
0
-%
1
/
2
]
movq
xm8
,
[
tmpq
+
strideq
*
1
-%
1
/
2
]
movq
xm9
,
[
tmpq
+
strideq
*
2
-%
1
/
2
]
movq
xm11
,
[
tmpq
+
stride3q
-%
1
/
2
]
lea
tmpq
,
[
tmpq
+
strideq
*
8
]
movhps
xm7
,
[
tmpq
+
strideq
*
0
-%
1
/
2
]
movhps
xm8
,
[
tmpq
+
strideq
*
1
-%
1
/
2
]
movhps
xm9
,
[
tmpq
+
strideq
*
2
-%
1
/
2
]
movhps
xm11
,
[
tmpq
+
stride3q
-%
1
/
2
]
vinserti128
m3
,
xm7
,
1
vinserti128
m4
,
xm8
,
1
vinserti128
m5
,
xm9
,
1
vinserti128
m6
,
xm11
,
1
lea
tmpq
,
[
ds
tq
+
strideq
*
4
]
movq
xm12
,
[
tmpq
+
strideq
*
0
-%
1
/
2
]
movq
xm13
,
[
tmpq
+
strideq
*
1
-%
1
/
2
]
movq
xm14
,
[
tmpq
+
strideq
*
2
-%
1
/
2
]
movq
xm15
,
[
tmpq
+
stride3q
-%
1
/
2
]
lea
tmpq
,
[
tmpq
+
strideq
*
8
]
movhps
xm12
,
[
tmpq
+
strideq
*
0
-%
1
/
2
]
movhps
xm13
,
[
tmpq
+
strideq
*
1
-%
1
/
2
]
movhps
xm14
,
[
tmpq
+
strideq
*
2
-%
1
/
2
]
movhps
xm15
,
[
tmpq
+
stride3q
-%
1
/
2
]
lea
tmpq
,
[
tmpq
+
strideq
*
8
]
movq
xm7
,
[
tmpq
+
strideq
*
0
-%
1
/
2
]
movq
xm8
,
[
tmpq
+
strideq
*
1
-%
1
/
2
]
movq
xm9
,
[
tmpq
+
strideq
*
2
-%
1
/
2
]
movq
xm11
,
[
tmpq
+
stride3q
-%
1
/
2
]
lea
tmpq
,
[
tmpq
+
strideq
*
8
]
movhps
xm7
,
[
tmpq
+
strideq
*
0
-%
1
/
2
]
movhps
xm8
,
[
tmpq
+
strideq
*
1
-%
1
/
2
]
movhps
xm9
,
[
tmpq
+
strideq
*
2
-%
1
/
2
]
movhps
xm11
,
[
tmpq
+
stride3q
-%
1
/
2
]
vinserti128
m12
,
xm7
,
1
vinserti128
m13
,
xm8
,
1
vinserti128
m14
,
xm9
,
1
vinserti128
m15
,
xm11
,
1
; transpose 8x16
; xm3: A-H0,A-H8
; xm4: A-H1,A-H9
; xm5: A-H2,A-H10
; xm6: A-H3,A-H11
; xm12: A-H4,A-H12
; xm13: A-H5,A-H13
; xm14: A-H6,A-H14
; xm15: A-H7,A-H15
punpcklbw
m7
,
m3
,
m4
punpckhbw
m3
,
m4
punpcklbw
m4
,
m5
,
m6
punpckhbw
m5
,
m6
punpcklbw
m6
,
m12
,
m13
punpckhbw
m12
,
m13
punpcklbw
m13
,
m14
,
m15
punpckhbw
m14
,
m15
; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1
; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9
; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3
; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11
; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5
; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
punpcklwd
m15
,
m7
,
m4
punpckhwd
m7
,
m4
punpcklwd
m4
,
m3
,
m5
punpckhwd
m3
,
m5
punpcklwd
m5
,
m6
,
m13
punpckhwd
m6
,
m13
punpcklwd
m13
,
m12
,
m14
punpckhwd
m12
,
m14
; xm15: A0-3,B0-3,C0-3,D0-3
; xm7: E0-3,F0-3,G0-3,H0-3
; xm4: A8-11,B8-11,C8-11,D8-11
; xm3: E8-11,F8-11,G8-11,H8-11
; xm5: A4-7,B4-7,C4-7,D4-7
; xm6: E4-7,F4-7,G4-7,H4-7
; xm13: A12-15,B12-15,C12-15,D12-15
; xm12: E12-15,F12-15,G12-15,H12-15
punpckldq
m14
,
m15
,
m5
punpckhdq
m15
,
m5
punpckldq
m5
,
m7
,
m6
%if %1 != 6
punpckhdq
m7
,
m6
%endif
punpckldq
m6
,
m4
,
m13
punpckhdq
m4
,
m13
punpckldq
m13
,
m3
,
m12
%if %1 != 6
punpckhdq
m12
,
m3
,
m12
%endif
; xm14: A0-7,B0-7
; xm15: C0-7,D0-7
; xm5: E0-7,F0-7
; xm7: G0-7,H0-7
; xm6: A8-15,B8-15
; xm4: C8-15,D8-15
; xm13: E8-15,F8-15
; xm12: G8-15,H8-15
punpcklqdq
m3
,
m14
,
m6
punpckhqdq
m14
,
m6
punpckhqdq
m6
,
m15
,
m4
punpcklqdq
m15
,
m4
punpcklqdq
m4
,
m5
,
m13
punpckhqdq
m13
,
m5
,
m13
%if %1 == 8
punpcklqdq
m5
,
m7
,
m12
punpckhqdq
m12
,
m7
,
m12
; xm3: A0-15
; xm14: B0-15
; xm15: C0-15
; xm6: D0-15
; xm4: E0-15
; xm13: F0-15
; xm5: G0-15
; xm12: H0-15
SWAP
12
,
3
,
15
SWAP
13
,
14
,
5
,
4
,
6
; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15
%else
SWAP
13
,
3
,
14
SWAP
6
,
4
,
15
,
5
; 3,14,15,6,4,13 -> 13,3,4,5,6,14
%endif
%else
; load and 16x16 transpose. We only use 14 pixels but we'll need the
; remainder at the end for the second transpose
movu
xm0
,
[
ds
tq
+
strideq
*
0
-
8
]
movu
xm1
,
[
ds
tq
+
strideq
*
1
-
8
]
movu
xm2
,
[
ds
tq
+
strideq
*
2
-
8
]
movu
xm3
,
[
ds
tq
+
stride3q
-
8
]
lea
tmpq
,
[
ds
tq
+
strideq
*
4
]
movu
xm4
,
[
tmpq
+
strideq
*
0
-
8
]
movu
xm5
,
[
tmpq
+
strideq
*
1
-
8
]
movu
xm6
,
[
tmpq
+
strideq
*
2
-
8
]
movu
xm7
,
[
tmpq
+
stride3q
-
8
]
lea
tmpq
,
[
tmpq
+
strideq
*
4
]
movu
xm8
,
[
tmpq
+
strideq
*
0
-
8
]
movu
xm9
,
[
tmpq
+
strideq
*
1
-
8
]
movu
xm10
,
[
tmpq
+
strideq
*
2
-
8
]
movu
xm11
,
[
tmpq
+
stride3q
-
8
]
lea
tmpq
,
[
tmpq
+
strideq
*
4
]
movu
xm12
,
[
tmpq
+
strideq
*
0
-
8
]
movu
xm13
,
[
tmpq
+
strideq
*
1
-
8
]
movu
xm14
,
[
tmpq
+
strideq
*
2
-
8
]
movu
xm15
,
[
tmpq
+
stride3q
-
8
]
lea
tmpq
,
[
tmpq
+
strideq
*
4
]
vinserti128
m0
,
[
tmpq
+
strideq
*
0
-
8
],
1
vinserti128
m1
,
[
tmpq
+
strideq
*
1
-
8
],
1
vinserti128
m2
,
[
tmpq
+
strideq
*
2
-
8
],
1
vinserti128
m3
,
[
tmpq
+
stride3q
-
8
],
1
lea
tmpq
,
[
tmpq
+
strideq
*
4
]
vinserti128
m4
,
[
tmpq
+
strideq
*
0
-
8
],
1
vinserti128
m5
,
[
tmpq
+
strideq
*
1
-
8
],
1
vinserti128
m6
,
[
tmpq
+
strideq
*
2
-
8
],
1
vinserti128
m7
,
[
tmpq
+
stride3q
-
8
],
1
lea
tmpq
,
[
tmpq
+
strideq
*
4
]
vinserti128
m8
,
[
tmpq
+
strideq
*
0
-
8
],
1
vinserti128
m9
,
[
tmpq
+
strideq
*
1
-
8
],
1
vinserti128
m10
,
[
tmpq
+
strideq
*
2
-
8
],
1
vinserti128
m11
,
[
tmpq
+
stride3q
-
8
],
1
lea
tmpq
,
[
tmpq
+
strideq
*
4
]
vinserti128
m12
,
[
tmpq
+
strideq
*
0
-
8
],
1
vinserti128
m13
,
[
tmpq
+
strideq
*
1
-
8
],
1
vinserti128
m14
,
[
tmpq
+
strideq
*
2
-
8
],
1
vinserti128
m15
,
[
tmpq
+
stride3q
-
8
],
1
TRANSPOSE_16X16B
0
,
1
,
[
rsp
+
11
*
32
]
mova
[
rsp
+
12
*
32
],
m1
mova
[
rsp
+
13
*
32
],
m2
mova
[
rsp
+
14
*
32
],
m3
mova
[
rsp
+
15
*
32
],
m12
mova
[
rsp
+
16
*
32
],
m13
mova
[
rsp
+
17
*
32
],
m14
mova
[
rsp
+
18
*
32
],
m15
; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15
SWAP
12
,
4
,
7
SWAP
13
,
5
,
8
SWAP
3
,
6
,
9
SWAP
10
,
14
SWAP
11
,
15
%endif
%endif
; load L/E/I/H
%ifidn %2, v
movu
m1
,
[
lq
]
movu
m0
,
[
lq
+
l_strideq
]
%else
movq
xm1
,
[
lq
]
movq
xm2
,
[
lq
+
l_strideq
*
2
]
movhps
xm1
,
[
lq
+
l_strideq
]
movhps
xm2
,
[
lq
+
l_stride3q
]
lea
lq
,
[
lq
+
l_strideq
*
4
]
movq
xm10
,
[
lq
]
movq
xm0
,
[
lq
+
l_strideq
*
2
]
movhps
xm10
,
[
lq
+
l_strideq
]
movhps
xm0
,
[
lq
+
l_stride3q
]
lea
lq
,
[
lq
+
l_strideq
*
4
]
vinserti128
m1
,
xm10
,
1
vinserti128
m2
,
xm0
,
1
shufps
m0
,
m1
,
m2
,
q3131
shufps
m1
,
m2
,
q2020
%endif
pxor
m2
,
m2
pcmpeqb
m3
,
m2
,
m0
pand
m1
,
m
3
pcmpeqb
m10
,
m2
,
m0
pand
m1
,
m
10
por
m0
,
m1
; l[x][] ? l[x][] : l[x-stride][]
pshufb
m0
,
[
pb_4x1_4x5_4x9_4x13
]
; l[x][1]
pcmpeqb
m10
,
m2
,
m0
; !L
pand
m1
,
m0
,
[
pb_240
]
psrlq
m1
,
4
; H
psrlq
m2
,
m0
,
[
lutq
+
128
]
pand
m2
,
[
pb_63
]
vpbroadcastb
m
4
,
[
lutq
+
136
]
pminub
m2
,
m
4
vpbroadcastb
m
1
,
[
lutq
+
136
]
pminub
m2
,
m
1
pmaxub
m2
,
[
pb_1
]
; I
pand
m1
,
m0
,
[
pb_240
]
psrlq
m1
,
4
; H
paddb
m0
,
[
pb_2
]
paddb
m0
,
m0
paddb
m0
,
m2
; E
...
...
@@ -86,20 +560,6 @@ SECTION .text
pxor
m2
,
[
pb_128
]
pxor
m0
,
[
pb_128
]
%if %1 == 4
lea
tmpq
,
[
ds
tq
+
mstrideq
*
2
]
mova
m3
,
[
tmpq
+
strideq
*
0
]
; p1
mova
m4
,
[
tmpq
+
strideq
*
1
]
; p0
mova
m5
,
[
tmpq
+
strideq
*
2
]
; q0
mova
m6
,
[
tmpq
+
stride3q
]
; q1
%else
lea
tmpq
,
[
ds
tq
+
mstrideq
*
4
]
mova
m3
,
[
tmpq
+
strideq
*
2
]
mova
m4
,
[
tmpq
+
stride3q
]
mova
m5
,
[
ds
tq
+
strideq
*
0
]
mova
m6
,
[
ds
tq
+
strideq
*
1
]
%endif
ABSSUB
m8
,
m3
,
m4
,
m9
; abs(p1-p0)
pmaxub
m8
,
m10
ABSSUB
m9
,
m5
,
m6
,
m10
; abs(q1-q0)
...
...
@@ -111,15 +571,6 @@ SECTION .text
pxor
m7
,
m8
,
[
pb_128
]
pcmpgtb
m7
,
m1
; hev
%if %1 != 6
mova
m12
,
[
tmpq
+
strideq
*
0
]
%endif
mova
m13
,
[
tmpq
+
strideq
*
1
]
mova
m14
,
[
ds
tq
+
strideq
*
2
]
%if %1 != 6
mova
m15
,
[
ds
tq
+
stride3q
]
%endif
%if %1 == 6
ABSSUB
m9
,
m13
,
m4
,
m10
; abs(p2-p0)
pmaxub
m9
,
m8
...
...
@@ -179,23 +630,47 @@ SECTION .text
por
m8
,
m10
%if %1 == 16
%ifidn %2, v
lea
tmpq
,
[
ds
tq
+
mstrideq
*
8
]
mova
m0
,
[
tmpq
+
strideq
*
1
]
%else
mova
m0
,
[
rsp
+
12
*
32
]
%endif
ABSSUB
m1
,
m0
,
m4
,
m2
%ifidn %2, v
mova
m0
,
[
tmpq
+
strideq
*
2
]
%else
mova
m0
,
[
rsp
+
13
*
32
]
%endif
ABSSUB
m2
,
m0
,
m4
,
m10
pmaxub
m1
,
m2
%ifidn %2, v
mova
m0
,
[
tmpq
+
stride3q
]
%else
mova
m0
,
[
rsp
+
14
*
32
]
%endif
ABSSUB
m2
,
m0
,
m4
,
m10
pmaxub
m1
,
m2
%ifidn %2, v
lea
tmpq
,
[
ds
tq
+
strideq
*
4
]
mova
m0
,
[
tmpq
+
strideq
*
0
]
%else
mova
m0
,
[
rsp
+
15
*
32
]
%endif
ABSSUB
m2
,
m0
,
m5
,
m10
pmaxub
m1
,
m2
%ifidn %2, v
mova
m0
,
[
tmpq
+
strideq
*
1
]
%else
mova
m0
,
[
rsp
+
16
*
32
]
%endif
ABSSUB
m2
,
m0
,
m5
,
m10
pmaxub
m1
,
m2
%ifidn %2, v
mova
m0
,
[
tmpq
+
strideq
*
2
]
%else
mova
m0
,
[
rsp
+
17
*
32
]
%endif
ABSSUB
m2
,
m0
,
m5
,
m10
pmaxub
m1
,
m2
pxor
m1
,
[
pb_128
]
...
...
@@ -279,10 +754,16 @@ SECTION .text
%if %1 == 16
; flat16 filter
%ifidn %2, v
lea
tmpq
,
[
ds
tq
+
mstrideq
*
8
]
mova
m0
,
[
tmpq
+
strideq
*
1
]
; p6
mova
m2
,
[
tmpq
+
strideq
*
2
]
; p5
mova
m7
,
[
tmpq
+
stride3q
]
; p4