Commit 0282f6f3 authored by Martin Storsjö's avatar Martin Storsjö

arm64: loopfilter: Implement NEON loop filters

The exact relative speedup compared to C code is a bit vague and hard
to measure, depending on eactly how many filtered blocks are skipped,
as the NEON version always filters 16 pixels at a time, while the
C code can skip processing individual 4 pixel blocks.

Additionally, the checkasm benchmarking code runs the same function
repeatedly on the same buffer, which can make the filter take
different codepaths on each run, as the function updates the buffer
which will be used as input for the next run.

If tweaking the checkasm test data to try to avoid skipped blocks,
the relative speedups compared to C is between 2x and 5x, while
it is around 1x to 4x with the current checkasm test as such.

Benchmark numbers from a tweaked checkasm that avoids skipped
blocks:

                        Cortex A53     A72     A73
lpf_h_sb_uv_w4_8bpc_c:      2954.7  1399.3  1655.3
lpf_h_sb_uv_w4_8bpc_neon:    895.5   650.8   692.0
lpf_h_sb_uv_w6_8bpc_c:      3879.2  1917.2  2257.7
lpf_h_sb_uv_w6_8bpc_neon:   1125.6   759.5   838.4
lpf_h_sb_y_w4_8bpc_c:       6711.0  3275.5  3913.7
lpf_h_sb_y_w4_8bpc_neon:    1744.0  1342.1  1351.5
lpf_h_sb_y_w8_8bpc_c:      10695.7  6155.8  6638.9
lpf_h_sb_y_w8_8bpc_neon:    2146.5  1560.4  1609.1
lpf_h_sb_y_w16_8bpc_c:     11355.8  6292.0  6995.9
lpf_h_sb_y_w16_8bpc_neon:   2475.4  1949.6  1968.4
lpf_v_sb_uv_w4_8bpc_c:      2639.7  1204.8  1425.9
lpf_v_sb_uv_w4_8bpc_neon:    510.7   351.4   334.7
lpf_v_sb_uv_w6_8bpc_c:      3468.3  1757.1  2021.5
lpf_v_sb_uv_w6_8bpc_neon:    625.0   415.0   397.8
lpf_v_sb_y_w4_8bpc_c:       5428.7  2731.7  3068.5
lpf_v_sb_y_w4_8bpc_neon:    1172.6   792.1   768.0
lpf_v_sb_y_w8_8bpc_c:       8946.1  4412.8  5121.0
lpf_v_sb_y_w8_8bpc_neon:    1565.5  1063.6  1062.7
lpf_v_sb_y_w16_8bpc_c:      8978.9  4411.7  5112.0
lpf_v_sb_y_w16_8bpc_neon:   1775.0  1288.1  1236.7
parent 204bf211
Pipeline #6334 passed with stages
in 7 minutes and 25 seconds
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2019, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
.macro loop_filter wd
function lpf_16_wd\wd\()_neon
uabd v0.16b, v22.16b, v23.16b // abs(p1 - p0)
uabd v1.16b, v25.16b, v24.16b // abs(q1 - q0)
uabd v2.16b, v23.16b, v24.16b // abs(p0 - q0)
uabd v3.16b, v22.16b, v25.16b // abs(p1 - q1)
.if \wd >= 6
uabd v4.16b, v21.16b, v22.16b // abs(p2 - p1)
uabd v5.16b, v26.16b, v25.16b // abs(q2 - q1)
.if \wd >= 8
uabd v6.16b, v20.16b, v21.16b // abs(p3 - p2)
uabd v7.16b, v27.16b, v26.16b // abs(q3 - q3)
.endif
.endif
.if \wd >= 6
umax v4.16b, v4.16b, v5.16b
.endif
uqadd v2.16b, v2.16b, v2.16b // abs(p0 - q0) * 2
.if \wd >= 8
umax v6.16b, v6.16b, v7.16b
.endif
ushr v3.16b, v3.16b, #1
.if \wd >= 8
umax v4.16b, v4.16b, v6.16b
.endif
.if \wd >= 6
and v4.16b, v4.16b, v14.16b
.endif
umax v0.16b, v0.16b, v1.16b // max(abs(p1 - p0), abs(q1 - q0))
uqadd v2.16b, v2.16b, v3.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
.if \wd >= 6
umax v4.16b, v0.16b, v4.16b
cmhs v1.16b, v11.16b, v4.16b // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
.else
cmhs v1.16b, v11.16b, v0.16b // max(abs(p1 - p0), abs(q1 - q0)) <= I
.endif
cmhs v2.16b, v10.16b, v2.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
and v1.16b, v1.16b, v2.16b // fm
and v1.16b, v1.16b, v13.16b // fm && wd >= 4
.if \wd >= 6
and v14.16b, v14.16b, v1.16b // fm && wd > 4
.endif
.if \wd >= 6
and v15.16b, v15.16b, v1.16b // fm && wd == 16
.endif
mov x16, v1.d[0]
mov x17, v1.d[1]
adds x16, x16, x17
b.eq 9f // if (!fm || wd < 4) return;
.if \wd >= 6
movi v10.16b, #1
uabd v2.16b, v21.16b, v23.16b // abs(p2 - p0)
uabd v3.16b, v22.16b, v23.16b // abs(p1 - p0)
uabd v4.16b, v25.16b, v24.16b // abs(q1 - q0)
uabd v5.16b, v26.16b, v24.16b // abs(q2 - q0)
.if \wd >= 8
uabd v6.16b, v20.16b, v23.16b // abs(p3 - p0)
uabd v7.16b, v27.16b, v24.16b // abs(q3 - q0)
.endif
umax v2.16b, v2.16b, v3.16b
umax v4.16b, v4.16b, v5.16b
.if \wd >= 8
umax v6.16b, v6.16b, v7.16b
.endif
umax v2.16b, v2.16b, v4.16b
.if \wd >= 8
umax v2.16b, v2.16b, v6.16b
.endif
.if \wd == 16
uabd v3.16b, v17.16b, v23.16b // abs(p6 - p0)
uabd v4.16b, v18.16b, v23.16b // abs(p5 - p0)
uabd v5.16b, v19.16b, v23.16b // abs(p4 - p0)
.endif
cmhs v2.16b, v10.16b, v2.16b // flat8in
.if \wd == 16
uabd v6.16b, v28.16b, v24.16b // abs(q4 - q0)
uabd v7.16b, v29.16b, v24.16b // abs(q5 - q0)
uabd v8.16b, v30.16b, v24.16b // abs(q6 - q0)
.endif
and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4
bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in
.if \wd == 16
umax v3.16b, v3.16b, v4.16b
umax v5.16b, v5.16b, v6.16b
.endif
mov x16, v1.d[0]
mov x17, v1.d[1]
.if \wd == 16
umax v7.16b, v7.16b, v8.16b
umax v3.16b, v3.16b, v5.16b
umax v3.16b, v3.16b, v7.16b
cmhs v3.16b, v10.16b, v3.16b // flat8out
.endif
adds x16, x16, x17
.if \wd == 16
and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16
and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16
bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out
.endif
b.eq 1f // skip wd == 4 case
.endif
usubl v2.8h, v22.8b, v25.8b // p1 - q1
usubl2 v3.8h, v22.16b, v25.16b
cmhi v0.16b, v0.16b, v12.16b // hev
sqxtn v2.8b, v2.8h // iclip_diff(p1 - q1)
sqxtn2 v2.16b, v3.8h
and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1)
bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev)
usubl v2.8h, v24.8b, v23.8b
movi v5.8h, #3
usubl2 v3.8h, v24.16b, v23.16b
mul v2.8h, v2.8h, v5.8h
mul v3.8h, v3.8h, v5.8h
movi v6.16b, #4
saddw v2.8h, v2.8h, v4.8b
saddw2 v3.8h, v3.8h, v4.16b
movi v7.16b, #3
sqxtn v2.8b, v2.8h // f
sqxtn2 v2.16b, v3.8h
sqadd v4.16b, v6.16b, v2.16b // imin(f + 4, 128)
sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 128)
sshr v4.16b, v4.16b, #3 // f1
sshr v5.16b, v5.16b, #3 // f2
uxtl v2.8h, v23.8b // p0
uxtl2 v3.8h, v23.16b
uxtl v6.8h, v24.8b // q0
uxtl2 v7.8h, v24.16b
saddw v2.8h, v2.8h, v5.8b
saddw2 v3.8h, v3.8h, v5.16b
ssubw v6.8h, v6.8h, v4.8b
ssubw2 v7.8h, v7.8h, v4.16b
srshr v4.16b, v4.16b, #1 // (f1 + 1) >> 1
sqxtun v2.8b, v2.8h // out p0
sqxtun2 v2.16b, v3.8h
sqxtun v6.8b, v6.8h // out q0
sqxtun2 v6.16b, v7.8h
bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4)
uxtl v2.8h, v22.8b // p1
uxtl2 v3.8h, v22.16b
bit v24.16b, v6.16b, v1.16b // if (fm && wd >= 4)
uxtl v6.8h, v25.8b // q1
uxtl2 v7.8h, v25.16b
saddw v2.8h, v2.8h, v4.8b
saddw2 v3.8h, v3.8h, v4.16b
ssubw v6.8h, v6.8h, v4.8b
ssubw2 v7.8h, v7.8h, v4.16b
sqxtun v2.8b, v2.8h // out p1
sqxtun2 v2.16b, v3.8h
sqxtun v6.8b, v6.8h // out q1
sqxtun2 v6.16b, v7.8h
bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev)
bit v25.16b, v6.16b, v0.16b // if (fm && wd >= 4 && !hev)
1:
.if \wd == 6
mov x16, v14.d[0]
mov x17, v14.d[1]
adds x16, x16, x17
b.eq 2f // skip if there's no flat8in
uaddl v0.8h, v21.8b, v21.8b // p2 * 2
uaddl2 v1.8h, v21.16b, v21.16b
uaddl v2.8h, v21.8b, v22.8b // p2 + p1
uaddl2 v3.8h, v21.16b, v22.16b
uaddl v4.8h, v22.8b, v23.8b // p1 + p0
uaddl2 v5.8h, v22.16b, v23.16b
uaddl v6.8h, v23.8b, v24.8b // p0 + q0
uaddl2 v7.8h, v23.16b, v24.16b
add v8.8h, v0.8h, v2.8h
add v9.8h, v1.8h, v3.8h
add v10.8h, v4.8h, v6.8h
add v11.8h, v5.8h, v7.8h
uaddl v12.8h, v24.8b, v25.8b // q0 + q1
uaddl2 v13.8h, v24.16b, v25.16b
add v8.8h, v8.8h, v10.8h
add v9.8h, v9.8h, v11.8h
sub v12.8h, v12.8h, v0.8h
sub v13.8h, v13.8h, v1.8h
uaddl v10.8h, v25.8b, v26.8b // q1 + q2
uaddl2 v11.8h, v25.16b, v26.16b
rshrn v0.8b, v8.8h, #3 // out p1
rshrn2 v0.16b, v9.8h, #3
add v8.8h, v8.8h, v12.8h
add v9.8h, v9.8h, v13.8h
sub v10.8h, v10.8h, v2.8h
sub v11.8h, v11.8h, v3.8h
uaddl v12.8h, v26.8b, v26.8b // q2 + q2
uaddl2 v13.8h, v26.16b, v26.16b
rshrn v1.8b, v8.8h, #3 // out p0
rshrn2 v1.16b, v9.8h, #3
add v8.8h, v8.8h, v10.8h
add v9.8h, v9.8h, v11.8h
sub v12.8h, v12.8h, v4.8h
sub v13.8h, v13.8h, v5.8h
rshrn v2.8b, v8.8h, #3 // out q0
rshrn2 v2.16b, v9.8h, #3
bit v22.16b, v0.16b, v14.16b // p1 if (flat8in)
add v8.8h, v8.8h, v12.8h
add v9.8h, v9.8h, v13.8h
bit v23.16b, v1.16b, v14.16b // p0 if (flat8in)
rshrn v3.8b, v8.8h, #3 // out q1
rshrn2 v3.16b, v9.8h, #3
bit v24.16b, v2.16b, v14.16b // q0 if (flat8in)
bit v25.16b, v3.16b, v14.16b // q1 if (flat8in)
.elseif \wd >= 8
mov x16, v14.d[0]
mov x17, v14.d[1]
adds x16, x16, x17
.if \wd == 8
b.eq 8f // skip if there's no flat8in
.else
b.eq 2f // skip if there's no flat8in
.endif
uaddl v0.8h, v20.8b, v21.8b // p3 + p2
uaddl2 v1.8h, v20.16b, v21.16b
uaddl v2.8h, v22.8b, v25.8b // p1 + q1
uaddl2 v3.8h, v22.16b, v25.16b
uaddl v4.8h, v20.8b, v22.8b // p3 + p1
uaddl2 v5.8h, v20.16b, v22.16b
uaddl v6.8h, v23.8b, v26.8b // p0 + q2
uaddl2 v7.8h, v23.16b, v26.16b
add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2)
add v9.8h, v1.8h, v1.8h
uaddw v8.8h, v8.8h, v23.8b // + p0
uaddw2 v9.8h, v9.8h, v23.16b
uaddw v8.8h, v8.8h, v24.8b // + q0
uaddw2 v9.8h, v9.8h, v24.16b
add v8.8h, v8.8h, v4.8h
add v9.8h, v9.8h, v5.8h // + p3 + p1
sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2
sub v3.8h, v3.8h, v1.8h
sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1
sub v7.8h, v7.8h, v5.8h
rshrn v10.8b, v8.8h, #3 // out p2
rshrn2 v10.16b, v9.8h, #3
add v8.8h, v8.8h, v2.8h
add v9.8h, v9.8h, v3.8h
uaddl v0.8h, v20.8b, v23.8b // p3 + p0
uaddl2 v1.8h, v20.16b, v23.16b
uaddl v2.8h, v24.8b, v27.8b // q0 + q3
uaddl2 v3.8h, v24.16b, v27.16b
rshrn v11.8b, v8.8h, #3 // out p1
rshrn2 v11.16b, v9.8h, #3
add v8.8h, v8.8h, v6.8h
add v9.8h, v9.8h, v7.8h
sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0
sub v3.8h, v3.8h, v1.8h
uaddl v4.8h, v21.8b, v24.8b // p2 + q0
uaddl2 v5.8h, v21.16b, v24.16b
uaddl v6.8h, v25.8b, v27.8b // q1 + q3
uaddl2 v7.8h, v25.16b, v27.16b
rshrn v12.8b, v8.8h, #3 // out p0
rshrn2 v12.16b, v9.8h, #3
add v8.8h, v8.8h, v2.8h
add v9.8h, v9.8h, v3.8h
sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0
sub v7.8h, v7.8h, v5.8h
uaddl v0.8h, v22.8b, v25.8b // p1 + q1
uaddl2 v1.8h, v22.16b, v25.16b
uaddl v2.8h, v26.8b, v27.8b // q2 + q3
uaddl2 v3.8h, v26.16b, v27.16b
rshrn v13.8b, v8.8h, #3 // out q0
rshrn2 v13.16b, v9.8h, #3
add v8.8h, v8.8h, v6.8h
add v9.8h, v9.8h, v7.8h
sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1
sub v3.8h, v3.8h, v1.8h
rshrn v0.8b, v8.8h, #3 // out q1
rshrn2 v0.16b, v9.8h, #3
add v8.8h, v8.8h, v2.8h
add v9.8h , v9.8h, v3.8h
bit v21.16b, v10.16b, v14.16b
bit v22.16b, v11.16b, v14.16b
bit v23.16b, v12.16b, v14.16b
rshrn v1.8b, v8.8h, #3 // out q2
rshrn2 v1.16b, v9.8h, #3
bit v24.16b, v13.16b, v14.16b
bit v25.16b, v0.16b, v14.16b
bit v26.16b, v1.16b, v14.16b
.endif
2:
.if \wd == 16
mov x16, v15.d[0]
mov x17, v15.d[1]
adds x16, x16, x17
b.ne 1f // check if flat8out is needed
mov x16, v14.d[0]
mov x17, v14.d[1]
adds x16, x16, x17
b.eq 8f // if there was no flat8in, just write the inner 4 pixels
b 7f // if flat8in was used, write the inner 6 pixels
1:
uaddl v2.8h, v17.8b, v17.8b // p6 + p6
uaddl2 v3.8h, v17.16b, v17.16b
uaddl v4.8h, v17.8b, v18.8b // p6 + p5
uaddl2 v5.8h, v17.16b, v18.16b
uaddl v6.8h, v17.8b, v19.8b // p6 + p4
uaddl2 v7.8h, v17.16b, v19.16b
uaddl v8.8h, v17.8b, v20.8b // p6 + p3
uaddl2 v9.8h, v17.16b, v20.16b
add v12.8h, v2.8h, v4.8h
add v13.8h, v3.8h, v5.8h
add v10.8h, v6.8h, v8.8h
add v11.8h, v7.8h, v9.8h
uaddl v6.8h, v17.8b, v21.8b // p6 + p2
uaddl2 v7.8h, v17.16b, v21.16b
add v12.8h, v12.8h, v10.8h
add v13.8h, v13.8h, v11.8h
uaddl v8.8h, v17.8b, v22.8b // p6 + p1
uaddl2 v9.8h, v17.16b, v22.16b
uaddl v10.8h, v18.8b, v23.8b // p5 + p0
uaddl2 v11.8h, v18.16b, v23.16b
add v6.8h, v6.8h, v8.8h
add v7.8h, v7.8h, v9.8h
uaddl v8.8h, v19.8b, v24.8b // p4 + q0
uaddl2 v9.8h, v19.16b, v24.16b
add v12.8h, v12.8h, v6.8h
add v13.8h, v13.8h, v7.8h
add v10.8h, v10.8h, v8.8h
add v11.8h, v11.8h, v9.8h
uaddl v6.8h, v20.8b, v25.8b // p3 + q1
uaddl2 v7.8h, v20.16b, v25.16b
add v12.8h, v12.8h, v10.8h
add v13.8h, v13.8h, v11.8h
sub v6.8h, v6.8h, v2.8h
sub v7.8h, v7.8h, v3.8h
uaddl v2.8h, v21.8b, v26.8b // p2 + q2
uaddl2 v3.8h, v21.16b, v26.16b
rshrn v0.8b, v12.8h, #4 // out p5
rshrn2 v0.16b, v13.8h, #4
add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1)
add v13.8h, v13.8h, v7.8h
sub v2.8h, v2.8h, v4.8h
sub v3.8h, v3.8h, v5.8h
uaddl v4.8h, v22.8b, v27.8b // p1 + q3
uaddl2 v5.8h, v22.16b, v27.16b
uaddl v6.8h, v17.8b, v19.8b // p6 + p4
uaddl2 v7.8h, v17.16b, v19.16b
rshrn v1.8b, v12.8h, #4 // out p4
rshrn2 v1.16b, v13.8h, #4
add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2)
add v13.8h, v13.8h, v3.8h
sub v4.8h, v4.8h, v6.8h
sub v5.8h, v5.8h, v7.8h
uaddl v6.8h, v23.8b, v28.8b // p0 + q4
uaddl2 v7.8h, v23.16b, v28.16b
uaddl v8.8h, v17.8b, v20.8b // p6 + p3
uaddl2 v9.8h, v17.16b, v20.16b
rshrn v2.8b, v12.8h, #4 // out p3
rshrn2 v2.16b, v13.8h, #4
add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3)
add v13.8h, v13.8h, v5.8h
sub v6.8h, v6.8h, v8.8h
sub v7.8h, v7.8h, v9.8h
uaddl v8.8h, v24.8b, v29.8b // q0 + q5
uaddl2 v9.8h, v24.16b, v29.16b
uaddl v4.8h, v17.8b, v21.8b // p6 + p2
uaddl2 v5.8h, v17.16b, v21.16b
rshrn v3.8b, v12.8h, #4 // out p2
rshrn2 v3.16b, v13.8h, #4
add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4)
add v13.8h, v13.8h, v7.8h
sub v8.8h, v8.8h, v4.8h
sub v9.8h, v9.8h, v5.8h
uaddl v6.8h, v25.8b, v30.8b // q1 + q6
uaddl2 v7.8h, v25.16b, v30.16b
uaddl v10.8h, v17.8b, v22.8b // p6 + p1
uaddl2 v11.8h, v17.16b, v22.16b
rshrn v4.8b, v12.8h, #4 // out p1
rshrn2 v4.16b, v13.8h, #4
add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5)
add v13.8h, v13.8h, v9.8h
sub v6.8h, v6.8h, v10.8h
sub v7.8h, v7.8h, v11.8h
uaddl v8.8h, v26.8b, v30.8b // q2 + q6
uaddl2 v9.8h, v26.16b, v30.16b
uaddl v10.8h, v18.8b, v23.8b // p5 + p0
uaddl2 v11.8h, v18.16b, v23.16b
rshrn v5.8b, v12.8h, #4 // out p0
rshrn2 v5.16b, v13.8h, #4
add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6)
add v13.8h, v13.8h, v7.8h
sub v8.8h, v8.8h, v10.8h
sub v9.8h, v9.8h, v11.8h
uaddl v10.8h, v27.8b, v30.8b // q3 + q6
uaddl2 v11.8h, v27.16b, v30.16b
bif v0.16b, v18.16b, v15.16b // out p5
uaddl v14.8h, v19.8b, v24.8b // p4 + q0
uaddl2 v18.8h, v19.16b, v24.16b
rshrn v6.8b, v12.8h, #4 // out q0
rshrn2 v6.16b, v13.8h, #4
add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6)
add v13.8h, v13.8h, v9.8h
sub v10.8h, v10.8h, v14.8h
sub v11.8h, v11.8h, v18.8h
uaddl v14.8h, v28.8b, v30.8b // q4 + q6
uaddl2 v18.8h, v28.16b, v30.16b
bif v1.16b, v19.16b, v15.16b // out p4
uaddl v8.8h, v20.8b, v25.8b // p3 + q1
uaddl2 v9.8h, v20.16b, v25.16b
rshrn v7.8b, v12.8h, #4 // out q1
rshrn2 v7.16b, v13.8h, #4
add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6)
add v13.8h, v13.8h, v11.8h
sub v14.8h, v14.8h, v8.8h
sub v18.8h, v18.8h, v9.8h
uaddl v10.8h, v29.8b, v30.8b // q5 + q6
uaddl2 v11.8h, v29.16b, v30.16b
bif v2.16b, v20.16b, v15.16b // out p3
uaddl v19.8h, v21.8b, v26.8b // p2 + q2
uaddl2 v20.8h, v21.16b, v26.16b
rshrn v8.8b, v12.8h, #4 // out q2
rshrn2 v8.16b, v13.8h, #4
add v12.8h, v12.8h, v14.8h // - (p3 + q1) + (q4 + q6)
add v13.8h, v13.8h, v18.8h
sub v10.8h, v10.8h, v19.8h
sub v11.8h, v11.8h, v20.8h
uaddl v14.8h, v30.8b, v30.8b // q6 + q6
uaddl2 v18.8h, v30.16b, v30.16b
bif v3.16b, v21.16b, v15.16b // out p2
uaddl v19.8h, v22.8b, v27.8b // p1 + q3
uaddl2 v20.8h, v22.16b, v27.16b
rshrn v9.8b, v12.8h, #4 // out q3
rshrn2 v9.16b, v13.8h, #4
add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6)
add v13.8h, v13.8h, v11.8h
sub v14.8h, v14.8h, v19.8h
sub v18.8h, v18.8h, v20.8h
bif v4.16b, v22.16b, v15.16b // out p1
rshrn v10.8b, v12.8h, #4 // out q4
rshrn2 v10.16b, v13.8h, #4
add v12.8h, v12.8h, v14.8h // - (p1 + q3) + (q6 + q6)
add v13.8h, v13.8h, v18.8h
rshrn v11.8b, v12.8h, #4 // out q5
rshrn2 v11.16b, v13.8h, #4
bif v5.16b, v23.16b, v15.16b // out p0
bif v6.16b, v24.16b, v15.16b // out q0
bif v7.16b, v25.16b, v15.16b // out q1
bif v8.16b, v26.16b, v15.16b // out q2
bif v9.16b, v27.16b, v15.16b // out q3
bif v10.16b, v28.16b, v15.16b // out q4
bif v11.16b, v29.16b, v15.16b // out q5
.endif
ret
.if \wd == 16
7:
// Return to a shorter epilogue, writing only the inner 6 pixels
br x13
.endif
.if \wd >= 8
8:
// Return to a shorter epilogue, writing only the inner 4 pixels
br x14
.endif
9:
// Return directly without writing back any pixels
br x15
endfunc
.endm
loop_filter 16
loop_filter 8
loop_filter 6
loop_filter 4
.macro lpf_16_wd16
adr x13, 7f
adr x14, 8f
bl lpf_16_wd16_neon
.endm
.macro lpf_16_wd8
adr x14, 8f
bl lpf_16_wd8_neon
.endm
.macro lpf_16_wd6
bl lpf_16_wd6_neon
.endm
.macro lpf_16_wd4
bl lpf_16_wd4_neon
.endm
function lpf_v_4_16_neon
mov x15, x30
sub x16, x0, x1, lsl #1
ld1 {v22.16b}, [x16], x1 // p1
ld1 {v24.16b}, [x0], x1 // q0
ld1 {v23.16b}, [x16], x1 // p0
ld1 {v25.16b}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
lpf_16_wd4
sub x16, x0, x1, lsl #1
st1 {v22.16b}, [x16], x1 // p1
st1 {v24.16b}, [x0], x1 // q0
st1 {v23.16b}, [x16], x1 // p0
st1 {v25.16b}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
br x15
endfunc
function lpf_h_4_16_neon
mov x15, x30
sub x16, x0, #2
add x0, x16, x1, lsl #3
ld1 {v22.s}[0], [x16], x1
ld1 {v22.s}[2], [x0], x1
ld1 {v23.s}[0], [x16], x1
ld1 {v23.s}[2], [x0], x1
ld1 {v24.s}[0], [x16], x1
ld1 {v24.s}[2], [x0], x1
ld1 {v25.s}[0], [x16], x1
ld1 {v25.s}[2], [x0], x1
ld1 {v22.s}[1], [x16], x1
ld1 {v22.s}[3], [x0], x1
ld1 {v23.s}[1], [x16], x1
ld1 {v23.s}[3], [x0], x1
ld1 {v24.s}[1], [x16], x1
ld1 {v24.s}[3], [x0], x1
ld1 {v25.s}[1], [x16], x1
ld1 {v25.s}[3], [x0], x1
add x0, x0, #2
transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
lpf_16_wd4
sub x16, x0, x1, lsl #4
sub x16, x16, #2
transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #3
st1 {v22.s}[0], [x16], x1
st1 {v22.s}[2], [x0], x1
st1 {v23.s}[0], [x16], x1
st1 {v23.s}[2], [x0], x1
st1 {v24.s}[0], [x16], x1
st1 {v24.s}[2], [x0], x1
st1 {v25.s}[0], [x16], x1
st1 {v25.s}[2], [x0], x1
st1 {v22.s}[1], [x16], x1
st1 {v22.s}[3], [x0], x1
st1 {v23.s}[1], [x16], x1
st1 {v23.s}[3], [x0], x1
st1 {v24.s}[1], [x16], x1
st1 {v24.s}[3], [x0], x1
st1 {v25.s}[1], [x16], x1
st1 {v25.s}[3], [x0], x1
add x0, x0, #2
br x15
endfunc
function lpf_v_6_16_neon
mov x15, x30
sub x16, x0, x1, lsl #1
sub x16, x16, x1
ld1 {v21.16b}, [x16], x1 // p2
ld1 {v24.16b}, [x0], x1 // q0
ld1 {v22.16b}, [x16], x1 // p1
ld1 {v25.16b}, [x0], x1 // q1
ld1 {v23.16b}, [x16], x1 // p0
ld1 {v26.16b}, [x0], x1 // q2
sub x0, x0, x1, lsl #1
sub x0, x0, x1
lpf_16_wd6
sub x16, x0, x1, lsl #1
st1 {v22.16b}, [x16], x1 // p1
st1 {v24.16b}, [x0], x1 // q0
st1 {v23.16b}, [x16], x1 // p0
st1 {v25.16b}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
br x15
endfunc
function lpf_h_6_16_neon
mov x15, x30
sub x16, x0, #4
add x0, x16, x1, lsl #3
ld1 {v20.d}[0], [x16], x1
ld1 {v20.d}[1], [x0], x1
ld1 {v21.d}[0], [x16], x1
ld1 {v21.d}[1], [x0], x1
ld1 {v22.d}[0], [x16], x1
ld1 {v22.d}[1], [x0], x1
ld1 {v23.d}[0], [x16], x1
ld1 {v23.d}[1], [x0], x1
ld1 {v24.d}[0], [x16], x1
ld1 {v24.d}[1], [x0], x1
ld1 {v25.d}[0], [x16], x1
ld1 {v25.d}[1], [x0], x1
ld1 {v26.d}[0], [x16], x1
ld1 {v26.d}[1], [x0], x1
ld1 {v27.d}[0], [x16], x1
ld1 {v27.d}[1], [x0], x1
add x0, x0, #4
transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
lpf_16_wd6
sub x16, x0, x1, lsl #4
sub x16, x16, #2
transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #3
st1 {v22.s}[0], [x16], x1
st1 {v22.s}[2], [x0], x1
st1 {v23.s}[0], [x16], x1
st1 {v23.s}[2], [x0], x1
st1 {v24.s}[0], [x16], x1
st1 {v24.s}[2], [x0], x1
st1 {v25.s}[0], [x16], x1
st1 {v25.s}[2], [x0], x1
st1 {v22.s}[1], [x16], x1
st1 {v22.s}[3], [x0], x1
st1 {v23.s}[1], [x16], x1
st1 {v23.s}[3], [x0], x1
st1 {v24.s}[1], [x16], x1
st1 {v24.s}[3], [x0], x1
st1 {v25.s}[1], [x16], x1
st1 {v25.s}[3], [x0], x1
add x0, x0, #2
br x15
endfunc
function lpf_v_8_16_neon
mov x15, x30
sub x16, x0, x1, lsl #2
ld1 {v20.16b}, [x16], x1 // p3
ld1 {v24.16b}, [x0], x1 // q0
ld1 {v21.16b}, [x16], x1 // p2
ld1 {v25.16b}, [x0], x1 // q1
ld1 {v22.16b}, [x16], x1 // p1
ld1 {v26.16b}, [x0], x1 // q2
ld1 {v23.16b}, [x16], x1 // p0
ld1 {v27.16b}, [x0], x1 // q3
sub x0, x0, x1, lsl #2
lpf_16_wd8
sub x16, x0, x1, lsl #1
sub x16, x16, x1
st1 {v21.16b}, [x16], x1 // p2
st1 {v24.16b}, [x0], x1 // q0
st1 {v22.16b}, [x16], x1 // p1
st1 {v25.16b}, [x0], x1 // q1
st1 {v23.16b}, [x16], x1 // p0
st1 {v26.16b}, [x0], x1 // q2
sub x0, x0, x1, lsl #1
sub x0, x0, x1
br x15
8:
sub x16, x0, x1, lsl #1
st1 {v22.16b}, [x16], x1 // p1
st1 {v24.16b}, [x0], x1 // q0
st1 {v23.16b}, [x16], x1 // p0
st1 {v25.16b}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
br x15
endfunc
function lpf_h_8_16_neon
mov x15, x30
sub x16, x0, #4
add x0, x16, x1, lsl #3
ld1 {v20.d}[0], [x16], x1
ld1 {v20.d}[1], [x0], x1
ld1 {v21.d}[0], [x16], x1
ld1 {v21.d}[1], [x0], x1
ld1 {v22.d}[0], [x16], x1
ld1 {v22.d}[1], [x0], x1
ld1 {v23.d}[0], [x16], x1
ld1 {v23.d}[1], [x0], x1
ld1 {v24.d}[0], [x16], x1
ld1 {v24.d}[1], [x0], x1
ld1 {v25.d}[0], [x16], x1
ld1 {v25.d}[1], [x0], x1
ld1 {v26.d}[0], [x16], x1
ld1 {v26.d}[1], [x0], x1
ld1 {v27.d}[0], [x16], x1
ld1 {v27.d}[1], [x0], x1
add x0, x0, #4
transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
lpf_16_wd8
sub x16, x0, x1, lsl #4
sub x16, x16, #4
transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #3
st1 {v20.d}[0], [x16], x1
st1 {v20.d}[1], [x0], x1
st1 {v21.d}[0], [x16], x1
st1 {v21.d}[1], [x0], x1
st1 {v22.d}[0], [x16], x1
st1 {v22.d}[1], [x0], x1
st1 {v23.d}[0], [x16], x1
st1 {v23.d}[1], [x0], x1
st1 {v24.d}[0], [x16], x1
st1 {v24.d}[1], [x0], x1
st1 {v25.d}[0], [x16], x1
st1 {v25.d}[1], [x0], x1
st1 {v26.d}[0], [x16], x1
st1 {v26.d}[1], [x0], x1
st1 {v27.d}[0], [x16], x1
st1 {v27.d}[1], [x0], x1
add x0, x0, #4
br x15
8:
sub x16, x0, x1, lsl #4
sub x16, x16, #2
transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #3
st1 {v22.s}[0], [x16], x1
st1 {v22.s}[2], [x0], x1
st1 {v23.s}[0], [x16], x1
st1 {v23.s}[2], [x0], x1
st1 {v24.s}[0], [x16], x1
st1 {v24.s}[2], [x0], x1
st1 {v25.s}[0], [x16], x1
st1 {v25.s}[2], [x0], x1