Commit 0282f6f3 authored by Martin Storsjö's avatar Martin Storsjö

arm64: loopfilter: Implement NEON loop filters

The exact relative speedup compared to C code is a bit vague and hard
to measure, depending on eactly how many filtered blocks are skipped,
as the NEON version always filters 16 pixels at a time, while the
C code can skip processing individual 4 pixel blocks.

Additionally, the checkasm benchmarking code runs the same function
repeatedly on the same buffer, which can make the filter take
different codepaths on each run, as the function updates the buffer
which will be used as input for the next run.

If tweaking the checkasm test data to try to avoid skipped blocks,
the relative speedups compared to C is between 2x and 5x, while
it is around 1x to 4x with the current checkasm test as such.

Benchmark numbers from a tweaked checkasm that avoids skipped
blocks:

                        Cortex A53     A72     A73
lpf_h_sb_uv_w4_8bpc_c:      2954.7  1399.3  1655.3
lpf_h_sb_uv_w4_8bpc_neon:    895.5   650.8   692.0
lpf_h_sb_uv_w6_8bpc_c:      3879.2  1917.2  2257.7
lpf_h_sb_uv_w6_8bpc_neon:   1125.6   759.5   838.4
lpf_h_sb_y_w4_8bpc_c:       6711.0  3275.5  3913.7
lpf_h_sb_y_w4_8bpc_neon:    1744.0  1342.1  1351.5
lpf_h_sb_y_w8_8bpc_c:      10695.7  6155.8  6638.9
lpf_h_sb_y_w8_8bpc_neon:    2146.5  1560.4  1609.1
lpf_h_sb_y_w16_8bpc_c:     11355.8  6292.0  6995.9
lpf_h_sb_y_w16_8bpc_neon:   2475.4  1949.6  1968.4
lpf_v_sb_uv_w4_8bpc_c:      2639.7  1204.8  1425.9
lpf_v_sb_uv_w4_8bpc_neon:    510.7   351.4   334.7
lpf_v_sb_uv_w6_8bpc_c:      3468.3  1757.1  2021.5
lpf_v_sb_uv_w6_8bpc_neon:    625.0   415.0   397.8
lpf_v_sb_y_w4_8bpc_c:       5428.7  2731.7  3068.5
lpf_v_sb_y_w4_8bpc_neon:    1172.6   792.1   768.0
lpf_v_sb_y_w8_8bpc_c:       8946.1  4412.8  5121.0
lpf_v_sb_y_w8_8bpc_neon:    1565.5  1063.6  1062.7
lpf_v_sb_y_w16_8bpc_c:      8978.9  4411.7  5112.0
lpf_v_sb_y_w16_8bpc_neon:   1775.0  1288.1  1236.7
parent 204bf211
Pipeline #6334 passed with stages
in 7 minutes and 25 seconds
This diff is collapsed.
......@@ -88,4 +88,45 @@
trn2 \r7\().2s, \r9\().2s, \r7\().2s
.endm
.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
trn1 \r8\().16b, \r0\().16b, \r1\().16b
trn2 \r9\().16b, \r0\().16b, \r1\().16b
trn1 \r1\().16b, \r2\().16b, \r3\().16b
trn2 \r3\().16b, \r2\().16b, \r3\().16b
trn1 \r0\().16b, \r4\().16b, \r5\().16b
trn2 \r5\().16b, \r4\().16b, \r5\().16b
trn1 \r2\().16b, \r6\().16b, \r7\().16b
trn2 \r7\().16b, \r6\().16b, \r7\().16b
trn1 \r4\().8h, \r0\().8h, \r2\().8h
trn2 \r2\().8h, \r0\().8h, \r2\().8h
trn1 \r6\().8h, \r5\().8h, \r7\().8h
trn2 \r7\().8h, \r5\().8h, \r7\().8h
trn1 \r5\().8h, \r9\().8h, \r3\().8h
trn2 \r9\().8h, \r9\().8h, \r3\().8h
trn1 \r3\().8h, \r8\().8h, \r1\().8h
trn2 \r8\().8h, \r8\().8h, \r1\().8h
trn1 \r0\().4s, \r3\().4s, \r4\().4s
trn2 \r4\().4s, \r3\().4s, \r4\().4s
trn1 \r1\().4s, \r5\().4s, \r6\().4s
trn2 \r5\().4s, \r5\().4s, \r6\().4s
trn2 \r6\().4s, \r8\().4s, \r2\().4s
trn1 \r2\().4s, \r8\().4s, \r2\().4s
trn1 \r3\().4s, \r9\().4s, \r7\().4s
trn2 \r7\().4s, \r9\().4s, \r7\().4s
.endm
.macro transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().16b, \r0\().16b, \r1\().16b
trn2 \t5\().16b, \r0\().16b, \r1\().16b
trn1 \t6\().16b, \r2\().16b, \r3\().16b
trn2 \t7\().16b, \r2\().16b, \r3\().16b
trn1 \r0\().8h, \t4\().8h, \t6\().8h
trn2 \r2\().8h, \t4\().8h, \t6\().8h
trn1 \r1\().8h, \t5\().8h, \t7\().8h
trn2 \r3\().8h, \t5\().8h, \t7\().8h
.endm
#endif /* DAV1D_SRC_ARM_64_UTIL_S */
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/cpu.h"
#include "src/loopfilter.h"
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_neon);
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_neon);
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_neon);
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_neon);
void bitfn(dav1d_loop_filter_dsp_init_arm)(Dav1dLoopFilterDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8 && ARCH_AARCH64
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_neon;
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_neon;
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_neon;
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_neon;
#endif
}
......@@ -53,6 +53,7 @@ typedef struct Dav1dLoopFilterDSPContext {
} Dav1dLoopFilterDSPContext;
bitfn_decls(void dav1d_loop_filter_dsp_init, Dav1dLoopFilterDSPContext *c);
bitfn_decls(void dav1d_loop_filter_dsp_init_arm, Dav1dLoopFilterDSPContext *c);
bitfn_decls(void dav1d_loop_filter_dsp_init_x86, Dav1dLoopFilterDSPContext *c);
#endif /* DAV1D_SRC_LOOPFILTER_H */
......@@ -250,7 +250,11 @@ void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;
c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;
#if HAVE_ASM && ARCH_X86
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
bitfn(dav1d_loop_filter_dsp_init_arm)(c);
#elif ARCH_X86
bitfn(dav1d_loop_filter_dsp_init_x86)(c);
#endif
#endif
}
......@@ -86,12 +86,14 @@ if is_asm_enabled
)
libdav1d_tmpl_sources += files(
'arm/cdef_init_tmpl.c',
'arm/loopfilter_init_tmpl.c',
'arm/looprestoration_init_tmpl.c',
'arm/mc_init_tmpl.c',
)
if host_machine.cpu_family() == 'aarch64'
libdav1d_sources += files(
'arm/64/cdef.S',
'arm/64/loopfilter.S',
'arm/64/looprestoration.S',
'arm/64/mc.S',
)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment