Commit 4aa0363a authored by Martin Storsjö's avatar Martin Storsjö Committed by Janne Grunau

arm64: mc: Implement 8tap and bilin functions

These functions have been tuned against Cortex A53 and Snapdragon
835. The bilin functions have mainly been written with code size
in mind, as they aren't used much in practice.

Relative speedups for the actual filtering fuctions (that don't
just do a plain copy) are around 4-15x, some over 20x. This is
in comparison with GCC 5.4 with autovectorization disabled; the
actual real-world speedup against autovectorized C code is around
4-10x.

Relative speedups measured with checkasm:
                                Cortex A53   Snapdragon 835
mc_8tap_regular_w2_0_8bpc_neon:       6.96   5.28
mc_8tap_regular_w2_h_8bpc_neon:       5.16   4.35
mc_8tap_regular_w2_hv_8bpc_neon:      5.37   4.98
mc_8tap_regular_w2_v_8bpc_neon:       6.35   4.85
mc_8tap_regular_w4_0_8bpc_neon:       6.78   5.73
mc_8tap_regular_w4_h_8bpc_neon:       8.40   6.60
mc_8tap_regular_w4_hv_8bpc_neon:      7.23   7.10
mc_8tap_regular_w4_v_8bpc_neon:       9.06   7.76
mc_8tap_regular_w8_0_8bpc_neon:       6.96   5.55
mc_8tap_regular_w8_h_8bpc_neon:      10.36   6.88
mc_8tap_regular_w8_hv_8bpc_neon:      9.49   6.86
mc_8tap_regular_w8_v_8bpc_neon:      12.06   9.61
mc_8tap_regular_w16_0_8bpc_neon:      6.68   4.51
mc_8tap_regular_w16_h_8bpc_neon:     12.30   7.77
mc_8tap_regular_w16_hv_8bpc_neon:     9.50   6.68
mc_8tap_regular_w16_v_8bpc_neon:     12.93   9.68
mc_8tap_regular_w32_0_8bpc_neon:      3.91   2.93
mc_8tap_regular_w32_h_8bpc_neon:     13.06   7.89
mc_8tap_regular_w32_hv_8bpc_neon:     9.37   6.70
mc_8tap_regular_w32_v_8bpc_neon:     12.88   9.49
mc_8tap_regular_w64_0_8bpc_neon:      2.89   1.68
mc_8tap_regular_w64_h_8bpc_neon:     13.48   8.00
mc_8tap_regular_w64_hv_8bpc_neon:     9.23   6.53
mc_8tap_regular_w64_v_8bpc_neon:     13.11   9.68
mc_8tap_regular_w128_0_8bpc_neon:     1.89   1.24
mc_8tap_regular_w128_h_8bpc_neon:    13.58   7.98
mc_8tap_regular_w128_hv_8bpc_neon:    8.86   6.53
mc_8tap_regular_w128_v_8bpc_neon:    12.46   9.63
mc_bilinear_w2_0_8bpc_neon:           7.02   5.40
mc_bilinear_w2_h_8bpc_neon:           3.65   3.14
mc_bilinear_w2_hv_8bpc_neon:          4.36   4.84
mc_bilinear_w2_v_8bpc_neon:           5.22   4.28
mc_bilinear_w4_0_8bpc_neon:           6.87   5.99
mc_bilinear_w4_h_8bpc_neon:           6.50   8.61
mc_bilinear_w4_hv_8bpc_neon:          7.70   7.99
mc_bilinear_w4_v_8bpc_neon:           7.04   9.10
mc_bilinear_w8_0_8bpc_neon:           7.03   5.70
mc_bilinear_w8_h_8bpc_neon:          11.30  15.14
mc_bilinear_w8_hv_8bpc_neon:         15.74  13.50
mc_bilinear_w8_v_8bpc_neon:          13.40  17.54
mc_bilinear_w16_0_8bpc_neon:          6.75   4.48
mc_bilinear_w16_h_8bpc_neon:         17.02  13.95
mc_bilinear_w16_hv_8bpc_neon:        17.37  13.78
mc_bilinear_w16_v_8bpc_neon:         23.69  22.98
mc_bilinear_w32_0_8bpc_neon:          3.88   3.18
mc_bilinear_w32_h_8bpc_neon:         18.80  14.97
mc_bilinear_w32_hv_8bpc_neon:        17.74  14.02
mc_bilinear_w32_v_8bpc_neon:         24.46  23.04
mc_bilinear_w64_0_8bpc_neon:          2.87   1.66
mc_bilinear_w64_h_8bpc_neon:         19.54  16.02
mc_bilinear_w64_hv_8bpc_neon:        17.80  14.32
mc_bilinear_w64_v_8bpc_neon:         24.79  23.63
mc_bilinear_w128_0_8bpc_neon:         2.13   1.23
mc_bilinear_w128_h_8bpc_neon:        19.89  16.24
mc_bilinear_w128_hv_8bpc_neon:       17.55  14.15
mc_bilinear_w128_v_8bpc_neon:        24.45  23.54
mct_8tap_regular_w4_0_8bpc_neon:      5.56   5.51
mct_8tap_regular_w4_h_8bpc_neon:      7.48   5.80
mct_8tap_regular_w4_hv_8bpc_neon:     7.27   7.09
mct_8tap_regular_w4_v_8bpc_neon:      7.80   6.84
mct_8tap_regular_w8_0_8bpc_neon:      9.54   9.25
mct_8tap_regular_w8_h_8bpc_neon:      9.08   6.55
mct_8tap_regular_w8_hv_8bpc_neon:     9.16   6.30
mct_8tap_regular_w8_v_8bpc_neon:     10.79   8.66
mct_8tap_regular_w16_0_8bpc_neon:    15.35  10.50
mct_8tap_regular_w16_h_8bpc_neon:    10.18   6.76
mct_8tap_regular_w16_hv_8bpc_neon:    9.17   6.11
mct_8tap_regular_w16_v_8bpc_neon:    11.52   8.72
mct_8tap_regular_w32_0_8bpc_neon:    15.82  10.09
mct_8tap_regular_w32_h_8bpc_neon:    10.75   6.85
mct_8tap_regular_w32_hv_8bpc_neon:    9.00   6.22
mct_8tap_regular_w32_v_8bpc_neon:    11.58   8.67
mct_8tap_regular_w64_0_8bpc_neon:    15.28   9.68
mct_8tap_regular_w64_h_8bpc_neon:    10.93   6.96
mct_8tap_regular_w64_hv_8bpc_neon:    8.81   6.53
mct_8tap_regular_w64_v_8bpc_neon:    11.42   8.73
mct_8tap_regular_w128_0_8bpc_neon:   14.41   7.67
mct_8tap_regular_w128_h_8bpc_neon:   10.92   6.96
mct_8tap_regular_w128_hv_8bpc_neon:   8.56   6.51
mct_8tap_regular_w128_v_8bpc_neon:   11.16   8.70
mct_bilinear_w4_0_8bpc_neon:          5.66   5.77
mct_bilinear_w4_h_8bpc_neon:          5.16   6.40
mct_bilinear_w4_hv_8bpc_neon:         6.86   6.82
mct_bilinear_w4_v_8bpc_neon:          4.75   6.09
mct_bilinear_w8_0_8bpc_neon:          9.78  10.00
mct_bilinear_w8_h_8bpc_neon:          8.98  11.37
mct_bilinear_w8_hv_8bpc_neon:        14.42  10.83
mct_bilinear_w8_v_8bpc_neon:          9.12  11.62
mct_bilinear_w16_0_8bpc_neon:        15.59  10.76
mct_bilinear_w16_h_8bpc_neon:        11.98   8.77
mct_bilinear_w16_hv_8bpc_neon:       15.83  10.73
mct_bilinear_w16_v_8bpc_neon:        14.70  14.60
mct_bilinear_w32_0_8bpc_neon:        15.89  10.32
mct_bilinear_w32_h_8bpc_neon:        13.47   9.07
mct_bilinear_w32_hv_8bpc_neon:       16.01  10.95
mct_bilinear_w32_v_8bpc_neon:        14.85  14.16
mct_bilinear_w64_0_8bpc_neon:        15.36  10.51
mct_bilinear_w64_h_8bpc_neon:        14.00   9.61
mct_bilinear_w64_hv_8bpc_neon:       15.82  11.27
mct_bilinear_w64_v_8bpc_neon:        14.61  14.76
mct_bilinear_w128_0_8bpc_neon:       14.41   7.92
mct_bilinear_w128_h_8bpc_neon:       13.31   9.58
mct_bilinear_w128_hv_8bpc_neon:      14.07  11.18
mct_bilinear_w128_v_8bpc_neon:       11.57  14.42
parent 842b2074
Pipeline #2726 passed with stages
in 6 minutes and 39 seconds
This diff is collapsed.
......@@ -129,4 +129,6 @@ EXTERN\name:
#define L(x) .L ## x
#endif
#define X(x) CONCAT(EXTERN, x)
#endif /* __DAV1D_SRC_ARM_ASM_S__ */
......@@ -30,16 +30,66 @@
#include "src/mc.h"
#include "src/cpu.h"
decl_mc_fn(dav1d_put_8tap_regular_8bpc_neon);
decl_mc_fn(dav1d_put_8tap_regular_smooth_8bpc_neon);
decl_mc_fn(dav1d_put_8tap_regular_sharp_8bpc_neon);
decl_mc_fn(dav1d_put_8tap_smooth_8bpc_neon);
decl_mc_fn(dav1d_put_8tap_smooth_regular_8bpc_neon);
decl_mc_fn(dav1d_put_8tap_smooth_sharp_8bpc_neon);
decl_mc_fn(dav1d_put_8tap_sharp_8bpc_neon);
decl_mc_fn(dav1d_put_8tap_sharp_regular_8bpc_neon);
decl_mc_fn(dav1d_put_8tap_sharp_smooth_8bpc_neon);
decl_mc_fn(dav1d_put_bilin_8bpc_neon);
decl_mct_fn(dav1d_prep_8tap_regular_8bpc_neon);
decl_mct_fn(dav1d_prep_8tap_regular_smooth_8bpc_neon);
decl_mct_fn(dav1d_prep_8tap_regular_sharp_8bpc_neon);
decl_mct_fn(dav1d_prep_8tap_smooth_8bpc_neon);
decl_mct_fn(dav1d_prep_8tap_smooth_regular_8bpc_neon);
decl_mct_fn(dav1d_prep_8tap_smooth_sharp_8bpc_neon);
decl_mct_fn(dav1d_prep_8tap_sharp_8bpc_neon);
decl_mct_fn(dav1d_prep_8tap_sharp_regular_8bpc_neon);
decl_mct_fn(dav1d_prep_8tap_sharp_smooth_8bpc_neon);
decl_mct_fn(dav1d_prep_bilin_8bpc_neon);
decl_avg_fn(dav1d_avg_8bpc_neon);
decl_w_avg_fn(dav1d_w_avg_8bpc_neon);
decl_mask_fn(dav1d_mask_8bpc_neon);
void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
#define init_mc_fn(type, name, suffix) \
c->mc[type] = dav1d_put_##name##_8bpc_##suffix
#define init_mct_fn(type, name, suffix) \
c->mct[type] = dav1d_prep_##name##_8bpc_##suffix
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8
#if ARCH_AARCH64
init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon);
init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon);
init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon);
init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
init_mc_fn (FILTER_2D_BILINEAR, bilin, neon);
init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon);
init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon);
init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon);
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
init_mct_fn(FILTER_2D_BILINEAR, bilin, neon);
#endif
c->avg = dav1d_avg_8bpc_neon;
c->w_avg = dav1d_w_avg_8bpc_neon;
c->mask = dav1d_mask_8bpc_neon;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment