Commit b4b225d8 authored by Martin Storsjö's avatar Martin Storsjö
Browse files

arm32: itx: Add a NEON implementation of itx for 10 bpc

Relative speedup vs C for a few functions:

                                      Cortex A7     A8     A9    A53    A72    A73
inv_txfm_add_4x4_dct_dct_0_10bpc_neon:     2.79   5.08   2.99   2.83   3.49   4.44
inv_txfm_add_4x4_dct_dct_1_10bpc_neon:     5.74   9.43   5.72   7.19   6.73   6.92
inv_txfm_add_8x8_dct_dct_0_10bpc_neon:     3.13   3.68   2.79   3.25   3.21   3.33
inv_txfm_add_8x8_dct_dct_1_10bpc_neon:     7.09  10.41   7.00  10.55   8.06   9.02
inv_txfm_add_16x16_dct_dct_0_10bpc_neon:   5.01   6.76   4.56   5.58   5.52   2.97
inv_txfm_add_16x16_dct_dct_1_10bpc_neon:   8.62  12.48  13.71  11.75  15.94  16.86
inv_txfm_add_16x16_dct_dct_2_10bpc_neon:   6.05   8.81   6.13   8.18   7.90  12.27
inv_txfm_add_32x32_dct_dct_0_10bpc_neon:   2.90   3.90   2.16   2.63   3.56   2.74
inv_txfm_add_32x32_dct_dct_1_10bpc_neon:  13.57  17.00  13.30  13.76  14.54  17.08
inv_txfm_add_32x32_dct_dct_2_10bpc_neon:   8.29  10.54   8.05  10.68  12.75  14.36
inv_txfm_add_32x32_dct_dct_3_10bpc_neon:   6.78   8.40   7.60  10.12   8.97  12.96
inv_txfm_add_32x32_dct_dct_4_10bpc_neon:   6.48   6.74   6.00   7.38   7.67   9.70
inv_txfm_add_64x64_dct_dct_0_10bpc_neon:   3.02   4.59   2.21   2.65   3.36   2.47
inv_txfm_add_64x64_dct_dct_1_10bpc_neon:   9.86  11.30   9.14  13.80  12.46  14.83
inv_txfm_add_64x64_dct_dct_2_10bpc_neon:   8.65   9.76   7.60  12.05  10.55  12.62
inv_txfm_add_64x64_dct_dct_3_10bpc_neon:   7.78   8.65   6.98  10.63   9.15  11.73
inv_txfm_add_64x64_dct_dct_4_10bpc_neon:   6.61   7.01   5.52   8.41   8.33   9.69
parent 7f5b334b
Pipeline #71229 passed with stages
in 5 minutes and 7 seconds
This diff is collapsed.
...@@ -158,6 +158,14 @@ ...@@ -158,6 +158,14 @@
vtrn.8 \r2, \r3 vtrn.8 \r2, \r3
.endm .endm
.macro transpose_4x4s q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
vswp \r1, \r4 // vtrn.64 \q0, \q2
vswp \r3, \r6 // vtrn.64 \q1, \q3
vtrn.32 \q0, \q1
vtrn.32 \q2, \q3
.endm
.macro transpose_4x4h q0, q1, r0, r1, r2, r3 .macro transpose_4x4h q0, q1, r0, r1, r2, r3
vtrn.32 \q0, \q1 vtrn.32 \q0, \q1
......
...@@ -119,7 +119,6 @@ COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc ...@@ -119,7 +119,6 @@ COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc
if (bpc > 10) return; if (bpc > 10) return;
#if ARCH_AARCH64 || BITDEPTH == 8
assign_itx17_fn( , 4, 4, neon); assign_itx17_fn( , 4, 4, neon);
assign_itx16_fn(R, 4, 8, neon); assign_itx16_fn(R, 4, 8, neon);
assign_itx16_fn(R, 4, 16, neon); assign_itx16_fn(R, 4, 16, neon);
...@@ -139,5 +138,4 @@ COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc ...@@ -139,5 +138,4 @@ COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc
assign_itx1_fn (R, 64, 16, neon); assign_itx1_fn (R, 64, 16, neon);
assign_itx1_fn (R, 64, 32, neon); assign_itx1_fn (R, 64, 32, neon);
assign_itx1_fn ( , 64, 64, neon); assign_itx1_fn ( , 64, 64, neon);
#endif
} }
...@@ -132,6 +132,8 @@ if is_asm_enabled ...@@ -132,6 +132,8 @@ if is_asm_enabled
endif endif
elif host_machine.cpu_family().startswith('arm') elif host_machine.cpu_family().startswith('arm')
libdav1d_sources_asm = files( libdav1d_sources_asm = files(
# itx.S is used for both 8 and 16 bpc.
'arm/32/itx.S',
'arm/32/looprestoration_common.S', 'arm/32/looprestoration_common.S',
'arm/32/msac.S', 'arm/32/msac.S',
) )
...@@ -140,7 +142,6 @@ if is_asm_enabled ...@@ -140,7 +142,6 @@ if is_asm_enabled
libdav1d_sources_asm += files( libdav1d_sources_asm += files(
'arm/32/cdef.S', 'arm/32/cdef.S',
'arm/32/ipred.S', 'arm/32/ipred.S',
'arm/32/itx.S',
'arm/32/loopfilter.S', 'arm/32/loopfilter.S',
'arm/32/looprestoration.S', 'arm/32/looprestoration.S',
'arm/32/mc.S', 'arm/32/mc.S',
...@@ -151,6 +152,7 @@ if is_asm_enabled ...@@ -151,6 +152,7 @@ if is_asm_enabled
libdav1d_sources_asm += files( libdav1d_sources_asm += files(
'arm/32/cdef16.S', 'arm/32/cdef16.S',
'arm/32/ipred16.S', 'arm/32/ipred16.S',
'arm/32/itx16.S',
'arm/32/loopfilter16.S', 'arm/32/loopfilter16.S',
'arm/32/looprestoration16.S', 'arm/32/looprestoration16.S',
'arm/32/mc16.S', 'arm/32/mc16.S',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment