Commit 515e2667 authored by Martin Storsjö's avatar Martin Storsjö

arm/mc: Add 8 bit neon asm for avg, w_avg and mask

checkasm --bench numbers from a Snapdragon 835:
nop: 23.0
avg_w4_8bpc_c: 385.0
avg_w4_8bpc_neon: 34.0
avg_w8_8bpc_c: 590.5
avg_w8_8bpc_neon: 65.5
avg_w16_8bpc_c: 1304.4
avg_w16_8bpc_neon: 161.3
avg_w32_8bpc_c: 4098.4
avg_w32_8bpc_neon: 589.2
avg_w64_8bpc_c: 8405.0
avg_w64_8bpc_neon: 1367.1
avg_w128_8bpc_c: 19667.9
avg_w128_8bpc_neon: 3409.0
w_avg_w4_8bpc_c: 453.8
w_avg_w4_8bpc_neon: 50.0
w_avg_w8_8bpc_c: 749.0
w_avg_w8_8bpc_neon: 105.7
w_avg_w16_8bpc_c: 1851.2
w_avg_w16_8bpc_neon: 283.7
w_avg_w32_8bpc_c: 5991.5
w_avg_w32_8bpc_neon: 1080.9
w_avg_w64_8bpc_c: 12763.5
w_avg_w64_8bpc_neon: 2544.4
w_avg_w128_8bpc_c: 30311.3
w_avg_w128_8bpc_neon: 6350.5
mask_w4_8bpc_c: 492.9
mask_w4_8bpc_neon: 57.7
mask_w8_8bpc_c: 1108.5
mask_w8_8bpc_neon: 123.0
mask_w16_8bpc_c: 2880.3
mask_w16_8bpc_neon: 349.2
mask_w32_8bpc_c: 8996.4
mask_w32_8bpc_neon: 1368.1
mask_w64_8bpc_c: 19570.3
mask_w64_8bpc_neon: 3263.5
mask_w128_8bpc_c: 46757.4
mask_w128_8bpc_neon: 8743.1
parent 128715b5
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Janne Grunau
* Copyright © 2018, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#if BITDEPTH == 8
.macro avg dst0, dst1, t0, t1, t2, t3
vld1.16 {\t0,\t1}, [r2, :128]!
vld1.16 {\t2,\t3}, [r3, :128]!
vadd.i16 \t0, \t0, \t2
vadd.i16 \t1, \t1, \t3
vqrshrun.s16 \dst0, \t0, #5
vqrshrun.s16 \dst1, \t1, #5
.endm
.macro w_avg dst0, dst1, t0, t1, t2, t3
vld1.16 {\t0,\t1}, [r2, :128]!
vld1.16 {\t2,\t3}, [r3, :128]!
vsub.i16 \t0, \t2, \t0
vsub.i16 \t1, \t3, \t1
vqdmulh.s16 \t0, \t0, q15
vqdmulh.s16 \t1, \t1, q15
vadd.i16 \t0, \t2, \t0
vadd.i16 \t1, \t3, \t1
vqrshrun.s16 \dst0, \t0, #4
vqrshrun.s16 \dst1, \t1, #4
.endm
.macro mask dst0, dst1, t0, t1, t2, t3
vld1.8 {q14}, [lr, :128]!
vld1.16 {\t0,\t1}, [r2, :128]!
vmul.i8 q14, q14, q15
vld1.16 {\t2,\t3}, [r3, :128]!
vshll.i8 q13, d28, #8
vshll.i8 q14, d29, #8
vsub.i16 \t0, \t2, \t0
vsub.i16 \t1, \t3, \t1
vqdmulh.s16 \t0, \t0, q13
vqdmulh.s16 \t1, \t1, q14
vadd.i16 \t0, \t2, \t0
vadd.i16 \t1, \t3, \t1
vqrshrun.s16 \dst0, \t0, #4
vqrshrun.s16 \dst1, \t1, #4
.endm
.macro bidir_fn type
function \type\()_8bpc_neon, export=1
push {r4-r6,lr}
ldr r4, [sp, #16]
ldr r5, [sp, #20]
.ifnc \type, avg
ldr lr, [sp, #24]
.endif
.ifc \type, w_avg
vdup.s16 q15, lr
vneg.s16 q15, q15
vshl.i16 q15, q15, #11
.endif
.ifc \type, mask
vmov.i8 q15, #256-2
.endif
rbit r4, r4
adr r12, L(\type\()_tbl)
clz r4, r4
ldr r4, [r12, r4, lsl #2]
\type d16, d17, q0, q1, q2, q3
add r12, r12, r4
bx r12
.align 2
L(\type\()_tbl):
.word 0, 0
.word 4f - L(\type\()_tbl) + CONFIG_THUMB
.word 80f - L(\type\()_tbl) + CONFIG_THUMB
.word 160f - L(\type\()_tbl) + CONFIG_THUMB
.word 320f - L(\type\()_tbl) + CONFIG_THUMB
.word 640f - L(\type\()_tbl) + CONFIG_THUMB
.word 1280f - L(\type\()_tbl) + CONFIG_THUMB
4:
add r6, r0, r1
lsl r1, r1, #1
cmp r5, #4
vst1.32 {d16[0]}, [r0, :32], r1
vst1.32 {d16[1]}, [r6, :32], r1
vst1.32 {d17[0]}, [r0, :32], r1
vst1.32 {d17[1]}, [r6, :32], r1
beq 0f
\type d18, d19, q0, q1, q2, q3
cmp r5, #8
vst1.32 {d18[0]}, [r0, :32], r1
vst1.32 {d18[1]}, [r6, :32], r1
vst1.32 {d19[0]}, [r0, :32], r1
vst1.32 {d19[1]}, [r6, :32], r1
beq 0f
\type d16, d17, q0, q1, q2, q3
vst1.32 {d16[0]}, [r0, :32], r1
vst1.32 {d16[1]}, [r6, :32], r1
\type d18, d19, q0, q1, q2, q3
vst1.32 {d17[0]}, [r0, :32], r1
vst1.32 {d17[1]}, [r6, :32], r1
vst1.32 {d18[0]}, [r0, :32], r1
vst1.32 {d18[1]}, [r6, :32], r1
vst1.32 {d19[0]}, [r0, :32], r1
vst1.32 {d19[1]}, [r6, :32], r1
pop {r4-r6,pc}
80:
add r6, r0, r1
lsl r1, r1, #1
8:
vst1.8 {d16}, [r0, :64], r1
\type d18, d19, q0, q1, q2, q3
vst1.8 {d17}, [r6, :64], r1
vst1.8 {d18}, [r0, :64], r1
subs r5, r5, #4
vst1.8 {d19}, [r6, :64], r1
ble 0f
\type d16, d17, q0, q1, q2, q3
b 8b
160:
add r6, r0, r1
lsl r1, r1, #1
16:
\type d18, d19, q0, q1, q2, q3
vst1.8 {q8}, [r0, :128], r1
\type d20, d21, q0, q1, q2, q3
vst1.8 {q9}, [r6, :128], r1
\type d22, d23, q0, q1, q2, q3
vst1.8 {q10}, [r0, :128], r1
subs r5, r5, #4
vst1.8 {q11}, [r6, :128], r1
ble 0f
\type d16, d17, q0, q1, q2, q3
b 16b
320:
add r6, r0, r1
lsl r1, r1, #1
32:
\type d18, d19, q0, q1, q2, q3
\type d20, d21, q0, q1, q2, q3
vst1.8 {q8, q9}, [r0, :128], r1
\type d22, d23, q0, q1, q2, q3
subs r5, r5, #2
vst1.8 {q10, q11}, [r6, :128], r1
ble 0f
\type d16, d17, q0, q1, q2, q3
b 32b
640:
add r6, r0, #32
64:
\type d18, d19, q0, q1, q2, q3
\type d20, d21, q0, q1, q2, q3
\type d22, d23, q0, q1, q2, q3
vst1.8 {q8, q9}, [r0, :128], r1
\type d16, d17, q0, q1, q2, q3
vst1.8 {q10, q11}, [r6, :128], r1
\type d18, d19, q0, q1, q2, q3
\type d20, d21, q0, q1, q2, q3
vst1.8 {q8, q9}, [r0, :128], r1
\type d22, d23, q0, q1, q2, q3
subs r5, r5, #2
vst1.8 {q10, q11}, [r6, :128], r1
ble 0f
\type d16, d17, q0, q1, q2, q3
b 64b
1280:
sub r1, r1, #32
add r6, r0, #64
128:
\type d18, d19, q0, q1, q2, q3
\type d20, d21, q0, q1, q2, q3
\type d22, d23, q0, q1, q2, q3
vst1.8 {q8, q9}, [r0, :128]!
\type d16, d17, q0, q1, q2, q3
vst1.8 {q10, q11}, [r0, :128], r1
\type d18, d19, q0, q1, q2, q3
\type d20, d21, q0, q1, q2, q3
vst1.8 {q8, q9}, [r6, :128]!
\type d22, d23, q0, q1, q2, q3
subs r5, r5, #1
vst1.8 {q10, q11}, [r6, :128], r1
ble 0f
\type d16, d17, q0, q1, q2, q3
b 128b
0:
pop {r4-r6,pc}
endfunc
.endm
bidir_fn avg
bidir_fn w_avg
bidir_fn mask
#endif /* BITDEPTH == 8 */
...@@ -30,6 +30,30 @@ ...@@ -30,6 +30,30 @@
#include "config.h" #include "config.h"
#if ARCH_ARM
.syntax unified
#ifdef __ELF__
.fpu neon
.eabi_attribute 10, 0 // suppress Tag_FP_arch
.eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch
#endif
#ifdef _WIN32
#define CONFIG_THUMB 1
#else
#define CONFIG_THUMB 0
#endif
#if CONFIG_THUMB
.thumb
#define A @
#define T
#else
#define A
#define T @
#endif
#endif
#ifndef PRIVATE_PREFIX #ifndef PRIVATE_PREFIX
#define PRIVATE_PREFIX dav1d_ #define PRIVATE_PREFIX dav1d_
#endif #endif
......
...@@ -39,7 +39,7 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) { ...@@ -39,7 +39,7 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8 && ARCH_AARCH64 #if BITDEPTH == 8
c->avg = dav1d_avg_8bpc_neon; c->avg = dav1d_avg_8bpc_neon;
c->w_avg = dav1d_w_avg_8bpc_neon; c->w_avg = dav1d_w_avg_8bpc_neon;
c->mask = dav1d_mask_8bpc_neon; c->mask = dav1d_mask_8bpc_neon;
......
...@@ -91,6 +91,7 @@ if is_asm_enabled ...@@ -91,6 +91,7 @@ if is_asm_enabled
) )
elif host_machine.cpu_family().startswith('arm') elif host_machine.cpu_family().startswith('arm')
libdav1d_tmpl_sources += files( libdav1d_tmpl_sources += files(
'arm/32/mc.S',
) )
endif endif
elif host_machine.cpu_family().startswith('x86') elif host_machine.cpu_family().startswith('x86')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment