Commit 80e47425 authored by Janne Grunau's avatar Janne Grunau

arm64/mc: add 8-bit neon asm for avg, w_avg and mask

checkasm --bench on a Qualcomm Kryo (Sanpdragon 820):
nop: 33.0
avg_w4_8bpc_c: 450.5
avg_w4_8bpc_neon: 20.1
avg_w8_8bpc_c: 438.6
avg_w8_8bpc_neon: 45.2
avg_w16_8bpc_c: 1003.7
avg_w16_8bpc_neon: 112.8
avg_w32_8bpc_c: 3249.6
avg_w32_8bpc_neon: 429.9
avg_w64_8bpc_c: 7213.3
avg_w64_8bpc_neon: 1299.4
avg_w128_8bpc_c: 16791.3
avg_w128_8bpc_neon: 2978.4
w_avg_w4_8bpc_c: 605.7
w_avg_w4_8bpc_neon: 30.9
w_avg_w8_8bpc_c: 545.8
w_avg_w8_8bpc_neon: 72.9
w_avg_w16_8bpc_c: 1430.1
w_avg_w16_8bpc_neon: 193.5
w_avg_w32_8bpc_c: 4876.3
w_avg_w32_8bpc_neon: 715.3
w_avg_w64_8bpc_c: 11338.0
w_avg_w64_8bpc_neon: 2147.0
w_avg_w128_8bpc_c: 26822.0
w_avg_w128_8bpc_neon: 4596.3
mask_w4_8bpc_c: 604.6
mask_w4_8bpc_neon: 37.2
mask_w8_8bpc_c: 654.8
mask_w8_8bpc_neon: 96.0
mask_w16_8bpc_c: 1663.0
mask_w16_8bpc_neon: 272.4
mask_w32_8bpc_c: 5707.6
mask_w32_8bpc_neon: 1028.9
mask_w64_8bpc_c: 12735.3
mask_w64_8bpc_neon: 2533.2
mask_w128_8bpc_c: 31027.6
mask_w128_8bpc_neon: 6247.2
parent 1400b028
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Janne Grunau
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#if BITDEPTH == 8
.macro avg dst, t0, t1
ld1 {\t0\().8h}, [x2], 16
ld1 {\t1\().8h}, [x3], 16
add \t0\().8h, \t0\().8h, \t1\().8h
sqrshrun \dst\().8b, \t0\().8h, #5
.endm
.macro avg16 dst, t0, t1, t2, t3
ld1 {\t0\().8h,\t1\().8h}, [x2], 32
ld1 {\t2\().8h,\t3\().8h}, [x3], 32
add \t0\().8h, \t0\().8h, \t2\().8h
add \t1\().8h, \t1\().8h, \t3\().8h
sqrshrun \dst\().8b, \t0\().8h, #5
sqrshrun2 \dst\().16b, \t1\().8h, #5
.endm
.macro w_avg dst, t0, t1
ld1 {\t0\().8h}, [x2], 16
ld1 {\t1\().8h}, [x3], 16
sub \t0\().8h, \t1\().8h, \t0\().8h
sqdmulh \t0\().8h, \t0\().8h, v30.8h
add \t0\().8h, \t1\().8h, \t0\().8h
sqrshrun \dst\().8b, \t0\().8h, #4
.endm
.macro w_avg16 dst, t0, t1, t2, t3
ld1 {\t0\().8h,\t1\().8h}, [x2], 32
ld1 {\t2\().8h,\t3\().8h}, [x3], 32
sub \t0\().8h, \t2\().8h, \t0\().8h
sub \t1\().8h, \t3\().8h, \t1\().8h
sqdmulh \t0\().8h, \t0\().8h, v30.8h
sqdmulh \t1\().8h, \t1\().8h, v30.8h
add \t0\().8h, \t2\().8h, \t0\().8h
add \t1\().8h, \t3\().8h, \t1\().8h
sqrshrun \dst\().8b, \t0\().8h, #4
sqrshrun2 \dst\().16b, \t1\().8h, #4
.endm
.macro mask dst, t0, t1
ld1 {v30.8b}, [x6], 8
ld1 {\t0\().8h}, [x2], 16
mul v30.8b, v30.8b, v31.8b
ld1 {\t1\().8h}, [x3], 16
shll v30.8h, v30.8b, #8
sub \t0\().8h, \t1\().8h, \t0\().8h
sqdmulh \t0\().8h, \t0\().8h, v30.8h
add \t0\().8h, \t1\().8h, \t0\().8h
sqrshrun \dst\().8b, \t0\().8h, #4
.endm
.macro mask16 dst, t0, t1, t2, t3
ld1 {v30.16b}, [x6], 16
ld1 {\t0\().8h,\t1\().8h}, [x2], 32
mul v30.16b, v30.16b, v31.16b
ld1 {\t2\().8h,\t3\().8h}, [x3], 32
shll v28.8h, v30.8b, #8
shll2 v29.8h, v30.16b, #8
sub \t0\().8h, \t2\().8h, \t0\().8h
sub \t1\().8h, \t3\().8h, \t1\().8h
sqdmulh \t0\().8h, \t0\().8h, v28.8h
sqdmulh \t1\().8h, \t1\().8h, v29.8h
add \t0\().8h, \t2\().8h, \t0\().8h
add \t1\().8h, \t3\().8h, \t1\().8h
sqrshrun \dst\().8b, \t0\().8h, #4
sqrshrun2 \dst\().16b, \t1\().8h, #4
.endm
.macro bidir_fn type
function \type\()_8bpc_neon, export=1
.ifc \type, w_avg
dup v30.8h, w6
neg v30.8h, v30.8h
shl v30.8h, v30.8h, #11
.endif
.ifc \type, mask
movi v31.16b, #256-2
.endif
rbit w4, w4
adr x7, \type\()_tbl
clz w4, w4
\type v4, v0, v1
ldrh w4, [x7, x4, lsl #1]
\type v5, v2, v3
sub x7, x7, w4, uxth
br x7
4:
cmp w5, #4
st1 {v4.s}[0], [x0], x1
st1 {v4.s}[1], [x0], x1
st1 {v5.s}[0], [x0], x1
st1 {v5.s}[1], [x0], x1
b.eq 0f
\type v6, v0, v1
\type v7, v2, v3
cmp w5, #8
st1 {v6.s}[0], [x0], x1
st1 {v6.s}[1], [x0], x1
st1 {v7.s}[0], [x0], x1
st1 {v7.s}[1], [x0], x1
b.eq 0f
\type v4, v0, v1
\type v5, v2, v3
st1 {v4.s}[0], [x0], x1
st1 {v4.s}[1], [x0], x1
\type v6, v0, v1
st1 {v5.s}[0], [x0], x1
st1 {v5.s}[1], [x0], x1
\type v7, v2, v3
st1 {v6.s}[0], [x0], x1
st1 {v6.s}[1], [x0], x1
st1 {v7.s}[0], [x0], x1
st1 {v7.s}[1], [x0], x1
ret
8:
st1 {v4.8b}, [x0], x1
\type v6, v0, v1
st1 {v5.8b}, [x0], x1
\type v7, v0, v1
st1 {v6.8b}, [x0], x1
subs w5, w5, #4
st1 {v7.8b}, [x0], x1
b.le 0f
\type v4, v0, v1
\type v5, v2, v3
b 8b
160:
trn1 v4.2d, v4.2d, v5.2d
16:
\type\()16 v5, v0, v1, v2, v3
st1 {v4.16b}, [x0], x1
\type\()16 v6, v0, v1, v2, v3
st1 {v5.16b}, [x0], x1
\type\()16 v7, v0, v1, v2, v3
st1 {v6.16b}, [x0], x1
subs w5, w5, #4
st1 {v7.16b}, [x0], x1
b.le 0f
\type\()16 v4, v0, v1, v2, v3
b 16b
320:
trn1 v4.2d, v4.2d, v5.2d
add x7, x0, x1
lsl x1, x1, #1
32:
\type\()16 v5, v0, v1, v2, v3
\type\()16 v6, v0, v1, v2, v3
st1 {v4.16b,v5.16b}, [x0], x1
\type\()16 v7, v0, v1, v2, v3
subs w5, w5, #2
st1 {v6.16b,v7.16b}, [x7], x1
b.le 0f
\type\()16 v4, v0, v1, v2, v3
b 32b
640:
trn1 v4.2d, v4.2d, v5.2d
add x7, x0, x1
lsl x1, x1, #1
64:
\type\()16 v5, v0, v1, v2, v3
\type\()16 v6, v0, v1, v2, v3
\type\()16 v7, v0, v1, v2, v3
\type\()16 v16, v0, v1, v2, v3
\type\()16 v17, v0, v1, v2, v3
st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
\type\()16 v18, v0, v1, v2, v3
\type\()16 v19, v0, v1, v2, v3
subs w5, w5, #2
st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
b.le 0f
\type\()16 v4, v0, v1, v2, v3
b 64b
1280:
trn1 v4.2d, v4.2d, v5.2d
add x7, x0, #64
128:
\type\()16 v5, v0, v1, v2, v3
\type\()16 v6, v0, v1, v2, v3
\type\()16 v7, v0, v1, v2, v3
\type\()16 v16, v0, v1, v2, v3
\type\()16 v17, v0, v1, v2, v3
st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
\type\()16 v18, v0, v1, v2, v3
\type\()16 v19, v0, v1, v2, v3
subs w5, w5, #1
st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
b.le 0f
\type\()16 v4, v0, v1, v2, v3
b 128b
0:
ret
\type\()_tbl:
.hword 0, 0
.hword \type\()_tbl - 4b
.hword \type\()_tbl - 8b
.hword \type\()_tbl - 160b
.hword \type\()_tbl - 320b
.hword \type\()_tbl - 640b
.hword \type\()_tbl - 1280b
endfunc
.endm
bidir_fn avg
bidir_fn w_avg
bidir_fn mask
#endif /* BITDEPTH == 8 */
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "src/mc.h"
#include "src/cpu.h"
decl_avg_fn(dav1d_avg_8bpc_neon);
decl_w_avg_fn(dav1d_w_avg_8bpc_neon);
decl_mask_fn(dav1d_mask_8bpc_neon);
void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8 && ARCH_AARCH64
c->avg = dav1d_avg_8bpc_neon;
c->w_avg = dav1d_w_avg_8bpc_neon;
c->mask = dav1d_mask_8bpc_neon;
#endif
}
......@@ -532,7 +532,11 @@ void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
c->warp8x8 = warp_affine_8x8_c;
c->warp8x8t = warp_affine_8x8t_c;
#if HAVE_ASM && ARCH_X86
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
bitfn(dav1d_mc_dsp_init_arm)(c);
#elif ARCH_X86
bitfn(dav1d_mc_dsp_init_x86)(c);
#endif
#endif
}
......@@ -101,6 +101,9 @@ typedef struct Dav1dMCDSPContext {
void dav1d_mc_dsp_init_8bpc(Dav1dMCDSPContext *c);
void dav1d_mc_dsp_init_10bpc(Dav1dMCDSPContext *c);
void dav1d_mc_dsp_init_arm_8bpc(Dav1dMCDSPContext *c);
void dav1d_mc_dsp_init_arm_10bpc(Dav1dMCDSPContext *c);
void dav1d_mc_dsp_init_x86_8bpc(Dav1dMCDSPContext *c);
void dav1d_mc_dsp_init_x86_10bpc(Dav1dMCDSPContext *c);
......
......@@ -83,9 +83,11 @@ if is_asm_enabled
'arm/cpu.c',
)
libdav1d_tmpl_sources += files(
'arm/mc_init.c',
)
if host_machine.cpu_family() == 'aarch64'
libdav1d_tmpl_sources += files(
'arm/64/mc.S',
)
elif host_machine.cpu_family().startswith('arm')
libdav1d_tmpl_sources += files(
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment