Commit ea54dbe2 authored by Martin Storsjö's avatar Martin Storsjö
Browse files

arm64: mc: NEON implementation of emu_edge for 8bpc

Relative speedups over C code:
                     Cortex A53    A72    A73
emu_edge_w4_8bpc_neon:     3.82   2.93   2.41
emu_edge_w8_8bpc_neon:     3.28   2.86   2.51
emu_edge_w16_8bpc_neon:    3.58   3.27   2.63
emu_edge_w32_8bpc_neon:    3.04   1.68   2.12
emu_edge_w64_8bpc_neon:    2.58   1.45   1.48
emu_edge_w128_8bpc_neon:   1.79   1.02   1.57

The benchmark numbers for the larger size on A72 fluctuate a
whole lot and thus seem very unreliable.
parent ad392d71
Pipeline #15777 passed with stages
in 5 minutes and 2 seconds
......@@ -3089,3 +3089,161 @@ endfunc
warp , 11
warp t, 7
// void dav1d_emu_edge_8bpc_neon(
// const intptr_t bw, const intptr_t bh,
// const intptr_t iw, const intptr_t ih,
// const intptr_t x, const intptr_t y,
// pixel *dst, const ptrdiff_t dst_stride,
// const pixel *ref, const ptrdiff_t ref_stride)
function emu_edge_8bpc_neon, export=1
ldp x8, x9, [sp]
// ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
// ref += iclip(x, 0, iw - 1)
sub x12, x3, #1 // ih - 1
cmp x5, x3
sub x13, x2, #1 // iw - 1
csel x12, x12, x5, ge // min(y, ih - 1)
cmp x4, x2
bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
csel x13, x13, x4, ge // min(x, iw - 1)
bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
madd x8, x12, x9, x8 // ref += iclip() * stride
add x8, x8, x13 // ref += iclip()
// bottom_ext = iclip(y + bh - ih, 0, bh - 1)
// top_ext = iclip(-y, 0, bh - 1)
add x10, x5, x1 // y + bh
neg x5, x5 // -y
sub x10, x10, x3 // y + bh - ih
sub x12, x1, #1 // bh - 1
cmp x10, x1
bic x5, x5, x5, asr #63 // max(-y, 0)
csel x10, x10, x12, lt // min(y + bh - ih, bh-1)
cmp x5, x1
bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
csel x5, x5, x12, lt // min(max(-y, 0), bh-1)
// right_ext = iclip(x + bw - iw, 0, bw - 1)
// left_ext = iclip(-x, 0, bw - 1)
add x11, x4, x0 // x + bw
neg x4, x4 // -x
sub x11, x11, x2 // x + bw - iw
sub x13, x0, #1 // bw - 1
cmp x11, x0
bic x4, x4, x4, asr #63 // max(-x, 0)
csel x11, x11, x13, lt // min(x + bw - iw, bw-1)
cmp x4, x0
bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
csel x4, x4, x13, lt // min(max(-x, 0), bw - 1)
// center_h = bh - top_ext - bottom_ext
// dst += top_ext * PXSTRIDE(dst_stride)
// center_w = bw - left_ext - right_ext
sub x1, x1, x5 // bh - top_ext
madd x6, x5, x7, x6
sub x2, x0, x4 // bw - left_ext
sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext
sub x2, x2, x11 // center_w = bw - left_ext - right_ext
mov x14, x6 // backup of dst
.macro v_loop need_left, need_right
0:
.if \need_left
ld1r {v0.16b}, [x8]
mov x12, x6 // out = dst
mov x3, x4
1:
subs x3, x3, #16
st1 {v0.16b}, [x12], #16
b.gt 1b
.endif
mov x13, x8
add x12, x6, x4 // out = dst + left_ext
mov x3, x2
1:
ld1 {v0.16b, v1.16b}, [x13], #32
subs x3, x3, #32
st1 {v0.16b, v1.16b}, [x12], #32
b.gt 1b
.if \need_right
add x3, x8, x2 // in + center_w
sub x3, x3, #1 // in + center_w - 1
add x12, x6, x4 // dst + left_ext
ld1r {v0.16b}, [x3]
add x12, x12, x2 // out = dst + left_ext + center_w
mov x3, x11
1:
subs x3, x3, #16
st1 {v0.16b}, [x12], #16
b.gt 1b
.endif
subs x1, x1, #1 // center_h--
add x6, x6, x7
add x8, x8, x9
b.gt 0b
.endm
cbz x4, 2f
// need_left
cbz x11, 3f
// need_left + need_right
v_loop 1, 1
b 5f
2:
// !need_left
cbz x11, 4f
// !need_left + need_right
v_loop 0, 1
b 5f
3:
// need_left + !need_right
v_loop 1, 0
b 5f
4:
// !need_left + !need_right
v_loop 0, 0
5:
cbz x10, 3f
// need_bottom
sub x8, x6, x7 // ref = dst - stride
mov x4, x0
1:
ld1 {v0.16b, v1.16b}, [x8], #32
mov x3, x10
2:
subs x3, x3, #1
st1 {v0.16b, v1.16b}, [x6], x7
b.gt 2b
msub x6, x7, x10, x6 // dst -= bottom_ext * stride
subs x4, x4, #32 // bw -= 32
add x6, x6, #32 // dst += 32
b.gt 1b
3:
cbz x5, 3f
// need_top
msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride
1:
ld1 {v0.16b, v1.16b}, [x14], #32
mov x3, x5
2:
subs x3, x3, #1
st1 {v0.16b, v1.16b}, [x6], x7
b.gt 2b
msub x6, x7, x5, x6 // dst -= top_ext * stride
subs x0, x0, #32 // bw -= 32
add x6, x6, #32 // dst += 32
b.gt 1b
3:
ret
endfunc
......@@ -66,6 +66,8 @@ decl_w_mask_fn(BF(dav1d_w_mask_420, neon));
decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, neon));
decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon));
decl_emu_edge_fn(BF(dav1d_emu_edge, neon));
void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
#define init_mc_fn(type, name, suffix) \
c->mc[type] = BF(dav1d_put_##name, suffix)
......@@ -110,4 +112,7 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
c->warp8x8 = BF(dav1d_warp_affine_8x8, neon);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
#endif
#if BITDEPTH == 8 && ARCH_AARCH64
c->emu_edge = BF(dav1d_emu_edge, neon);
#endif
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment