Commit 1dc2dc7d authored by B Krishnan Iyer's avatar B Krishnan Iyer

arm64: mc: NEON implementation of blend, blend_h and blend_v function

                   	A73	A53
blend_h_w2_8bpc_c:	184.7	301.5
blend_h_w2_8bpc_neon:	58.8	104.1
blend_h_w4_8bpc_c:	291.4	507.3
blend_h_w4_8bpc_neon:	48.7	108.9
blend_h_w8_8bpc_c:	510.1	992.7
blend_h_w8_8bpc_neon:	66.5	99.3
blend_h_w16_8bpc_c:	972	1835.3
blend_h_w16_8bpc_neon:	82.7	145.2
blend_h_w32_8bpc_c:	776.7	912.9
blend_h_w32_8bpc_neon:	155.1	266.9
blend_h_w64_8bpc_c:	1424.3	1635.4
blend_h_w64_8bpc_neon:	273.4	480.9
blend_h_w128_8bpc_c:	3318.1	3774
blend_h_w128_8bpc_neon:	614.1	1097.9
blend_v_w2_8bpc_c:	278.8	427.5
blend_v_w2_8bpc_neon:	113.7	170.4
blend_v_w4_8bpc_c:	960.2	1597.7
blend_v_w4_8bpc_neon:	222.9	351.4
blend_v_w8_8bpc_c:	1694.2	3333.5
blend_v_w8_8bpc_neon:	200.9	333.6
blend_v_w16_8bpc_c:	3115.2	5971.6
blend_v_w16_8bpc_neon:	233.2	494.8
blend_v_w32_8bpc_c:	3949.7	6070.6
blend_v_w32_8bpc_neon:	460.4	841.6
blend_w4_8bpc_c:	244.2	388.3
blend_w4_8bpc_neon:	25.5	66.7
blend_w8_8bpc_c:	616.3	1120.8
blend_w8_8bpc_neon:	46	110.7
blend_w16_8bpc_c:	2193.1	4056.4
blend_w16_8bpc_neon:	140.7	299.3
blend_w32_8bpc_c:	2502.8	2998.5
blend_w32_8bpc_neon:	381.4	725.3
parent d20d70e8
Pipeline #8918 passed with stages
in 6 minutes and 2 seconds
......@@ -234,6 +234,413 @@ bidir_fn w_avg
bidir_fn mask
function blend_8bpc_neon, export=1
adr x6, L(blend_tbl)
clz w3, w3
sub w3, w3, #26
ldrh w3, [x6, x3, lsl #1]
sub x6, x6, w3, uxtw
movi v4.16b, #64
add x8, x0, x1
lsl w1, w1, #1
br x6
4:
ld1 {v2.d}[0], [x5], #8
ld1 {v1.d}[0], [x2], #8
ld1 {v0.s}[0], [x0]
subs w4, w4, #2
ld1 {v0.s}[1], [x8]
sub v3.8b, v4.8b, v2.8b
umull v5.8h, v1.8b, v2.8b
umlal v5.8h, v0.8b, v3.8b
rshrn v6.8b, v5.8h, #6
st1 {v6.s}[0], [x0], x1
st1 {v6.s}[1], [x8], x1
b.gt 4b
ret
8:
ld1 {v2.2d}, [x5], #16
ld1 {v1.2d}, [x2], #16
ld1 {v0.d}[0], [x0]
ld1 {v0.d}[1], [x8]
sub v3.16b, v4.16b, v2.16b
subs w4, w4, #2
umull v5.8h, v1.8b, v2.8b
umlal v5.8h, v0.8b, v3.8b
umull2 v6.8h, v1.16b, v2.16b
umlal2 v6.8h, v0.16b, v3.16b
rshrn v7.8b, v5.8h, #6
rshrn2 v7.16b, v6.8h, #6
st1 {v7.d}[0], [x0], x1
st1 {v7.d}[1], [x8], x1
b.gt 8b
ret
16:
ld1 {v1.2d, v2.2d}, [x5], #32
ld1 {v5.2d, v6.2d}, [x2], #32
ld1 {v0.2d}, [x0]
subs w4, w4, #2
sub v7.16b, v4.16b, v1.16b
sub v20.16b, v4.16b, v2.16b
ld1 {v3.2d}, [x8]
umull v16.8h, v5.8b, v1.8b
umlal v16.8h, v0.8b, v7.8b
umull2 v17.8h, v5.16b, v1.16b
umlal2 v17.8h, v0.16b, v7.16b
umull v21.8h, v6.8b, v2.8b
umlal v21.8h, v3.8b, v20.8b
umull2 v22.8h, v6.16b, v2.16b
umlal2 v22.8h, v3.16b, v20.16b
rshrn v18.8b, v16.8h, #6
rshrn2 v18.16b, v17.8h, #6
rshrn v19.8b, v21.8h, #6
rshrn2 v19.16b, v22.8h, #6
st1 {v18.2d}, [x0], x1
st1 {v19.2d}, [x8], x1
b.gt 16b
ret
32:
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x5], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x2], #64
ld1 {v20.2d, v21.2d}, [x0]
subs w4, w4, #2
ld1 {v22.2d, v23.2d}, [x8]
sub v5.16b, v4.16b, v0.16b
sub v6.16b, v4.16b, v1.16b
sub v30.16b, v4.16b, v2.16b
sub v31.16b, v4.16b, v3.16b
umull v24.8h, v16.8b, v0.8b
umlal v24.8h, v20.8b, v5.8b
umull2 v26.8h, v16.16b, v0.16b
umlal2 v26.8h, v20.16b, v5.16b
umull v28.8h, v17.8b, v1.8b
umlal v28.8h, v21.8b, v6.8b
umull2 v7.8h, v17.16b, v1.16b
umlal2 v7.8h, v21.16b, v6.16b
umull v27.8h, v18.8b, v2.8b
umlal v27.8h, v22.8b, v30.8b
umull2 v1.8h, v18.16b, v2.16b
umlal2 v1.8h, v22.16b, v30.16b
umull v29.8h, v19.8b, v3.8b
umlal v29.8h, v23.8b, v31.8b
umull2 v21.8h, v19.16b, v3.16b
umlal2 v21.8h, v23.16b, v31.16b
rshrn v24.8b, v24.8h, #6
rshrn2 v24.16b, v26.8h, #6
rshrn v25.8b, v28.8h, #6
rshrn2 v25.16b, v7.8h, #6
rshrn v27.8b, v27.8h, #6
rshrn2 v27.16b, v1.8h, #6
rshrn v28.8b, v29.8h, #6
rshrn2 v28.16b, v21.8h, #6
st1 {v24.2d, v25.2d}, [x0], x1
st1 {v27.2d, v28.2d}, [x8], x1
b.gt 32b
ret
L(blend_tbl):
.hword L(blend_tbl) - 32b
.hword L(blend_tbl) - 16b
.hword L(blend_tbl) - 8b
.hword L(blend_tbl) - 4b
endfunc
function blend_h_8bpc_neon, export=1
adr x6, L(blend_h_tbl)
movrel x5, X(obmc_masks)
add x5, x5, w4, uxtw
sub w4, w4, w4, lsr #2
clz w7, w3
movi v4.16b, #64
add x8, x0, x1
lsl x1, x1, #1
sub w7, w7, #24
ldrh w7, [x6, x7, lsl #1]
sub x6, x6, w7, uxtw
br x6
2:
ld1 {v0.h}[0], [x5], #2
ld1 {v1.s}[0], [x2], #4
subs w4, w4, #2
ld1 {v2.h}[0], [x0]
zip1 v0.8b, v0.8b, v0.8b
sub v3.8b, v4.8b, v0.8b
ld1 {v2.h}[1], [x8]
umull v5.8h, v1.8b, v0.8b
umlal v5.8h, v2.8b, v3.8b
rshrn v5.8b, v5.8h, #6
st1 {v5.h}[0], [x0], x1
st1 {v5.h}[1], [x8], x1
b.gt 2b
ret
4:
ld2r {v0.8b, v1.8b}, [x5], #2
ld1 {v2.2s}, [x2], #8
subs w4, w4, #2
ext v0.8b, v0.8b, v1.8b, #4
ld1 {v3.s}[0], [x0]
sub v5.8b, v4.8b, v0.8b
ld1 {v3.s}[1], [x8]
umull v6.8h, v2.8b, v0.8b
umlal v6.8h, v3.8b, v5.8b
rshrn v6.8b, v6.8h, #6
st1 {v6.s}[0], [x0], x1
st1 {v6.s}[1], [x8], x1
b.gt 4b
ret
8:
ld2r {v0.16b, v1.16b}, [x5], #2
ld1 {v2.16b}, [x2], #16
ld1 {v3.d}[0], [x0]
ext v0.16b, v0.16b, v1.16b, #8
sub v5.16b, v4.16b, v0.16b
ld1 {v3.d}[1], [x8]
subs w4, w4, #2
umull v6.8h, v0.8b, v2.8b
umlal v6.8h, v3.8b, v5.8b
umull2 v7.8h, v0.16b, v2.16b
umlal2 v7.8h, v3.16b, v5.16b
rshrn v16.8b, v6.8h, #6
rshrn2 v16.16b, v7.8h, #6
st1 {v16.d}[0], [x0], x1
st1 {v16.d}[1], [x8], x1
b.gt 8b
ret
16:
ld2r {v0.16b, v1.16b}, [x5], #2
ld1 {v2.16b, v3.16b}, [x2], #32
ld1 {v5.16b}, [x0]
sub v7.16b, v4.16b, v0.16b
sub v16.16b, v4.16b, v1.16b
ld1 {v6.16b}, [x8]
subs w4, w4, #2
umull v17.8h, v0.8b, v2.8b
umlal v17.8h, v5.8b, v7.8b
umull2 v18.8h, v0.16b, v2.16b
umlal2 v18.8h, v5.16b, v7.16b
umull v19.8h, v1.8b, v3.8b
umlal v19.8h, v6.8b, v16.8b
umull2 v20.8h, v1.16b, v3.16b
umlal2 v20.8h, v6.16b, v16.16b
rshrn v21.8b, v17.8h, #6
rshrn2 v21.16b, v18.8h, #6
rshrn v22.8b, v19.8h, #6
rshrn2 v22.16b, v20.8h, #6
st1 {v21.16b}, [x0], x1
st1 {v22.16b}, [x8], x1
b.gt 16b
ret
1280:
640:
320:
sub x1, x1, w3, uxtw
add x7, x2, w3, uxtw
321:
ld2r {v0.16b, v1.16b}, [x5], #2
mov w6, w3
sub v20.16b, v4.16b, v0.16b
sub v21.16b, v4.16b, v1.16b
32:
ld1 {v16.16b, v17.16b}, [x2], #32
ld1 {v2.16b, v3.16b}, [x0]
subs w6, w6, #32
umull v23.8h, v0.8b, v16.8b
umlal v23.8h, v2.8b, v20.8b
ld1 {v18.16b, v19.16b}, [x7], #32
umull2 v27.8h, v0.16b, v16.16b
umlal2 v27.8h, v2.16b, v20.16b
ld1 {v6.16b, v7.16b}, [x8]
umull v24.8h, v0.8b, v17.8b
umlal v24.8h, v3.8b, v20.8b
umull2 v28.8h, v0.16b, v17.16b
umlal2 v28.8h, v3.16b, v20.16b
umull v25.8h, v1.8b, v18.8b
umlal v25.8h, v6.8b, v21.8b
umull2 v5.8h, v1.16b, v18.16b
umlal2 v5.8h, v6.16b, v21.16b
rshrn v29.8b, v23.8h, #6
rshrn2 v29.16b, v27.8h, #6
umull v26.8h, v1.8b, v19.8b
umlal v26.8h, v7.8b, v21.8b
umull2 v31.8h, v1.16b, v19.16b
umlal2 v31.8h, v7.16b, v21.16b
rshrn v30.8b, v24.8h, #6
rshrn2 v30.16b, v28.8h, #6
rshrn v23.8b, v25.8h, #6
rshrn2 v23.16b, v5.8h, #6
rshrn v24.8b, v26.8h, #6
st1 {v29.16b, v30.16b}, [x0], #32
rshrn2 v24.16b, v31.8h, #6
st1 {v23.16b, v24.16b}, [x8], #32
b.gt 32b
subs w4, w4, #2
add x0, x0, x1
add x8, x8, x1
add x2, x2, w3, uxtw
add x7, x7, w3, uxtw
b.gt 321b
ret
L(blend_h_tbl):
.hword L(blend_h_tbl) - 1280b
.hword L(blend_h_tbl) - 640b
.hword L(blend_h_tbl) - 320b
.hword L(blend_h_tbl) - 16b
.hword L(blend_h_tbl) - 8b
.hword L(blend_h_tbl) - 4b
.hword L(blend_h_tbl) - 2b
endfunc
function blend_v_8bpc_neon, export=1
adr x6, L(blend_v_tbl)
movrel x5, X(obmc_masks)
add x5, x5, w3, uxtw
clz w3, w3
movi v4.16b, #64
add x8, x0, x1
lsl x1, x1, #1
sub w3, w3, #26
ldrh w3, [x6, x3, lsl #1]
sub x6, x6, w3, uxtw
br x6
20:
ld1r {v0.8b}, [x5]
sub v1.8b, v4.8b, v0.8b
2:
ld1 {v2.h}[0], [x2], #2
ld1 {v3.b}[0], [x0]
subs w4, w4, #2
ld1 {v2.b}[1], [x2]
ld1 {v3.b}[1], [x8]
umull v5.8h, v2.8b, v0.8b
umlal v5.8h, v3.8b, v1.8b
rshrn v5.8b, v5.8h, #6
add x2, x2, #2
st1 {v5.b}[0], [x0], x1
st1 {v5.b}[1], [x8], x1
b.gt 2b
ret
40:
ld1r {v0.2s}, [x5]
sub v1.8b, v4.8b, v0.8b
sub x1, x1, #3
4:
ld1 {v2.8b}, [x2], #8
ld1 {v3.s}[0], [x0]
ld1 {v3.s}[1], [x8]
subs w4, w4, #2
umull v5.8h, v2.8b, v0.8b
umlal v5.8h, v3.8b, v1.8b
rshrn v5.8b, v5.8h, #6
st1 {v5.h}[0], [x0], #2
st1 {v5.h}[2], [x8], #2
st1 {v5.b}[2], [x0], #1
st1 {v5.b}[6], [x8], #1
add x0, x0, x1
add x8, x8, x1
b.gt 4b
ret
80:
ld1r {v0.2d}, [x5]
sub v1.16b, v4.16b, v0.16b
sub x1, x1, #6
8:
ld1 {v2.16b}, [x2], #16
ld1 {v3.d}[0], [x0]
ld1 {v3.d}[1], [x8]
subs w4, w4, #2
umull v5.8h, v0.8b, v2.8b
umlal v5.8h, v3.8b, v1.8b
umull2 v6.8h, v0.16b, v2.16b
umlal2 v6.8h, v3.16b, v1.16b
rshrn v7.8b, v5.8h, #6
rshrn2 v7.16b, v6.8h, #6
st1 {v7.s}[0], [x0], #4
st1 {v7.s}[2], [x8], #4
st1 {v7.h}[2], [x0], #2
st1 {v7.h}[6], [x8], #2
add x0, x0, x1
add x8, x8, x1
b.gt 8b
ret
160:
ld1 {v0.16b}, [x5]
sub v2.16b, v4.16b, v0.16b
sub x1, x1, #12
16:
ld1 {v5.16b, v6.16b}, [x2], #32
ld1 {v7.16b}, [x0]
subs w4, w4, #2
ld1 {v16.16b}, [x8]
umull v17.8h, v5.8b, v0.8b
umlal v17.8h, v7.8b, v2.8b
umull2 v18.8h, v5.16b, v0.16b
umlal2 v18.8h, v7.16b, v2.16b
umull v20.8h, v6.8b, v0.8b
umlal v20.8h, v16.8b, v2.8b
umull2 v21.8h, v6.16b, v0.16b
umlal2 v21.8h, v16.16b, v2.16b
rshrn v19.8b, v17.8h, #6
rshrn2 v19.16b, v18.8h, #6
rshrn v22.8b, v20.8h, #6
rshrn2 v22.16b, v21.8h, #6
st1 {v19.8b}, [x0], #8
st1 {v22.8b}, [x8], #8
st1 {v19.s}[2], [x0], #4
st1 {v22.s}[2], [x8], #4
add x0, x0, x1
add x8, x8, x1
b.gt 16b
ret
320:
ld1 {v0.16b, v1.16b}, [x5]
sub v2.16b, v4.16b, v0.16b
sub v3.16b, v4.16b, v1.16b
sub x1, x1, #24
32:
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
ld1 {v5.16b, v6.16b}, [x0]
subs w4, w4, #2
ld1 {v20.16b, v21.16b}, [x8]
umull v22.8h, v16.8b, v0.8b
umlal v22.8h, v5.8b, v2.8b
umull2 v23.8h, v16.16b, v0.16b
umlal2 v23.8h, v5.16b, v2.16b
umull v28.8h, v17.8b, v1.8b
umlal v28.8h, v6.8b, v3.8b
umull2 v29.8h, v17.16b, v1.16b
umlal2 v29.8h, v6.16b, v3.16b
umull v30.8h, v18.8b, v0.8b
umlal v30.8h, v20.8b, v2.8b
umull2 v31.8h, v18.16b, v0.16b
umlal2 v31.8h, v20.16b, v2.16b
umull v25.8h, v19.8b, v1.8b
umlal v25.8h, v21.8b, v3.8b
umull2 v26.8h, v19.16b, v1.16b
umlal2 v26.8h, v21.16b, v3.16b
rshrn v24.8b, v22.8h, #6
rshrn2 v24.16b, v23.8h, #6
rshrn v28.8b, v28.8h, #6
rshrn2 v28.16b, v29.8h, #6
rshrn v30.8b, v30.8h, #6
rshrn2 v30.16b, v31.8h, #6
rshrn v27.8b, v25.8h, #6
rshrn2 v27.16b, v26.8h, #6
st1 {v24.16b}, [x0], #16
st1 {v30.16b}, [x8], #16
st1 {v28.8b}, [x0], #8
st1 {v27.8b}, [x8], #8
add x0, x0, x1
add x8, x8, x1
b.gt 32b
ret
L(blend_v_tbl):
.hword L(blend_v_tbl) - 320b
.hword L(blend_v_tbl) - 160b
.hword L(blend_v_tbl) - 80b
.hword L(blend_v_tbl) - 40b
.hword L(blend_v_tbl) - 20b
endfunc
// This has got the same signature as the put_8tap functions,
// and assumes that x8 is set to (clz(w)-24).
function put_neon
......
......@@ -101,13 +101,13 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
c->avg = dav1d_avg_8bpc_neon;
c->w_avg = dav1d_w_avg_8bpc_neon;
c->mask = dav1d_mask_8bpc_neon;
c->blend = dav1d_blend_8bpc_neon;
c->blend_h = dav1d_blend_h_8bpc_neon;
c->blend_v = dav1d_blend_v_8bpc_neon;
#if ARCH_AARCH64
c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
#elif ARCH_ARM
c->blend = dav1d_blend_8bpc_neon;
c->blend_h = dav1d_blend_h_8bpc_neon;
c->blend_v = dav1d_blend_v_8bpc_neon;
c->w_mask[0] = dav1d_w_mask_444_8bpc_neon;
c->w_mask[1] = dav1d_w_mask_422_8bpc_neon;
c->w_mask[2] = dav1d_w_mask_420_8bpc_neon;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment