Commit 3d94fb9a authored by B Krishnan Iyer's avatar B Krishnan Iyer

arm64: mc: NEON implementation of w_mask_444/422/420 function

	                        A73	        A53

w_mask_420_w4_8bpc_c:	        818	        1082.9
w_mask_420_w4_8bpc_neon:	79	        126.6
w_mask_420_w8_8bpc_c:	        2486	        3399.8
w_mask_420_w8_8bpc_neon:	200.2	        343.7
w_mask_420_w16_8bpc_c:	        8022.3	        10989.6
w_mask_420_w16_8bpc_neon:	528.1   	889
w_mask_420_w32_8bpc_c:	        31851.8	        42808.6
w_mask_420_w32_8bpc_neon:	2062.5	        3380.8
w_mask_420_w64_8bpc_c:	        79268.5	        102683.9
w_mask_420_w64_8bpc_neon:	5252.9	        8575.4
w_mask_420_w128_8bpc_c:	        193704.1	255586.5
w_mask_420_w128_8bpc_neon:	14602.3	        22167.7

w_mask_422_w4_8bpc_c:	        777.3	        1038.5
w_mask_422_w4_8bpc_neon:	72.1	        112.9
w_mask_422_w8_8bpc_c:	        2405.7	        3168
w_mask_422_w8_8bpc_neon:	191.9	        314.1
w_mask_422_w16_8bpc_c:	        7783.7	        10543.9
w_mask_422_w16_8bpc_neon:	559.8	        835.5
w_mask_422_w32_8bpc_c:	        30895.7	        41141.2
w_mask_422_w32_8bpc_neon:	2089.7	        3187.2
w_mask_422_w64_8bpc_c:	        75500.2	        98766.3
w_mask_422_w64_8bpc_neon:	5379	        8208.2
w_mask_422_w128_8bpc_c:	        186967.1	245809.1
w_mask_422_w128_8bpc_neon:	15159.9	        21474.5

w_mask_444_w4_8bpc_c:	        850.1	        1136.6
w_mask_444_w4_8bpc_neon:	66.5	        104.7
w_mask_444_w8_8bpc_c:	        2373.5	        3262.9
w_mask_444_w8_8bpc_neon:	180.5	        290.2
w_mask_444_w16_8bpc_c:	        7291.6	        10590.7
w_mask_444_w16_8bpc_neon:	550.9	        809.7
w_mask_444_w32_8bpc_c:	        8048.3	        10140.8
w_mask_444_w32_8bpc_neon:	2136.2	        3095
w_mask_444_w64_8bpc_c:	        18055.3	        23060
w_mask_444_w64_8bpc_neon:	5522.5	        8124.8
w_mask_444_w128_8bpc_c:	        42754.3	        56072
w_mask_444_w128_8bpc_neon:	15569.5	        21531.5
parent 1dc2dc7d
Pipeline #9127 passed with stages
in 8 minutes and 47 seconds
......@@ -234,6 +234,228 @@ bidir_fn w_avg
bidir_fn mask
.macro w_mask_fn type
function w_mask_\type\()_8bpc_neon, export=1
clz w8, w4
adr x9, L(w_mask_\type\()_tbl)
sub w8, w8, #24
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
mov w10, #6903
dup v0.8h, w10
.if \type == 444
movi v1.16b, #64
.elseif \type == 422
dup v2.8b, w7
movi v3.8b, #129
sub v3.8b, v3.8b, v2.8b
.elseif \type == 420
dup v2.8h, w7
movi v3.8h, #1, lsl #8
sub v3.8h, v3.8h, v2.8h
.endif
add x12, x0, x1
lsl x1, x1, #1
br x9
4:
ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
subs w5, w5, #4
sub v16.8h, v6.8h, v4.8h
sub v17.8h, v7.8h, v5.8h
sabd v18.8h, v4.8h, v6.8h
sabd v19.8h, v5.8h, v7.8h
uqsub v18.8h, v0.8h, v18.8h
uqsub v19.8h, v0.8h, v19.8h
ushr v18.8h, v18.8h, #8
ushr v19.8h, v19.8h, #8
shl v20.8h, v18.8h, #9
shl v21.8h, v19.8h, #9
sqdmulh v20.8h, v20.8h, v16.8h
sqdmulh v21.8h, v21.8h, v17.8h
add v20.8h, v20.8h, v4.8h
add v21.8h, v21.8h, v5.8h
sqrshrun v22.8b, v20.8h, #4
sqrshrun v23.8b, v21.8h, #4
.if \type == 444
xtn v18.8b, v18.8h
xtn2 v18.16b, v19.8h
sub v18.16b, v1.16b, v18.16b
st1 {v18.16b}, [x6], #16
.elseif \type == 422
addp v18.8h, v18.8h, v19.8h
xtn v18.8b, v18.8h
uhsub v18.8b, v3.8b, v18.8b
st1 {v18.8b}, [x6], #8
.elseif \type == 420
trn1 v24.2d, v18.2d, v19.2d
trn2 v25.2d, v18.2d, v19.2d
add v24.8h, v24.8h, v25.8h
addp v18.8h, v24.8h, v24.8h
sub v18.4h, v3.4h, v18.4h
rshrn v18.8b, v18.8h, #2
st1 {v18.s}[0], [x6], #4
.endif
st1 {v22.s}[0], [x0], x1
st1 {v22.s}[1], [x12], x1
st1 {v23.s}[0], [x0], x1
st1 {v23.s}[1], [x12], x1
b.gt 4b
ret
8:
ld1 {v4.8h, v5.8h}, [x2], #32
ld1 {v6.8h, v7.8h}, [x3], #32
subs w5, w5, #2
sub v16.8h, v6.8h, v4.8h
sub v17.8h, v7.8h, v5.8h
sabd v18.8h, v4.8h, v6.8h
sabd v19.8h, v5.8h, v7.8h
uqsub v18.8h, v0.8h, v18.8h
uqsub v19.8h, v0.8h, v19.8h
ushr v18.8h, v18.8h, #8
ushr v19.8h, v19.8h, #8
shl v20.8h, v18.8h, #9
shl v21.8h, v19.8h, #9
sqdmulh v20.8h, v20.8h, v16.8h
sqdmulh v21.8h, v21.8h, v17.8h
add v20.8h, v20.8h, v4.8h
add v21.8h, v21.8h, v5.8h
sqrshrun v22.8b, v20.8h, #4
sqrshrun v23.8b, v21.8h, #4
.if \type == 444
xtn v18.8b, v18.8h
xtn2 v18.16b, v19.8h
sub v18.16b, v1.16b, v18.16b
st1 {v18.16b}, [x6], #16
.elseif \type == 422
addp v18.8h, v18.8h, v19.8h
xtn v18.8b, v18.8h
uhsub v18.8b, v3.8b, v18.8b
st1 {v18.8b}, [x6], #8
.elseif \type == 420
add v18.8h, v18.8h, v19.8h
addp v18.8h, v18.8h, v18.8h
sub v18.4h, v3.4h, v18.4h
rshrn v18.8b, v18.8h, #2
st1 {v18.s}[0], [x6], #4
.endif
st1 {v22.8b}, [x0], x1
st1 {v23.8b}, [x12], x1
b.gt 8b
ret
1280:
640:
320:
160:
mov w11, w4
sub x1, x1, w4, uxtw
.if \type == 444
add x10, x6, w4, uxtw
.elseif \type == 422
add x10, x6, x11, lsr #1
.endif
add x9, x3, w4, uxtw #1
add x7, x2, w4, uxtw #1
161:
mov w8, w4
16:
ld1 {v4.8h, v5.8h}, [x2], #32
ld1 {v6.8h, v7.8h}, [x3], #32
ld1 {v16.8h, v17.8h}, [x7], #32
ld1 {v18.8h, v19.8h}, [x9], #32
subs w8, w8, #16
sub v6.8h, v6.8h, v4.8h
sub v7.8h, v7.8h, v5.8h
sub v18.8h, v18.8h, v16.8h
sub v19.8h, v19.8h, v17.8h
abs v20.8h, v6.8h
abs v21.8h, v7.8h
abs v22.8h, v18.8h
abs v23.8h, v19.8h
uqsub v20.8h, v0.8h, v20.8h
uqsub v21.8h, v0.8h, v21.8h
uqsub v22.8h, v0.8h, v22.8h
uqsub v23.8h, v0.8h, v23.8h
ushr v20.8h, v20.8h, #8
ushr v21.8h, v21.8h, #8
ushr v22.8h, v22.8h, #8
ushr v23.8h, v23.8h, #8
shl v24.8h, v20.8h, #9
shl v25.8h, v21.8h, #9
shl v26.8h, v22.8h, #9
shl v27.8h, v23.8h, #9
sqdmulh v24.8h, v24.8h, v6.8h
sqdmulh v25.8h, v25.8h, v7.8h
sqdmulh v26.8h, v26.8h, v18.8h
sqdmulh v27.8h, v27.8h, v19.8h
add v24.8h, v24.8h, v4.8h
add v25.8h, v25.8h, v5.8h
add v26.8h, v26.8h, v16.8h
add v27.8h, v27.8h, v17.8h
sqrshrun v24.8b, v24.8h, #4
sqrshrun v25.8b, v25.8h, #4
sqrshrun v26.8b, v26.8h, #4
sqrshrun v27.8b, v27.8h, #4
.if \type == 444
xtn v20.8b, v20.8h
xtn2 v20.16b, v21.8h
xtn v21.8b, v22.8h
xtn2 v21.16b, v23.8h
sub v20.16b, v1.16b, v20.16b
sub v21.16b, v1.16b, v21.16b
st1 {v20.16b}, [x6], #16
st1 {v21.16b}, [x10], #16
.elseif \type == 422
addp v20.8h, v20.8h, v21.8h
addp v21.8h, v22.8h, v23.8h
xtn v20.8b, v20.8h
xtn v21.8b, v21.8h
uhsub v20.8b, v3.8b, v20.8b
uhsub v21.8b, v3.8b, v21.8b
st1 {v20.8b}, [x6], #8
st1 {v21.8b}, [x10], #8
.elseif \type == 420
add v20.8h, v20.8h, v22.8h
add v21.8h, v21.8h, v23.8h
addp v20.8h, v20.8h, v21.8h
sub v20.8h, v3.8h, v20.8h
rshrn v20.8b, v20.8h, #2
st1 {v20.8b}, [x6], #8
.endif
st1 {v24.8b, v25.8b}, [x0], #16
st1 {v26.8b, v27.8b}, [x12], #16
b.gt 16b
subs w5, w5, #2
add x2, x2, w4, uxtw #1
add x3, x3, w4, uxtw #1
add x7, x7, w4, uxtw #1
add x9, x9, w4, uxtw #1
.if \type == 444
add x6, x6, w4, uxtw
add x10, x10, w4, uxtw
.elseif \type == 422
add x6, x6, x11, lsr #1
add x10, x10, x11, lsr #1
.endif
add x0, x0, x1
add x12, x12, x1
b.gt 161b
ret
L(w_mask_\type\()_tbl):
.hword L(w_mask_\type\()_tbl) - 1280b
.hword L(w_mask_\type\()_tbl) - 640b
.hword L(w_mask_\type\()_tbl) - 320b
.hword L(w_mask_\type\()_tbl) - 160b
.hword L(w_mask_\type\()_tbl) - 8b
.hword L(w_mask_\type\()_tbl) - 4b
endfunc
.endm
w_mask_fn 444
w_mask_fn 422
w_mask_fn 420
function blend_8bpc_neon, export=1
adr x6, L(blend_tbl)
clz w3, w3
......
......@@ -104,13 +104,12 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
c->blend = dav1d_blend_8bpc_neon;
c->blend_h = dav1d_blend_h_8bpc_neon;
c->blend_v = dav1d_blend_v_8bpc_neon;
#if ARCH_AARCH64
c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
#elif ARCH_ARM
c->w_mask[0] = dav1d_w_mask_444_8bpc_neon;
c->w_mask[1] = dav1d_w_mask_422_8bpc_neon;
c->w_mask[2] = dav1d_w_mask_420_8bpc_neon;
#if ARCH_AARCH64
c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
#endif
#endif
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment