Commit a1e3f358 authored by B Krishnan Iyer's avatar B Krishnan Iyer Committed by B Krishnan Iyer

arm:mc: NEON implementation of blend, blend_h and blend_v function

	                A73	A53

blend_h_w2_8bpc_c:	149.3	246.8
blend_h_w2_8bpc_neon:	74.6	137
blend_h_w4_8bpc_c:	251.6	409.8
blend_h_w4_8bpc_neon:	66	146.6
blend_h_w8_8bpc_c:	446.6	844.1
blend_h_w8_8bpc_neon:	68.6	131.2
blend_h_w16_8bpc_c:	830	1513
blend_h_w16_8bpc_neon:	85.9	192
blend_h_w32_8bpc_c:	1605.2	2847.8
blend_h_w32_8bpc_neon:	149.8	357.6
blend_h_w64_8bpc_c:	3304.8	5515.5
blend_h_w64_8bpc_neon:	262.8	629.5
blend_h_w128_8bpc_c:	7895.1	13260.6
blend_h_w128_8bpc_neon:	577	1402
blend_v_w2_8bpc_c:	241.2	410.8
blend_v_w2_8bpc_neon:	122.1	196.8
blend_v_w4_8bpc_c:	874.4	1418.2
blend_v_w4_8bpc_neon:	248.5	375.9
blend_v_w8_8bpc_c:	1550.5	2514.7
blend_v_w8_8bpc_neon:	210.8	376
blend_v_w16_8bpc_c:	2925.3	5086
blend_v_w16_8bpc_neon:	253.4	608.3
blend_v_w32_8bpc_c:	5686.7	9470.5
blend_v_w32_8bpc_neon:	348.2	994.8
blend_w4_8bpc_c:	201.5	309.3
blend_w4_8bpc_neon:	38.6	99.2
blend_w8_8bpc_c:	531.3	944.8
blend_w8_8bpc_neon:	55.1	125.8
blend_w16_8bpc_c:	1992.8	3349.8
blend_w16_8bpc_neon:	150.1	344
blend_w32_8bpc_c:	4982	8165.9
blend_w32_8bpc_neon:	360.4	910.9
parent efd852af
Pipeline #7544 passed with stages
in 6 minutes and 32 seconds
......@@ -439,6 +439,421 @@ L(prep_tbl):
pop {r4-r11,pc}
endfunc
function blend_8bpc_neon, export=1
push {r4-r8,lr}
ldr r4, [sp, #24]
ldr r5, [sp, #28]
clz r6, r3
adr r7, L(blend_tbl)
sub r6, r6, #26
ldr r6, [r7, r6, lsl #2]
add r7, r7, r6
bx r7
.align 2
L(blend_tbl):
.word 320f - L(blend_tbl) + CONFIG_THUMB
.word 160f - L(blend_tbl) + CONFIG_THUMB
.word 80f - L(blend_tbl) + CONFIG_THUMB
.word 40f - L(blend_tbl) + CONFIG_THUMB
40:
vmov.i8 d22, #64
add r12, r0, r1
lsl r1, r1, #1
4:
vld1.32 {d2[]}, [r5], r3
vld1.32 {d1[]}, [r2], r3
vld1.32 {d0[]}, [r0]
subs r4, r4, #2
vld1.32 {d2[1]}, [r5], r3
vld1.32 {d1[1]}, [r2], r3
vld1.32 {d0[1]}, [r12]
vsub.i8 d3, d22, d2
vmull.u8 q8, d1, d2
vmlal.u8 q8, d0, d3
vrshrn.i16 d20, q8, #6
vst1.32 {d20[0]}, [r0], r1
vst1.32 {d20[1]}, [r12], r1
bgt 4b
pop {r4-r8,pc}
80:
vmov.i8 d16, #64
add r12, r0, r1
lsl r1, r1, #1
8:
vld1.u8 {d2}, [r5], r3
vld1.u8 {d4}, [r2], r3
vld1.u8 {d0}, [r0]
vsub.i8 d17, d16, d2
vld1.u8 {d3}, [r5], r3
vld1.u8 {d5}, [r2], r3
vld1.u8 {d1}, [r12]
subs r4, r4, #2
vsub.i8 d18, d16, d3
vmull.u8 q3, d2, d4
vmlal.u8 q3, d0, d17
vmull.u8 q10, d3, d5
vmlal.u8 q10, d1, d18
vrshrn.i16 d22, q3, #6
vrshrn.i16 d23, q10, #6
vst1.u8 {d22}, [r0], r1
vst1.u8 {d23}, [r12], r1
bgt 8b
pop {r4-r8,pc}
160:
vmov.i8 q12, #64
add r12, r0, r1
lsl r1, r1, #1
16:
vld1.u8 {q2}, [r5], r3
vld1.u8 {q1}, [r2], r3
vld1.u8 {q0}, [r0]
subs r4, r4, #2
vsub.i8 q11, q12, q2
vld1.u8 {q15}, [r5], r3
vld1.u8 {q14}, [r2], r3
vld1.u8 {q13}, [r12]
vmull.u8 q3, d2, d4
vmlal.u8 q3, d0, d22
vmull.u8 q8, d3, d5
vmlal.u8 q8, d1, d23
vsub.i8 q11, q12, q15
vrshrn.i16 d18, q3, #6
vrshrn.i16 d19, q8, #6
vmull.u8 q3, d28, d30
vmlal.u8 q3, d26, d22
vmull.u8 q8, d29, d31
vmlal.u8 q8, d27, d23
vrshrn.i16 d20, q3, #6
vrshrn.i16 d21, q8, #6
vst1.u8 {q9}, [r0], r1
vst1.u8 {q10}, [r12], r1
bgt 16b
pop {r4-r8,pc}
320:
vmov.i8 q10, #64
32:
vld1.u8 {q2, q3}, [r5], r3
vld1.u8 {q8, q9}, [r2], r3
vld1.u8 {q0, q1}, [r0]
subs r4, r4, #1
vsub.i8 q11, q10, q2
vmull.u8 q15, d16, d4
vmlal.u8 q15, d0, d22
vmull.u8 q14, d17, d5
vmlal.u8 q14, d1, d23
vsub.i8 q11, q10, q3
vrshrn.i16 d24, q15, #6
vrshrn.i16 d25, q14, #6
vmull.u8 q15, d18, d6
vmlal.u8 q15, d2, d22
vmull.u8 q14, d19, d7
vmlal.u8 q14, d3, d23
vrshrn.i16 d26, q15, #6
vrshrn.i16 d27, q14, #6
vst1.u8 {q12, q13}, [r0], r1
bgt 32b
pop {r4-r8,pc}
endfunc
function blend_h_8bpc_neon, export=1
push {r4-r8,lr}
ldr r4, [sp, #24]
movrel r5, X(obmc_masks)
add r5, r5, r4
sub r4, r4, r4, lsr #2
clz r6, r3
adr r7, L(blend_h_tbl)
sub r6, r6, #24
ldr r6, [r7, r6, lsl #2]
add r7, r7, r6
bx r7
.align 2
L(blend_h_tbl):
.word 1280f - L(blend_h_tbl) + CONFIG_THUMB
.word 640f - L(blend_h_tbl) + CONFIG_THUMB
.word 320f - L(blend_h_tbl) + CONFIG_THUMB
.word 160f - L(blend_h_tbl) + CONFIG_THUMB
.word 80f - L(blend_h_tbl) + CONFIG_THUMB
.word 40f - L(blend_h_tbl) + CONFIG_THUMB
.word 20f - L(blend_h_tbl) + CONFIG_THUMB
20:
vmov.i8 d22, #64
add r12, r0, r1
lsl r1, r1, #1
2:
vld1.16 {d2[], d3[]}, [r5]!
vld1.16 {d1[]}, [r2], r3
subs r4, r4, #2
vld1.16 {d0[]}, [r0]
vzip.8 d2, d3
vld1.16 {d1[1]}, [r2], r3
vsub.i8 d4, d22, d2
vld1.16 {d0[1]}, [r12]
vmull.u8 q8, d1, d2
vmlal.u8 q8, d0, d4
vrshrn.i16 d20, q8, #6
vst1.16 {d20[0]}, [r0], r1
vst1.16 {d20[1]}, [r12], r1
bgt 2b
pop {r4-r8,pc}
40:
vmov.i8 d22, #64
add r12, r0, r1
lsl r1, r1, #1
4:
vld1.u8 {d2[]}, [r5]!
vld1.32 {d1[]}, [r2], r3
subs r4, r4, #2
vld1.u8 {d6[]}, [r5]!
vld1.32 {d1[1]}, [r2], r3
vext.u8 d2, d2, d6, #4
vld1.32 {d0[]}, [r0]
vsub.i8 d3, d22, d2
vld1.32 {d0[1]}, [r12]
vmull.u8 q8, d1, d2
vmlal.u8 q8, d0, d3
vrshrn.i16 d20, q8, #6
vst1.32 {d20[0]}, [r0], r1
vst1.32 {d20[1]}, [r12], r1
bgt 4b
pop {r4-r8,pc}
80:
vmov.i8 d16, #64
add r12, r0, r1
lsl r1, r1, #1
8:
vld1.u8 {d2[]}, [r5]!
vld1.u8 {d4}, [r2], r3
vld1.u8 {d0}, [r0]
vsub.i8 d17, d16, d2
vld1.u8 {d3[]}, [r5]!
vld1.u8 {d5}, [r2], r3
vld1.u8 {d1}, [r12]
subs r4, r4, #2
vsub.i8 d18, d16, d3
vmull.u8 q3, d2, d4
vmlal.u8 q3, d0, d17
vmull.u8 q10, d3, d5
vmlal.u8 q10, d1, d18
vrshrn.i16 d22, q3, #6
vrshrn.i16 d23, q10, #6
vst1.u8 {d22}, [r0], r1
vst1.u8 {d23}, [r12], r1
bgt 8b
pop {r4-r8,pc}
160:
vmov.i8 d24, #64
add r12, r0, r1
lsl r1, r1, #1
16:
vld1.u8 {d4[]}, [r5]!
vld1.u8 {q1}, [r2], r3
vsub.i8 d5, d24, d4
vld1.u8 {q0}, [r0]
subs r4, r4, #2
vld1.u8 {d30[]}, [r5]!
vld1.u8 {q14}, [r2], r3
vsub.i8 d31, d24, d30
vld1.u8 {q13}, [r12]
vmull.u8 q3, d2, d4
vmlal.u8 q3, d0, d5
vmull.u8 q8, d3, d4
vmlal.u8 q8, d1, d5
vrshrn.i16 d18, q3, #6
vrshrn.i16 d19, q8, #6
vmull.u8 q3, d28, d30
vmlal.u8 q3, d26, d31
vmull.u8 q8, d29, d30
vmlal.u8 q8, d27, d31
vrshrn.i16 d20, q3, #6
vrshrn.i16 d21, q8, #6
vst1.u8 {q9}, [r0], r1
vst1.u8 {q10}, [r12], r1
bgt 16b
pop {r4-r8,pc}
320:
640:
1280:
vmov.i8 d20, #64
sub r1, r1, r3
321:
vld1.u8 {d6[]}, [r5]!
vsub.i8 d7, d20, d6
mov r8, r3
32:
vld1.u8 {q8, q9}, [r2]!
vld1.u8 {q0, q1}, [r0]
vmull.u8 q15, d16, d6
vmlal.u8 q15, d0, d7
vmull.u8 q14, d17, d6
vmlal.u8 q14, d1, d7
vrshrn.i16 d0, q15, #6
vrshrn.i16 d1, q14, #6
vmull.u8 q15, d18, d6
vmlal.u8 q15, d2, d7
vmull.u8 q14, d19, d6
vmlal.u8 q14, d3, d7
vrshrn.i16 d2, q15, #6
vrshrn.i16 d3, q14, #6
vst1.u8 {q0, q1}, [r0]!
subs r8, r8, #32
bgt 32b
add r0, r0, r1
subs r4, r4, #1
bgt 321b
pop {r4-r8,pc}
endfunc
function blend_v_8bpc_neon, export=1
push {r4-r8,lr}
ldr r4, [sp, #24]
movrel r5, X(obmc_masks)
add r5, r5, r3
clz r8, r3
adr r7, L(blend_v_tbl)
sub r8, r8, #26
ldr r8, [r7, r8, lsl #2]
add r7, r7, r8
bx r7
.align 2
L(blend_v_tbl):
.word 320f - L(blend_v_tbl) + CONFIG_THUMB
.word 160f - L(blend_v_tbl) + CONFIG_THUMB
.word 80f - L(blend_v_tbl) + CONFIG_THUMB
.word 40f - L(blend_v_tbl) + CONFIG_THUMB
.word 20f - L(blend_v_tbl) + CONFIG_THUMB
20:
vmov.i8 d22, #64
vld1.8 {d2[]}, [r5]
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d3, d22, d2
2:
vld1.8 {d1[]}, [r2], r3
vld1.8 {d0[]}, [r0]
subs r4, r4, #2
vld1.8 {d1[1]}, [r2], r3
vld1.8 {d0[1]}, [r12]
vmull.u8 q2, d1, d2
vmlal.u8 q2, d0, d3
vrshrn.i16 d6, q2, #6
vst1.8 {d6[0]}, [r0], r1
vst1.8 {d6[1]}, [r12], r1
bgt 2b
pop {r4-r8,pc}
40:
vmov.i8 d22, #64
vld1.32 {d4[]}, [r5]
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d5, d22, d4
4:
vld1.32 {d2[]}, [r2], r3
vld1.32 {d0[]}, [r0]
vld1.32 {d2[1]}, [r2], r3
vld1.32 {d0[1]}, [r12]
subs r4, r4, #2
vmull.u8 q3, d2, d4
vmlal.u8 q3, d0, d5
vrshrn.i16 d20, q3, #6
vst1.16 {d20[0]}, [r0]!
vst1.16 {d20[2]}, [r12]!
vst1.8 {d20[2]}, [r0]!
vst1.8 {d20[6]}, [r12]!
sub r0, r0, #3
sub r12, r12, #3
add r0, r0, r1
add r12, r12, r1
bgt 4b
pop {r4-r8,pc}
80:
vmov.i8 d16, #64
vld1.u8 {d2}, [r5]
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d17, d16, d2
8:
vld1.u8 {d4}, [r2], r3
vld1.u8 {d0}, [r0]
vld1.u8 {d5}, [r2], r3
vld1.u8 {d1}, [r12]
subs r4, r4, #2
vmull.u8 q3, d2, d4
vmlal.u8 q3, d0, d17
vmull.u8 q10, d2, d5
vmlal.u8 q10, d1, d17
vrshrn.i16 d22, q3, #6
vrshrn.i16 d23, q10, #6
vst1.32 {d22[0]}, [r0]!
vst1.32 {d23[0]}, [r12]!
vst1.16 {d22[2]}, [r0]!
vst1.16 {d23[2]}, [r12]!
sub r0, r0, #6
sub r12, r12, #6
add r0, r0, r1
add r12, r12, r1
bgt 8b
pop {r4-r8,pc}
160:
vmov.i8 q12, #64
vld1.u8 {q2}, [r5]
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 q11, q12, q2
16:
vld1.u8 {q1}, [r2], r3
vld1.u8 {q0}, [r0]
subs r4, r4, #2
vld1.u8 {q14}, [r2], r3
vld1.u8 {q13}, [r12]
vmull.u8 q3, d2, d4
vmlal.u8 q3, d0, d22
vmull.u8 q8, d3, d5
vmlal.u8 q8, d1, d23
vrshrn.i16 d18, q3, #6
vrshrn.i16 d19, q8, #6
vmull.u8 q3, d28, d4
vmlal.u8 q3, d26, d22
vmull.u8 q8, d29, d5
vmlal.u8 q8, d27, d23
vrshrn.i16 d20, q3, #6
vrshrn.i16 d21, q8, #6
vst1.u8 {d18}, [r0]!
vst1.u8 {d20}, [r12]!
vst1.32 {d19[0]}, [r0]!
vst1.32 {d21[0]}, [r12]!
sub r0, r0, #12
sub r12, r12, #12
add r0, r0, r1
add r12, r12, r1
bgt 16b
pop {r4-r8,pc}
320:
vmov.i8 q10, #64
vld1.u8 {q2, q3}, [r5]
vsub.i8 q11, q10, q2
vsub.i8 q12, q10, q3
32:
vld1.u8 {q8, q9}, [r2], r3
vld1.u8 {q0, q1}, [r0]
subs r4, r4, #1
vmull.u8 q15, d16, d4
vmlal.u8 q15, d0, d22
vmull.u8 q14, d17, d5
vmlal.u8 q14, d1, d23
vrshrn.i16 d0, q15, #6
vrshrn.i16 d1, q14, #6
vmull.u8 q15, d18, d6
vmlal.u8 q15, d2, d24
vrshrn.i16 d2, q15, #6
vst1.u8 {d0, d1, d2}, [r0], r1
bgt 32b
pop {r4-r8,pc}
endfunc
.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
vld1.\wd {\d0[]}, [\s0], \strd
......
......@@ -55,6 +55,9 @@ decl_mct_fn(dav1d_prep_bilin_8bpc_neon);
decl_avg_fn(dav1d_avg_8bpc_neon);
decl_w_avg_fn(dav1d_w_avg_8bpc_neon);
decl_mask_fn(dav1d_mask_8bpc_neon);
decl_blend_fn(dav1d_blend_8bpc_neon);
decl_blend_dir_fn(dav1d_blend_h_8bpc_neon);
decl_blend_dir_fn(dav1d_blend_v_8bpc_neon);
decl_warp8x8_fn(dav1d_warp_affine_8x8_8bpc_neon);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_8bpc_neon);
......@@ -97,6 +100,10 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
#if ARCH_AARCH64
c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
#elif ARCH_ARM
c->blend = dav1d_blend_8bpc_neon;
c->blend_h = dav1d_blend_h_8bpc_neon;
c->blend_v = dav1d_blend_v_8bpc_neon;
#endif
#endif
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment