Commit b0f88e34 authored by B Krishnan Iyer's avatar B Krishnan Iyer

blend_h for w=32 with single row doesn't help much

                                A73             A53
                        Current Earlier Current Earlier
blend_h_w32_8bpc_neon:	169.3	144.3	298.2	262.2
blend_h_w64_8bpc_neon:	262.8	247.1	521.6	469.3
blend_h_w128_8bpc_neon:	571.6	544.1	1167.7	1055.1
parent 2125d9f1
Pipeline #8916 passed with stages
in 6 minutes and 7 seconds
......@@ -418,51 +418,32 @@ function blend_h_8bpc_neon, export=1
1280:
640:
320:
lsr x1, x1, #1
sub x1, x1, w3, uxtw
add x7, x2, w3, uxtw
321:
ld2r {v0.16b, v1.16b}, [x5], #2
ld1r {v0.16b}, [x5], #1
mov w6, w3
sub v20.16b, v4.16b, v0.16b
sub v21.16b, v4.16b, v1.16b
32:
ld1 {v16.16b, v17.16b}, [x2], #32
ld1 {v2.16b, v3.16b}, [x0]
subs w6, w6, #32
umull v23.8h, v0.8b, v16.8b
umlal v23.8h, v2.8b, v20.8b
ld1 {v18.16b, v19.16b}, [x7], #32
umull2 v27.8h, v0.16b, v16.16b
umlal2 v27.8h, v2.16b, v20.16b
ld1 {v6.16b, v7.16b}, [x8]
umull v24.8h, v0.8b, v17.8b
umlal v24.8h, v3.8b, v20.8b
umull2 v28.8h, v0.16b, v17.16b
umlal2 v28.8h, v3.16b, v20.16b
umull v25.8h, v1.8b, v18.8b
umlal v25.8h, v6.8b, v21.8b
umull2 v5.8h, v1.16b, v18.16b
umlal2 v5.8h, v6.16b, v21.16b
rshrn v29.8b, v23.8h, #6
rshrn2 v29.16b, v27.8h, #6
umull v26.8h, v1.8b, v19.8b
umlal v26.8h, v7.8b, v21.8b
umull2 v31.8h, v1.16b, v19.16b
umlal2 v31.8h, v7.16b, v21.16b
rshrn v30.8b, v24.8h, #6
rshrn2 v30.16b, v28.8h, #6
rshrn v23.8b, v25.8h, #6
rshrn2 v23.16b, v5.8h, #6
rshrn v24.8b, v26.8h, #6
st1 {v29.16b, v30.16b}, [x0], #32
rshrn2 v24.16b, v31.8h, #6
st1 {v23.16b, v24.16b}, [x8], #32
b.gt 32b
subs w4, w4, #2
subs w4, w4, #1
add x0, x0, x1
add x8, x8, x1
add x2, x2, w3, uxtw
add x7, x7, w3, uxtw
b.gt 321b
ret
L(blend_h_tbl):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment