Commit 52e9b435 authored by Martin Storsjö's avatar Martin Storsjö
Browse files

arm: mc: Optimize blend_v

Use a post-increment with a register on the last increment, avoiding
a separate increment. Avoid processing the last 8 pixels in the w32
case when we only output 24 pixels.

Before:
ARM32                Cortex A7      A8      A9     A53     A72     A73
blend_v_w4_8bpc_neon:    450.4   574.7   538.7   374.6   199.3   260.5
blend_v_w8_8bpc_neon:    559.6   351.3   552.5   357.6   214.8   204.3
blend_v_w16_8bpc_neon:   926.3   511.6   787.9   593.0   271.0   246.8
blend_v_w32_8bpc_neon:  1482.5   917.0  1149.5   991.9   354.0   368.9
ARM64
blend_v_w4_8bpc_neon:                            351.1   200.0   224.1
blend_v_w8_8bpc_neon:                            333.0   212.4   203.8
blend_v_w16_8bpc_neon:                           495.2   302.0   247.0
blend_v_w32_8bpc_neon:                           840.0   557.8   514.0

After:
ARM32
blend_v_w4_8bpc_neon:    435.5   575.8   537.6   356.2   198.3   259.5
blend_v_w8_8bpc_neon:    545.2   347.9   553.5   339.1   207.8   204.2
blend_v_w16_8bpc_neon:   913.7   511.0   788.1   573.7   275.4   243.3
blend_v_w32_8bpc_neon:  1445.3   951.2  1079.1   920.4   352.2   361.6
ARM64
blend_v_w4_8bpc_neon:                            333.0   191.3   225.9
blend_v_w8_8bpc_neon:                            314.9   199.3   203.5
blend_v_w16_8bpc_neon:                           476.9   301.3   241.1
blend_v_w32_8bpc_neon:                           766.9   432.8   416.9
parent a7f6fe32
......@@ -753,7 +753,7 @@ L(blend_v_tbl):
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d5, d22, d4
sub r1, r1, #3
sub r1, r1, #2
4:
vld1.u8 {d2}, [r2, :64]!
vld1.32 {d0[]}, [r0, :32]
......@@ -764,10 +764,8 @@ L(blend_v_tbl):
vrshrn.i16 d20, q3, #6
vst1.16 {d20[0]}, [r0, :16]!
vst1.16 {d20[2]}, [r12, :16]!
vst1.8 {d20[2]}, [r0]!
vst1.8 {d20[6]}, [r12]!
add r0, r0, r1
add r12, r12, r1
vst1.8 {d20[2]}, [r0], r1
vst1.8 {d20[6]}, [r12], r1
bgt 4b
pop {r4-r5,pc}
80:
......@@ -776,7 +774,7 @@ L(blend_v_tbl):
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d17, d16, d2
sub r1, r1, #6
sub r1, r1, #4
8:
vld1.u8 {d4, d5}, [r2, :128]!
vld1.u8 {d0}, [r0, :64]
......@@ -790,10 +788,8 @@ L(blend_v_tbl):
vrshrn.i16 d23, q10, #6
vst1.32 {d22[0]}, [r0, :32]!
vst1.32 {d23[0]}, [r12, :32]!
vst1.16 {d22[2]}, [r0, :16]!
vst1.16 {d23[2]}, [r12, :16]!
add r0, r0, r1
add r12, r12, r1
vst1.16 {d22[2]}, [r0, :16], r1
vst1.16 {d23[2]}, [r12, :16], r1
bgt 8b
pop {r4-r5,pc}
160:
......@@ -802,7 +798,7 @@ L(blend_v_tbl):
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 q11, q12, q14
sub r1, r1, #12
sub r1, r1, #8
16:
vld1.u8 {q1, q2}, [r2, :128]!
vld1.u8 {q0}, [r0, :128]
......@@ -822,20 +818,18 @@ L(blend_v_tbl):
vrshrn.i16 d21, q8, #6
vst1.u8 {d18}, [r0, :64]!
vst1.u8 {d20}, [r12, :64]!
vst1.32 {d19[0]}, [r0, :32]!
vst1.32 {d21[0]}, [r12, :32]!
add r0, r0, r1
add r12, r12, r1
vst1.32 {d19[0]}, [r0, :32], r1
vst1.32 {d21[0]}, [r12, :32], r1
bgt 16b
pop {r4-r5,pc}
320:
vmov.i8 q10, #64
vld1.u8 {q2, q3}, [r5, :128]
vsub.i8 q11, q10, q2
vsub.i8 q12, q10, q3
vsub.i8 d24, d20, d6
32:
vld1.u8 {q8, q9}, [r2, :128]!
vld1.u8 {q0, q1}, [r0, :128]
vld1.u8 {d0, d1, d2}, [r0, :64]
subs r4, r4, #1
vmull.u8 q15, d16, d4
vmlal.u8 q15, d0, d22
......
......@@ -709,8 +709,8 @@ function blend_v_8bpc_neon, export=1
ret
40:
ld1r {v0.2s}, [x5]
sub x1, x1, #2
sub v1.8b, v4.8b, v0.8b
sub x1, x1, #3
4:
ld1 {v2.8b}, [x2], #8
ld1 {v3.s}[0], [x0]
......@@ -721,16 +721,14 @@ function blend_v_8bpc_neon, export=1
rshrn v5.8b, v5.8h, #6
st1 {v5.h}[0], [x0], #2
st1 {v5.h}[2], [x8], #2
st1 {v5.b}[2], [x0], #1
st1 {v5.b}[6], [x8], #1
add x0, x0, x1
add x8, x8, x1
st1 {v5.b}[2], [x0], x1
st1 {v5.b}[6], [x8], x1
b.gt 4b
ret
80:
ld1r {v0.2d}, [x5]
sub x1, x1, #4
sub v1.16b, v4.16b, v0.16b
sub x1, x1, #6
8:
ld1 {v2.16b}, [x2], #16
ld1 {v3.d}[0], [x0]
......@@ -744,16 +742,14 @@ function blend_v_8bpc_neon, export=1
rshrn2 v7.16b, v6.8h, #6
st1 {v7.s}[0], [x0], #4
st1 {v7.s}[2], [x8], #4
st1 {v7.h}[2], [x0], #2
st1 {v7.h}[6], [x8], #2
add x0, x0, x1
add x8, x8, x1
st1 {v7.h}[2], [x0], x1
st1 {v7.h}[6], [x8], x1
b.gt 8b
ret
160:
ld1 {v0.16b}, [x5]
sub x1, x1, #8
sub v2.16b, v4.16b, v0.16b
sub x1, x1, #12
16:
ld1 {v5.16b, v6.16b}, [x2], #32
ld1 {v7.16b}, [x0]
......@@ -773,17 +769,15 @@ function blend_v_8bpc_neon, export=1
rshrn2 v22.16b, v21.8h, #6
st1 {v19.8b}, [x0], #8
st1 {v22.8b}, [x8], #8
st1 {v19.s}[2], [x0], #4
st1 {v22.s}[2], [x8], #4
add x0, x0, x1
add x8, x8, x1
st1 {v19.s}[2], [x0], x1
st1 {v22.s}[2], [x8], x1
b.gt 16b
ret
320:
ld1 {v0.16b, v1.16b}, [x5]
sub x1, x1, #16
sub v2.16b, v4.16b, v0.16b
sub v3.16b, v4.16b, v1.16b
sub x1, x1, #24
sub v3.8b, v4.8b, v1.8b
32:
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
ld1 {v5.16b, v6.16b}, [x0]
......@@ -795,30 +789,22 @@ function blend_v_8bpc_neon, export=1
umlal2 v23.8h, v5.16b, v2.16b
umull v28.8h, v17.8b, v1.8b
umlal v28.8h, v6.8b, v3.8b
umull2 v29.8h, v17.16b, v1.16b
umlal2 v29.8h, v6.16b, v3.16b
umull v30.8h, v18.8b, v0.8b
umlal v30.8h, v20.8b, v2.8b
umull2 v31.8h, v18.16b, v0.16b
umlal2 v31.8h, v20.16b, v2.16b
umull v25.8h, v19.8b, v1.8b
umlal v25.8h, v21.8b, v3.8b
umull2 v26.8h, v19.16b, v1.16b
umlal2 v26.8h, v21.16b, v3.16b
rshrn v24.8b, v22.8h, #6
rshrn2 v24.16b, v23.8h, #6
rshrn v28.8b, v28.8h, #6
rshrn2 v28.16b, v29.8h, #6
rshrn v30.8b, v30.8h, #6
rshrn2 v30.16b, v31.8h, #6
rshrn v27.8b, v25.8h, #6
rshrn2 v27.16b, v26.8h, #6
st1 {v24.16b}, [x0], #16
st1 {v30.16b}, [x8], #16
st1 {v28.8b}, [x0], #8
st1 {v27.8b}, [x8], #8
add x0, x0, x1
add x8, x8, x1
st1 {v28.8b}, [x0], x1
st1 {v27.8b}, [x8], x1
b.gt 32b
ret
L(blend_v_tbl):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment