Commit b1167ce1 authored by Martin Storsjö's avatar Martin Storsjö Committed by Janne Grunau
Browse files

arm64: mc: Use two regs for alternating output rows for w4/8 in avg/w_avg/mask

It was already done this way for w32/64. Not doing it for w16 as it
didn't help there (and instead gave a small slowdown due to the two
setup instructions).

This gives a small speedup on in-order cores like A53.

Before:         Cortex A53     A72     A73
avg_w4_8bpc_neon:     60.9    25.6    29.0
avg_w8_8bpc_neon:    143.6    52.8    64.0
After:
avg_w4_8bpc_neon:     56.7    26.7    28.5
avg_w8_8bpc_neon:    137.2    54.5    64.4
parent 0bad117e
......@@ -85,38 +85,44 @@ function \type\()_8bpc_neon, export=1
\type v4, v0, v1, v2, v3
sub x7, x7, w4, uxtw
br x7
40:
add x7, x0, x1
lsl x1, x1, #1
4:
cmp w5, #4
st1 {v4.s}[0], [x0], x1
st1 {v4.s}[1], [x0], x1
st1 {v4.s}[1], [x7], x1
st1 {v4.s}[2], [x0], x1
st1 {v4.s}[3], [x0], x1
st1 {v4.s}[3], [x7], x1
b.eq 0f
\type v5, v0, v1, v2, v3
cmp w5, #8
st1 {v5.s}[0], [x0], x1
st1 {v5.s}[1], [x0], x1
st1 {v5.s}[1], [x7], x1
st1 {v5.s}[2], [x0], x1
st1 {v5.s}[3], [x0], x1
st1 {v5.s}[3], [x7], x1
b.eq 0f
\type v4, v0, v1, v2, v3
st1 {v4.s}[0], [x0], x1
st1 {v4.s}[1], [x0], x1
st1 {v4.s}[1], [x7], x1
\type v5, v0, v1, v2, v3
st1 {v4.s}[2], [x0], x1
st1 {v4.s}[3], [x0], x1
st1 {v4.s}[3], [x7], x1
st1 {v5.s}[0], [x0], x1
st1 {v5.s}[1], [x0], x1
st1 {v5.s}[1], [x7], x1
st1 {v5.s}[2], [x0], x1
st1 {v5.s}[3], [x0], x1
st1 {v5.s}[3], [x7], x1
ret
80:
add x7, x0, x1
lsl x1, x1, #1
8:
st1 {v4.d}[0], [x0], x1
\type v5, v0, v1, v2, v3
st1 {v4.d}[1], [x0], x1
st1 {v4.d}[1], [x7], x1
st1 {v5.d}[0], [x0], x1
subs w5, w5, #4
st1 {v5.d}[1], [x0], x1
st1 {v5.d}[1], [x7], x1
b.le 0f
\type v4, v0, v1, v2, v3
b 8b
......@@ -185,8 +191,8 @@ L(\type\()_tbl):
.hword L(\type\()_tbl) - 640b
.hword L(\type\()_tbl) - 320b
.hword L(\type\()_tbl) - 16b
.hword L(\type\()_tbl) - 8b
.hword L(\type\()_tbl) - 4b
.hword L(\type\()_tbl) - 80b
.hword L(\type\()_tbl) - 40b
endfunc
.endm
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment