Commit 0bad117e authored by Martin Storsjö's avatar Martin Storsjö Committed by Janne Grunau
Browse files

arm64: mc: Simplify avg/w_avg/mask by always using the w16 macro

This shortens the source by 40 lines, and gives a significant
speedup on A53, a small speedup on A72 and a very minor slowdown
for avg/w_avg on A73.

Before:           Cortex A53     A72     A73
avg_w4_8bpc_neon:       67.4    26.1    25.4
avg_w8_8bpc_neon:      158.7    56.3    59.1
avg_w16_8bpc_neon:     382.9   154.1   160.7
w_avg_w4_8bpc_neon:     99.9    43.6    39.4
w_avg_w8_8bpc_neon:    253.2    98.3    99.0
w_avg_w16_8bpc_neon:   543.1   285.0   301.8
mask_w4_8bpc_neon:     110.6    51.4    45.1
mask_w8_8bpc_neon:     295.0   129.9   114.0
mask_w16_8bpc_neon:    654.6   365.8   369.7
After:
avg_w4_8bpc_neon:       60.8    26.3    29.0
avg_w8_8bpc_neon:      142.8    52.9    64.1
avg_w16_8bpc_neon:     378.2   153.4   160.8
w_avg_w4_8bpc_neon:     78.7    41.0    40.9
w_avg_w8_8bpc_neon:    190.6    90.1   105.1
w_avg_w16_8bpc_neon:   531.1   279.3   301.4
mask_w4_8bpc_neon:      86.6    47.2    44.9
mask_w8_8bpc_neon:     222.0   114.3   114.9
mask_w16_8bpc_neon:    639.5   356.0   369.8
parent 2e68c1f3
......@@ -29,14 +29,7 @@
#include "src/arm/asm.S"
#include "util.S"
.macro avg dst, t0, t1
ld1 {\t0\().8h}, [x2], 16
ld1 {\t1\().8h}, [x3], 16
add \t0\().8h, \t0\().8h, \t1\().8h
sqrshrun \dst\().8b, \t0\().8h, #5
.endm
.macro avg16 dst, t0, t1, t2, t3
.macro avg dst, t0, t1, t2, t3
ld1 {\t0\().8h,\t1\().8h}, [x2], 32
ld1 {\t2\().8h,\t3\().8h}, [x3], 32
add \t0\().8h, \t0\().8h, \t2\().8h
......@@ -45,16 +38,7 @@
sqrshrun2 \dst\().16b, \t1\().8h, #5
.endm
.macro w_avg dst, t0, t1
ld1 {\t0\().8h}, [x2], 16
ld1 {\t1\().8h}, [x3], 16
sub \t0\().8h, \t1\().8h, \t0\().8h
sqdmulh \t0\().8h, \t0\().8h, v30.8h
add \t0\().8h, \t1\().8h, \t0\().8h
sqrshrun \dst\().8b, \t0\().8h, #4
.endm
.macro w_avg16 dst, t0, t1, t2, t3
.macro w_avg dst, t0, t1, t2, t3
ld1 {\t0\().8h,\t1\().8h}, [x2], 32
ld1 {\t2\().8h,\t3\().8h}, [x3], 32
sub \t0\().8h, \t2\().8h, \t0\().8h
......@@ -67,19 +51,7 @@
sqrshrun2 \dst\().16b, \t1\().8h, #4
.endm
.macro mask dst, t0, t1
ld1 {v30.8b}, [x6], 8
ld1 {\t0\().8h}, [x2], 16
mul v30.8b, v30.8b, v31.8b
ld1 {\t1\().8h}, [x3], 16
shll v30.8h, v30.8b, #8
sub \t0\().8h, \t1\().8h, \t0\().8h
sqdmulh \t0\().8h, \t0\().8h, v30.8h
add \t0\().8h, \t1\().8h, \t0\().8h
sqrshrun \dst\().8b, \t0\().8h, #4
.endm
.macro mask16 dst, t0, t1, t2, t3
.macro mask dst, t0, t1, t2, t3
ld1 {v30.16b}, [x6], 16
ld1 {\t0\().8h,\t1\().8h}, [x2], 32
mul v30.16b, v30.16b, v31.16b
......@@ -109,113 +81,102 @@ function \type\()_8bpc_neon, export=1
.endif
adr x7, L(\type\()_tbl)
sub w4, w4, #24
\type v4, v0, v1
ldrh w4, [x7, x4, lsl #1]
\type v5, v2, v3
\type v4, v0, v1, v2, v3
sub x7, x7, w4, uxtw
br x7
4:
cmp w5, #4
st1 {v4.s}[0], [x0], x1
st1 {v4.s}[1], [x0], x1
st1 {v5.s}[0], [x0], x1
st1 {v5.s}[1], [x0], x1
st1 {v4.s}[2], [x0], x1
st1 {v4.s}[3], [x0], x1
b.eq 0f
\type v6, v0, v1
\type v7, v2, v3
\type v5, v0, v1, v2, v3
cmp w5, #8
st1 {v6.s}[0], [x0], x1
st1 {v6.s}[1], [x0], x1
st1 {v7.s}[0], [x0], x1
st1 {v7.s}[1], [x0], x1
st1 {v5.s}[0], [x0], x1
st1 {v5.s}[1], [x0], x1
st1 {v5.s}[2], [x0], x1
st1 {v5.s}[3], [x0], x1
b.eq 0f
\type v4, v0, v1
\type v5, v2, v3
\type v4, v0, v1, v2, v3
st1 {v4.s}[0], [x0], x1
st1 {v4.s}[1], [x0], x1
\type v6, v0, v1
\type v5, v0, v1, v2, v3
st1 {v4.s}[2], [x0], x1
st1 {v4.s}[3], [x0], x1
st1 {v5.s}[0], [x0], x1
st1 {v5.s}[1], [x0], x1
\type v7, v2, v3
st1 {v6.s}[0], [x0], x1
st1 {v6.s}[1], [x0], x1
st1 {v7.s}[0], [x0], x1
st1 {v7.s}[1], [x0], x1
st1 {v5.s}[2], [x0], x1
st1 {v5.s}[3], [x0], x1
ret
8:
st1 {v4.8b}, [x0], x1
\type v6, v0, v1
st1 {v5.8b}, [x0], x1
\type v7, v0, v1
st1 {v6.8b}, [x0], x1
st1 {v4.d}[0], [x0], x1
\type v5, v0, v1, v2, v3
st1 {v4.d}[1], [x0], x1
st1 {v5.d}[0], [x0], x1
subs w5, w5, #4
st1 {v7.8b}, [x0], x1
st1 {v5.d}[1], [x0], x1
b.le 0f
\type v4, v0, v1
\type v5, v2, v3
\type v4, v0, v1, v2, v3
b 8b
160:
trn1 v4.2d, v4.2d, v5.2d
16:
\type\()16 v5, v0, v1, v2, v3
\type v5, v0, v1, v2, v3
st1 {v4.16b}, [x0], x1
\type\()16 v6, v0, v1, v2, v3
\type v6, v0, v1, v2, v3
st1 {v5.16b}, [x0], x1
\type\()16 v7, v0, v1, v2, v3
\type v7, v0, v1, v2, v3
st1 {v6.16b}, [x0], x1
subs w5, w5, #4
st1 {v7.16b}, [x0], x1
b.le 0f
\type\()16 v4, v0, v1, v2, v3
\type v4, v0, v1, v2, v3
b 16b
320:
trn1 v4.2d, v4.2d, v5.2d
add x7, x0, x1
lsl x1, x1, #1
32:
\type\()16 v5, v0, v1, v2, v3
\type\()16 v6, v0, v1, v2, v3
\type v5, v0, v1, v2, v3
\type v6, v0, v1, v2, v3
st1 {v4.16b,v5.16b}, [x0], x1
\type\()16 v7, v0, v1, v2, v3
\type v7, v0, v1, v2, v3
subs w5, w5, #2
st1 {v6.16b,v7.16b}, [x7], x1
b.le 0f
\type\()16 v4, v0, v1, v2, v3
\type v4, v0, v1, v2, v3
b 32b
640:
trn1 v4.2d, v4.2d, v5.2d
add x7, x0, x1
lsl x1, x1, #1
64:
\type\()16 v5, v0, v1, v2, v3
\type\()16 v6, v0, v1, v2, v3
\type\()16 v7, v0, v1, v2, v3
\type\()16 v16, v0, v1, v2, v3
\type\()16 v17, v0, v1, v2, v3
\type v5, v0, v1, v2, v3
\type v6, v0, v1, v2, v3
\type v7, v0, v1, v2, v3
\type v16, v0, v1, v2, v3
\type v17, v0, v1, v2, v3
st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
\type\()16 v18, v0, v1, v2, v3
\type\()16 v19, v0, v1, v2, v3
\type v18, v0, v1, v2, v3
\type v19, v0, v1, v2, v3
subs w5, w5, #2
st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
b.le 0f
\type\()16 v4, v0, v1, v2, v3
\type v4, v0, v1, v2, v3
b 64b
1280:
trn1 v4.2d, v4.2d, v5.2d
add x7, x0, #64
128:
\type\()16 v5, v0, v1, v2, v3
\type\()16 v6, v0, v1, v2, v3
\type\()16 v7, v0, v1, v2, v3
\type\()16 v16, v0, v1, v2, v3
\type\()16 v17, v0, v1, v2, v3
\type v5, v0, v1, v2, v3
\type v6, v0, v1, v2, v3
\type v7, v0, v1, v2, v3
\type v16, v0, v1, v2, v3
\type v17, v0, v1, v2, v3
st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
\type\()16 v18, v0, v1, v2, v3
\type\()16 v19, v0, v1, v2, v3
\type v18, v0, v1, v2, v3
\type v19, v0, v1, v2, v3
subs w5, w5, #1
st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
b.le 0f
\type\()16 v4, v0, v1, v2, v3
\type v4, v0, v1, v2, v3
b 128b
0:
ret
......@@ -223,7 +184,7 @@ L(\type\()_tbl):
.hword L(\type\()_tbl) - 1280b
.hword L(\type\()_tbl) - 640b
.hword L(\type\()_tbl) - 320b
.hword L(\type\()_tbl) - 160b
.hword L(\type\()_tbl) - 16b
.hword L(\type\()_tbl) - 8b
.hword L(\type\()_tbl) - 4b
endfunc
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment