Commit 3ad1d894 authored by Martin Storsjö's avatar Martin Storsjö
Browse files

arm64: looprestoration: Do a tbl with 4 regs

Before:                 Cortex A53     A72      A73
selfguided_3x3_8bpc_neon:   3260.6  2175.4   2284.6
selfguided_5x5_8bpc_neon:   2553.2  1694.4   1809.2
selfguided_mix_8bpc_neon:   5720.0  3776.8   4000.5
After:
selfguided_3x3_8bpc_neon:   3315.6  2224.7   2310.5
selfguided_5x5_8bpc_neon:   2603.5  1729.7   1826.9
selfguided_mix_8bpc_neon:   5809.0  3866.3   4043.7
parent 371de01c
Pipeline #5379 passed with stages
in 10 minutes and 16 seconds
......@@ -1466,9 +1466,8 @@ endfunc
function sgr_calc_ab_neon
movrel x12, X(sgr_x_by_x)
ld1 {v16.16b, v17.16b, v18.16b}, [x12]
movi v19.16b, #5
movi v20.8b, #55 // idx of last 5
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x12]
movi v20.16b, #4
movi v21.8b, #72 // idx of last 4
movi v22.8b, #101 // idx of last 3
movi v23.8b, #169 // idx of last 2
......@@ -1483,9 +1482,10 @@ function sgr_calc_ab_neon
sub x0, x0, #(4*(SUM_STRIDE))
sub x1, x1, #(2*(SUM_STRIDE))
mov x6, x2 // backup of w
sub v16.16b, v16.16b, v19.16b
sub v17.16b, v17.16b, v19.16b
sub v18.16b, v18.16b, v19.16b
sub v16.16b, v16.16b, v20.16b
sub v17.16b, v17.16b, v20.16b
sub v18.16b, v18.16b, v20.16b
sub v19.16b, v19.16b, v20.16b
1:
subs x2, x2, #8
ld1 {v0.4s, v1.4s}, [x0] // a
......@@ -1502,18 +1502,16 @@ function sgr_calc_ab_neon
uqshrn2 v0.8h, v1.4s, #16
uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
tbl v1.8b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.8b
cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
cmhi v5.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
add v25.8b, v25.8b, v26.8b
add v26.8b, v26.8b, v27.8b
cmhi v6.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
add v27.8b, v27.8b, v5.8b
add v6.8b, v6.8b, v19.8b
add v25.8b, v25.8b, v27.8b
add v1.8b, v1.8b, v6.8b
add v1.8b, v1.8b, v25.8b
add v5.8b, v5.8b, v6.8b
add v26.8b, v26.8b, v5.8b
add v1.8b, v1.8b, v20.8b
add v1.8b, v1.8b, v26.8b
uxtl v1.8h, v1.8b // x
umull v3.4s, v1.4h, v2.4h // x * BB[i]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment