Commit e7db58c9 authored by Martin Storsjö's avatar Martin Storsjö
Browse files

arm64: looprestoration: Use individual ldrb for loading from the table

Before:                 Cortex A53     A72      A73
selfguided_3x3_8bpc_neon:   3260.6  2175.4   2284.6
selfguided_5x5_8bpc_neon:   2553.2  1694.4   1809.2
selfguided_mix_8bpc_neon:   5720.0  3776.8   4000.5
After:
selfguided_3x3_8bpc_neon:   3514.1  2388.5   2335.9
selfguided_5x5_8bpc_neon:   2692.2  1789.5   1835.1
selfguided_mix_8bpc_neon:   6091.1  4089.1   4083.2
parent 371de01c
Pipeline #5378 passed with stages
in 10 minutes and 17 seconds
......@@ -1466,6 +1466,7 @@ endfunc
function sgr_calc_ab_neon
movrel x12, X(sgr_x_by_x)
/*
ld1 {v16.16b, v17.16b, v18.16b}, [x12]
movi v19.16b, #5
movi v20.8b, #55 // idx of last 5
......@@ -1473,6 +1474,7 @@ function sgr_calc_ab_neon
movi v22.8b, #101 // idx of last 3
movi v23.8b, #169 // idx of last 2
movi v24.8b, #254 // idx of last 1
*/
add x2, x2, #2 // w += 2
add x7, x2, #7
bic x7, x7, #7 // aligned w
......@@ -1483,9 +1485,11 @@ function sgr_calc_ab_neon
sub x0, x0, #(4*(SUM_STRIDE))
sub x1, x1, #(2*(SUM_STRIDE))
mov x6, x2 // backup of w
/*
sub v16.16b, v16.16b, v19.16b
sub v17.16b, v17.16b, v19.16b
sub v18.16b, v18.16b, v19.16b
*/
1:
subs x2, x2, #8
ld1 {v0.4s, v1.4s}, [x0] // a
......@@ -1501,7 +1505,7 @@ function sgr_calc_ab_neon
uqshrn v0.4h, v0.4s, #16
uqshrn2 v0.8h, v1.4s, #16
uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
/*
cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
......@@ -1515,6 +1519,32 @@ function sgr_calc_ab_neon
add v1.8b, v1.8b, v6.8b
add v1.8b, v1.8b, v25.8b
uxtl v1.8h, v1.8b // x
*/
mov x5, v0.d[0]
ubfx x8, x5, #0, #8
ubfx x9, x5, #8, #8
ubfx x10, x5, #16, #8
ubfx x11, x5, #24, #8
ldrb w8, [x12, x8]
ldrb w9, [x12, x9]
ldrb w10, [x12, x10]
ldrb w11, [x12, x11]
ins v1.h[0], w8
ins v1.h[1], w9
ins v1.h[2], w10
ins v1.h[3], w11
ubfx x8, x5, #32, #8
ubfx x9, x5, #40, #8
ubfx x10, x5, #48, #8
ubfx x11, x5, #56, #8
ldrb w8, [x12, x8]
ldrb w9, [x12, x9]
ldrb w10, [x12, x10]
ldrb w11, [x12, x11]
ins v1.h[4], w8
ins v1.h[5], w9
ins v1.h[6], w10
ins v1.h[7], w11
umull v3.4s, v1.4h, v2.4h // x * BB[i]
umull2 v4.4s, v1.8h, v2.8h // x * BB[i]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment