Commit acf3124c authored by Martin Storsjö's avatar Martin Storsjö
Browse files

arm64: looprestoration: Use 4 tbl/tbx for the whole table

Before:                 Cortex A53     A72      A73
selfguided_3x3_8bpc_neon:   3260.6  2175.4   2284.6
selfguided_5x5_8bpc_neon:   2553.2  1694.4   1809.2
selfguided_mix_8bpc_neon:   5720.0  3776.8   4000.5
After:
selfguided_3x3_8bpc_neon:   3567.8  2759.7   2811.5
selfguided_5x5_8bpc_neon:   2720.7  1979.3   2067.5
selfguided_mix_8bpc_neon:   6168.9  4652.6   4828.3
parent 371de01c
Pipeline #5377 passed with stages
in 9 minutes and 48 seconds
......@@ -1449,80 +1449,72 @@ endfunc
// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
// const int w, const int h, const int strength);
function sgr_calc_ab1_neon, export=1
stp d8, d9, [sp, #-16]!
add x3, x3, #2 // h += 2
movi v31.4s, #9 // n
movi v8.4s, #9 // n
mov x5, #455
mov x8, #SUM_STRIDE
b sgr_calc_ab_neon
endfunc
function sgr_calc_ab2_neon, export=1
stp d8, d9, [sp, #-16]!
add x3, x3, #3 // h += 3
asr x3, x3, #1 // h /= 2
movi v31.4s, #25 // n
movi v8.4s, #25 // n
mov x5, #164
mov x8, #(2*SUM_STRIDE)
endfunc
function sgr_calc_ab_neon
movrel x12, X(sgr_x_by_x)
ld1 {v16.16b, v17.16b, v18.16b}, [x12]
movi v19.16b, #5
movi v20.8b, #55 // idx of last 5
movi v21.8b, #72 // idx of last 4
movi v22.8b, #101 // idx of last 3
movi v23.8b, #169 // idx of last 2
movi v24.8b, #254 // idx of last 1
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x12], #64
add x2, x2, #2 // w += 2
add x7, x2, #7
bic x7, x7, #7 // aligned w
ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x12], #64
sub x7, x8, x7 // increment between rows
movi v29.8h, #1, lsl #8
dup v28.4s, w4
dup v30.4s, w5 // one_by_x
movi v7.8h, #1, lsl #8
dup v5.4s, w4
dup v6.4s, w5 // one_by_x
ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x12], #64
movi v9.8b, #64
sub x0, x0, #(4*(SUM_STRIDE))
sub x1, x1, #(2*(SUM_STRIDE))
mov x6, x2 // backup of w
sub v16.16b, v16.16b, v19.16b
sub v17.16b, v17.16b, v19.16b
sub v18.16b, v18.16b, v19.16b
ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x12], #64
1:
subs x2, x2, #8
ld1 {v0.4s, v1.4s}, [x0] // a
ld1 {v2.8h}, [x1] // b
mul v0.4s, v0.4s, v31.4s // a * n
mul v1.4s, v1.4s, v31.4s // a * n
mul v0.4s, v0.4s, v8.4s // a * n
mul v1.4s, v1.4s, v8.4s // a * n
umull v3.4s, v2.4h, v2.4h // b * b
umull2 v4.4s, v2.8h, v2.8h // b * b
uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
mul v0.4s, v0.4s, v28.4s // p * s
mul v1.4s, v1.4s, v28.4s // p * s
mul v0.4s, v0.4s, v5.4s // p * s
mul v1.4s, v1.4s, v5.4s // p * s
uqshrn v0.4h, v0.4s, #16
uqshrn2 v0.8h, v1.4s, #16
uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
cmhi v5.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
add v25.8b, v25.8b, v26.8b
cmhi v6.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
add v27.8b, v27.8b, v5.8b
add v6.8b, v6.8b, v19.8b
add v25.8b, v25.8b, v27.8b
add v1.8b, v1.8b, v6.8b
add v1.8b, v1.8b, v25.8b
tbl v1.8b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.8b
sub v0.8b, v0.8b, v9.8b
tbx v1.8b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.8b
sub v0.8b, v0.8b, v9.8b
tbx v1.8b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.8b
sub v0.8b, v0.8b, v9.8b
tbx v1.8b, {v28.16b, v29.16b, v30.16b, v31.16b}, v0.8b
uxtl v1.8h, v1.8b // x
umull v3.4s, v1.4h, v2.4h // x * BB[i]
umull2 v4.4s, v1.8h, v2.8h // x * BB[i]
mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
mul v3.4s, v3.4s, v6.4s // x * BB[i] * sgr_one_by_x
mul v4.4s, v4.4s, v6.4s // x * BB[i] * sgr_one_by_x
srshr v3.4s, v3.4s, #12 // AA[i]
srshr v4.4s, v4.4s, #12 // AA[i]
sub v2.8h, v29.8h, v1.8h // 256 - x
sub v2.8h, v7.8h, v1.8h // 256 - x
st1 {v3.4s, v4.4s}, [x0], #32
st1 {v2.8h}, [x1], #16
......@@ -1535,6 +1527,7 @@ function sgr_calc_ab_neon
mov x2, x6
b 1b
0:
ldp d8, d9, [sp], #16
ret
endfunc
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment