Commit a3b8157e authored by Kyle Siefring's avatar Kyle Siefring Committed by Martin Storsjö
Browse files

arm64: warped motion: Various optimizations

- Reorder loads of filters to benifit in order cores.
- Use full 128-bit vectors to transpose 8x8 bytes. zip1 is called in the
   first stage which will hurt performance on some older big cores.
- Rework horz stage for 8 bit mode:
    * Use smull instead of mul
    * Replace existing narrow and long instructions
    * Replace mov after calling with right shift

Before:            Cortex A55    A53     A72     A73
warp_8x8_8bpc_neon:    1683.2  1860.6  1065.0  1102.6
warp_8x8t_8bpc_neon:   1673.2  1846.4  1057.0  1098.4
warp_8x8_16bpc_neon:   1870.7  2031.7  1147.3  1220.7
warp_8x8t_16bpc_neon:  1848.0  2006.2  1121.6  1188.0
After:
warp_8x8_8bpc_neon:    1267.2  1446.2   807.0   871.5
warp_8x8t_8bpc_neon:   1245.4  1422.0   810.2   868.4
warp_8x8_16bpc_neon:   1769.8  1929.3  1132.0  1238.2
warp_8x8t_16bpc_neon:  1747.3  1904.1  1101.5  1207.9

Cortex-A55
Before:
warp_8x8_8bpc_neon:   1683.2
warp_8x8t_8bpc_neon:  1673.2
warp_8x8_16bpc_neon:  1870.7
warp_8x8t_16bpc_neon: 1848.0
After:
warp_8x8_8bpc_neon:   1267.2
warp_8x8t_8bpc_neon:  1245.4
warp_8x8_16bpc_neon:  1769.8
warp_8x8t_16bpc_neon: 1747.3
parent 833382b3
Pipeline #64808 passed with stages
in 8 minutes
......@@ -2916,8 +2916,8 @@ filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
.macro load_filter_row dst, src, inc
asr w13, \src, #10
ldr \dst, [x11, w13, sxtw #3]
add \src, \src, \inc
ldr \dst, [x11, w13, sxtw #3]
.endm
function warp_filter_horz_neon
......@@ -2926,57 +2926,44 @@ function warp_filter_horz_neon
ld1 {v16.8b, v17.8b}, [x2], x3
load_filter_row d0, w12, w7
uxtl v16.8h, v16.8b
load_filter_row d1, w12, w7
uxtl v17.8h, v17.8b
load_filter_row d2, w12, w7
sxtl v0.8h, v0.8b
load_filter_row d3, w12, w7
sxtl v1.8h, v1.8b
load_filter_row d4, w12, w7
sxtl v2.8h, v2.8b
load_filter_row d5, w12, w7
sxtl v3.8h, v3.8b
load_filter_row d6, w12, w7
sxtl v4.8h, v4.8b
// subtract by 128 to allow using smull
eor v16.8b, v16.8b, v22.8b
eor v17.8b, v17.8b, v22.8b
load_filter_row d7, w12, w7
sxtl v5.8h, v5.8b
ext v18.16b, v16.16b, v17.16b, #2*1
mul v23.8h, v16.8h, v0.8h
sxtl v6.8h, v6.8b
ext v19.16b, v16.16b, v17.16b, #2*2
mul v18.8h, v18.8h, v1.8h
sxtl v7.8h, v7.8b
ext v20.16b, v16.16b, v17.16b, #2*3
mul v19.8h, v19.8h, v2.8h
ext v21.16b, v16.16b, v17.16b, #2*4
saddlp v23.4s, v23.8h
mul v20.8h, v20.8h, v3.8h
ext v22.16b, v16.16b, v17.16b, #2*5
saddlp v18.4s, v18.8h
mul v21.8h, v21.8h, v4.8h
saddlp v19.4s, v19.8h
mul v22.8h, v22.8h, v5.8h
saddlp v20.4s, v20.8h
saddlp v21.4s, v21.8h
saddlp v22.4s, v22.8h
addp v18.4s, v23.4s, v18.4s
ext v23.16b, v16.16b, v17.16b, #2*6
addp v19.4s, v19.4s, v20.4s
mul v23.8h, v23.8h, v6.8h
ext v20.16b, v16.16b, v17.16b, #2*7
mul v20.8h, v20.8h, v7.8h
saddlp v23.4s, v23.8h
addp v21.4s, v21.4s, v22.4s
saddlp v20.4s, v20.8h
addp v20.4s, v23.4s, v20.4s
addp v18.4s, v18.4s, v19.4s
addp v20.4s, v21.4s, v20.4s
add w5, w5, w8
ext v18.8b, v16.8b, v17.8b, #1
ext v19.8b, v16.8b, v17.8b, #2
smull v0.8h, v0.8b, v16.8b
smull v1.8h, v1.8b, v18.8b
ext v18.8b, v16.8b, v17.8b, #3
ext v20.8b, v16.8b, v17.8b, #4
smull v2.8h, v2.8b, v19.8b
smull v3.8h, v3.8b, v18.8b
ext v18.8b, v16.8b, v17.8b, #5
ext v19.8b, v16.8b, v17.8b, #6
smull v4.8h, v4.8b, v20.8b
smull v5.8h, v5.8b, v18.8b
ext v18.8b, v16.8b, v17.8b, #7
smull v6.8h, v6.8b, v19.8b
smull v7.8h, v7.8b, v18.8b
addp v0.8h, v0.8h, v1.8h
addp v2.8h, v2.8h, v3.8h
addp v4.8h, v4.8h, v5.8h
addp v6.8h, v6.8h, v7.8h
addp v0.8h, v0.8h, v2.8h
addp v4.8h, v4.8h, v6.8h
addp v0.8h, v0.8h, v4.8h
rshrn v16.4h, v18.4s, #3
rshrn2 v16.8h, v20.4s, #3
add w5, w5, w8
ret
endfunc
......@@ -3002,25 +2989,32 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
lsl x1, x1, #1
.endif
movi v22.8b, #128
.ifb \t
movi v23.8h, #128
.else
movi v23.8h, #8, lsl #8
.endif
bl warp_filter_horz_neon
mov v24.16b, v16.16b
srshr v24.8h, v0.8h, #3
bl warp_filter_horz_neon
mov v25.16b, v16.16b
srshr v25.8h, v0.8h, #3
bl warp_filter_horz_neon
mov v26.16b, v16.16b
srshr v26.8h, v0.8h, #3
bl warp_filter_horz_neon
mov v27.16b, v16.16b
srshr v27.8h, v0.8h, #3
bl warp_filter_horz_neon
mov v28.16b, v16.16b
srshr v28.8h, v0.8h, #3
bl warp_filter_horz_neon
mov v29.16b, v16.16b
srshr v29.8h, v0.8h, #3
bl warp_filter_horz_neon
mov v30.16b, v16.16b
srshr v30.8h, v0.8h, #3
1:
add w14, w6, #512
bl warp_filter_horz_neon
mov v31.16b, v16.16b
srshr v31.8h, v0.8h, #3
load_filter_row d0, w14, w9
load_filter_row d1, w14, w9
......@@ -3030,15 +3024,7 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
load_filter_row d5, w14, w9
load_filter_row d6, w14, w9
load_filter_row d7, w14, w9
transpose_8x8b v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
sxtl v0.8h, v0.8b
sxtl v1.8h, v1.8b
sxtl v2.8h, v2.8b
sxtl v3.8h, v3.8b
sxtl v4.8h, v4.8b
sxtl v5.8h, v5.8b
sxtl v6.8h, v6.8b
sxtl v7.8h, v7.8b
transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
// This ordering of smull/smlal/smull2/smlal2 is highly
// beneficial for Cortex A53 here.
......@@ -3066,6 +3052,7 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
sqrshrn2 v16.8h, v17.4s, #\shift
mov v27.16b, v28.16b
mov v28.16b, v29.16b
add v16.8h, v16.8h, v23.8h
.ifb \t
sqxtun v16.8b, v16.8h
.endif
......
......@@ -3188,8 +3188,8 @@ filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
.macro load_filter_row dst, src, inc
asr w13, \src, #10
ldr \dst, [x11, w13, sxtw #3]
add \src, \src, \inc
ldr \dst, [x11, w13, sxtw #3]
.endm
function warp_filter_horz_neon
......@@ -3343,15 +3343,7 @@ function warp_affine_8x8\t\()_16bpc_neon, export=1
load_filter_row d5, w14, w9
load_filter_row d6, w14, w9
load_filter_row d7, w14, w9
transpose_8x8b v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
sxtl v0.8h, v0.8b
sxtl v1.8h, v1.8b
sxtl v2.8h, v2.8b
sxtl v3.8h, v3.8b
sxtl v4.8h, v4.8b
sxtl v5.8h, v5.8b
sxtl v6.8h, v6.8b
sxtl v7.8h, v7.8b
transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
// This ordering of smull/smlal/smull2/smlal2 is highly
// beneficial for Cortex A53 here.
......
......@@ -59,33 +59,42 @@
#endif
.endm
.macro transpose_8x8b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
trn1 \t8\().8b, \r0\().8b, \r1\().8b
trn2 \t9\().8b, \r0\().8b, \r1\().8b
trn1 \r1\().8b, \r2\().8b, \r3\().8b
trn2 \r3\().8b, \r2\().8b, \r3\().8b
trn1 \r0\().8b, \r4\().8b, \r5\().8b
trn2 \r5\().8b, \r4\().8b, \r5\().8b
trn1 \r2\().8b, \r6\().8b, \r7\().8b
trn2 \r7\().8b, \r6\().8b, \r7\().8b
trn1 \r4\().4h, \r0\().4h, \r2\().4h
trn2 \r2\().4h, \r0\().4h, \r2\().4h
trn1 \r6\().4h, \r5\().4h, \r7\().4h
trn2 \r7\().4h, \r5\().4h, \r7\().4h
trn1 \r5\().4h, \t9\().4h, \r3\().4h
trn2 \t9\().4h, \t9\().4h, \r3\().4h
trn1 \r3\().4h, \t8\().4h, \r1\().4h
trn2 \t8\().4h, \t8\().4h, \r1\().4h
trn1 \r0\().2s, \r3\().2s, \r4\().2s
trn2 \r4\().2s, \r3\().2s, \r4\().2s
trn1 \r1\().2s, \r5\().2s, \r6\().2s
trn2 \r5\().2s, \r5\().2s, \r6\().2s
trn2 \r6\().2s, \t8\().2s, \r2\().2s
trn1 \r2\().2s, \t8\().2s, \r2\().2s
trn1 \r3\().2s, \t9\().2s, \r7\().2s
trn2 \r7\().2s, \t9\().2s, \r7\().2s
.macro transpose_8x8b_xtl r0, r1, r2, r3, r4, r5, r6, r7, xtl
// a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7
zip1 \r0\().16b, \r0\().16b, \r1\().16b
// c0 d0 c1 d1 c2 d2 d3 d3 c4 d4 c5 d5 c6 d6 d7 d7
zip1 \r2\().16b, \r2\().16b, \r3\().16b
// e0 f0 e1 f1 e2 f2 e3 f3 e4 f4 e5 f5 e6 f6 e7 f7
zip1 \r4\().16b, \r4\().16b, \r5\().16b
// g0 h0 g1 h1 g2 h2 h3 h3 g4 h4 g5 h5 g6 h6 h7 h7
zip1 \r6\().16b, \r6\().16b, \r7\().16b
// a0 b0 c0 d0 a2 b2 c2 d2 a4 b4 c4 d4 a6 b6 c6 d6
trn1 \r1\().8h, \r0\().8h, \r2\().8h
// a1 b1 c1 d1 a3 b3 c3 d3 a5 b5 c5 d5 a7 b7 c7 d7
trn2 \r3\().8h, \r0\().8h, \r2\().8h
// e0 f0 g0 h0 e2 f2 g2 h2 e4 f4 g4 h4 e6 f6 g6 h6
trn1 \r5\().8h, \r4\().8h, \r6\().8h
// e1 f1 g1 h1 e3 f3 g3 h3 e5 f5 g5 h5 e7 f7 g7 h7
trn2 \r7\().8h, \r4\().8h, \r6\().8h
// a0 b0 c0 d0 e0 f0 g0 h0 a4 b4 c4 d4 e4 f4 g4 h4
trn1 \r0\().4s, \r1\().4s, \r5\().4s
// a2 b2 c2 d2 e2 f2 g2 h2 a6 b6 c6 d6 e6 f6 g6 h6
trn2 \r2\().4s, \r1\().4s, \r5\().4s
// a1 b1 c1 d1 e1 f1 g1 h1 a5 b5 c5 d5 e5 f5 g5 h5
trn1 \r1\().4s, \r3\().4s, \r7\().4s
// a3 b3 c3 d3 e3 f3 g3 h3 a7 b7 c7 d7 e7 f7 g7 h7
trn2 \r3\().4s, \r3\().4s, \r7\().4s
\xtl\()2 \r4\().8h, \r0\().16b
\xtl \r0\().8h, \r0\().8b
\xtl\()2 \r6\().8h, \r2\().16b
\xtl \r2\().8h, \r2\().8b
\xtl\()2 \r5\().8h, \r1\().16b
\xtl \r1\().8h, \r1\().8b
\xtl\()2 \r7\().8h, \r3\().16b
\xtl \r3\().8h, \r3\().8b
.endm
.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment