Commit 8f8dc928 authored by Martin Storsjö's avatar Martin Storsjö

arm64: cdef: Use a smarter padding constant

Pad with a value which works both as a large unsigned value and a
negative signed value. This allows doing the max operation using
signed max, avoiding the conditional altogether.

Based on the same idea for x86 by Kyle Siefring.

Before:                  Cortex A53     A72     A73
cdef_filter_4x4_8bpc_neon:    645.5   401.9   422.5
cdef_filter_4x8_8bpc_neon:   1193.7   756.6   782.4
cdef_filter_8x8_8bpc_neon:   2162.4  1361.9  1375.6
After:
cdef_filter_4x4_8bpc_neon:    596.3   377.8   384.8
cdef_filter_4x8_8bpc_neon:   1097.4   705.5   707.1
cdef_filter_8x8_8bpc_neon:   1967.4  1232.3  1239.9
parent 4f5261a0
......@@ -136,8 +136,7 @@
.macro padding_func w, stride, rn, rw
function cdef_padding\w\()_neon, export=1
movi v30.16b, #255
ushr v30.8h, v30.8h, #1 // INT16_MAX
movi v30.8h, #0x80, lsl #8
mov v31.16b, v30.16b
sub x0, x0, #2*(2*\stride+2)
tst w6, #4 // CDEF_HAVE_TOP
......@@ -290,14 +289,10 @@ endconst
.endif
.endm
.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
cmeq v16.8h, \s1\().8h, v31.8h
cmeq v17.8h, \s2\().8h, v31.8h
bic v16.16b, \s1\().16b, v16.16b
bic v17.16b, \s2\().16b, v17.16b
umin v2.8h, v2.8h, \s1\().8h
umax v3.8h, v3.8h, v16.8h
smax v3.8h, v3.8h, \s1\().8h
umin v2.8h, v2.8h, \s2\().8h
umax v3.8h, v3.8h, v17.8h
smax v3.8h, v3.8h, \s2\().8h
cbz \threshold, 3f
uabd v16.8h, v0.8h, \s1\().8h // abs(diff)
......@@ -308,8 +303,8 @@ endconst
uqsub v21.8h, \thresh_vec, v21.8h // imax(0, threshold - (abs(diff) >> shift))
cmhi v18.8h, v0.8h, \s1\().8h // px > p0
cmhi v22.8h, v0.8h, \s2\().8h // px > p1
smin v17.8h, v17.8h, v16.8h // imin(abs(diff), imax())
smin v21.8h, v21.8h, v20.8h // imin(abs(diff), imax())
umin v17.8h, v17.8h, v16.8h // imin(abs(diff), imax())
umin v21.8h, v21.8h, v20.8h // imin(abs(diff), imax())
dup v19.8h, \tap // taps[k]/taps[k]
neg v16.8h, v17.8h // -imin()
neg v20.8h, v21.8h // -imin()
......@@ -330,10 +325,8 @@ function cdef_filter\w\()_neon, export=1
add x8, x8, w9, uxtw #1
movrel x9, directions\w
add x5, x9, w5, uxtw #1
movi v31.16b, #255
movi v30.8h, #15
dup v28.8h, w6 // damping
ushr v31.8h, v31.8h, #1 // INT16_MAX
dup v25.8h, w3 // threshold
dup v27.8h, w4 // threshold
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment