Commit 4f5261a0 authored by Martin Storsjö's avatar Martin Storsjö

arm64: cdef: Do saturating subtractions to avoid max operations with 0

Before:                  Cortex A53     A72     A73
cdef_filter_4x4_8bpc_neon:    677.4   433.9   452.9
cdef_filter_4x8_8bpc_neon:   1255.0   815.2   841.8
cdef_filter_8x8_8bpc_neon:   2278.5  1440.0  1505.0
After:
cdef_filter_4x4_8bpc_neon:    645.5   401.9   422.5
cdef_filter_4x8_8bpc_neon:   1193.7   756.6   782.4
cdef_filter_8x8_8bpc_neon:   2162.4  1361.9  1375.6
parent dc2ae517
...@@ -304,10 +304,8 @@ endconst ...@@ -304,10 +304,8 @@ endconst
uabd v20.8h, v0.8h, \s2\().8h // abs(diff) uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
ushl v17.8h, v16.8h, \shift // abs(diff) >> shift ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
ushl v21.8h, v20.8h, \shift // abs(diff) >> shift ushl v21.8h, v20.8h, \shift // abs(diff) >> shift
sub v17.8h, \thresh_vec, v17.8h // threshold - (abs(diff) >> shift) uqsub v17.8h, \thresh_vec, v17.8h // imax(0, threshold - (abs(diff) >> shift))
sub v21.8h, \thresh_vec, v21.8h // threshold - (abs(diff) >> shift) uqsub v21.8h, \thresh_vec, v21.8h // imax(0, threshold - (abs(diff) >> shift))
smax v17.8h, v29.8h, v17.8h // imax(0, threshold - ())
smax v21.8h, v29.8h, v21.8h // imax(0, threshold - ())
cmhi v18.8h, v0.8h, \s1\().8h // px > p0 cmhi v18.8h, v0.8h, \s1\().8h // px > p0
cmhi v22.8h, v0.8h, \s2\().8h // px > p1 cmhi v22.8h, v0.8h, \s2\().8h // px > p1
smin v17.8h, v17.8h, v16.8h // imin(abs(diff), imax()) smin v17.8h, v17.8h, v16.8h // imin(abs(diff), imax())
...@@ -334,7 +332,6 @@ function cdef_filter\w\()_neon, export=1 ...@@ -334,7 +332,6 @@ function cdef_filter\w\()_neon, export=1
add x5, x9, w5, uxtw #1 add x5, x9, w5, uxtw #1
movi v31.16b, #255 movi v31.16b, #255
movi v30.8h, #15 movi v30.8h, #15
movi v29.8h, #0
dup v28.8h, w6 // damping dup v28.8h, w6 // damping
ushr v31.8h, v31.8h, #1 // INT16_MAX ushr v31.8h, v31.8h, #1 // INT16_MAX
...@@ -344,10 +341,8 @@ function cdef_filter\w\()_neon, export=1 ...@@ -344,10 +341,8 @@ function cdef_filter\w\()_neon, export=1
clz v26.8h, v27.8h // clz(threshold) clz v26.8h, v27.8h // clz(threshold)
sub v24.8h, v30.8h, v24.8h // ulog2(threshold) sub v24.8h, v30.8h, v24.8h // ulog2(threshold)
sub v26.8h, v30.8h, v26.8h // ulog2(threshold) sub v26.8h, v30.8h, v26.8h // ulog2(threshold)
sub v24.8h, v28.8h, v24.8h // damping - ulog2(threshold) uqsub v24.8h, v28.8h, v24.8h // shift = imax(0, damping - ulog2(threshold))
sub v26.8h, v28.8h, v26.8h // damping - ulog2(threshold) uqsub v26.8h, v28.8h, v26.8h // shift = imax(0, damping - ulog2(threshold))
smax v24.8h, v29.8h, v24.8h // shift = imax(0, damping - ulog2(threshold))
smax v26.8h, v29.8h, v26.8h // shift = imax(0, damping - ulog2(threshold))
neg v24.8h, v24.8h // -shift neg v24.8h, v24.8h // -shift
neg v26.8h, v26.8h // -shift neg v26.8h, v26.8h // -shift
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment