Commit 361a3c8e authored by Martin Storsjö's avatar Martin Storsjö

arm: cdef: Add special cased versions for pri_strength/sec_strength being zero

Before:
ARM32:                    Cortex A7      A8      A9     A53     A72     A73
cdef_filter_4x4_8bpc_neon:    964.6   599.5   707.9   601.2   465.1   405.2
cdef_filter_4x8_8bpc_neon:   1726.0  1066.2  1238.7  1041.7   798.6   725.3
cdef_filter_8x8_8bpc_neon:   2974.4  1671.8  1943.9  1806.1  1229.8  1242.1
ARM64:
cdef_filter_4x4_8bpc_neon:                            569.2   337.8   348.7
cdef_filter_4x8_8bpc_neon:                           1031.1   623.3   633.6
cdef_filter_8x8_8bpc_neon:                           1847.5  1097.7  1117.5

After:
ARM32:                    Cortex A7      A8      A9     A53     A72     A73
cdef_filter_4x4_8bpc_neon:    798.4   524.2   617.3   506.8   432.4   361.1
cdef_filter_4x8_8bpc_neon:   1394.7   910.4  1054.0   863.6   730.2   632.2
cdef_filter_8x8_8bpc_neon:   2364.6  1453.8  1675.1  1466.0  1086.4  1107.7
ARM64:
cdef_filter_4x4_8bpc_neon:                            461.7   303.1   308.6
cdef_filter_4x8_8bpc_neon:                            833.0   547.5   556.0
cdef_filter_8x8_8bpc_neon:                           1459.3   934.1   967.9
parent 6ad9bd5f
Pipeline #13245 passed with stages
in 9 minutes and 7 seconds
......@@ -311,14 +311,13 @@ endconst
vld1.16 {\d22}, [r9] // p1
.endif
.endm
.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
cmp \threshold, #0
.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap, min
.if \min
vmin.u16 q2, q2, \s1
vmax.s16 q3, q3, \s1
vmin.u16 q2, q2, \s2
vmax.s16 q3, q3, \s2
beq 3f
.endif
vabd.u16 q8, q0, \s1 // abs(diff)
vabd.u16 q11, q0, \s2 // abs(diff)
vshl.u16 q9, q8, \shift // abs(diff) >> shift
......@@ -342,30 +341,36 @@ endconst
// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping, int h);
.macro filter w
function cdef_filter\w\()_neon, export=1
push {r4-r9,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #92]
ldrd r6, r7, [sp, #100]
.macro filter_func w, pri, sec, min, suffix
function cdef_filter\w\suffix\()_neon
.if \pri
movrel_local r8, pri_taps
and r9, r3, #1
add r8, r8, r9, lsl #1
.endif
movrel_local r9, directions\w
add r5, r9, r5, lsl #1
vmov.u16 d17, #15
vdup.16 d16, r6 // damping
.if \pri
vdup.16 q5, r3 // threshold
.endif
.if \sec
vdup.16 q7, r4 // threshold
.endif
vmov.16 d8[0], r3
vmov.16 d8[1], r4
vclz.i16 d8, d8 // clz(threshold)
vsub.i16 d8, d17, d8 // ulog2(threshold)
vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
vneg.s16 d8, d8 // -shift
.if \sec
vdup.16 q6, d8[1]
.endif
.if \pri
vdup.16 q4, d8[0]
.endif
1:
.if \w == 8
......@@ -377,47 +382,64 @@ function cdef_filter\w\()_neon, export=1
.endif
vmov.u16 q1, #0 // sum
.if \min
vmov.u16 q2, q0 // min
vmov.u16 q3, q0 // max
.endif
// Instead of loading sec_taps 2, 1 from memory, just set it
// to 2 initially and decrease for the second round.
// This is also used as loop counter.
mov lr, #2 // sec_taps[0]
2:
.if \pri
ldrsb r9, [r5] // off1
load_px d28, d29, d30, d31, \w
.endif
.if \sec
add r5, r5, #4 // +2*2
ldrsb r9, [r5] // off2
.endif
.if \pri
ldrb r12, [r8] // *pri_taps
handle_pixel q14, q15, r3, q5, q4, r12
handle_pixel q14, q15, r3, q5, q4, r12, \min
.endif
.if \sec
load_px d28, d29, d30, d31, \w
add r5, r5, #8 // +2*4
ldrsb r9, [r5] // off3
handle_pixel q14, q15, r4, q7, q6, lr
handle_pixel q14, q15, r4, q7, q6, lr, \min
load_px d28, d29, d30, d31, \w
handle_pixel q14, q15, r4, q7, q6, lr
handle_pixel q14, q15, r4, q7, q6, lr, \min
sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1;
.else
add r5, r5, #1 // r5 += 1
.endif
subs lr, lr, #1 // sec_tap-- (value)
.if \pri
add r8, r8, #1 // pri_taps++ (pointer)
.endif
bne 2b
vshr.s16 q14, q1, #15 // -(sum < 0)
vadd.i16 q1, q1, q14 // sum - (sum < 0)
vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4
.if \min
vmin.s16 q0, q0, q3
vmax.s16 q0, q0, q2 // iclip(px + .., min, max)
.endif
vmovn.u16 d0, q0
.if \w == 8
add r2, r2, #2*16 // tmp += tmp_stride
......@@ -432,7 +454,9 @@ function cdef_filter\w\()_neon, export=1
// Reset pri_taps and directions back to the original point
sub r5, r5, #2
.if \pri
sub r8, r8, #2
.endif
bgt 1b
vpop {q4-q7}
......@@ -440,6 +464,28 @@ function cdef_filter\w\()_neon, export=1
endfunc
.endm
.macro filter w
filter_func \w, pri=1, sec=0, min=0, suffix=_pri
filter_func \w, pri=0, sec=1, min=0, suffix=_sec
filter_func \w, pri=1, sec=1, min=1, suffix=_pri_sec
function cdef_filter\w\()_neon, export=1
push {r4-r9,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #92]
ldrd r6, r7, [sp, #100]
cmp r3, #0 // pri_strength
bne 1f
b cdef_filter\w\()_sec_neon // only sec
1:
cmp r4, #0 // sec_strength
bne 1f
b cdef_filter\w\()_pri_neon // only pri
1:
b cdef_filter\w\()_pri_sec_neon // both pri and sec
endfunc
.endm
filter 8
filter 4
......
......@@ -286,13 +286,13 @@ endconst
ld1 {\d2\().d}[1], [x9] // p1
.endif
.endm
.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap, min
.if \min
umin v2.8h, v2.8h, \s1\().8h
smax v3.8h, v3.8h, \s1\().8h
umin v2.8h, v2.8h, \s2\().8h
smax v3.8h, v3.8h, \s2\().8h
cbz \threshold, 3f
.endif
uabd v16.8h, v0.8h, \s1\().8h // abs(diff)
uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
......@@ -316,25 +316,35 @@ endconst
// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping, int h);
.macro filter w
function cdef_filter\w\()_neon, export=1
.macro filter_func w, pri, sec, min, suffix
function cdef_filter\w\suffix\()_neon
.if \pri
movrel x8, pri_taps
and w9, w3, #1
add x8, x8, w9, uxtw #1
.endif
movrel x9, directions\w
add x5, x9, w5, uxtw #1
movi v30.4h, #15
dup v28.4h, w6 // damping
.if \pri
dup v25.8h, w3 // threshold
.endif
.if \sec
dup v27.8h, w4 // threshold
.endif
trn1 v24.4h, v25.4h, v27.4h
clz v24.4h, v24.4h // clz(threshold)
sub v24.4h, v30.4h, v24.4h // ulog2(threshold)
uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold))
neg v24.4h, v24.4h // -shift
.if \sec
dup v26.8h, v24.h[1]
.endif
.if \pri
dup v24.8h, v24.h[0]
.endif
1:
.if \w == 8
......@@ -346,45 +356,62 @@ function cdef_filter\w\()_neon, export=1
.endif
movi v1.8h, #0 // sum
.if \min
mov v2.16b, v0.16b // min
mov v3.16b, v0.16b // max
.endif
// Instead of loading sec_taps 2, 1 from memory, just set it
// to 2 initially and decrease for the second round.
// This is also used as loop counter.
mov w11, #2 // sec_taps[0]
2:
.if \pri
ldrb w9, [x5] // off1
load_px v4, v5, \w
.endif
.if \sec
add x5, x5, #4 // +2*2
ldrb w9, [x5] // off2
load_px v6, v7, \w
.endif
.if \pri
ldrb w10, [x8] // *pri_taps
handle_pixel v4, v5, w3, v25.8h, v24.8h, w10
handle_pixel v4, v5, w3, v25.8h, v24.8h, w10, \min
.endif
.if \sec
add x5, x5, #8 // +2*4
ldrb w9, [x5] // off3
load_px v4, v5, \w
handle_pixel v6, v7, w4, v27.8h, v26.8h, w11
handle_pixel v6, v7, w4, v27.8h, v26.8h, w11, \min
handle_pixel v4, v5, w4, v27.8h, v26.8h, w11
handle_pixel v4, v5, w4, v27.8h, v26.8h, w11, \min
sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
.else
add x5, x5, #1 // x5 += 1
.endif
subs w11, w11, #1 // sec_tap-- (value)
.if \pri
add x8, x8, #1 // pri_taps++ (pointer)
.endif
b.ne 2b
sshr v4.8h, v1.8h, #15 // -(sum < 0)
add v1.8h, v1.8h, v4.8h // sum - (sum < 0)
srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4
.if \min
smin v0.8h, v0.8h, v3.8h
smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max)
.endif
xtn v0.8b, v0.8h
.if \w == 8
add x2, x2, #2*16 // tmp += tmp_stride
......@@ -399,13 +426,31 @@ function cdef_filter\w\()_neon, export=1
// Reset pri_taps and directions back to the original point
sub x5, x5, #2
.if \pri
sub x8, x8, #2
.endif
b.gt 1b
ret
endfunc
.endm
.macro filter w
filter_func \w, pri=1, sec=0, min=0, suffix=_pri
filter_func \w, pri=0, sec=1, min=0, suffix=_sec
filter_func \w, pri=1, sec=1, min=1, suffix=_pri_sec
function cdef_filter\w\()_neon, export=1
cbnz w3, 1f // pri_strength
b cdef_filter\w\()_sec_neon // only sec
1:
cbnz w4, 1f // sec_strength
b cdef_filter\w\()_pri_neon // only pri
1:
b cdef_filter\w\()_pri_sec_neon // both pri and sec
endfunc
.endm
filter 8
filter 4
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment