Commit 96b24495 authored by Martin Storsjö's avatar Martin Storsjö

arm: looprestoration: NEON optimized wiener filter

The relative speedup compared to C code is around 4-8x:

                    Cortex A7     A8     A9    A53    A72    A73
wiener_luma_8bpc_neon:   4.00   7.54   4.74   6.84   4.91   8.01
parent 95cd440a
Pipeline #4291 passed with stages
in 6 minutes and 29 seconds
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2019, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
// void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
// const pixel *src, ptrdiff_t stride,
// const int16_t fh[7], const intptr_t w,
// int h, enum LrEdgeFlags edges);
function wiener_filter_h_neon, export=1
push {r4-r11,lr}
vpush {q4}
ldrd r4, r5, [sp, #52]
ldrd r6, r7, [sp, #60]
mov r8, r5
vld1.16 {q0}, [r4]
movw r9, #(1 << 14) - (1 << 2)
vdup.16 q14, r9
vmov.s16 q15, #2048
// Calculate mid_stride
add r10, r5, #7
bic r10, r10, #7
lsl r10, r10, #1
// Clear the last unused element of q0, to allow filtering a single
// pixel with one plain mul+addv.
mov r12, #0
vmov.16 d1[3], r12
// Set up pointers for reading/writing alternate rows
add r12, r0, r10
lsl r10, r10, #1
add lr, r2, r3
lsl r3, r3, #1
// Subtract the width from mid_stride
sub r10, r10, r5, lsl #1
// For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
cmp r5, #8
add r11, r5, #13
bic r11, r11, #7
bge 1f
mov r11, #16
1:
sub r3, r3, r11
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst r7, #1 // LR_HAVE_LEFT
beq 2f
// LR_HAVE_LEFT
cmp r1, #0
bne 0f
// left == NULL
sub r2, r2, #3
sub lr, lr, #3
b 1f
0: // LR_HAVE_LEFT, left != NULL
2: // !LR_HAVE_LEFT, increase the stride.
// For this case we don't read the left 3 pixels from the src pointer,
// but shift it as if we had done that.
add r3, r3, #3
1: // Loop vertically
vld1.8 {q2}, [r2]!
vld1.8 {q9}, [lr]!
tst r7, #1 // LR_HAVE_LEFT
beq 0f
cmp r1, #0
beq 2f
// LR_HAVE_LEFT, left != NULL
vld1.32 {d3[1]}, [r1]!
// Move r2/lr back to account for the last 3 bytes we loaded earlier,
// which we'll shift out.
sub r2, r2, #3
sub lr, lr, #3
vld1.32 {d17[1]}, [r1]!
vext.8 q2, q1, q2, #13
vext.8 q9, q8, q9, #13
b 2f
0:
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q2 to have 3x the first byte at the front.
vdup.8 q1, d4[0]
vdup.8 q8, d18[0]
// Move r2 back to account for the last 3 bytes we loaded before,
// which we shifted out.
sub r2, r2, #3
sub lr, lr, #3
vext.8 q2, q1, q2, #13
vext.8 q9, q8, q9, #13
2:
vmovl.u8 q1, d4
vmovl.u8 q2, d5
vmovl.u8 q8, d18
vmovl.u8 q9, d19
tst r7, #2 // LR_HAVE_RIGHT
bne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here.
sub r9, r5, #14
ldrb r11, [r2, r9]
ldrb r9, [lr, r9]
// Fill q12/q13 with the right padding pixel
vdup.8 d24, r11
vdup.8 d26, r9
vmovl.u8 q12, d24
vmovl.u8 q13, d26
3: // !LR_HAVE_RIGHT
// If we'll have to pad the right edge we need to quit early here.
cmp r5, #11
bge 4f // If w >= 11, all used input pixels are valid
cmp r5, #7
bge 5f // If w >= 7, we can filter 4 pixels
b 6f
4: // Loop horizontally
.macro filter_8
// This is tuned as some sort of compromise between Cortex A7, A8,
// A9 and A53.
vmul.s16 q3, q1, d0[0]
vext.8 q10, q1, q2, #2
vext.8 q11, q1, q2, #4
vmla.s16 q3, q10, d0[1]
vmla.s16 q3, q11, d0[2]
vext.8 q10, q1, q2, #6
vext.8 q11, q1, q2, #8
vmla.s16 q3, q10, d0[3]
vmla.s16 q3, q11, d1[0]
vext.8 q10, q1, q2, #10
vext.8 q11, q1, q2, #12
vmla.s16 q3, q10, d1[1]
vmla.s16 q3, q11, d1[2]
vmul.s16 q10, q8, d0[0]
vext.8 q11, q8, q9, #2
vext.8 q4, q8, q9, #4
vmla.s16 q10, q11, d0[1]
vmla.s16 q10, q4, d0[2]
vext.8 q11, q8, q9, #6
vext.8 q4, q8, q9, #8
vmla.s16 q10, q11, d0[3]
vmla.s16 q10, q4, d1[0]
vext.8 q11, q8, q9, #10
vext.8 q4, q8, q9, #12
vmla.s16 q10, q11, d1[1]
vmla.s16 q10, q4, d1[2]
vext.8 q1, q1, q2, #6
vext.8 q8, q8, q9, #6
vshl.s16 q1, q1, #7
vshl.s16 q8, q8, #7
vsub.s16 q1, q1, q14
vsub.s16 q8, q8, q14
vqadd.s16 q3, q3, q1
vqadd.s16 q10, q10, q8
vshr.s16 q3, q3, #3
vshr.s16 q10, q10, #3
vadd.s16 q3, q3, q15
vadd.s16 q10, q10, q15
.endm
filter_8
vst1.16 {q3}, [r0, :128]!
vst1.16 {q10}, [r12, :128]!
subs r5, r5, #8
ble 9f
tst r7, #2 // LR_HAVE_RIGHT
vmov q1, q2
vmov q8, q9
vld1.8 {d4}, [r2]!
vld1.8 {d18}, [lr]!
vmovl.u8 q2, d4
vmovl.u8 q9, d18
bne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
5: // Filter 4 pixels, 7 <= w < 11
.macro filter_4
vmul.s16 d6, d2, d0[0]
vext.8 q10, q1, q2, #2
vext.8 q11, q1, q2, #4
vmla.s16 d6, d20, d0[1]
vmla.s16 d6, d22, d0[2]
vext.8 q10, q1, q2, #6
vext.8 q11, q1, q2, #8
vmla.s16 d6, d20, d0[3]
vmla.s16 d6, d22, d1[0]
vext.8 q10, q1, q2, #10
vext.8 q11, q1, q2, #12
vmla.s16 d6, d20, d1[1]
vmla.s16 d6, d22, d1[2]
vmul.s16 d20, d16, d0[0]
vext.8 q11, q8, q9, #2
vext.8 q4, q8, q9, #4
vmla.s16 d20, d22, d0[1]
vmla.s16 d20, d8, d0[2]
vext.8 q11, q8, q9, #6
vext.8 q4, q8, q9, #8
vmla.s16 d20, d22, d0[3]
vmla.s16 d20, d8, d1[0]
vext.8 q11, q8, q9, #10
vext.8 q4, q8, q9, #12
vmla.s16 d20, d22, d1[1]
vmla.s16 d20, d8, d1[2]
vext.8 q11, q1, q2, #6
vshl.s16 d22, d22, #7
vsub.s16 d22, d22, d28
vqadd.s16 d6, d6, d22
vext.8 q11, q8, q9, #6
vshl.s16 d22, d22, #7
vsub.s16 d22, d22, d28
vqadd.s16 d20, d20, d22
vshr.s16 d6, d6, #3
vshr.s16 d20, d20, #3
vadd.s16 d6, d6, d30
vadd.s16 d20, d20, d30
.endm
filter_4
vst1.16 {d6}, [r0, :64]!
vst1.16 {d20}, [r12, :64]!
subs r5, r5, #4 // 3 <= w < 7
vext.8 q1, q1, q2, #8
vext.8 q2, q2, q2, #8
vext.8 q8, q8, q9, #8
vext.8 q9, q9, q9, #8
6: // Pad the right edge and filter the last few pixels.
// w < 7, w+3 pixels valid in q1-q2
cmp r5, #5
blt 7f
bgt 8f
// w == 5, 8 pixels valid in q1, q2 invalid
vmov q2, q12
vmov q9, q13
b 88f
7: // 1 <= w < 5, 4-7 pixels valid in q1
sub r9, r5, #1
// w9 = (pixels valid - 4)
adr r11, L(variable_shift_tbl)
ldr r9, [r11, r9, lsl #2]
add r11, r11, r9
vmov q2, q12
vmov q9, q13
bx r11
.align 2
L(variable_shift_tbl):
.word 44f - L(variable_shift_tbl) + CONFIG_THUMB
.word 55f - L(variable_shift_tbl) + CONFIG_THUMB
.word 66f - L(variable_shift_tbl) + CONFIG_THUMB
.word 77f - L(variable_shift_tbl) + CONFIG_THUMB
// Shift q1 right, shifting out invalid pixels,
// shift q1 left to the original offset, shifting in padding pixels.
44: // 4 pixels valid
vext.8 q1, q1, q1, #8
vext.8 q1, q1, q2, #8
vext.8 q8, q8, q8, #8
vext.8 q8, q8, q9, #8
b 88f
55: // 5 pixels valid
vext.8 q1, q1, q1, #10
vext.8 q1, q1, q2, #6
vext.8 q8, q8, q8, #10
vext.8 q8, q8, q9, #6
b 88f
66: // 6 pixels valid
vext.8 q1, q1, q1, #12
vext.8 q1, q1, q2, #4
vext.8 q8, q8, q8, #12
vext.8 q8, q8, q9, #4
b 88f
77: // 7 pixels valid
vext.8 q1, q1, q1, #14
vext.8 q1, q1, q2, #2
vext.8 q8, q8, q8, #14
vext.8 q8, q8, q9, #2
b 88f
8: // w > 5, w == 6, 9 pixels valid in q1-q2, 1 pixel valid in q2
vext.8 q2, q2, q2, #2
vext.8 q2, q2, q12, #14
vext.8 q9, q9, q9, #2
vext.8 q9, q9, q13, #14
88:
// w < 7, q1-q2 padded properly
cmp r5, #4
blt 888f
// w >= 4, filter 4 pixels
filter_4
vst1.16 {d6}, [r0, :64]!
vst1.16 {d20}, [r12, :64]!
subs r5, r5, #4 // 0 <= w < 4
vext.8 q1, q1, q2, #8
vext.8 q8, q8, q9, #8
beq 9f
888: // 1 <= w < 4, filter 1 pixel at a time
vmul.s16 q3, q1, q0
vmul.s16 q10, q8, q0
vpadd.s16 d6, d6, d7
vpadd.s16 d7, d20, d21
vdup.16 d24, d2[3]
vpadd.s16 d6, d6, d7
vdup.16 d25, d16[3]
vpadd.s16 d6, d6, d6
vtrn.16 d24, d25
vshl.s16 d24, d24, #7
vsub.s16 d24, d24, d28
vqadd.s16 d6, d6, d24
vshr.s16 d6, d6, #3
vadd.s16 d6, d6, d30
vst1.s16 {d6[0]}, [r0, :16]!
vst1.s16 {d6[1]}, [r12, :16]!
subs r5, r5, #1
vext.8 q1, q1, q2, #2
vext.8 q8, q8, q9, #2
bgt 888b
9:
subs r6, r6, #2
ble 0f
// Jump to the next row and loop horizontally
add r0, r0, r10
add r12, r12, r10
add r2, r2, r3
add lr, lr, r3
mov r5, r8
b 1b
0:
vpop {q4}
pop {r4-r11,pc}
.purgem filter_8
.purgem filter_4
endfunc
// void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
// const int16_t *mid, int w, int h,
// const int16_t fv[7], enum LrEdgeFlags edges,
// ptrdiff_t mid_stride);
function wiener_filter_v_neon, export=1
push {r4-r7,lr}
ldrd r4, r5, [sp, #20]
ldrd r6, r7, [sp, #28]
mov lr, r4
vmov.s16 q1, #0
mov r12, #128
vld1.16 {q0}, [r5]
vmov.s16 d2[3], r12
vadd.s16 q0, q0, q1
// Calculate the number of rows to move back when looping vertically
mov r12, r4
tst r6, #4 // LR_HAVE_TOP
beq 0f
sub r2, r2, r7, lsl #1
add r12, r12, #2
0:
tst r6, #8 // LR_HAVE_BOTTOM
beq 1f
add r12, r12, #2
1: // Start of horizontal loop; start one vertical filter slice.
// Load rows into q8-q11 and pad properly.
tst r6, #4 // LR_HAVE_TOP
vld1.16 {q8}, [r2, :128], r7
beq 2f
// LR_HAVE_TOP
vld1.16 {q10}, [r2, :128], r7
vmov q9, q8
vld1.16 {q11}, [r2, :128], r7
b 3f
2: // !LR_HAVE_TOP
vmov q9, q8
vmov q10, q8
vmov q11, q8
3:
cmp r4, #4
blt 5f
// Start filtering normally; fill in q12-q14 with unique rows.
vld1.16 {q12}, [r2, :128], r7
vld1.16 {q13}, [r2, :128], r7
vld1.16 {q14}, [r2, :128], r7
4:
.macro filter compare
subs r4, r4, #1
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
vmull.s16 q2, d16, d0[0]
vmlal.s16 q2, d18, d0[1]
vmlal.s16 q2, d20, d0[2]
vmlal.s16 q2, d22, d0[3]
vmlal.s16 q2, d24, d1[0]
vmlal.s16 q2, d26, d1[1]
vmlal.s16 q2, d28, d1[2]
vmull.s16 q3, d17, d0[0]
vmlal.s16 q3, d19, d0[1]
vmlal.s16 q3, d21, d0[2]
vmlal.s16 q3, d23, d0[3]
vmlal.s16 q3, d25, d1[0]
vmlal.s16 q3, d27, d1[1]
vmlal.s16 q3, d29, d1[2]
vqrshrun.s32 d4, q2, #11
vqrshrun.s32 d5, q3, #11
vqmovun.s16 d4, q2
vst1.8 {d4}, [r0], r1
.if \compare
cmp r4, #4
.else
ble 9f
.endif
vmov q8, q9
vmov q9, q10
vmov q10, q11
vmov q11, q12
vmov q12, q13
vmov q13, q14
.endm
filter 1
blt 7f
vld1.16 {q14}, [r2, :128], r7
b 4b
5: // Less than 4 rows in total; not all of q12-q13 are filled yet.
tst r6, #8 // LR_HAVE_BOTTOM
beq 6f
// LR_HAVE_BOTTOM
cmp r4, #2
// We load at least 2 rows in all cases.
vld1.16 {q12}, [r2, :128], r7
vld1.16 {q13}, [r2, :128], r7
bgt 53f // 3 rows in total
beq 52f // 2 rows in total
51: // 1 row in total, q11 already loaded, load edge into q12-q14.
vmov q13, q12
b 8f
52: // 2 rows in total, q11 already loaded, load q12 with content data
// and 2 rows of edge.
vld1.16 {q14}, [r2, :128], r7
vmov q15, q14
b 8f
53:
// 3 rows in total, q11 already loaded, load q12 and q13 with content
// and 2 rows of edge.
vld1.16 {q14}, [r2, :128], r7
vld1.16 {q15}, [r2, :128], r7
vmov q1, q15
b 8f
6:
// !LR_HAVE_BOTTOM
cmp r4, #2
bgt 63f // 3 rows in total
beq 62f // 2 rows in total
61: // 1 row in total, q11 already loaded, pad that into q12-q14.
vmov q12, q11
vmov q13, q11
vmov q14, q11
b 8f
62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
vld1.16 {q12}, [r2, :128], r7
vmov q13, q12
vmov q14, q12
vmov q15, q12
b 8f
63:
// 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
vld1.16 {q12}, [r2, :128], r7
vld1.16 {q13}, [r2, :128], r7
vmov q14, q13
vmov q15, q13
vmov q1, q13
b 8f
7:
// All registers up to q13 are filled already, 3 valid rows left.
// < 4 valid rows left; fill in padding and filter the last
// few rows.
tst r6, #8 // LR_HAVE_BOTTOM
beq 71f
// LR_HAVE_BOTTOM; load 2 rows of edge.
vld1.16 {q14}, [r2, :128], r7
vld1.16 {q15}, [r2, :128], r7
vmov q1, q15
b 8f
71:
// !LR_HAVE_BOTTOM, pad 3 rows
vmov q14, q13
vmov q15, q13
vmov q1, q13
8: // At this point, all registers up to q14-15,q1 are loaded with
// edge/padding (depending on how many rows are left).
filter 0 // This branches to 9f when done
vmov q13, q14
vmov q14, q1
b 8b
9: // End of one vertical slice.
subs r3, r3, #8
ble 0f
// Move pointers back up to the top and loop horizontally.
mls r0, r1, lr, r0
mls r2, r7, r12, r2
add r0, r0, #8
add r2, r2, #16
mov r4, lr
b 1b
0:
pop {r4-r7,pc}
.purgem filter
endfunc
// void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
// const pixel *src, int w, int h);
function copy_narrow_neon, export=1
push {r4,lr}
ldr r4, [sp, #8]
adr r12, L(copy_narrow_tbl)
ldr r3, [r12, r3, lsl #2]
add r12, r12, r3
bx r12
.align 2
L(copy_narrow_tbl):
.word 0
.word 10f - L(copy_narrow_tbl) + CONFIG_THUMB
.word 20f - L(copy_narrow_tbl) + CONFIG_THUMB
.word 30f - L(copy_narrow_tbl) + CONFIG_THUMB
.word 40f - L(copy_narrow_tbl) + CONFIG_THUMB
.word 50f - L(copy_narrow_tbl) + CONFIG_THUMB
.word 60f - L(copy_narrow_tbl) + CONFIG_THUMB
.word 70f - L(copy_narrow_tbl) + CONFIG_THUMB
10:
add r3, r0, r1
lsl r1, r1, #1
18:
subs r4, r4, #8
blt 110f
vld1.8 {d0}, [r2, :64]!
vst1.8 {d0[0]}, [r0], r1
vst1.8 {d0[1]}, [r3], r1
vst1.8 {d0[2]}, [r0], r1
vst1.8 {d0[3]}, [r3], r1
vst1.8 {d0[4]}, [r0], r1
vst1.8 {d0[5]}, [r3], r1
vst1.8 {d0[6]}, [r0], r1
vst1.8 {d0[7]}, [r3], r1
ble 0f
b 18b
110:
add r4, r4, #8
asr r1, r1, #1
11:
subs r4, r4, #1
vld1.8 {d0[]}, [r2]!
vst1.8 {d0[0]}, [r0], r1
bgt 11b
0:
pop {r4,pc}
20:
add r3, r0, r1
lsl r1, r1, #1
24:
subs r4, r4, #4
blt 210f
vld1.16 {d0}, [r2, :64]!
vst1.16 {d0[0]}, [r0, :16], r1
vst1.16 {d0[1]}, [r3, :16], r1
vst1.16 {d0[2]}, [r0, :16], r1
vst1.16 {d0[3]}, [r3, :16], r1
ble 0f
b 24b
210:
add r4, r4, #4
asr r1, r1, #1
22:
subs r4, r4, #1
vld1.16 {d0[]}, [r2]!
vst1.16 {d0[0]}, [r0], r1
bgt 22b
0:
pop {r4,pc}
30:
ldrh r3, [r2]
ldrb r12, [r2, #2]
add r2, r2, #3
subs r4, r4, #1
strh r3, [r0]
strb r12, [r0, #2]
add r0, r0, r1
bgt 30b
pop {r4,pc}
40:
add r3, r0, r1
lsl r1, r1, #1
42:
subs r4, r4, #2
blt 41f
vld1.8 {d0}, [r2, :64]!
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[1]}, [r3, :32], r1
ble 0f
b 42b
41:
vld1.32 {d0[]}, [r2]
vst1.32 {d0[0]}, [r0]
0:
pop {r4,pc}
50:
ldr r3, [r2]
ldrb r12, [r2, #4]
add r2, r2, #5
subs r4, r4, #1
str r3, [r0]
strb r12, [r0, #4]
add r0, r0, r1
bgt 50b
pop {r4,pc}
60:
ldr r3, [r2]
ldrh r12, [r2, #4]
add r2, r2, #6
subs r4, r4, #1
str r3, [r0]
strh r12, [r0, #4]
add r0, r0, r1
bgt 60b
pop {r4,pc}
70:
ldr r3, [r2]
ldrh r12, [r2, #4]
ldrb lr, [r2, #6]
add r2, r2, #7
subs r4, r4, #1
str r3, [r0]
strh r12, [r0, #4]
strb lr, [r0, #6]
add r0, r0, r1
bgt 70b
pop {r4,pc}
endfunc
......@@ -32,7 +32,7 @@
#include "common/intops.h"
#include "src/tables.h"
#if BITDEPTH == 8 && ARCH_AARCH64
#if BITDEPTH == 8
// This calculates things slightly differently than the reference C version.
// This version calculates roughly this:
// int16_t sum = 0;
......@@ -100,7 +100,7 @@ void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8 && ARCH_AARCH64
#if BITDEPTH == 8
c->wiener = wiener_filter_neon;
#endif
}
......@@ -95,6 +95,7 @@ if is_asm_enabled
)
elif host_machine.cpu_family().startswith('arm')
libdav1d_sources += files(
'arm/32/looprestoration.S',
'arm/32/mc.S',
)
endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment