Skip to content
Commits on Source (33)
Changes for 0.5.2 'Asiatic Cheetah':
------------------------------------
0.5.2 is a small release improving speed for ARM32 and adding minor features:
- ARM32 optimizations for loopfilter, ipred_dc|h|v
- Add section-5 raw OBU demuxer
- Improve the speed by reducing the L2 cache collisions
- Fix minor issues
Changes for 0.5.1 'Asiatic Cheetah':
------------------------------------
......
......@@ -31,15 +31,16 @@ The plan is the folllowing:
2. Provide a usable API,
3. Port to most platforms,
4. Make it fast on desktop, by writing asm for AVX-2 chips.
5. Make it fast on mobile, by writing asm for ARMv8 chips,
6. Make it fast on older desktop, by writing asm for SSSE3+ chips.
### On-going
5. Make it fast on mobile, by writing asm for ARMv8 chips,
6. Make it fast on older desktop, by writing asm for SSE chips.
7. Make it fast on older mobiles, by writing asm for ARMv7 chips,
8. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
9. Accelerate for less common architectures, like PPC, SSE2 or AVX-512.
### After
7. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
8. Accelerate for less common architectures,
9. Use more GPU, when possible.
10. Use more GPU, when possible.
# Contribute
......
......@@ -41,6 +41,17 @@
#define DAV1D_REFS_PER_FRAME 7
#define DAV1D_TOTAL_REFS_PER_FRAME (DAV1D_REFS_PER_FRAME + 1)
enum Dav1dObuType {
DAV1D_OBU_SEQ_HDR = 1,
DAV1D_OBU_TD = 2,
DAV1D_OBU_FRAME_HDR = 3,
DAV1D_OBU_TILE_GRP = 4,
DAV1D_OBU_METADATA = 5,
DAV1D_OBU_FRAME = 6,
DAV1D_OBU_REDUNDANT_FRAME_HDR = 7,
DAV1D_OBU_PADDING = 15,
};
enum Dav1dTxfmMode {
DAV1D_TX_4X4_ONLY,
DAV1D_TX_LARGEST,
......
......@@ -37,7 +37,7 @@
/* Number of bytes to align AND pad picture memory buffers by, so that SIMD
* implementations can over-read by a few bytes, and use aligned read/write
* instructions. */
#define DAV1D_PICTURE_ALIGNMENT 32
#define DAV1D_PICTURE_ALIGNMENT 64
typedef struct Dav1dPictureParameters {
int w; ///< width (in pixels)
......
......@@ -23,14 +23,14 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
version: '0.5.1',
version: '0.5.2',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',
'b_ndebug=if-release'],
meson_version: '>= 0.47.0')
dav1d_soname_version = '3.0.0'
dav1d_soname_version = '3.1.0'
dav1d_api_version_array = dav1d_soname_version.split('.')
dav1d_api_version_major = dav1d_api_version_array[0]
dav1d_api_version_minor = dav1d_api_version_array[1]
......@@ -98,6 +98,7 @@ if host_machine.system() == 'windows'
cdata.set('UNICODE', 1) # Define to 1 for Unicode (Wide Chars) APIs
cdata.set('_UNICODE', 1) # Define to 1 for Unicode (Wide Chars) APIs
cdata.set('__USE_MINGW_ANSI_STDIO', 1) # Define to force use of MinGW printf
cdata.set('_CRT_DECLARE_NONSTDC_NAMES', 1) # Define to get off_t from sys/types.h on MSVC
if cc.has_function('fseeko', prefix : '#include <stdio.h>', args : test_args)
cdata.set('_FILE_OFFSET_BITS', 64) # Not set by default by Meson on Windows
else
......@@ -112,11 +113,23 @@ if host_machine.system() == 'windows'
# On Windows, we use a compatibility layer to emulate pthread
thread_dependency = []
thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c'))
rt_dependency = []
else
thread_dependency = dependency('threads')
thread_compat_dep = []
endif
rt_dependency = []
if cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args)
cdata.set('HAVE_CLOCK_GETTIME', 1)
elif host_machine.system() != 'darwin'
rt_dependency = cc.find_library('rt', required: false)
if not cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args, dependencies : rt_dependency)
error('clock_gettime not found')
endif
cdata.set('HAVE_CLOCK_GETTIME', 1)
endif
endif
# Header checks
......
......@@ -6,7 +6,7 @@ summary: AV1 decoder from VideoLAN
description: |
A small and fast AV1 decoder from the people who brought you VLC.
grade: devel # must be 'stable' to release into candidate/stable channels
grade: stable
confinement: strict # use 'strict' once you have the right plugs and slots
apps:
......
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2019, Martin Storsjo
* Copyright © 2019, B Krishnan Iyer
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
// void ipred_dc_128_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_dc_128_neon, export=1
push {r4, lr}
ldr r4, [sp, #8]
clz r3, r3
adr r2, L(ipred_dc_128_tbl)
sub r3, r3, #25
ldr r3, [r2, r3, lsl #2]
mov lr, #128
vdup.8 q0, lr
add r2, r2, r3
add r12, r0, r1
lsl r1, r1, #1
bx r2
.align 2
L(ipred_dc_128_tbl):
.word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB
.word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB
.word 16f - L(ipred_dc_128_tbl) + CONFIG_THUMB
.word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB
.word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB
4:
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
subs r4, r4, #4
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
bgt 4b
pop {r4, pc}
8:
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
subs r4, r4, #4
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
bgt 8b
pop {r4, pc}
16:
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
bgt 16b
pop {r4, pc}
320:
vdup.8 q1, lr
32:
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
bgt 32b
pop {r4, pc}
640:
vdup.8 q1, lr
vdup.8 q2, lr
vdup.8 q3, lr
sub r1, r1, #32
64:
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
bgt 64b
pop {r4, pc}
endfunc
// void ipred_v_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_v_neon, export=1
push {r4, lr}
ldr lr, [sp, #8]
clz r3, r3
adr r4, L(ipred_v_tbl)
sub r3, r3, #25
ldr r3, [r4, r3, lsl #2]
add r2, r2, #1
add r4, r4, r3
add r12, r0, r1
lsl r1, r1, #1
bx r4
.align 2
L(ipred_v_tbl):
.word 640f - L(ipred_v_tbl) + CONFIG_THUMB
.word 320f - L(ipred_v_tbl) + CONFIG_THUMB
.word 160f - L(ipred_v_tbl) + CONFIG_THUMB
.word 80f - L(ipred_v_tbl) + CONFIG_THUMB
.word 40f - L(ipred_v_tbl) + CONFIG_THUMB
40:
vld1.32 {d0[0]}, [r2]
4:
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
subs lr, lr, #4
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
bgt 4b
pop {r4, pc}
80:
vld1.8 {d0}, [r2]
8:
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
subs lr, lr, #4
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
bgt 8b
pop {r4, pc}
160:
vld1.8 {q0}, [r2]
16:
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
subs lr, lr, #4
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
bgt 16b
pop {r4, pc}
320:
vld1.8 {q0, q1}, [r2]
32:
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs lr, lr, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
bgt 32b
pop {r4, pc}
640:
vld1.8 {q0, q1}, [r2]!
sub r1, r1, #32
vld1.8 {q2, q3}, [r2]
64:
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
subs lr, lr, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
bgt 64b
pop {r4, pc}
endfunc
// void ipred_h_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_h_neon, export=1
push {r4-r5, lr}
ldr r4, [sp, #12]
clz r3, r3
adr r5, L(ipred_h_tbl)
sub r3, r3, #25
ldr r3, [r5, r3, lsl #2]
sub r2, r2, #4
mov lr, #-4
add r5, r5, r3
add r12, r0, r1
lsl r1, r1, #1
bx r5
.align 2
L(ipred_h_tbl):
.word 640f - L(ipred_h_tbl) + CONFIG_THUMB
.word 320f - L(ipred_h_tbl) + CONFIG_THUMB
.word 160f - L(ipred_h_tbl) + CONFIG_THUMB
.word 8f - L(ipred_h_tbl) + CONFIG_THUMB
.word 4f - L(ipred_h_tbl) + CONFIG_THUMB
4:
vld4.8 {d0[], d1[], d2[], d3[]}, [r2], lr
vst1.32 {d3[0]}, [r0, :32], r1
vst1.32 {d2[0]}, [r12, :32], r1
subs r4, r4, #4
vst1.32 {d1[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
bgt 4b
pop {r4-r5, pc}
8:
vld4.8 {d0[], d1[], d2[], d3[]}, [r2], lr
vst1.8 {d3}, [r0, :64], r1
vst1.8 {d2}, [r12, :64], r1
subs r4, r4, #4
vst1.8 {d1}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
bgt 8b
pop {r4-r5, pc}
160:
add r2, r2, #3
mov lr, #-1
16:
vld1.8 {d0[], d1[]}, [r2], lr
subs r4, r4, #4
vld1.8 {d2[], d3[]}, [r2], lr
vst1.8 {q0}, [r0, :128], r1
vld1.8 {d4[], d5[]}, [r2], lr
vst1.8 {q1}, [r12, :128], r1
vld1.8 {d6[], d7[]}, [r2], lr
vst1.8 {q2}, [r0, :128], r1
vst1.8 {q3}, [r12, :128], r1
bgt 16b
pop {r4-r5, pc}
320:
add r2, r2, #3
mov lr, #-1
sub r1, r1, #16
32:
vld1.8 {d0[], d1[]}, [r2], lr
subs r4, r4, #4
vld1.8 {d2[], d3[]}, [r2], lr
vst1.8 {q0}, [r0, :128]!
vld1.8 {d4[], d5[]}, [r2], lr
vst1.8 {q1}, [r12, :128]!
vld1.8 {d6[], d7[]}, [r2], lr
vst1.8 {q0}, [r0, :128], r1
vst1.8 {q1}, [r12, :128], r1
vst1.8 {q2}, [r0, :128]!
vst1.8 {q3}, [r12, :128]!
vst1.8 {q2}, [r0, :128], r1
vst1.8 {q3}, [r12, :128], r1
bgt 32b
pop {r4-r5, pc}
640:
add r2, r2, #3
mov lr, #-1
sub r1, r1, #48
64:
vld1.8 {d0[], d1[]}, [r2], lr
subs r4, r4, #4
vld1.8 {d2[], d3[]}, [r2], lr
vst1.8 {q0}, [r0, :128]!
vld1.8 {d4[], d5[]}, [r2], lr
vst1.8 {q1}, [r12, :128]!
vld1.8 {d6[], d7[]}, [r2], lr
vst1.8 {q0}, [r0, :128]!
vst1.8 {q1}, [r12, :128]!
vst1.8 {q0}, [r0, :128]!
vst1.8 {q1}, [r12, :128]!
vst1.8 {q0}, [r0, :128], r1
vst1.8 {q1}, [r12, :128], r1
vst1.8 {q2}, [r0, :128]!
vst1.8 {q3}, [r12, :128]!
vst1.8 {q2}, [r0, :128]!
vst1.8 {q3}, [r12, :128]!
vst1.8 {q2}, [r0, :128]!
vst1.8 {q3}, [r12, :128]!
vst1.8 {q2}, [r0, :128], r1
vst1.8 {q3}, [r12, :128], r1
bgt 64b
pop {r4-r5, pc}
endfunc
// void ipred_dc_top_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_dc_top_neon, export=1
push {r4-r5, lr}
ldr r4, [sp, #12]
clz r3, r3
adr r5, L(ipred_dc_top_tbl)
sub r3, r3, #25
ldr r3, [r5, r3, lsl #2]
add r2, r2, #1
add r5, r5, r3
add r12, r0, r1
lsl r1, r1, #1
bx r5
.align 2
L(ipred_dc_top_tbl):
.word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB
.word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB
.word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB
.word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB
.word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB
40:
vld1.32 {d0[]}, [r2]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #2
vdup.8 d0, d0[0]
4:
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
subs r4, r4, #4
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
bgt 4b
pop {r4-r5, pc}
80:
vld1.8 {d0}, [r2]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #3
vdup.8 d0, d0[0]
8:
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
subs r4, r4, #4
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
bgt 8b
pop {r4-r5, pc}
160:
vld1.8 {d0, d1}, [r2]
vaddl.u8 q0, d0, d1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #4
vdup.8 q0, d0[0]
16:
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
bgt 16b
pop {r4-r5, pc}
320:
vld1.8 {d0, d1, d2, d3}, [r2]
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d4, q0, #5
vdup.8 q0, d4[0]
vdup.8 q1, d4[0]
32:
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
bgt 32b
pop {r4-r5, pc}
640:
vld1.8 {d0, d1, d2, d3}, [r2]!
vaddl.u8 q0, d0, d1
vld1.8 {d4, d5, d6, d7}, [r2]
vaddl.u8 q1, d2, d3
vaddl.u8 q2, d4, d5
vaddl.u8 q3, d6, d7
vadd.u16 q0, q0, q1
vadd.u16 q1, q2, q3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d18, q0, #6
vdup.8 q0, d18[0]
vdup.8 q1, d18[0]
vdup.8 q2, d18[0]
vdup.8 q3, d18[0]
sub r1, r1, #32
64:
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
bgt 64b
pop {r4-r5, pc}
endfunc
// void ipred_dc_left_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_dc_left_neon, export=1
push {r4-r5, lr}
ldr r4, [sp, #12]
sub r2, r2, r4
clz r3, r3
clz lr, r4
sub lr, lr, #25
adr r5, L(ipred_dc_left_tbl)
sub r3, r3, #20
ldr r3, [r5, r3, lsl #2]
ldr lr, [r5, lr, lsl #2]
add r3, r5, r3
add r5, r5, lr
add r12, r0, r1
lsl r1, r1, #1
bx r5
.align 2
L(ipred_dc_left_tbl):
.word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
L(ipred_dc_left_h4):
vld1.32 {d0[]}, [r2]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #2
vdup.8 q0, d0[0]
bx r3
L(ipred_dc_left_w4):
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
subs r4, r4, #4
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
bgt L(ipred_dc_left_w4)
pop {r4-r5, pc}
L(ipred_dc_left_h8):
vld1.8 {d0}, [r2]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #3
vdup.8 q0, d0[0]
bx r3
L(ipred_dc_left_w8):
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
subs r4, r4, #4
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
bgt L(ipred_dc_left_w8)
pop {r4-r5, pc}
L(ipred_dc_left_h16):
vld1.8 {d0, d1}, [r2]
vaddl.u8 q0, d0, d1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #4
vdup.8 q0, d0[0]
bx r3
L(ipred_dc_left_w16):
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
bgt L(ipred_dc_left_w16)
pop {r4-r5, pc}
L(ipred_dc_left_h32):
vld1.8 {d0, d1, d2, d3}, [r2]
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #5
vdup.8 q0, d0[0]
bx r3
L(ipred_dc_left_w32):
vmov.8 q1, q0
1:
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
bgt 1b
pop {r4-r5, pc}
L(ipred_dc_left_h64):
vld1.8 {d0, d1, d2, d3}, [r2]!
vld1.8 {d4, d5, d6, d7}, [r2]
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vaddl.u8 q2, d4, d5
vaddl.u8 q3, d6, d7
vadd.u16 q0, q0, q1
vadd.u16 q1, q2, q3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #6
vdup.8 q0, d0[0]
bx r3
L(ipred_dc_left_w64):
sub r1, r1, #32
vmov.8 q1, q0
vmov.8 q2, q0
vmov.8 q3, q0
1:
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
bgt 1b
pop {r4-r5, pc}
endfunc
// void ipred_dc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_dc_neon, export=1
push {r4-r6, lr}
ldr r4, [sp, #16]
sub r2, r2, r4
add lr, r3, r4 // width + height
clz r3, r3
clz r12, r4
vdup.16 q15, lr // width + height
mov r6, #0
adr r5, L(ipred_dc_tbl)
rbit lr, lr // rbit(width + height)
sub r3, r3, #20 // 25 leading bits, minus table offset 5
sub r12, r12, #25
clz lr, lr // ctz(width + height)
ldr r3, [r5, r3, lsl #2]
ldr r12, [r5, r12, lsl #2]
neg lr, lr // -ctz(width + height)
add r3, r5, r3
add r5, r5, r12
vshr.u16 q15, q15, #1 // (width + height) >> 1
vdup.16 q14, lr // -ctz(width + height)
add r12, r0, r1
lsl r1, r1, #1
bx r5
.align 2
L(ipred_dc_tbl):
.word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB
L(ipred_dc_h4):
vld1.32 {d0[0]}, [r2]!
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w4):
add r2, r2, #1
vld1.32 {d1[0]}, [r2]
vmov.32 d1[1], r6
vadd.s16 d0, d0, d30
vpaddl.u8 d1, d1
vpadd.u16 d1, d1
vpadd.u16 d1, d1
cmp r4, #4
vadd.s16 d0, d0, d1
vshl.u16 d0, d0, d28
beq 1f // h = 8/16
movw lr, #(0x3334/2)
movw r5, #(0x5556/2)
cmp r4, #16
it ne
movne lr, r5
vdup.16 d30, lr
vqdmulh.s16 d0, d0, d30
1:
vdup.8 d0, d0[0]
2:
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
subs r4, r4, #4
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
bgt 2b
pop {r4-r6, pc}
L(ipred_dc_h8):
vld1.8 {d0}, [r2]!
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w8):
add r2, r2, #1
vld1.8 {d2}, [r2]
vadd.s16 d0, d0, d30
vpaddl.u8 d2, d2
vpadd.u16 d2, d2
vpadd.u16 d2, d2
cmp r4, #8
vadd.s16 d0, d0, d2
vshl.u16 d0, d0, d28
beq 1f // h = 4/16/32
cmp r4, #32
movw lr, #(0x3334/2)
movw r5, #(0x5556/2)
it ne
movne lr, r5
vdup.16 q12, lr
vqdmulh.s16 d0, d0, d24
1:
vdup.8 d0, d0[0]
2:
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
subs r4, r4, #4
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
bgt 2b
pop {r4-r6, pc}
L(ipred_dc_h16):
vld1.8 {d0, d1}, [r2]!
vaddl.u8 q0, d0, d1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w16):
add r2, r2, #1
vld1.8 {d2, d3}, [r2]
vadd.s16 d0, d0, d30
vaddl.u8 q1, d2, d3
vadd.u16 d2, d2, d3
vpadd.u16 d2, d2
vpadd.u16 d2, d2
cmp r4, #16
vadd.s16 d0, d0, d2
vshl.u16 d0, d0, d28
beq 1f // h = 4/8/32/64
tst r4, #(32+16+8) // 16 added to make a consecutive bitmask
movw lr, #(0x3334/2)
movw r5, #(0x5556/2)
it ne
movne lr, r5
vdup.16 q12, lr
vqdmulh.s16 d0, d0, d24
1:
vdup.8 q0, d0[0]
2:
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
bgt 2b
pop {r4-r6, pc}
L(ipred_dc_h32):
vld1.8 {d0, d1, d2, d3}, [r2]!
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w32):
add r2, r2, #1
vld1.8 {d2, d3, d4, d5}, [r2]
vadd.s16 d0, d0, d30
vaddl.u8 q2, d4, d5
vadd.u16 d4, d4, d5
vaddl.u8 q1, d2, d3
vadd.u16 d2, d2, d3
vpadd.u16 d4, d4
vpadd.u16 d2, d2
vpadd.u16 d4, d4
vpadd.u16 d2, d2
cmp r4, #32
vadd.s16 d0, d0, d4
vadd.s16 d0, d0, d2
vshl.u16 d4, d0, d28
beq 1f // h = 8/16/64
cmp r4, #8
movw lr, #(0x3334/2)
movw r5, #(0x5556/2)
it ne
movne lr, r5
vdup.16 q12, lr
vqdmulh.s16 d4, d4, d24
1:
vdup.8 q0, d4[0]
vdup.8 q1, d4[0]
2:
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
bgt 2b
pop {r4-r6, pc}
L(ipred_dc_h64):
vld1.8 {d0, d1, d2, d3}, [r2]!
vaddl.u8 q0, d0, d1
vld1.8 {d4, d5, d6, d7}, [r2]!
vaddl.u8 q1, d2, d3
vaddl.u8 q2, d4, d5
vaddl.u8 q3, d6, d7
vadd.u16 q0, q0, q1
vadd.u16 q1, q2, q3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w64):
vmov.8 q1, q0
vmov.8 q2, q0
vmov.8 q3, q0
2:
add r2, r2, #1
vld1.8 {d2, d3, d4, d5}, [r2]!
vadd.s16 d0, d0, d30
vaddl.u8 q2, d4, d5
vaddl.u8 q1, d2, d3
vadd.u16 d4, d4, d5
vadd.u16 d2, d2, d3
vld1.8 {d16, d17, d18, d19}, [r2]
vpadd.u16 d4, d4
vpadd.u16 d2, d2
vpadd.u16 d4, d4
vpadd.u16 d2, d2
vaddl.u8 q8, d16, d17
vaddl.u8 q9, d18, d19
vadd.u16 d16, d16, d17
vadd.u16 d18, d18, d19
vpadd.u16 d16, d16
vpadd.u16 d18, d18
vpadd.u16 d16, d16
vpadd.u16 d18, d18
vadd.u16 d2, d2, d4
vadd.u16 d3, d16, d18
cmp r4, #64
vadd.s16 d0, d0, d2
vadd.s16 d0, d0, d3
vshl.u16 d18, d0, d28
beq 1f // h = 16/32
movw lr, #(0x5556/2)
movt lr, #(0x3334/2)
mov r5, r4
and r5, r5, #31
lsr lr, lr, r5
vdup.16 d30, lr
vqdmulh.s16 d18, d18, d30
1:
sub r1, r1, #32
vdup.8 q0, d18[0]
vdup.8 q1, d18[0]
vdup.8 q2, d18[0]
vdup.8 q3, d18[0]
2:
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
bgt 2b
pop {r4-r6, pc}
endfunc
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2019, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
.macro loop_filter wd
function lpf_8_wd\wd\()_neon
vabd.u8 d0, d22, d23 // abs(p1 - p0)
vabd.u8 d1, d25, d24 // abs(q1 - q0)
vabd.u8 d2, d23, d24 // abs(p0 - q0)
vabd.u8 d3, d22, d25 // abs(p1 - q1)
.if \wd >= 6
vabd.u8 d4, d21, d22 // abs(p2 - p1)
vabd.u8 d5, d26, d25 // abs(q2 - q1)
.endif
.if \wd >= 8
vabd.u8 d6, d20, d21 // abs(p3 - p2)
vabd.u8 d7, d27, d26 // abs(q3 - q3)
.endif
.if \wd >= 6
vmax.u8 d4, d4, d5
.endif
vqadd.u8 d2, d2, d2 // abs(p0 - q0) * 2
.if \wd >= 8
vmax.u8 d6, d6, d7
.endif
vshr.u8 d3, d3, #1
.if \wd >= 8
vmax.u8 d4, d4, d6
.endif
.if \wd >= 6
vand d4, d4, d14
.endif
vmax.u8 d0, d0, d1 // max(abs(p1 - p0), abs(q1 - q0))
vqadd.u8 d2, d2, d3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
.if \wd >= 6
vmax.u8 d4, d0, d4
vcge.u8 d1, d11, d4 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
.else
vcge.u8 d1, d11, d0 // max(abs(p1 - p0), abs(q1 - q0)) <= I
.endif
vcge.u8 d2, d10, d2 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
vand d1, d1, d2 // fm
vand d1, d1, d13 // fm && wd >= 4
.if \wd >= 6
vand d14, d14, d1 // fm && wd > 4
.endif
.if \wd >= 16
vand d15, d15, d1 // fm && wd == 16
.endif
vmov r10, r11, d1
orrs r10, r10, r11
beq 9f // if (!fm || wd < 4) return;
.if \wd >= 6
vmov.i8 d10, #1
vabd.u8 d2, d21, d23 // abs(p2 - p0)
vabd.u8 d3, d22, d23 // abs(p1 - p0)
vabd.u8 d4, d25, d24 // abs(q1 - q0)
vabd.u8 d5, d26, d24 // abs(q2 - q0)
.if \wd >= 8
vabd.u8 d6, d20, d23 // abs(p3 - p0)
vabd.u8 d7, d27, d24 // abs(q3 - q0)
.endif
vmax.u8 d2, d2, d3
vmax.u8 d4, d4, d5
.if \wd >= 8
vmax.u8 d6, d6, d7
.endif
vmax.u8 d2, d2, d4
.if \wd >= 8
vmax.u8 d2, d2, d6
.endif
.if \wd == 16
vabd.u8 d3, d17, d23 // abs(p6 - p0)
vabd.u8 d4, d18, d23 // abs(p5 - p0)
vabd.u8 d5, d19, d23 // abs(p4 - p0)
.endif
vcge.u8 d2, d10, d2 // flat8in
.if \wd == 16
vabd.u8 d6, d28, d24 // abs(q4 - q0)
vabd.u8 d7, d29, d24 // abs(q5 - q0)
vabd.u8 d8, d30, d24 // abs(q6 - q0)
.endif
vand d14, d2, d14 // flat8in && fm && wd > 4
vbic d1, d1, d14 // fm && wd >= 4 && !flat8in
.if \wd == 16
vmax.u8 d3, d3, d4
vmax.u8 d5, d5, d6
.endif
vmov r10, r11, d1
.if \wd == 16
vmax.u8 d7, d7, d8
vmax.u8 d3, d3, d5
vmax.u8 d3, d3, d7
vcge.u8 d3, d10, d3 // flat8out
.endif
orrs r10, r10, r11
.if \wd == 16
vand d15, d15, d3 // flat8out && fm && wd == 16
vand d15, d15, d14 // flat8out && flat8in && fm && wd == 16
vbic d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out
.endif
beq 1f // skip wd == 4 case
.endif
vsubl.u8 q1, d22, d25 // p1 - q1
vcgt.u8 d0, d0, d12 // hev
vqmovn.s16 d2, q1
vand d4, d2, d0 // if (hev) iclip_diff(p1 - q1)
vbic d0, d1, d0 // (fm && wd >= 4 && !hev)
vsubl.u8 q1, d24, d23
vmov.i16 q3, #3
vmul.i16 q1, q1, q3
vmov.i8 d6, #4
vaddw.s8 q1, q1, d4
vmov.i8 d7, #3
vqmovn.s16 d2, q1 // f
vqadd.s8 d4, d6, d2 // imin(f + 4, 128)
vqadd.s8 d5, d7, d2 // imin(f + 3, 128)
vshr.s8 d4, d4, #3 // f1
vshr.s8 d5, d5, #3 // f2
vmovl.u8 q1, d23 // p0
vmovl.u8 q3, d24 // q0
vaddw.s8 q1, q1, d5
vsubw.s8 q3, q3, d4
vrshr.s8 d4, d4, #1 // (f1 + 1) >> 1
vqmovun.s16 d2, q1 // out p0
vqmovun.s16 d6, q3 // out q0
vbit d23, d2, d1 // if (fm && wd >= 4)
vmovl.u8 q1, d22 // p1
vbit d24, d6, d1 // if (fm && wd >= 4)
vmovl.u8 q3, d25 // q1
vaddw.s8 q1, q1, d4
vsubw.s8 q3, q3, d4
vqmovun.s16 d2, q1 // out p1
vqmovun.s16 d6, q3 // out q1
vbit d22, d2, d0 // if (fm && wd >= 4 && !hev)
vbit d25, d6, d0 // if (fm && wd >= 4 && !hev)
1:
.if \wd == 6
vmov r10, r11, d14
orrs r10, r10, r11
beq 2f // skip if there's no flat8in
vaddl.u8 q0, d21, d21 // p2 * 2
vaddl.u8 q1, d21, d22 // p2 + p1
vaddl.u8 q2, d22, d23 // p1 + p0
vaddl.u8 q3, d23, d24 // p0 + q0
vadd.i16 q4, q0, q1
vadd.i16 q5, q2, q3
vaddl.u8 q6, d24, d25 // q0 + q1
vadd.i16 q4, q4, q5
vsub.i16 q6, q6, q0
vaddl.u8 q5, d25, d26 // q1 + q2
vrshrn.i16 d0, q4, #3 // out p1
vadd.i16 q4, q4, q6
vsub.i16 q5, q5, q1
vaddl.u8 q6, d26, d26 // q2 + q2
vrshrn.i16 d1, q4, #3 // out p0
vadd.i16 q4, q4, q5
vsub.i16 q6, q6, q2
vrshrn.i16 d2, q4, #3 // out q0
vbit d22, d0, d14 // p1 if (flat8in)
vadd.i16 q4, q4, q6
vbit d23, d1, d14 // p0 if (flat8in)
vrshrn.i16 d3, q4, #3 // out q1
vbit d24, d2, d14 // q0 if (flat8in)
vbit d25, d3, d14 // q1 if (flat8in)
.elseif \wd >= 8
vmov r10, r11, d14
orrs r10, r10, r11
.if \wd == 8
beq 8f // skip if there's no flat8in
.else
beq 2f // skip if there's no flat8in
.endif
vaddl.u8 q0, d20, d21 // p3 + p2
vaddl.u8 q1, d22, d25 // p1 + q1
vaddl.u8 q2, d20, d22 // p3 + p1
vaddl.u8 q3, d23, d26 // p0 + q2
vadd.i16 q4, q0, q0 // 2 * (p3 + p2)
vaddw.u8 q4, q4, d23 // + p0
vaddw.u8 q4, q4, d24 // + q0
vadd.i16 q4, q4, q2 // + p3 + p1
vsub.i16 q1, q1, q0 // p1 + q1 - p3 - p2
vsub.i16 q3, q3, q2 // p0 + q2 - p3 - p1
vrshrn.i16 d10, q4, #3 // out p2
vadd.i16 q4, q4, q1
vaddl.u8 q0, d20, d23 // p3 + p0
vaddl.u8 q1, d24, d27 // q0 + q3
vrshrn.i16 d11, q4, #3 // out p1
vadd.i16 q4, q4, q3
vsub.i16 q1, q1, q0 // q0 + q3 - p3 - p0
vaddl.u8 q2, d21, d24 // p2 + q0
vaddl.u8 q3, d25, d27 // q1 + q3
vrshrn.i16 d12, q4, #3 // out p0
vadd.i16 q4, q4, q1
vsub.i16 q3, q3, q2 // q1 + q3 - p2 - q0
vaddl.u8 q0, d22, d25 // p1 + q1
vaddl.u8 q1, d26, d27 // q2 + q3
vrshrn.i16 d13, q4, #3 // out q0
vadd.i16 q4, q4, q3
vsub.i16 q1, q1, q0 // q2 + q3 - p1 - q1
vrshrn.i16 d0, q4, #3 // out q1
vadd.i16 q4, q4, q1
vbit d21, d10, d14
vbit d22, d11, d14
vbit d23, d12, d14
vrshrn.i16 d1, q4, #3 // out q2
vbit d24, d13, d14
vbit d25, d0, d14
vbit d26, d1, d14
.endif
2:
.if \wd == 16
vmov r10, r11, d15
orrs r10, r10, r11
bne 1f // check if flat8out is needed
vmov r10, r11, d14
orrs r10, r10, r11
beq 8f // if there was no flat8in, just write the inner 4 pixels
b 7f // if flat8in was used, write the inner 6 pixels
1:
vaddl.u8 q1, d17, d17 // p6 + p6
vaddl.u8 q2, d17, d18 // p6 + p5
vaddl.u8 q3, d17, d19 // p6 + p4
vaddl.u8 q4, d17, d20 // p6 + p3
vadd.i16 q6, q1, q2
vadd.i16 q5, q3, q4
vaddl.u8 q3, d17, d21 // p6 + p2
vadd.i16 q6, q6, q5
vaddl.u8 q4, d17, d22 // p6 + p1
vaddl.u8 q5, d18, d23 // p5 + p0
vadd.i16 q3, q3, q4
vaddl.u8 q4, d19, d24 // p4 + q0
vadd.i16 q6, q6, q3
vadd.i16 q5, q5, q4
vaddl.u8 q3, d20, d25 // p3 + q1
vadd.i16 q6, q6, q5
vsub.i16 q3, q3, q1
vaddl.u8 q1, d21, d26 // p2 + q2
vrshrn.i16 d0, q6, #4 // out p5
vadd.i16 q6, q6, q3 // - (p6 + p6) + (p3 + q1)
vsub.i16 q1, q1, q2
vaddl.u8 q2, d22, d27 // p1 + q3
vaddl.u8 q3, d17, d19 // p6 + p4
vrshrn.i16 d1, q6, #4 // out p4
vadd.i16 q6, q6, q1 // - (p6 + p5) + (p2 + q2)
vsub.i16 q2, q2, q3
vaddl.u8 q3, d23, d28 // p0 + q4
vaddl.u8 q4, d17, d20 // p6 + p3
vrshrn.i16 d2, q6, #4 // out p3
vadd.i16 q6, q6, q2 // - (p6 + p4) + (p1 + q3)
vsub.i16 q3, q3, q4
vaddl.u8 q4, d24, d29 // q0 + q5
vaddl.u8 q2, d17, d21 // p6 + p2
vrshrn.i16 d3, q6, #4 // out p2
vadd.i16 q6, q6, q3 // - (p6 + p3) + (p0 + q4)
vsub.i16 q4, q4, q2
vaddl.u8 q3, d25, d30 // q1 + q6
vaddl.u8 q5, d17, d22 // p6 + p1
vrshrn.i16 d4, q6, #4 // out p1
vadd.i16 q6, q6, q4 // - (p6 + p2) + (q0 + q5)
vsub.i16 q3, q3, q5
vaddl.u8 q4, d26, d30 // q2 + q6
vbif d0, d18, d15 // out p5
vaddl.u8 q5, d18, d23 // p5 + p0
vrshrn.i16 d5, q6, #4 // out p0
vadd.i16 q6, q6, q3 // - (p6 + p1) + (q1 + q6)
vsub.i16 q4, q4, q5
vaddl.u8 q5, d27, d30 // q3 + q6
vbif d1, d19, d15 // out p4
vaddl.u8 q9, d19, d24 // p4 + q0
vrshrn.i16 d6, q6, #4 // out q0
vadd.i16 q6, q6, q4 // - (p5 + p0) + (q2 + q6)
vsub.i16 q5, q5, q9
vaddl.u8 q4, d28, d30 // q4 + q6
vbif d2, d20, d15 // out p3
vaddl.u8 q9, d20, d25 // p3 + q1
vrshrn.i16 d7, q6, #4 // out q1
vadd.i16 q6, q6, q5 // - (p4 + q0) + (q3 + q6)
vsub.i16 q9, q4, q9
vaddl.u8 q5, d29, d30 // q5 + q6
vbif d3, d21, d15 // out p2
vaddl.u8 q10, d21, d26 // p2 + q2
vrshrn.i16 d8, q6, #4 // out q2
vadd.i16 q6, q6, q9 // - (p3 + q1) + (q4 + q6)
vsub.i16 q5, q5, q10
vaddl.u8 q9, d30, d30 // q6 + q6
vbif d4, d22, d15 // out p1
vaddl.u8 q10, d22, d27 // p1 + q3
vrshrn.i16 d9, q6, #4 // out q3
vadd.i16 q6, q6, q5 // - (p2 + q2) + (q5 + q6)
vsub.i16 q9, q9, q10
vbif d5, d23, d15 // out p0
vrshrn.i16 d10, q6, #4 // out q4
vadd.i16 q6, q6, q9 // - (p1 + q3) + (q6 + q6)
vrshrn.i16 d11, q6, #4 // out q5
vbif d6, d24, d15 // out q0
vbif d7, d25, d15 // out q1
vbif d8, d26, d15 // out q2
vbif d9, d27, d15 // out q3
vbif d10, d28, d15 // out q4
vbif d11, d29, d15 // out q5
.endif
bx lr
.if \wd == 16
7:
// Return to a shorter epilogue, writing only the inner 6 pixels
bx r8
.endif
.if \wd >= 8
8:
// Return to a shorter epilogue, writing only the inner 4 pixels
bx r9
.endif
9:
// Return directly without writing back any pixels
bx r12
endfunc
.endm
loop_filter 16
loop_filter 8
loop_filter 6
loop_filter 4
.macro lpf_8_wd16
adr r8, 7f + CONFIG_THUMB
adr r9, 8f + CONFIG_THUMB
bl lpf_8_wd16_neon
.endm
.macro lpf_8_wd8
adr r9, 8f + CONFIG_THUMB
bl lpf_8_wd8_neon
.endm
.macro lpf_8_wd6
bl lpf_8_wd6_neon
.endm
.macro lpf_8_wd4
bl lpf_8_wd4_neon
.endm
function lpf_v_4_8_neon
mov r12, lr
sub r10, r0, r1, lsl #1
vld1.8 {d22}, [r10, :64], r1 // p1
vld1.8 {d24}, [r0, :64], r1 // q0
vld1.8 {d23}, [r10, :64], r1 // p0
vld1.8 {d25}, [r0, :64], r1 // q1
sub r0, r0, r1, lsl #1
lpf_8_wd4
sub r10, r0, r1, lsl #1
vst1.8 {d22}, [r10, :64], r1 // p1
vst1.8 {d24}, [r0, :64], r1 // q0
vst1.8 {d23}, [r10, :64], r1 // p0
vst1.8 {d25}, [r0, :64], r1 // q1
sub r0, r0, r1, lsl #1
bx r12
endfunc
function lpf_h_4_8_neon
mov r12, lr
sub r10, r0, #2
add r0, r10, r1, lsl #2
vld1.32 {d22[0]}, [r10], r1
vld1.32 {d22[1]}, [r0], r1
vld1.32 {d23[0]}, [r10], r1
vld1.32 {d23[1]}, [r0], r1
vld1.32 {d24[0]}, [r10], r1
vld1.32 {d24[1]}, [r0], r1
vld1.32 {d25[0]}, [r10], r1
vld1.32 {d25[1]}, [r0], r1
add r0, r0, #2
transpose_4x8b q11, q12, d22, d23, d24, d25
lpf_8_wd4
sub r10, r0, r1, lsl #3
sub r10, r10, #2
transpose_4x8b q11, q12, d22, d23, d24, d25
add r0, r10, r1, lsl #2
vst1.32 {d22[0]}, [r10], r1
vst1.32 {d22[1]}, [r0], r1
vst1.32 {d23[0]}, [r10], r1
vst1.32 {d23[1]}, [r0], r1
vst1.32 {d24[0]}, [r10], r1
vst1.32 {d24[1]}, [r0], r1
vst1.32 {d25[0]}, [r10], r1
vst1.32 {d25[1]}, [r0], r1
add r0, r0, #2
bx r12
endfunc
function lpf_v_6_8_neon
mov r12, lr
sub r10, r0, r1, lsl #1
sub r10, r10, r1
vld1.8 {d21}, [r10, :64], r1 // p2
vld1.8 {d24}, [r0, :64], r1 // q0
vld1.8 {d22}, [r10, :64], r1 // p1
vld1.8 {d25}, [r0, :64], r1 // q1
vld1.8 {d23}, [r10, :64], r1 // p0
vld1.8 {d26}, [r0, :64], r1 // q2
sub r0, r0, r1, lsl #1
sub r0, r0, r1
lpf_8_wd6
sub r10, r0, r1, lsl #1
vst1.8 {d22}, [r10, :64], r1 // p1
vst1.8 {d24}, [r0, :64], r1 // q0
vst1.8 {d23}, [r10, :64], r1 // p0
vst1.8 {d25}, [r0, :64], r1 // q1
sub r0, r0, r1, lsl #1
bx r12
endfunc
function lpf_h_6_8_neon
mov r12, lr
sub r10, r0, #4
add r0, r10, r1, lsl #2
vld1.8 {d20}, [r10], r1
vld1.8 {d24}, [r0], r1
vld1.8 {d21}, [r10], r1
vld1.8 {d25}, [r0], r1
vld1.8 {d22}, [r10], r1
vld1.8 {d26}, [r0], r1
vld1.8 {d23}, [r10], r1
vld1.8 {d27}, [r0], r1
add r0, r0, #4
transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
lpf_8_wd6
sub r10, r0, r1, lsl #3
sub r10, r10, #2
transpose_4x8b q11, q12, d22, d23, d24, d25
add r0, r10, r1, lsl #2
vst1.32 {d22[0]}, [r10], r1
vst1.32 {d22[1]}, [r0], r1
vst1.32 {d23[0]}, [r10], r1
vst1.32 {d23[1]}, [r0], r1
vst1.32 {d24[0]}, [r10], r1
vst1.32 {d24[1]}, [r0], r1
vst1.32 {d25[0]}, [r10], r1
vst1.32 {d25[1]}, [r0], r1
add r0, r0, #2
bx r12
endfunc
function lpf_v_8_8_neon
mov r12, lr
sub r10, r0, r1, lsl #2
vld1.8 {d20}, [r10, :64], r1 // p3
vld1.8 {d24}, [r0, :64], r1 // q0
vld1.8 {d21}, [r10, :64], r1 // p2
vld1.8 {d25}, [r0, :64], r1 // q1
vld1.8 {d22}, [r10, :64], r1 // p1
vld1.8 {d26}, [r0, :64], r1 // q2
vld1.8 {d23}, [r10, :64], r1 // p0
vld1.8 {d27}, [r0, :64], r1 // q3
sub r0, r0, r1, lsl #2
lpf_8_wd8
sub r10, r0, r1, lsl #1
sub r10, r10, r1
vst1.8 {d21}, [r10, :64], r1 // p2
vst1.8 {d24}, [r0, :64], r1 // q0
vst1.8 {d22}, [r10, :64], r1 // p1
vst1.8 {d25}, [r0, :64], r1 // q1
vst1.8 {d23}, [r10, :64], r1 // p0
vst1.8 {d26}, [r0, :64], r1 // q2
sub r0, r0, r1, lsl #1
sub r0, r0, r1
bx r12
8:
sub r10, r0, r1, lsl #1
vst1.8 {d22}, [r10, :64], r1 // p1
vst1.8 {d24}, [r0, :64], r1 // q0
vst1.8 {d23}, [r10, :64], r1 // p0
vst1.8 {d25}, [r0, :64], r1 // q1
sub r0, r0, r1, lsl #1
bx r12
endfunc
function lpf_h_8_8_neon
mov r12, lr
sub r10, r0, #4
add r0, r10, r1, lsl #2
vld1.8 {d20}, [r10], r1
vld1.8 {d24}, [r0], r1
vld1.8 {d21}, [r10], r1
vld1.8 {d25}, [r0], r1
vld1.8 {d22}, [r10], r1
vld1.8 {d26}, [r0], r1
vld1.8 {d23}, [r10], r1
vld1.8 {d27}, [r0], r1
add r0, r0, #4
transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
lpf_8_wd8
sub r10, r0, r1, lsl #3
sub r10, r10, #4
transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
add r0, r10, r1, lsl #2
vst1.8 {d20}, [r10], r1
vst1.8 {d24}, [r0], r1
vst1.8 {d21}, [r10], r1
vst1.8 {d25}, [r0], r1
vst1.8 {d22}, [r10], r1
vst1.8 {d26}, [r0], r1
vst1.8 {d23}, [r10], r1
vst1.8 {d27}, [r0], r1
add r0, r0, #4
bx r12
8:
sub r10, r0, r1, lsl #3
sub r10, r10, #2
transpose_4x8b q11, q12, d22, d23, d24, d25
add r0, r10, r1, lsl #2
vst1.32 {d22[0]}, [r10], r1
vst1.32 {d22[1]}, [r0], r1
vst1.32 {d23[0]}, [r10], r1
vst1.32 {d23[1]}, [r0], r1
vst1.32 {d24[0]}, [r10], r1
vst1.32 {d24[1]}, [r0], r1
vst1.32 {d25[0]}, [r10], r1
vst1.32 {d25[1]}, [r0], r1
add r0, r0, #2
bx r12
endfunc
function lpf_v_16_8_neon
mov r12, lr
sub r10, r0, r1, lsl #3
add r10, r10, r1
vld1.8 {d17}, [r10, :64], r1 // p6
vld1.8 {d24}, [r0, :64], r1 // q0
vld1.8 {d18}, [r10, :64], r1 // p5
vld1.8 {d25}, [r0, :64], r1 // q1
vld1.8 {d19}, [r10, :64], r1 // p4
vld1.8 {d26}, [r0, :64], r1 // q2
vld1.8 {d20}, [r10, :64], r1 // p3
vld1.8 {d27}, [r0, :64], r1 // q3
vld1.8 {d21}, [r10, :64], r1 // p2
vld1.8 {d28}, [r0, :64], r1 // q4
vld1.8 {d22}, [r10, :64], r1 // p1
vld1.8 {d29}, [r0, :64], r1 // q5
vld1.8 {d23}, [r10, :64], r1 // p0
vld1.8 {d30}, [r0, :64], r1 // q6
sub r0, r0, r1, lsl #3
add r0, r0, r1
lpf_8_wd16
sub r10, r0, r1, lsl #2
sub r10, r10, r1, lsl #1
vst1.8 {d0}, [r10, :64], r1 // p5
vst1.8 {d6}, [r0, :64], r1 // q0
vst1.8 {d1}, [r10, :64], r1 // p4
vst1.8 {d7}, [r0, :64], r1 // q1
vst1.8 {d2}, [r10, :64], r1 // p3
vst1.8 {d8}, [r0, :64], r1 // q2
vst1.8 {d3}, [r10, :64], r1 // p2
vst1.8 {d9}, [r0, :64], r1 // q3
vst1.8 {d4}, [r10, :64], r1 // p1
vst1.8 {d10}, [r0, :64], r1 // q4
vst1.8 {d5}, [r10, :64], r1 // p0
vst1.8 {d11}, [r0, :64], r1 // q5
sub r0, r0, r1, lsl #2
sub r0, r0, r1, lsl #1
bx r12
7:
sub r10, r0, r1
sub r10, r10, r1, lsl #1
vst1.8 {d21}, [r10, :64], r1 // p2
vst1.8 {d24}, [r0, :64], r1 // q0
vst1.8 {d22}, [r10, :64], r1 // p1
vst1.8 {d25}, [r0, :64], r1 // q1
vst1.8 {d23}, [r10, :64], r1 // p0
vst1.8 {d26}, [r0, :64], r1 // q2
sub r0, r0, r1, lsl #1
sub r0, r0, r1
bx r12
8:
sub r10, r0, r1, lsl #1
vst1.8 {d22}, [r10, :64], r1 // p1
vst1.8 {d24}, [r0, :64], r1 // q0
vst1.8 {d23}, [r10, :64], r1 // p0
vst1.8 {d25}, [r0, :64], r1 // q1
sub r0, r0, r1, lsl #1
bx r12
endfunc
function lpf_h_16_8_neon
mov r12, lr
sub r10, r0, #8
vld1.8 {d16}, [r10, :64], r1
vld1.8 {d24}, [r0, :64], r1
vld1.8 {d17}, [r10, :64], r1
vld1.8 {d25}, [r0, :64], r1
vld1.8 {d18}, [r10, :64], r1
vld1.8 {d26}, [r0, :64], r1
vld1.8 {d19}, [r10, :64], r1
vld1.8 {d27}, [r0, :64], r1
vld1.8 {d20}, [r10, :64], r1
vld1.8 {d28}, [r0, :64], r1
vld1.8 {d21}, [r10, :64], r1
vld1.8 {d29}, [r0, :64], r1
vld1.8 {d22}, [r10, :64], r1
vld1.8 {d30}, [r0, :64], r1
vld1.8 {d23}, [r10, :64], r1
vld1.8 {d31}, [r0, :64], r1
transpose_8x8b q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23
transpose_8x8b q12, q13, q14, q15, d24, d25, d26, d27, d28, d29, d30, d31
lpf_8_wd16
sub r0, r0, r1, lsl #3
sub r10, r0, #8
transpose_8x8b q8, q0, q1, q2, d16, d17, d0, d1, d2, d3, d4, d5
transpose_8x8b q3, q4, q5, q15, d6, d7, d8, d9, d10, d11, d30, d31
vst1.8 {d16}, [r10, :64], r1
vst1.8 {d6}, [r0, :64], r1
vst1.8 {d17}, [r10, :64], r1
vst1.8 {d7}, [r0, :64], r1
vst1.8 {d0}, [r10, :64], r1
vst1.8 {d8}, [r0, :64], r1
vst1.8 {d1}, [r10, :64], r1
vst1.8 {d9}, [r0, :64], r1
vst1.8 {d2}, [r10, :64], r1
vst1.8 {d10}, [r0, :64], r1
vst1.8 {d3}, [r10, :64], r1
vst1.8 {d11}, [r0, :64], r1
vst1.8 {d4}, [r10, :64], r1
vst1.8 {d30}, [r0, :64], r1
vst1.8 {d5}, [r10, :64], r1
vst1.8 {d31}, [r0, :64], r1
bx r12
7:
sub r10, r0, r1, lsl #3
sub r10, r10, #4
transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
add r0, r10, r1, lsl #2
vst1.8 {d20}, [r10], r1
vst1.8 {d24}, [r0], r1
vst1.8 {d21}, [r10], r1
vst1.8 {d25}, [r0], r1
vst1.8 {d22}, [r10], r1
vst1.8 {d26}, [r0], r1
vst1.8 {d23}, [r10], r1
vst1.8 {d27}, [r0], r1
add r0, r0, #4
bx r12
8:
sub r10, r0, r1, lsl #3
sub r10, r10, #2
transpose_4x8b q11, q12, d22, d23, d24, d25
add r0, r10, r1, lsl #2
vst1.32 {d22[0]}, [r10], r1
vst1.32 {d22[1]}, [r0], r1
vst1.32 {d23[0]}, [r10], r1
vst1.32 {d23[1]}, [r0], r1
vst1.32 {d24[0]}, [r10], r1
vst1.32 {d24[1]}, [r0], r1
vst1.32 {d25[0]}, [r10], r1
vst1.32 {d25[1]}, [r0], r1
add r0, r0, #2
bx r12
endfunc
// void dav1d_lpf_v_sb_y_neon(pixel *dst, const ptrdiff_t stride,
// const uint32_t *const vmask,
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
// const Av1FilterLUT *lut, const int w)
.macro lpf_func dir, type
function lpf_\dir\()_sb_\type\()_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
ldrd r6, r7, [r2] // vmask[0], vmask[1]
.ifc \type, y
ldr r2, [r2, #8] // vmask[2]
.endif
add r5, r5, #128 // Move to sharp part of lut
.ifc \type, y
orr r7, r7, r2 // vmask[1] |= vmask[2]
.endif
.ifc \dir, v
sub r4, r3, r4, lsl #2
.else
sub r3, r3, #4
lsl r4, r4, #2
.endif
orr r6, r6, r7 // vmask[0] |= vmask[1]
1:
tst r6, #0x03
.ifc \dir, v
vld1.8 {d0}, [r4]!
vld1.8 {d1}, [r3]!
.else
vld2.32 {d0[0], d1[0]}, [r3], r4
vld2.32 {d0[1], d1[1]}, [r3], r4
.endif
beq 7f // if (!(vm & bits)) continue;
vld1.8 {d5[]}, [r5] // sharp[0]
add r5, r5, #8
vmov.i32 d2, #0xff
vdup.32 d13, r6 // vmask[0]
vand d0, d0, d2 // Keep only lowest byte in each 32 bit word
vand d1, d1, d2
vtst.8 d3, d1, d2 // Check for nonzero values in l[0][0]
vmov.i8 d4, #1
vld1.8 {d6[]}, [r5] // sharp[1]
sub r5, r5, #8
vbif d1, d0, d3 // if (!l[0][0]) L = l[offset][0]
vmul.i32 d1, d1, d4 // L
.ifc \type, y
vdup.32 d15, r2 // vmask[2]
.endif
vtst.32 d2, d1, d2 // L != 0
vdup.32 d14, r7 // vmask[1]
vmov r10, r11, d2
orrs r10, r10, r11
beq 7f // if (!L) continue;
vneg.s8 d5, d5 // -sharp[0]
movrel_local r10, word_12
vshr.u8 d12, d1, #4 // H
vld1.32 {d16}, [r10, :64]
vshl.s8 d3, d1, d5 // L >> sharp[0]
.ifc \type, y
vtst.32 d15, d15, d16 // if (vmask[2] & bits)
.endif
vmov.i8 d7, #2
vmin.u8 d3, d3, d6 // imin(L >> sharp[0], sharp[1])
vadd.i8 d0, d1, d7 // L + 2
vmax.u8 d11, d3, d4 // imax(imin(), 1) = limit = I
vadd.u8 d0, d0, d0 // 2*(L + 2)
vtst.32 d14, d14, d16 // if (vmask[1] & bits)
vadd.i8 d10, d0, d11 // 2*(L + 2) + limit = E
vtst.32 d13, d13, d16 // if (vmask[0] & bits)
vand d13, d13, d2 // vmask[0] &= L != 0
.ifc \type, y
tst r2, #0x03
beq 2f
// wd16
bl lpf_\dir\()_16_8_neon
b 8f
2:
.endif
tst r7, #0x03
beq 3f
.ifc \type, y
// wd8
bl lpf_\dir\()_8_8_neon
.else
// wd6
bl lpf_\dir\()_6_8_neon
.endif
b 8f
3:
// wd4
bl lpf_\dir\()_4_8_neon
.ifc \dir, h
b 8f
7:
// For dir h, the functions above increment r0.
// If the whole function is skipped, increment it here instead.
add r0, r0, r1, lsl #3
.else
7:
.endif
8:
lsrs r6, r6, #2 // vmask[0] >>= 2
lsr r7, r7, #2 // vmask[1] >>= 2
.ifc \type, y
lsr r2, r2, #2 // vmask[2] >>= 2
.endif
.ifc \dir, v
add r0, r0, #8
.else
// For dir h, r0 is returned incremented
.endif
bne 1b
vpop {q4-q7}
pop {r4-r11,pc}
endfunc
.endm
lpf_func v, y
lpf_func h, y
lpf_func v, uv
lpf_func h, uv
const word_12, align=4
.word 1, 2
endconst
......@@ -34,11 +34,11 @@
.macro movrel_local rd, val, offset=0
#if defined(PIC)
ldr \rd, 1f
b 2f
1:
.word \val + \offset - (2f + 8 - 4 * CONFIG_THUMB)
2:
ldr \rd, 90001f
b 90002f
90001:
.word \val + \offset - (90002f + 8 - 4 * CONFIG_THUMB)
90002:
add \rd, \rd, pc
#else
movw \rd, #:lower16:\val+\offset
......@@ -84,4 +84,11 @@
vtrn.8 \r6, \r7
.endm
.macro transpose_4x8b q0, q1, r0, r1, r2, r3
vtrn.16 \q0, \q1
vtrn.8 \r0, \r1
vtrn.8 \r2, \r3
.endm
#endif /* DAV1D_SRC_ARM_32_UTIL_S */
......@@ -37,11 +37,11 @@ function lpf_16_wd\wd\()_neon
.if \wd >= 6
uabd v4.16b, v21.16b, v22.16b // abs(p2 - p1)
uabd v5.16b, v26.16b, v25.16b // abs(q2 - q1)
.endif
.if \wd >= 8
uabd v6.16b, v20.16b, v21.16b // abs(p3 - p2)
uabd v7.16b, v27.16b, v26.16b // abs(q3 - q3)
.endif
.endif
.if \wd >= 6
umax v4.16b, v4.16b, v5.16b
.endif
......@@ -70,7 +70,7 @@ function lpf_16_wd\wd\()_neon
.if \wd >= 6
and v14.16b, v14.16b, v1.16b // fm && wd > 4
.endif
.if \wd >= 6
.if \wd >= 16
and v15.16b, v15.16b, v1.16b // fm && wd == 16
.endif
......@@ -303,7 +303,6 @@ function lpf_16_wd\wd\()_neon
rshrn v13.8b, v8.8h, #3 // out q0
rshrn2 v13.16b, v9.8h, #3
add v8.8h, v8.8h, v6.8h
add v9.8h, v9.8h, v7.8h
sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1
......@@ -420,6 +419,7 @@ function lpf_16_wd\wd\()_neon
sub v7.8h, v7.8h, v11.8h
uaddl v8.8h, v26.8b, v30.8b // q2 + q6
uaddl2 v9.8h, v26.16b, v30.16b
bif v0.16b, v18.16b, v15.16b // out p5
uaddl v10.8h, v18.8b, v23.8b // p5 + p0
uaddl2 v11.8h, v18.16b, v23.16b
rshrn v5.8b, v12.8h, #4 // out p0
......@@ -430,56 +430,55 @@ function lpf_16_wd\wd\()_neon
sub v9.8h, v9.8h, v11.8h
uaddl v10.8h, v27.8b, v30.8b // q3 + q6
uaddl2 v11.8h, v27.16b, v30.16b
bif v0.16b, v18.16b, v15.16b // out p5
uaddl v14.8h, v19.8b, v24.8b // p4 + q0
uaddl2 v18.8h, v19.16b, v24.16b
bif v1.16b, v19.16b, v15.16b // out p4
uaddl v18.8h, v19.8b, v24.8b // p4 + q0
uaddl2 v19.8h, v19.16b, v24.16b
rshrn v6.8b, v12.8h, #4 // out q0
rshrn2 v6.16b, v13.8h, #4
add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6)
add v13.8h, v13.8h, v9.8h
sub v10.8h, v10.8h, v14.8h
sub v11.8h, v11.8h, v18.8h
uaddl v14.8h, v28.8b, v30.8b // q4 + q6
uaddl2 v18.8h, v28.16b, v30.16b
bif v1.16b, v19.16b, v15.16b // out p4
uaddl v8.8h, v20.8b, v25.8b // p3 + q1
uaddl2 v9.8h, v20.16b, v25.16b
sub v10.8h, v10.8h, v18.8h
sub v11.8h, v11.8h, v19.8h
uaddl v8.8h, v28.8b, v30.8b // q4 + q6
uaddl2 v9.8h, v28.16b, v30.16b
bif v2.16b, v20.16b, v15.16b // out p3
uaddl v18.8h, v20.8b, v25.8b // p3 + q1
uaddl2 v19.8h, v20.16b, v25.16b
rshrn v7.8b, v12.8h, #4 // out q1
rshrn2 v7.16b, v13.8h, #4
add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6)
add v13.8h, v13.8h, v11.8h
sub v14.8h, v14.8h, v8.8h
sub v18.8h, v18.8h, v9.8h
sub v18.8h, v8.8h, v18.8h
sub v19.8h, v9.8h, v19.8h
uaddl v10.8h, v29.8b, v30.8b // q5 + q6
uaddl2 v11.8h, v29.16b, v30.16b
bif v2.16b, v20.16b, v15.16b // out p3
uaddl v19.8h, v21.8b, v26.8b // p2 + q2
uaddl2 v20.8h, v21.16b, v26.16b
bif v3.16b, v21.16b, v15.16b // out p2
uaddl v20.8h, v21.8b, v26.8b // p2 + q2
uaddl2 v21.8h, v21.16b, v26.16b
rshrn v8.8b, v12.8h, #4 // out q2
rshrn2 v8.16b, v13.8h, #4
add v12.8h, v12.8h, v14.8h // - (p3 + q1) + (q4 + q6)
add v13.8h, v13.8h, v18.8h
sub v10.8h, v10.8h, v19.8h
sub v11.8h, v11.8h, v20.8h
uaddl v14.8h, v30.8b, v30.8b // q6 + q6
uaddl2 v18.8h, v30.16b, v30.16b
bif v3.16b, v21.16b, v15.16b // out p2
uaddl v19.8h, v22.8b, v27.8b // p1 + q3
uaddl2 v20.8h, v22.16b, v27.16b
add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6)
add v13.8h, v13.8h, v19.8h
sub v10.8h, v10.8h, v20.8h
sub v11.8h, v11.8h, v21.8h
uaddl v18.8h, v30.8b, v30.8b // q6 + q6
uaddl2 v19.8h, v30.16b, v30.16b
bif v4.16b, v22.16b, v15.16b // out p1
uaddl v20.8h, v22.8b, v27.8b // p1 + q3
uaddl2 v21.8h, v22.16b, v27.16b
rshrn v9.8b, v12.8h, #4 // out q3
rshrn2 v9.16b, v13.8h, #4
add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6)
add v13.8h, v13.8h, v11.8h
sub v14.8h, v14.8h, v19.8h
sub v18.8h, v18.8h, v20.8h
bif v4.16b, v22.16b, v15.16b // out p1
sub v19.8h, v19.8h, v21.8h
bif v5.16b, v23.16b, v15.16b // out p0
rshrn v10.8b, v12.8h, #4 // out q4
rshrn2 v10.16b, v13.8h, #4
add v12.8h, v12.8h, v14.8h // - (p1 + q3) + (q6 + q6)
add v13.8h, v13.8h, v18.8h
add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6)
add v13.8h, v13.8h, v19.8h
rshrn v11.8b, v12.8h, #4 // out q5
rshrn2 v11.16b, v13.8h, #4
bif v5.16b, v23.16b, v15.16b // out p0
bif v6.16b, v24.16b, v15.16b // out q0
bif v7.16b, v25.16b, v15.16b // out q1
bif v8.16b, v26.16b, v15.16b // out q2
......
......@@ -54,13 +54,14 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8 && ARCH_AARCH64
#if BITDEPTH == 8
c->intra_pred[DC_PRED] = dav1d_ipred_dc_neon;
c->intra_pred[DC_128_PRED] = dav1d_ipred_dc_128_neon;
c->intra_pred[TOP_DC_PRED] = dav1d_ipred_dc_top_neon;
c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_neon;
c->intra_pred[HOR_PRED] = dav1d_ipred_h_neon;
c->intra_pred[VERT_PRED] = dav1d_ipred_v_neon;
#if ARCH_AARCH64
c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_neon;
c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_neon;
c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_neon;
......@@ -77,4 +78,5 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c
c->pal_pred = dav1d_pal_pred_neon;
#endif
#endif
}
......@@ -38,7 +38,7 @@ COLD void bitfn(dav1d_loop_filter_dsp_init_arm)(Dav1dLoopFilterDSPContext *const
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8 && ARCH_AARCH64
#if BITDEPTH == 8
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_neon;
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_neon;
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_neon;
......
......@@ -524,6 +524,7 @@ static void order_palette(const uint8_t *pal_idx, const ptrdiff_t stride,
{
int have_top = i > first;
assert(pal_idx);
pal_idx += first + (i - first) * stride;
for (int j = first, n = 0; j >= last; have_top = 1, j--, n++, pal_idx += stride - 1) {
const int have_left = j > 0;
......@@ -586,6 +587,7 @@ static void read_pal_indices(Dav1dTileContext *const t,
{
Dav1dTileState *const ts = t->ts;
const ptrdiff_t stride = bw4 * 4;
assert(pal_idx);
pal_idx[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
uint16_t (*const color_map_cdf)[8] =
ts->cdf.m.color_map[pl][b->pal_sz[pl] - 2];
......@@ -1125,6 +1127,7 @@ static int decode_b(Dav1dTileContext *const t,
if (b->pal_sz[0]) {
uint8_t *pal_idx;
if (f->frame_thread.pass) {
assert(ts->frame_thread.pal_idx);
pal_idx = ts->frame_thread.pal_idx;
ts->frame_thread.pal_idx += bw4 * bh4 * 16;
} else
......@@ -1137,6 +1140,7 @@ static int decode_b(Dav1dTileContext *const t,
if (has_chroma && b->pal_sz[1]) {
uint8_t *pal_idx;
if (f->frame_thread.pass) {
assert(ts->frame_thread.pal_idx);
pal_idx = ts->frame_thread.pal_idx;
ts->frame_thread.pal_idx += cbw4 * cbh4 * 16;
} else
......@@ -1390,7 +1394,7 @@ static int decode_b(Dav1dTileContext *const t,
b->ref[1] = f->frame_hdr->skip_mode_refs[1];
b->comp_type = COMP_INTER_AVG;
b->inter_mode = NEARESTMV_NEARESTMV;
b->drl_idx = 0;
b->drl_idx = NEAREST_DRL;
has_subpel_filter = 0;
candidate_mv mvstack[8];
......@@ -1490,13 +1494,13 @@ static int decode_b(Dav1dTileContext *const t,
b->inter_mode, ctx, n_mvs, ts->msac.rng);
const uint8_t *const im = dav1d_comp_inter_pred_modes[b->inter_mode];
b->drl_idx = 0;
b->drl_idx = NEAREST_DRL;
if (b->inter_mode == NEWMV_NEWMV) {
if (n_mvs > 1) {
if (n_mvs > 1) { // NEARER, NEAR or NEARISH
const int drl_ctx_v1 = get_drl_context(mvstack, 0);
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.drl_bit[drl_ctx_v1]);
if (b->drl_idx == 1 && n_mvs > 2) {
if (b->drl_idx == NEARER_DRL && n_mvs > 2) {
const int drl_ctx_v2 = get_drl_context(mvstack, 1);
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.drl_bit[drl_ctx_v2]);
......@@ -1506,12 +1510,12 @@ static int decode_b(Dav1dTileContext *const t,
b->drl_idx, n_mvs, ts->msac.rng);
}
} else if (im[0] == NEARMV || im[1] == NEARMV) {
b->drl_idx = 1;
if (n_mvs > 2) {
b->drl_idx = NEARER_DRL;
if (n_mvs > 2) { // NEAR or NEARISH
const int drl_ctx_v2 = get_drl_context(mvstack, 1);
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.drl_bit[drl_ctx_v2]);
if (b->drl_idx == 2 && n_mvs > 3) {
if (b->drl_idx == NEAR_DRL && n_mvs > 3) {
const int drl_ctx_v3 = get_drl_context(mvstack, 2);
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.drl_bit[drl_ctx_v3]);
......@@ -1521,6 +1525,7 @@ static int decode_b(Dav1dTileContext *const t,
b->drl_idx, n_mvs, ts->msac.rng);
}
}
assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
#define assign_comp_mv(idx, pfx) \
switch (im[idx]) { \
......@@ -1678,14 +1683,14 @@ static int decode_b(Dav1dTileContext *const t,
has_subpel_filter = 1;
if (dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.refmv_mode[(ctx >> 4) & 15]))
{
{ // NEAREST, NEARER, NEAR or NEARISH
b->inter_mode = NEARMV;
b->drl_idx = 1;
if (n_mvs > 2) {
b->drl_idx = NEARER_DRL;
if (n_mvs > 2) { // NEARER, NEAR or NEARISH
const int drl_ctx_v2 = get_drl_context(mvstack, 1);
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.drl_bit[drl_ctx_v2]);
if (b->drl_idx == 2 && n_mvs > 3) {
if (b->drl_idx == NEAR_DRL && n_mvs > 3) { // NEAR or NEARISH
const int drl_ctx_v3 =
get_drl_context(mvstack, 2);
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
......@@ -1694,9 +1699,10 @@ static int decode_b(Dav1dTileContext *const t,
}
} else {
b->inter_mode = NEARESTMV;
b->drl_idx = 0;
b->drl_idx = NEAREST_DRL;
}
if (b->drl_idx >= 2) {
assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
if (b->drl_idx >= NEAR_DRL) {
b->mv[0] = mvstack[b->drl_idx].this_mv;
} else {
b->mv[0] = mvlist[0][b->drl_idx];
......@@ -1711,20 +1717,22 @@ static int decode_b(Dav1dTileContext *const t,
} else {
has_subpel_filter = 1;
b->inter_mode = NEWMV;
b->drl_idx = 0;
if (n_mvs > 1) {
b->drl_idx = NEAREST_DRL;
if (n_mvs > 1) { // NEARER, NEAR or NEARISH
const int drl_ctx_v1 = get_drl_context(mvstack, 0);
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.drl_bit[drl_ctx_v1]);
if (b->drl_idx == 1 && n_mvs > 2) {
if (b->drl_idx == NEARER_DRL && n_mvs > 2) { // NEAR or NEARISH
const int drl_ctx_v2 = get_drl_context(mvstack, 1);
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.drl_bit[drl_ctx_v2]);
}
}
assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
if (n_mvs > 1) {
b->mv[0] = mvstack[b->drl_idx].this_mv;
} else {
assert(!b->drl_idx);
b->mv[0] = mvlist[0][0];
fix_mv_precision(f->frame_hdr, &b->mv[0]);
}
......@@ -1972,7 +1980,7 @@ static int checked_decode_b(Dav1dTileContext *const t,
for (int p = 0; p < 1 + 2 * has_chroma; p++) {
const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int stride = f->cur.stride[!!p];
const ptrdiff_t stride = f->cur.stride[!!p];
const int bx = t->bx & ~ss_hor;
const int by = t->by & ~ss_ver;
const int width = w4 << (2 - ss_hor + (bw4 == ss_hor));
......@@ -2318,10 +2326,15 @@ static void setup_tile(Dav1dTileState *const ts,
const int sb_shift = f->sb_shift;
const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
ts->frame_thread.pal_idx =
&f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4];
ts->frame_thread.cf = (uint8_t*)f->frame_thread.cf +
(((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd);
ts->frame_thread.pal_idx = f->frame_thread.pal_idx ?
&f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4] :
NULL;
ts->frame_thread.cf = f->frame_thread.cf ?
(uint8_t*)f->frame_thread.cf +
(((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) :
NULL;
dav1d_cdf_thread_copy(&ts->cdf, &f->in_cdf);
ts->last_qidx = f->frame_hdr->quant.yac;
memset(ts->last_delta_lf, 0, sizeof(ts->last_delta_lf));
......@@ -3106,12 +3119,18 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
tile_idx++)
{
Dav1dTileState *const ts = &f->ts[tile_idx];
const int tile_start_off = f->frame_thread.tile_start_off[tile_idx];
ts->frame_thread.pal_idx = &f->frame_thread.pal_idx[tile_start_off * size_mul[1] / 4];
ts->frame_thread.cf = (uint8_t*)f->frame_thread.cf +
((tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd);
const size_t tile_start_off =
(size_t) f->frame_thread.tile_start_off[tile_idx];
ts->frame_thread.pal_idx = f->frame_thread.pal_idx ?
&f->frame_thread.pal_idx[tile_start_off * size_mul[1] / 4] :
NULL;
ts->frame_thread.cf = f->frame_thread.cf ?
(uint8_t*)f->frame_thread.cf +
((tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) :
NULL;
if (f->n_tc > 0) {
unsigned row_sb_start = f->frame_hdr->tiling.row_start_sb[ts->tiling.row];
const unsigned row_sb_start =
f->frame_hdr->tiling.row_start_sb[ts->tiling.row];
atomic_init(&ts->progress, row_sb_start);
}
}
......
......@@ -431,19 +431,10 @@ static inline int av1_get_uni_p1_ctx(const BlockContext *const a,
static inline int get_drl_context(const candidate_mv *const ref_mv_stack,
const int ref_idx)
{
if (ref_mv_stack[ref_idx].weight >= 640 &&
ref_mv_stack[ref_idx + 1].weight >= 640)
return 0;
if (ref_mv_stack[ref_idx].weight >= 640)
return ref_mv_stack[ref_idx + 1].weight < 640;
if (ref_mv_stack[ref_idx].weight >= 640 &&
ref_mv_stack[ref_idx + 1].weight < 640)
return 1;
if (ref_mv_stack[ref_idx].weight < 640 &&
ref_mv_stack[ref_idx + 1].weight < 640)
return 2;
return 0;
return ref_mv_stack[ref_idx + 1].weight < 640 ? 2 : 0;
}
static inline unsigned get_cur_frame_segid(const int by, const int bx,
......
......@@ -36,6 +36,6 @@
bitfn_decls(void dav1d_apply_grain, const Dav1dFilmGrainDSPContext *const dsp,
Dav1dPicture *const out,
const Dav1dPicture *const in);
Dav1dPicture *const in);
#endif /* DAV1D_SRC_FG_APPLY_H */
......@@ -91,7 +91,7 @@ static void generate_scaling(const int bitdepth,
#ifndef UNIT_TEST
void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
Dav1dPicture *const out,
const Dav1dPicture *const in)
Dav1dPicture *const in)
{
const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
......@@ -143,7 +143,7 @@ void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
const int cpw = (out->p.w + ss_x) >> ss_x;
const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
for (int row = 0; row < rows; row++) {
const pixel *const luma_src =
pixel *const luma_src =
((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]);
if (data->num_y_points) {
......@@ -153,7 +153,23 @@ void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);
}
if (!data->num_uv_points[0] && !data->num_uv_points[1] &&
!data->chroma_scaling_from_luma)
{
continue;
}
const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y;
// extend padding pixels
if (out->p.w & ss_x) {
pixel *ptr = luma_src;
for (int y = 0; y < bh; y++) {
ptr[out->p.w] = ptr[out->p.w - 1];
ptr += PXSTRIDE(in->stride[0]) << ss_y;
}
}
const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
if (data->chroma_scaling_from_luma) {
for (int pl = 0; pl < 2; pl++)
......
......@@ -324,7 +324,9 @@ static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,
}
}
static int get_filter_strength(const int wh, const int angle, const int is_sm) {
static NOINLINE int get_filter_strength(const int wh, const int angle,
const int is_sm)
{
if (is_sm) {
if (wh <= 8) {
if (angle >= 64) return 2;
......@@ -357,10 +359,10 @@ static int get_filter_strength(const int wh, const int angle, const int is_sm) {
return 0;
}
static void filter_edge(pixel *const out, const int sz,
const int lim_from, const int lim_to,
const pixel *const in,
const int from, const int to, const unsigned strength)
static NOINLINE void filter_edge(pixel *const out, const int sz,
const int lim_from, const int lim_to,
const pixel *const in, const int from,
const int to, const int strength)
{
static const uint8_t kernel[3][5] = {
{ 0, 4, 8, 4, 0 },
......@@ -382,14 +384,13 @@ static void filter_edge(pixel *const out, const int sz,
out[i] = in[iclip(i, from, to - 1)];
}
static int get_upsample(const int blk_wh, const unsigned d, const int type) {
if (d >= 40) return 0;
return type ? (blk_wh <= 8) : (blk_wh <= 16);
static inline int get_upsample(const int wh, const int angle, const int is_sm) {
return angle < 40 && wh <= 16 >> is_sm;
}
static void upsample_edge(pixel *const out, const int hsz,
const pixel *const in, const int from, const int to
HIGHBD_DECL_SUFFIX)
static NOINLINE void upsample_edge(pixel *const out, const int hsz,
const pixel *const in, const int from,
const int to HIGHBD_DECL_SUFFIX)
{
static const int8_t kernel[4] = { -1, 9, 9, -1 };
int i;
......
......@@ -32,17 +32,6 @@
#include "dav1d/headers.h"
enum ObuType {
OBU_SEQ_HDR = 1,
OBU_TD = 2,
OBU_FRAME_HDR = 3,
OBU_TILE_GRP = 4,
OBU_METADATA = 5,
OBU_FRAME = 6,
OBU_REDUNDANT_FRAME_HDR = 7,
OBU_PADDING = 15,
};
enum ObuMetaType {
OBU_META_HDR_CLL = 1,
OBU_META_HDR_MDCV = 2,
......@@ -221,6 +210,13 @@ enum InterPredMode {
N_INTER_PRED_MODES,
};
enum DRL_PROXIMITY {
NEAREST_DRL,
NEARER_DRL,
NEAR_DRL,
NEARISH_DRL
};
enum CompInterPredMode {
NEARESTMV_NEARESTMV,
NEARMV_NEARMV,
......
......@@ -905,7 +905,6 @@ static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
src_x += mx >> 14;
mx &= 0x3fff;
}
if (dst_w & 1) dst[dst_w] = dst[dst_w - 1];
dst += PXSTRIDE(dst_stride);
src += PXSTRIDE(src_stride);
......
......@@ -112,6 +112,8 @@ if is_asm_enabled
elif host_machine.cpu_family().startswith('arm')
libdav1d_sources += files(
'arm/32/cdef.S',
'arm/32/ipred.S',
'arm/32/loopfilter.S',
'arm/32/looprestoration.S',
'arm/32/mc.S',
)
......