Skip to content
Commits on Source (118)
......@@ -38,7 +38,7 @@ build-debian:
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
stage: build
tags:
- debian
- avx2
- amd64
script:
- meson build --buildtype release --werror
......@@ -173,7 +173,7 @@ build-win-arm64:
build-debian-aarch64:
stage: build
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
tags:
- aarch64
- debian
......@@ -184,7 +184,7 @@ build-debian-aarch64:
build-debian-aarch64-clang-5:
stage: build
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
tags:
- aarch64
- debian
......@@ -203,7 +203,7 @@ build-macos:
- cd build && meson test -v
build-debian-werror:
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
stage: build
tags:
- aarch64
......@@ -219,7 +219,7 @@ build-debian-armv7:
- armv7
- debian
script:
- meson build --buildtype debugoptimized --werror
- linux32 meson build --buildtype debugoptimized --werror
- ninja -C build
- cd build && meson test -v
......@@ -230,13 +230,13 @@ build-debian-armv7-clang-5:
- armv7
- debian
script:
- env CC=clang-5.0 CFLAGS='-integrated-as' meson build --buildtype release
- env CC=clang-5.0 CFLAGS='-integrated-as' linux32 meson build --buildtype release
- ninja -C build
- cd build && meson test -v
build-ubuntu-snap:
stage: build
image: registry.videolan.org:5000/dav1d-ubuntu-bionic:20190221154127
image: registry.videolan.org/dav1d-ubuntu-bionic:20200121182340
tags:
- debian
- amd64
......@@ -292,7 +292,7 @@ test-debian-unaligned-stack:
stage: test
needs: ["build-debian"]
tags:
- debian
- avx2
- amd64
cache:
key: testdata.git-20190215
......@@ -382,7 +382,7 @@ test-win64:
stage: test
needs: ["build-win64"]
tags:
- debian
- avx2
- amd64
cache:
key: testdata.git-20190215
......@@ -403,7 +403,7 @@ test-win64:
dependencies: []
test-debian-aarch64:
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
stage: test
needs: ["build-debian-aarch64"]
tags:
......@@ -464,7 +464,7 @@ test-debian-armv7-clang-5:
- test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
- test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
- git clone cache/dav1d-test-data.git tests/dav1d-test-data
- env CC=clang-5.0 CFLAGS='-integrated-as' meson build --buildtype release
- env CC=clang-5.0 CFLAGS='-integrated-as' linux32 meson build --buildtype release
-Dtestdata_tests=true
-Dlogging=false
- ninja -C build
......
Changes for 0.6.0 'Gyrfalcon':
------------------------------
0.6.0 is a major release for dav1d:
- New ARM64 optimizations for the 10/12bit depth:
- mc_avg, mc_w_avg, mc_mask
- mc_put/mc_prep 8tap/bilin
- mc_warp_8x8
- mc_w_mask
- mc_blend
- wiener
- SGR
- loopfilter
- cdef
- New AVX-512 optimizations for prep_bilin, prep_8tap, cdef_filter, mc_avg/w_avg/mask
- New SSSE3 optimizations for film grain
- New AVX2 optimizations for msac_adapt16
- Fix rare mismatches against the reference decoder, notably because of clipping
- Improvements on ARM64 on msac, cdef and looprestoration optimizations
- Improvements on AVX2 optimizations for cdef_filter
- Improvements in the C version for itxfm, cdef_filter
Changes for 0.5.2 'Asiatic Cheetah':
------------------------------------
......@@ -32,7 +55,7 @@ and improving speed significantly:
- NEON optimizations for CDEF and warp on ARM32
- SSE2 optimizations for MSAC hi_tok decoding
- SSSE3 optimizations for deblocking loopfilters and warp_affine
- AVX-2 optimizations for film grain and ipred_z2
- AVX2 optimizations for film grain and ipred_z2
- SSE4 optimizations for warp_affine
- VSX optimizations for wiener
- Fix inverse transform overflows in x86 and NEON asm
......@@ -81,7 +104,7 @@ Changes for 0.2.2 (0.3.0-rc) 'Antelope':
-----------------------------
- Large improvement on MSAC decoding with SSE, bringing 4-6% speed increase
The impact is important on SSSE3, SSE4 and AVX-2 cpus
The impact is important on SSSE3, SSE4 and AVX2 cpus
- SSSE3 optimizations for all blocks size in itx
- SSSE3 optimizations for ipred_paeth and ipred_cfl (420, 422 and 444)
- Speed improvements on CDEF for SSE4 CPUs
......@@ -93,7 +116,7 @@ Changes for 0.2.1 'Antelope':
----------------------------
- SSSE3 optimization for cdef_dir
- AVX-2 improvements of the existing CDEF optimizations
- AVX2 improvements of the existing CDEF optimizations
- NEON improvements of the existing CDEF and wiener optimizations
- Clarification about the numbering/versionning scheme
......@@ -103,7 +126,7 @@ Changes for 0.2.0 'Antelope':
- ARM64 and ARM optimizations using NEON instructions
- SSSE3 optimizations for both 32 and 64bits
- More AVX-2 assembly, reaching almost completion
- More AVX2 assembly, reaching almost completion
- Fix installation of includes
- Rewrite inverse transforms to avoid overflows
- Snap packaging for Linux
......@@ -118,6 +141,6 @@ Initial release of dav1d, the fast and small AV1 decoder.
- Support for all features of the AV1 bitstream
- Support for all bitdepth, 8, 10 and 12bits
- Support for all chroma subsamplings 4:2:0, 4:2:2, 4:4:4 *and* grayscale
- Full acceleration for AVX-2 64bits processors, making it the fastest decoder
- Full acceleration for AVX2 64bits processors, making it the fastest decoder
- Partial acceleration for SSSE3 processors
- Partial acceleration for NEON processors
......@@ -73,7 +73,7 @@ The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this pr
# Compile
1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.13.02 or higher)
1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.14 or higher)
2. Run `mkdir build && cd build` to create a build directory and enter it
3. Run `meson ..` to configure meson, add `--default-library=static` if static linking is desired
4. Run `ninja` to compile
......
......@@ -43,15 +43,18 @@
#endif
#if ARCH_X86_64
/* x86-64 needs 32-byte alignment for AVX2. */
/* x86-64 needs 32- and 64-byte alignment for AVX2 and AVX-512. */
#define ALIGN_64_VAL 64
#define ALIGN_32_VAL 32
#define ALIGN_16_VAL 16
#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64 || ARCH_PPC64LE
/* ARM doesn't benefit from anything more than 16-byte alignment. */
#define ALIGN_64_VAL 16
#define ALIGN_32_VAL 16
#define ALIGN_16_VAL 16
#else
/* No need for extra alignment on platforms without assembly. */
#define ALIGN_64_VAL 8
#define ALIGN_32_VAL 8
#define ALIGN_16_VAL 8
#endif
......@@ -76,9 +79,10 @@
* becomes:
* ALIGN_STK_$align(uint8_t, var, 1, [2][3][4])
*/
#define ALIGN_STK_64(type, var, sz1d, sznd) \
ALIGN(type var[sz1d]sznd, ALIGN_64_VAL)
#define ALIGN_STK_32(type, var, sz1d, sznd) \
ALIGN(type var[sz1d]sznd, ALIGN_32_VAL)
// as long as stack is itself 16-byte aligned, this works (win64, gcc)
#define ALIGN_STK_16(type, var, sz1d, sznd) \
ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
......@@ -92,6 +96,12 @@
#define NOINLINE __attribute__((noinline))
#endif /* !_MSC_VER */
#ifdef __clang__
#define NO_SANITIZE(x) __attribute__((no_sanitize(x)))
#else
#define NO_SANITIZE(x)
#endif
#if defined(NDEBUG) && (defined(__GNUC__) || defined(__clang__))
#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
#elif defined(NDEBUG) && defined(_MSC_VER)
......
......@@ -31,6 +31,8 @@
#include <stdint.h>
#include <string.h>
#include "common/attributes.h"
#if !defined(BITDEPTH)
typedef void pixel;
typedef void coef;
......@@ -47,12 +49,14 @@ typedef int16_t coef;
#define iclip_pixel iclip_u8
#define PIX_HEX_FMT "%02x"
#define bitfn(x) x##_8bpc
#define PXSTRIDE(x) x
#define BF(x, suffix) x##_8bpc_##suffix
#define PXSTRIDE(x) (x)
#define highbd_only(x)
#define HIGHBD_DECL_SUFFIX /* nothing */
#define HIGHBD_CALL_SUFFIX /* nothing */
#define HIGHBD_TAIL_SUFFIX /* nothing */
#define bitdepth_from_max(x) 8
#define BITDEPTH_MAX 0xff
#elif BITDEPTH == 16
typedef uint16_t pixel;
typedef int32_t coef;
......@@ -69,8 +73,13 @@ static inline void pixel_set(pixel *const dst, const int val, const int num) {
#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
#define HIGHBD_TAIL_SUFFIX , bitdepth_max
#define bitdepth_from_max(bitdepth_max) (32 - clz(bitdepth_max))
#define BITDEPTH_MAX bitdepth_max
#define bitfn(x) x##_16bpc
#define PXSTRIDE(x) (x >> 1)
#define BF(x, suffix) x##_16bpc_##suffix
static inline ptrdiff_t PXSTRIDE(const ptrdiff_t x) {
assert(!(x & 1));
return x >> 1;
}
#define highbd_only(x) x
#else
#error invalid value for bitdepth
......
......@@ -318,8 +318,8 @@ typedef struct Dav1dFilmGrainData {
int scaling_shift;
int ar_coeff_lag;
int8_t ar_coeffs_y[24];
int8_t ar_coeffs_uv[2][25];
int ar_coeff_shift;
int8_t ar_coeffs_uv[2][25 + 3 /* padding for alignment purposes */];
uint64_t ar_coeff_shift;
int grain_scale_shift;
int uv_mult[2];
int uv_luma_mult[2];
......@@ -329,13 +329,13 @@ typedef struct Dav1dFilmGrainData {
} Dav1dFilmGrainData;
typedef struct Dav1dFrameHeader {
enum Dav1dFrameType frame_type; ///< type of the picture
int width[2 /* { coded_width, superresolution_upscaled_width } */], height;
int frame_offset; ///< frame number
struct {
int present, update;
Dav1dFilmGrainData data;
int present, update;
} film_grain; ///< film grain parameters
enum Dav1dFrameType frame_type; ///< type of the picture
int width[2 /* { coded_width, superresolution_upscaled_width } */], height;
int frame_offset; ///< frame number
int temporal_id, spatial_id; ///< spatial and temporal id of the frame for SVC
int show_existing_frame;
......
......@@ -23,14 +23,14 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
version: '0.5.2',
version: '0.6.0',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',
'b_ndebug=if-release'],
meson_version: '>= 0.47.0')
dav1d_soname_version = '3.1.0'
dav1d_soname_version = '4.0.0'
dav1d_api_version_array = dav1d_soname_version.split('.')
dav1d_api_version_major = dav1d_api_version_array[0]
dav1d_api_version_minor = dav1d_api_version_array[1]
......@@ -84,13 +84,15 @@ test_args = []
optional_arguments = []
# Define _POSIX_C_SOURCE to POSIX.1–2001 (IEEE Std 1003.1-2001)
test_args += '-D_POSIX_C_SOURCE=200112L'
add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c')
if host_machine.system() == 'darwin'
if host_machine.system() == 'linux'
test_args += '-D_GNU_SOURCE'
add_project_arguments('-D_GNU_SOURCE', language: 'c')
elif host_machine.system() == 'darwin'
test_args += '-D_DARWIN_C_SOURCE'
add_project_arguments('-D_DARWIN_C_SOURCE', language: 'c')
else
test_args += '-D_POSIX_C_SOURCE=200112L'
add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c')
endif
if host_machine.system() == 'windows'
......@@ -131,6 +133,15 @@ else
endif
endif
libdl_dependency = []
if host_machine.system() == 'linux'
libdl_dependency = cc.find_library('dl', required : false)
if cc.has_function('dlsym', prefix : '#include <dlfcn.h>', args : test_args, dependencies : libdl_dependency)
cdata.set('HAVE_DLSYM', 1)
endif
endif
# Header checks
stdatomic_dependency = []
......@@ -257,12 +268,12 @@ if host_machine.cpu_family().startswith('x86')
if get_option('stack_alignment') > 0
stack_alignment = get_option('stack_alignment')
elif host_machine.cpu_family() == 'x86_64'
if cc.has_argument('-mpreferred-stack-boundary=5')
stackalign_flag = ['-mpreferred-stack-boundary=5']
if cc.has_argument('-mpreferred-stack-boundary=6')
stackalign_flag = ['-mpreferred-stack-boundary=6']
stackrealign_flag = ['-mincoming-stack-boundary=4']
stack_alignment = 32
elif cc.has_argument('-mstack-alignment=32')
stackalign_flag = ['-mstack-alignment=32']
elif cc.has_argument('-mstack-alignment=64')
stackalign_flag = ['-mstack-alignment=64']
stackrealign_flag = ['-mstackrealign']
stack_alignment = 32
else
......@@ -364,8 +375,8 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
out = nasm_r.stdout().strip().split()
if out[1].to_lower() == 'version'
if out[2].version_compare('<2.13.02')
error('nasm 2.13.02 or later is required, found nasm @0@'.format(out[2]))
if out[2].version_compare('<2.14')
error('nasm 2.14 or later is required, found nasm @0@'.format(out[2]))
endif
else
error('unexpected nasm version string: @0@'.format(nasm_r.stdout()))
......@@ -390,7 +401,7 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
depfile: '@BASENAME@.obj.ndep',
arguments: [
'-f', nasm_format,
'-I', '@SOURCE_DIR@/src/',
'-I', '@0@/src/'.format(meson.current_source_dir()),
'-I', '@0@/'.format(meson.current_build_dir()),
'-MQ', '@OUTPUT@', '-MF', '@DEPFILE@',
'@EXTRA_ARGS@',
......
......@@ -148,20 +148,22 @@
.endif
.endm
// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// /*const*/ pixel *const top[2], int h,
// enum CdefEdgeFlags edges);
// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// const pixel *const top, int h,
// enum CdefEdgeFlags edges);
// n1 = s0/d0
// w1 = d0/q0
// n2 = s4/d2
// w2 = d2/q1
.macro padding_func w, stride, n1, w1, n2, w2, align
function cdef_padding\w\()_neon, export=1
function cdef_padding\w\()_8bpc_neon, export=1
push {r4-r7,lr}
ldrd r4, r5, [sp, #20]
ldr r6, [sp, #28]
cmp r6, #0xf // fully edged
beq cdef_padding\w\()_edged_8bpc_neon
vmov.i16 q3, #0x8000
tst r6, #4 // CDEF_HAVE_TOP
bne 1f
......@@ -175,10 +177,9 @@ function cdef_padding\w\()_neon, export=1
b 3f
1:
// CDEF_HAVE_TOP
ldr r7, [r4]
ldr lr, [r4, #4]
add r7, r4, r2
sub r0, r0, #2*(2*\stride)
pad_top_bottom r7, lr, \w, \stride, \n1, \w1, \n2, \w2, \align, 0
pad_top_bottom r4, r7, \w, \stride, \n1, \w1, \n2, \w2, \align, 0
// Middle section
3:
......@@ -267,6 +268,65 @@ endfunc
padding_func 8, 16, d0, q0, d2, q1, 128
padding_func 4, 8, s0, d0, s4, d2, 64
// void cdef_paddingX_edged_8bpc_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// const pixel *const top, int h,
// enum CdefEdgeFlags edges);
.macro padding_func_edged w, stride, reg, align
function cdef_padding\w\()_edged_8bpc_neon
sub r0, r0, #(2*\stride)
ldrh r12, [r4, #-2]
vldr \reg, [r4]
add r7, r4, r2
strh r12, [r0, #-2]
ldrh r12, [r4, #\w]
vstr \reg, [r0]
strh r12, [r0, #\w]
ldrh r12, [r7, #-2]
vldr \reg, [r7]
strh r12, [r0, #\stride-2]
ldrh r12, [r7, #\w]
vstr \reg, [r0, #\stride]
strh r12, [r0, #\stride+\w]
add r0, r0, #2*\stride
0:
ldrh r12, [r3], #2
vldr \reg, [r1]
str r12, [r0, #-2]
ldrh r12, [r1, #\w]
add r1, r1, r2
subs r5, r5, #1
vstr \reg, [r0]
str r12, [r0, #\w]
add r0, r0, #\stride
bgt 0b
ldrh r12, [r1, #-2]
vldr \reg, [r1]
add r7, r1, r2
strh r12, [r0, #-2]
ldrh r12, [r1, #\w]
vstr \reg, [r0]
strh r12, [r0, #\w]
ldrh r12, [r7, #-2]
vldr \reg, [r7]
strh r12, [r0, #\stride-2]
ldrh r12, [r7, #\w]
vstr \reg, [r0, #\stride]
strh r12, [r0, #\stride+\w]
pop {r4-r7,pc}
endfunc
.endm
padding_func_edged 8, 16, d0, 64
padding_func_edged 4, 8, s0, 32
.macro dir_table w, stride
const directions\w
.byte -1 * \stride + 1, -2 * \stride + 2
......@@ -311,14 +371,13 @@ endconst
vld1.16 {\d22}, [r9] // p1
.endif
.endm
.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
cmp \threshold, #0
.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
.if \min
vmin.u16 q2, q2, \s1
vmax.s16 q3, q3, \s1
vmin.u16 q2, q2, \s2
vmax.s16 q3, q3, \s2
beq 3f
.endif
vabd.u16 q8, q0, \s1 // abs(diff)
vabd.u16 q11, q0, \s2 // abs(diff)
vshl.u16 q9, q8, \shift // abs(diff) >> shift
......@@ -326,7 +385,7 @@ endconst
vqsub.u16 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
vqsub.u16 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
vsub.i16 q10, \s1, q0 // diff = p0 - px
vsub.u16 q13, \s2, q0 // diff = p1 - px
vsub.i16 q13, \s2, q0 // diff = p1 - px
vneg.s16 q8, q9 // -clip
vneg.s16 q11, q12 // -clip
vmin.s16 q10, q10, q9 // imin(diff, clip)
......@@ -336,36 +395,44 @@ endconst
vmax.s16 q13, q13, q11 // constrain() = imax(imin(diff, clip), -clip)
vmla.i16 q1, q10, q9 // sum += taps[k] * constrain()
vmla.i16 q1, q13, q9 // sum += taps[k] * constrain()
3:
.endm
// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping, int h);
.macro filter w
function cdef_filter\w\()_neon, export=1
push {r4-r9,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #92]
ldrd r6, r7, [sp, #100]
// void dav1d_cdef_filterX_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping,
// int h, size_t edges);
.macro filter_func w, pri, sec, min, suffix
function cdef_filter\w\suffix\()_neon
cmp r8, #0xf
beq cdef_filter\w\suffix\()_edged_neon
.if \pri
movrel_local r8, pri_taps
and r9, r3, #1
add r8, r8, r9, lsl #1
.endif
movrel_local r9, directions\w
add r5, r9, r5, lsl #1
vmov.u16 d17, #15
vdup.16 d16, r6 // damping
.if \pri
vdup.16 q5, r3 // threshold
.endif
.if \sec
vdup.16 q7, r4 // threshold
.endif
vmov.16 d8[0], r3
vmov.16 d8[1], r4
vclz.i16 d8, d8 // clz(threshold)
vsub.i16 d8, d17, d8 // ulog2(threshold)
vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
vneg.s16 d8, d8 // -shift
.if \sec
vdup.16 q6, d8[1]
.endif
.if \pri
vdup.16 q4, d8[0]
.endif
1:
.if \w == 8
......@@ -377,47 +444,64 @@ function cdef_filter\w\()_neon, export=1
.endif
vmov.u16 q1, #0 // sum
.if \min
vmov.u16 q2, q0 // min
vmov.u16 q3, q0 // max
.endif
// Instead of loading sec_taps 2, 1 from memory, just set it
// to 2 initially and decrease for the second round.
// This is also used as loop counter.
mov lr, #2 // sec_taps[0]
2:
.if \pri
ldrsb r9, [r5] // off1
load_px d28, d29, d30, d31, \w
.endif
.if \sec
add r5, r5, #4 // +2*2
ldrsb r9, [r5] // off2
.endif
.if \pri
ldrb r12, [r8] // *pri_taps
handle_pixel q14, q15, r3, q5, q4, r12
handle_pixel q14, q15, q5, q4, r12, \min
.endif
.if \sec
load_px d28, d29, d30, d31, \w
add r5, r5, #8 // +2*4
ldrsb r9, [r5] // off3
handle_pixel q14, q15, r4, q7, q6, lr
handle_pixel q14, q15, q7, q6, lr, \min
load_px d28, d29, d30, d31, \w
handle_pixel q14, q15, r4, q7, q6, lr
handle_pixel q14, q15, q7, q6, lr, \min
sub r5, r5, #11 // x8 -= 2*(2+4); x8 += 1;
sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1;
.else
add r5, r5, #1 // r5 += 1
.endif
subs lr, lr, #1 // sec_tap-- (value)
.if \pri
add r8, r8, #1 // pri_taps++ (pointer)
.endif
bne 2b
vshr.s16 q14, q1, #15 // -(sum < 0)
vadd.i16 q1, q1, q14 // sum - (sum < 0)
vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4
.if \min
vmin.s16 q0, q0, q3
vmax.s16 q0, q0, q2 // iclip(px + .., min, max)
.endif
vmovn.u16 d0, q0
.if \w == 8
add r2, r2, #2*16 // tmp += tmp_stride
......@@ -430,9 +514,11 @@ function cdef_filter\w\()_neon, export=1
vst1.32 {d0[1]}, [r0, :32], r1
.endif
// Reset pri_taps/sec_taps back to the original point
// Reset pri_taps and directions back to the original point
sub r5, r5, #2
.if \pri
sub r8, r8, #2
.endif
bgt 1b
vpop {q4-q7}
......@@ -440,9 +526,237 @@ function cdef_filter\w\()_neon, export=1
endfunc
.endm
.macro filter w
filter_func \w, pri=1, sec=0, min=0, suffix=_pri
filter_func \w, pri=0, sec=1, min=0, suffix=_sec
filter_func \w, pri=1, sec=1, min=1, suffix=_pri_sec
function cdef_filter\w\()_8bpc_neon, export=1
push {r4-r9,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #92]
ldrd r6, r7, [sp, #100]
ldr r8, [sp, #108]
cmp r3, #0 // pri_strength
bne 1f
b cdef_filter\w\()_sec_neon // only sec
1:
cmp r4, #0 // sec_strength
bne 1f
b cdef_filter\w\()_pri_neon // only pri
1:
b cdef_filter\w\()_pri_sec_neon // both pri and sec
endfunc
.endm
filter 8
filter 4
.macro load_px_8 d11, d12, d21, d22, w
.if \w == 8
add r6, r2, r9 // x + off
sub r9, r2, r9 // x - off
vld1.8 {\d11}, [r6] // p0
add r6, r6, #16 // += stride
vld1.8 {\d21}, [r9] // p1
add r9, r9, #16 // += stride
vld1.8 {\d12}, [r6] // p0
vld1.8 {\d22}, [r9] // p1
.else
add r6, r2, r9 // x + off
sub r9, r2, r9 // x - off
vld1.32 {\d11[0]}, [r6] // p0
add r6, r6, #8 // += stride
vld1.32 {\d21[0]}, [r9] // p1
add r9, r9, #8 // += stride
vld1.32 {\d11[1]}, [r6] // p0
add r6, r6, #8 // += stride
vld1.32 {\d21[1]}, [r9] // p1
add r9, r9, #8 // += stride
vld1.32 {\d12[0]}, [r6] // p0
add r6, r6, #8 // += stride
vld1.32 {\d22[0]}, [r9] // p1
add r9, r9, #8 // += stride
vld1.32 {\d12[1]}, [r6] // p0
vld1.32 {\d22[1]}, [r9] // p1
.endif
.endm
.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
.if \min
vmin.u8 q3, q3, \s1
vmax.u8 q4, q4, \s1
vmin.u8 q3, q3, \s2
vmax.u8 q4, q4, \s2
.endif
vabd.u8 q8, q0, \s1 // abs(diff)
vabd.u8 q11, q0, \s2 // abs(diff)
vshl.u8 q9, q8, \shift // abs(diff) >> shift
vshl.u8 q12, q11, \shift // abs(diff) >> shift
vqsub.u8 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
vqsub.u8 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
vcgt.u8 q10, q0, \s1 // px > p0
vcgt.u8 q13, q0, \s2 // px > p1
vmin.u8 q9, q9, q8 // imin(abs(diff), clip)
vmin.u8 q12, q12, q11 // imin(abs(diff), clip)
vneg.s8 q8, q9 // -imin()
vneg.s8 q11, q12 // -imin()
vbsl q10, q8, q9 // constrain() = imax(imin(diff, clip), -clip)
vdup.8 d18, \tap // taps[k]
vbsl q13, q11, q12 // constrain() = imax(imin(diff, clip), -clip)
vmlal.s8 q1, d20, d18 // sum += taps[k] * constrain()
vmlal.s8 q1, d26, d18 // sum += taps[k] * constrain()
vmlal.s8 q2, d21, d18 // sum += taps[k] * constrain()
vmlal.s8 q2, d27, d18 // sum += taps[k] * constrain()
.endm
// void cdef_filterX_edged_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping,
// int h, size_t edges);
.macro filter_func_8 w, pri, sec, min, suffix
function cdef_filter\w\suffix\()_edged_neon
.if \pri
movrel_local r8, pri_taps
and r9, r3, #1
add r8, r8, r9, lsl #1
.endif
movrel_local r9, directions\w
add r5, r9, r5, lsl #1
vmov.u8 d17, #7
vdup.8 d16, r6 // damping
vmov.8 d8[0], r3
vmov.8 d8[1], r4
vclz.i8 d8, d8 // clz(threshold)
vsub.i8 d8, d17, d8 // ulog2(threshold)
vqsub.u8 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
vneg.s8 d8, d8 // -shift
.if \sec
vdup.8 q6, d8[1]
.endif
.if \pri
vdup.8 q5, d8[0]
.endif
1:
.if \w == 8
add r12, r2, #16
vld1.8 {d0}, [r2, :64] // px
vld1.8 {d1}, [r12, :64] // px
.else
add r12, r2, #8
vld1.32 {d0[0]}, [r2, :32] // px
add r9, r2, #2*8
vld1.32 {d0[1]}, [r12, :32] // px
add r12, r12, #2*8
vld1.32 {d1[0]}, [r9, :32] // px
vld1.32 {d1[1]}, [r12, :32] // px
.endif
vmov.u8 q1, #0 // sum
vmov.u8 q2, #0 // sum
.if \min
vmov.u16 q3, q0 // min
vmov.u16 q4, q0 // max
.endif
// Instead of loading sec_taps 2, 1 from memory, just set it
// to 2 initially and decrease for the second round.
// This is also used as loop counter.
mov lr, #2 // sec_taps[0]
2:
.if \pri
ldrsb r9, [r5] // off1
load_px_8 d28, d29, d30, d31, \w
.endif
.if \sec
add r5, r5, #4 // +2*2
ldrsb r9, [r5] // off2
.endif
.if \pri
ldrb r12, [r8] // *pri_taps
vdup.8 q7, r3 // threshold
handle_pixel_8 q14, q15, q7, q5, r12, \min
.endif
.if \sec
load_px_8 d28, d29, d30, d31, \w
add r5, r5, #8 // +2*4
ldrsb r9, [r5] // off3
vdup.8 q7, r4 // threshold
handle_pixel_8 q14, q15, q7, q6, lr, \min
load_px_8 d28, d29, d30, d31, \w
handle_pixel_8 q14, q15, q7, q6, lr, \min
sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1;
.else
add r5, r5, #1 // r5 += 1
.endif
subs lr, lr, #1 // sec_tap-- (value)
.if \pri
add r8, r8, #1 // pri_taps++ (pointer)
.endif
bne 2b
vshr.s16 q14, q1, #15 // -(sum < 0)
vshr.s16 q15, q2, #15 // -(sum < 0)
vadd.i16 q1, q1, q14 // sum - (sum < 0)
vadd.i16 q2, q2, q15 // sum - (sum < 0)
vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
vrshr.s16 q2, q2, #4 // (8 + sum - (sum < 0)) >> 4
vaddw.u8 q1, q1, d0 // px + (8 + sum ...) >> 4
vaddw.u8 q2, q2, d1 // px + (8 + sum ...) >> 4
vqmovun.s16 d0, q1
vqmovun.s16 d1, q2
.if \min
vmin.u8 q0, q0, q4
vmax.u8 q0, q0, q3 // iclip(px + .., min, max)
.endif
.if \w == 8
vst1.8 {d0}, [r0, :64], r1
add r2, r2, #2*16 // tmp += 2*tmp_stride
subs r7, r7, #2 // h -= 2
vst1.8 {d1}, [r0, :64], r1
.else
vst1.32 {d0[0]}, [r0, :32], r1
add r2, r2, #4*8 // tmp += 4*tmp_stride
vst1.32 {d0[1]}, [r0, :32], r1
subs r7, r7, #4 // h -= 4
vst1.32 {d1[0]}, [r0, :32], r1
vst1.32 {d1[1]}, [r0, :32], r1
.endif
// Reset pri_taps and directions back to the original point
sub r5, r5, #2
.if \pri
sub r8, r8, #2
.endif
bgt 1b
vpop {q4-q7}
pop {r4-r9,pc}
endfunc
.endm
.macro filter_8 w
filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
.endm
filter_8 8
filter_8 4
const div_table, align=4
.short 840, 420, 280, 210, 168, 140, 120, 105
endconst
......@@ -451,9 +765,9 @@ const alt_fact, align=4
.short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
endconst
// int dav1d_cdef_find_dir_neon(const pixel *img, const ptrdiff_t stride,
// unsigned *const var)
function cdef_find_dir_neon, export=1
// int dav1d_cdef_find_dir_8bpc_neon(const pixel *img, const ptrdiff_t stride,
// unsigned *const var)
function cdef_find_dir_8bpc_neon, export=1
push {lr}
vpush {q4-q7}
sub sp, sp, #32 // cost
......
......@@ -143,8 +143,8 @@ function lpf_8_wd\wd\()_neon
vaddw.s8 q1, q1, d4
vmov.i8 d7, #3
vqmovn.s16 d2, q1 // f
vqadd.s8 d4, d6, d2 // imin(f + 4, 128)
vqadd.s8 d5, d7, d2 // imin(f + 3, 128)
vqadd.s8 d4, d6, d2 // imin(f + 4, 127)
vqadd.s8 d5, d7, d2 // imin(f + 3, 127)
vshr.s8 d4, d4, #3 // f1
vshr.s8 d5, d5, #3 // f2
vmovl.u8 q1, d23 // p0
......@@ -734,13 +734,13 @@ function lpf_h_16_8_neon
bx r12
endfunc
// void dav1d_lpf_v_sb_y_neon(pixel *dst, const ptrdiff_t stride,
// const uint32_t *const vmask,
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
// const Av1FilterLUT *lut, const int w)
// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
// const uint32_t *const vmask,
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
// const Av1FilterLUT *lut, const int w)
.macro lpf_func dir, type
function lpf_\dir\()_sb_\type\()_neon, export=1
function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
......
......@@ -28,11 +28,11 @@
#include "src/arm/asm.S"
#include "util.S"
// void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
// const pixel *src, ptrdiff_t stride,
// const int16_t fh[7], const intptr_t w,
// int h, enum LrEdgeFlags edges);
function wiener_filter_h_neon, export=1
// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
// const pixel *src, ptrdiff_t stride,
// const int16_t fh[7], const intptr_t w,
// int h, enum LrEdgeFlags edges);
function wiener_filter_h_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4}
ldrd r4, r5, [sp, #52]
......@@ -367,11 +367,11 @@ L(variable_shift_tbl):
.purgem filter_4
endfunc
// void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
// const int16_t *mid, int w, int h,
// const int16_t fv[7], enum LrEdgeFlags edges,
// ptrdiff_t mid_stride);
function wiener_filter_v_neon, export=1
// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
// const int16_t *mid, int w, int h,
// const int16_t fv[7], enum LrEdgeFlags edges,
// ptrdiff_t mid_stride);
function wiener_filter_v_8bpc_neon, export=1
push {r4-r7,lr}
ldrd r4, r5, [sp, #20]
ldrd r6, r7, [sp, #28]
......@@ -548,9 +548,9 @@ function wiener_filter_v_neon, export=1
.purgem filter
endfunc
// void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
// const pixel *src, int w, int h);
function copy_narrow_neon, export=1
// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride,
// const pixel *src, int w, int h);
function copy_narrow_8bpc_neon, export=1
push {r4,lr}
ldr r4, [sp, #8]
adr r12, L(copy_narrow_tbl)
......@@ -687,12 +687,12 @@ endfunc
#define SUM_STRIDE (384+16)
// void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box3_h_neon, export=1
// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box3_h_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
......@@ -925,11 +925,11 @@ L(box3_variable_shift_tbl):
vmull.u8 q6, d9, d9
add3 4
subs r5, r5, #4
vst1.16 {d6}, [r1, :64]!
vst1.16 {d14}, [r11, :64]!
vst1.32 {q12}, [r0, :128]!
vst1.32 {q8}, [r10, :128]!
subs r5, r5, #4
ble 9f
vext.8 q0, q0, q0, #4
vext.8 q1, q1, q2, #8
......@@ -961,12 +961,12 @@ L(box3_variable_shift_tbl):
.purgem add3
endfunc
// void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box5_h_neon, export=1
// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box5_h_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
......@@ -1038,7 +1038,7 @@ function sgr_box5_h_neon, export=1
b 2f
0:
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q0 to have 2x the first byte at the front.
// and shift q0 to have 3x the first byte at the front.
vdup.8 q1, d0[0]
vdup.8 q5, d8[0]
// Move r3 back to account for the last 3 bytes we loaded before,
......@@ -1215,11 +1215,11 @@ L(box5_variable_shift_tbl):
vmull.u8 q6, d9, d9
add5 4
subs r5, r5, #4
vst1.16 {d6}, [r1, :64]!
vst1.16 {d14}, [r11, :64]!
vst1.32 {q12}, [r0, :128]!
vst1.32 {q10}, [r10, :128]!
subs r5, r5, #4
ble 9f
vext.8 q0, q0, q0, #4
vext.8 q1, q1, q2, #8
......@@ -1661,11 +1661,11 @@ endfunc
#define FILTER_OUT_STRIDE 384
// void dav1d_sgr_finish_filter1_neon(coef *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter1_neon, export=1
// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter1_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
......@@ -1765,11 +1765,11 @@ function sgr_finish_filter1_neon, export=1
pop {r4-r11,pc}
endfunc
// void dav1d_sgr_finish_filter2_neon(coef *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter2_neon, export=1
// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter2_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
......@@ -1925,11 +1925,11 @@ function sgr_finish_filter2_neon, export=1
pop {r4-r11,pc}
endfunc
// void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
// const pixel *src, const ptrdiff_t src_stride,
// const coef *t1, const int w, const int h,
// const int wt);
function sgr_weighted1_neon, export=1
// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride,
// const pixel *src, const ptrdiff_t src_stride,
// const int16_t *t1, const int w, const int h,
// const int wt);
function sgr_weighted1_8bpc_neon, export=1
push {r4-r9,lr}
ldrd r4, r5, [sp, #28]
ldrd r6, r7, [sp, #36]
......@@ -2009,12 +2009,12 @@ function sgr_weighted1_neon, export=1
pop {r4-r9,pc}
endfunc
// void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *src, const ptrdiff_t src_stride,
// const coef *t1, const coef *t2,
// const int w, const int h,
// const int16_t wt[2]);
function sgr_weighted2_neon, export=1
// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *src, const ptrdiff_t src_stride,
// const int16_t *t1, const int16_t *t2,
// const int w, const int h,
// const int16_t wt[2]);
function sgr_weighted2_8bpc_neon, export=1
push {r4-r11,lr}
ldrd r4, r5, [sp, #36]
ldrd r6, r7, [sp, #44]
......
......@@ -753,7 +753,7 @@ L(blend_v_tbl):
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d5, d22, d4
sub r1, r1, #3
sub r1, r1, #2
4:
vld1.u8 {d2}, [r2, :64]!
vld1.32 {d0[]}, [r0, :32]
......@@ -764,10 +764,8 @@ L(blend_v_tbl):
vrshrn.i16 d20, q3, #6
vst1.16 {d20[0]}, [r0, :16]!
vst1.16 {d20[2]}, [r12, :16]!
vst1.8 {d20[2]}, [r0]!
vst1.8 {d20[6]}, [r12]!
add r0, r0, r1
add r12, r12, r1
vst1.8 {d20[2]}, [r0], r1
vst1.8 {d20[6]}, [r12], r1
bgt 4b
pop {r4-r5,pc}
80:
......@@ -776,7 +774,7 @@ L(blend_v_tbl):
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d17, d16, d2
sub r1, r1, #6
sub r1, r1, #4
8:
vld1.u8 {d4, d5}, [r2, :128]!
vld1.u8 {d0}, [r0, :64]
......@@ -790,10 +788,8 @@ L(blend_v_tbl):
vrshrn.i16 d23, q10, #6
vst1.32 {d22[0]}, [r0, :32]!
vst1.32 {d23[0]}, [r12, :32]!
vst1.16 {d22[2]}, [r0, :16]!
vst1.16 {d23[2]}, [r12, :16]!
add r0, r0, r1
add r12, r12, r1
vst1.16 {d22[2]}, [r0, :16], r1
vst1.16 {d23[2]}, [r12, :16], r1
bgt 8b
pop {r4-r5,pc}
160:
......@@ -802,7 +798,7 @@ L(blend_v_tbl):
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 q11, q12, q14
sub r1, r1, #12
sub r1, r1, #8
16:
vld1.u8 {q1, q2}, [r2, :128]!
vld1.u8 {q0}, [r0, :128]
......@@ -822,20 +818,18 @@ L(blend_v_tbl):
vrshrn.i16 d21, q8, #6
vst1.u8 {d18}, [r0, :64]!
vst1.u8 {d20}, [r12, :64]!
vst1.32 {d19[0]}, [r0, :32]!
vst1.32 {d21[0]}, [r12, :32]!
add r0, r0, r1
add r12, r12, r1
vst1.32 {d19[0]}, [r0, :32], r1
vst1.32 {d21[0]}, [r12, :32], r1
bgt 16b
pop {r4-r5,pc}
320:
vmov.i8 q10, #64
vld1.u8 {q2, q3}, [r5, :128]
vsub.i8 q11, q10, q2
vsub.i8 q12, q10, q3
vsub.i8 d24, d20, d6
32:
vld1.u8 {q8, q9}, [r2, :128]!
vld1.u8 {q0, q1}, [r0, :128]
vld1.u8 {d0, d1, d2}, [r0, :64]
subs r4, r4, #1
vmull.u8 q15, d16, d4
vmlal.u8 q15, d0, d22
......
......@@ -27,6 +27,7 @@
#include "src/arm/asm.S"
#include "util.S"
#include "cdef_tmpl.S"
.macro pad_top_bottom s1, s2, w, stride, rn, rw, ret
tst w6, #1 // CDEF_HAVE_LEFT
......@@ -137,13 +138,15 @@
.endif
.endm
// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// /*const*/ pixel *const top[2], int h,
// enum CdefEdgeFlags edges);
// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// const pixel *const top, int h,
// enum CdefEdgeFlags edges);
.macro padding_func w, stride, rn, rw
function cdef_padding\w\()_neon, export=1
function cdef_padding\w\()_8bpc_neon, export=1
cmp w6, #0xf // fully edged
b.eq cdef_padding\w\()_edged_8bpc_neon
movi v30.8h, #0x80, lsl #8
mov v31.16b, v30.16b
sub x0, x0, #2*(2*\stride+2)
......@@ -157,9 +160,8 @@ function cdef_padding\w\()_neon, export=1
b 3f
1:
// CDEF_HAVE_TOP
ldr x8, [x4]
ldr x9, [x4, #8]
pad_top_bottom x8, x9, \w, \stride, \rn, \rw, 0
add x9, x4, x2
pad_top_bottom x4, x9, \w, \stride, \rn, \rw, 0
// Middle section
3:
......@@ -242,358 +244,274 @@ endfunc
padding_func 8, 16, d, q
padding_func 4, 8, s, d
.macro dir_table w, stride
const directions\w
.byte -1 * \stride + 1, -2 * \stride + 2
.byte 0 * \stride + 1, -1 * \stride + 2
.byte 0 * \stride + 1, 0 * \stride + 2
.byte 0 * \stride + 1, 1 * \stride + 2
.byte 1 * \stride + 1, 2 * \stride + 2
.byte 1 * \stride + 0, 2 * \stride + 1
.byte 1 * \stride + 0, 2 * \stride + 0
.byte 1 * \stride + 0, 2 * \stride - 1
// Repeated, to avoid & 7
.byte -1 * \stride + 1, -2 * \stride + 2
.byte 0 * \stride + 1, -1 * \stride + 2
.byte 0 * \stride + 1, 0 * \stride + 2
.byte 0 * \stride + 1, 1 * \stride + 2
.byte 1 * \stride + 1, 2 * \stride + 2
.byte 1 * \stride + 0, 2 * \stride + 1
endconst
// void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// const pixel *const top, int h,
// enum CdefEdgeFlags edges);
.macro padding_func_edged w, stride, reg
function cdef_padding\w\()_edged_8bpc_neon, export=1
sub x4, x4, #2
sub x0, x0, #(2*\stride+2)
.if \w == 4
ldr d0, [x4]
ldr d1, [x4, x2]
st1 {v0.8b, v1.8b}, [x0], #16
.else
add x9, x4, x2
ldr d0, [x4]
ldr s1, [x4, #8]
ldr d2, [x9]
ldr s3, [x9, #8]
str d0, [x0]
str s1, [x0, #8]
str d2, [x0, #\stride]
str s3, [x0, #\stride+8]
add x0, x0, #2*\stride
.endif
0:
ld1 {v0.h}[0], [x3], #2
ldr h2, [x1, #\w]
load_n_incr v1, x1, x2, \w
subs w5, w5, #1
str h0, [x0]
stur \reg\()1, [x0, #2]
str h2, [x0, #2+\w]
add x0, x0, #\stride
b.gt 0b
sub x1, x1, #2
.if \w == 4
ldr d0, [x1]
ldr d1, [x1, x2]
st1 {v0.8b, v1.8b}, [x0], #16
.else
add x9, x1, x2
ldr d0, [x1]
ldr s1, [x1, #8]
ldr d2, [x9]
ldr s3, [x9, #8]
str d0, [x0]
str s1, [x0, #8]
str d2, [x0, #\stride]
str s3, [x0, #\stride+8]
.endif
ret
endfunc
.endm
dir_table 8, 16
dir_table 4, 8
padding_func_edged 8, 16, d
padding_func_edged 4, 8, s
const pri_taps
.byte 4, 2, 3, 3
endconst
tables
.macro load_px d1, d2, w
filter 8, 8
filter 4, 8
find_dir 8
.macro load_px_8 d1, d2, w
.if \w == 8
add x6, x2, w9, sxtb #1 // x + off
sub x9, x2, w9, sxtb #1 // x - off
ld1 {\d1\().8h}, [x6] // p0
ld1 {\d2\().8h}, [x9] // p1
.else
add x6, x2, w9, sxtb #1 // x + off
sub x9, x2, w9, sxtb #1 // x - off
ld1 {\d1\().4h}, [x6] // p0
add x6, x6, #2*8 // += stride
ld1 {\d2\().4h}, [x9] // p1
add x9, x9, #2*8 // += stride
add x6, x2, w9, sxtb // x + off
sub x9, x2, w9, sxtb // x - off
ld1 {\d1\().d}[0], [x6] // p0
add x6, x6, #16 // += stride
ld1 {\d2\().d}[0], [x9] // p1
add x9, x9, #16 // += stride
ld1 {\d1\().d}[1], [x6] // p0
ld1 {\d2\().d}[1], [x9] // p1
ld1 {\d2\().d}[1], [x9] // p0
.else
add x6, x2, w9, sxtb // x + off
sub x9, x2, w9, sxtb // x - off
ld1 {\d1\().s}[0], [x6] // p0
add x6, x6, #8 // += stride
ld1 {\d2\().s}[0], [x9] // p1
add x9, x9, #8 // += stride
ld1 {\d1\().s}[1], [x6] // p0
add x6, x6, #8 // += stride
ld1 {\d2\().s}[1], [x9] // p1
add x9, x9, #8 // += stride
ld1 {\d1\().s}[2], [x6] // p0
add x6, x6, #8 // += stride
ld1 {\d2\().s}[2], [x9] // p1
add x9, x9, #8 // += stride
ld1 {\d1\().s}[3], [x6] // p0
ld1 {\d2\().s}[3], [x9] // p1
.endif
.endm
.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
umin v2.8h, v2.8h, \s1\().8h
smax v3.8h, v3.8h, \s1\().8h
umin v2.8h, v2.8h, \s2\().8h
smax v3.8h, v3.8h, \s2\().8h
cbz \threshold, 3f
uabd v16.8h, v0.8h, \s1\().8h // abs(diff)
uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
ushl v21.8h, v20.8h, \shift // abs(diff) >> shift
uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px
sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px
neg v16.8h, v17.8h // -clip
neg v20.8h, v21.8h // -clip
smin v18.8h, v18.8h, v17.8h // imin(diff, clip)
smin v22.8h, v22.8h, v21.8h // imin(diff, clip)
dup v19.8h, \tap // taps[k]
smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip)
smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip)
mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain()
mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain()
3:
.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
.if \min
umin v3.16b, v3.16b, \s1\().16b
umax v4.16b, v4.16b, \s1\().16b
umin v3.16b, v3.16b, \s2\().16b
umax v4.16b, v4.16b, \s2\().16b
.endif
uabd v16.16b, v0.16b, \s1\().16b // abs(diff)
uabd v20.16b, v0.16b, \s2\().16b // abs(diff)
ushl v17.16b, v16.16b, \shift // abs(diff) >> shift
ushl v21.16b, v20.16b, \shift // abs(diff) >> shift
uqsub v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift))
uqsub v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift))
cmhi v18.16b, v0.16b, \s1\().16b // px > p0
cmhi v22.16b, v0.16b, \s2\().16b // px > p1
umin v17.16b, v17.16b, v16.16b // imin(abs(diff), clip)
umin v21.16b, v21.16b, v20.16b // imin(abs(diff), clip)
dup v19.16b, \tap // taps[k]
neg v16.16b, v17.16b // -imin()
neg v20.16b, v21.16b // -imin()
bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign()
bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign()
smlal v1.8h, v18.8b, v19.8b // sum += taps[k] * constrain()
smlal v1.8h, v22.8b, v19.8b // sum += taps[k] * constrain()
smlal2 v2.8h, v18.16b, v19.16b // sum += taps[k] * constrain()
smlal2 v2.8h, v22.16b, v19.16b // sum += taps[k] * constrain()
.endm
// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping, int h);
.macro filter w
function cdef_filter\w\()_neon, export=1
// void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint8_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping,
// int h);
.macro filter_func_8 w, pri, sec, min, suffix
function cdef_filter\w\suffix\()_edged_8bpc_neon
.if \pri
movrel x8, pri_taps
and w9, w3, #1
add x8, x8, w9, uxtw #1
.endif
movrel x9, directions\w
add x5, x9, w5, uxtw #1
movi v30.4h, #15
dup v28.4h, w6 // damping
dup v25.8h, w3 // threshold
dup v27.8h, w4 // threshold
trn1 v24.4h, v25.4h, v27.4h
clz v24.4h, v24.4h // clz(threshold)
sub v24.4h, v30.4h, v24.4h // ulog2(threshold)
uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold))
neg v24.4h, v24.4h // -shift
dup v26.8h, v24.h[1]
dup v24.8h, v24.h[0]
movi v30.8b, #7
dup v28.8b, w6 // damping
.if \pri
dup v25.16b, w3 // threshold
.endif
.if \sec
dup v27.16b, w4 // threshold
.endif
trn1 v24.8b, v25.8b, v27.8b
clz v24.8b, v24.8b // clz(threshold)
sub v24.8b, v30.8b, v24.8b // ulog2(threshold)
uqsub v24.8b, v28.8b, v24.8b // shift = imax(0, damping - ulog2(threshold))
neg v24.8b, v24.8b // -shift
.if \sec
dup v26.16b, v24.b[1]
.endif
.if \pri
dup v24.16b, v24.b[0]
.endif
1:
.if \w == 8
ld1 {v0.8h}, [x2] // px
.else
add x12, x2, #2*8
ld1 {v0.4h}, [x2] // px
add x12, x2, #16
ld1 {v0.d}[0], [x2] // px
ld1 {v0.d}[1], [x12] // px
.else
add x12, x2, #1*8
add x13, x2, #2*8
add x14, x2, #3*8
ld1 {v0.s}[0], [x2] // px
ld1 {v0.s}[1], [x12] // px
ld1 {v0.s}[2], [x13] // px
ld1 {v0.s}[3], [x14] // px
.endif
movi v1.8h, #0 // sum
mov v2.16b, v0.16b // min
mov v3.16b, v0.16b // max
movi v2.8h, #0 // sum
.if \min
mov v3.16b, v0.16b // min
mov v4.16b, v0.16b // max
.endif
// Instead of loading sec_taps 2, 1 from memory, just set it
// to 2 initially and decrease for the second round.
// This is also used as loop counter.
mov w11, #2 // sec_taps[0]
2:
.if \pri
ldrb w9, [x5] // off1
load_px v4, v5, \w
load_px_8 v5, v6, \w
.endif
.if \sec
add x5, x5, #4 // +2*2
ldrb w9, [x5] // off2
load_px v6, v7, \w
load_px_8 v28, v29, \w
.endif
.if \pri
ldrb w10, [x8] // *pri_taps
handle_pixel v4, v5, w3, v25.8h, v24.8h, w10
handle_pixel_8 v5, v6, v25.16b, v24.16b, w10, \min
.endif
.if \sec
add x5, x5, #8 // +2*4
ldrb w9, [x5] // off3
load_px v4, v5, \w
load_px_8 v5, v6, \w
handle_pixel v6, v7, w4, v27.8h, v26.8h, w11
handle_pixel_8 v28, v29, v27.16b, v26.16b, w11, \min
handle_pixel v4, v5, w4, v27.8h, v26.8h, w11
handle_pixel_8 v5, v6, v27.16b, v26.16b, w11, \min
sub x5, x5, #11 // x8 -= 2*(2+4); x8 += 1;
sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
.else
add x5, x5, #1 // x5 += 1
.endif
subs w11, w11, #1 // sec_tap-- (value)
.if \pri
add x8, x8, #1 // pri_taps++ (pointer)
.endif
b.ne 2b
sshr v4.8h, v1.8h, #15 // -(sum < 0)
add v1.8h, v1.8h, v4.8h // sum - (sum < 0)
srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4
smin v0.8h, v0.8h, v3.8h
smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max)
xtn v0.8b, v0.8h
sshr v5.8h, v1.8h, #15 // -(sum < 0)
sshr v6.8h, v2.8h, #15 // -(sum < 0)
add v1.8h, v1.8h, v5.8h // sum - (sum < 0)
add v2.8h, v2.8h, v6.8h // sum - (sum < 0)
srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
srshr v2.8h, v2.8h, #4 // (8 + sum - (sum < 0)) >> 4
uaddw v1.8h, v1.8h, v0.8b // px + (8 + sum ...) >> 4
uaddw2 v2.8h, v2.8h, v0.16b // px + (8 + sum ...) >> 4
sqxtun v0.8b, v1.8h
sqxtun2 v0.16b, v2.8h
.if \min
umin v0.16b, v0.16b, v4.16b
umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max)
.endif
.if \w == 8
add x2, x2, #2*16 // tmp += tmp_stride
subs w7, w7, #1 // h--
st1 {v0.8b}, [x0], x1
.else
st1 {v0.s}[0], [x0], x1
st1 {v0.d}[0], [x0], x1
add x2, x2, #2*16 // tmp += 2*tmp_stride
subs w7, w7, #2 // h -= 2
st1 {v0.d}[1], [x0], x1
.else
st1 {v0.s}[0], [x0], x1
add x2, x2, #4*8 // tmp += 4*tmp_stride
st1 {v0.s}[1], [x0], x1
subs w7, w7, #4 // h -= 4
st1 {v0.s}[2], [x0], x1
st1 {v0.s}[3], [x0], x1
.endif
// Reset pri_taps/sec_taps back to the original point
// Reset pri_taps and directions back to the original point
sub x5, x5, #2
.if \pri
sub x8, x8, #2
.endif
b.gt 1b
ret
endfunc
.endm
filter 8
filter 4
const div_table
.short 840, 420, 280, 210, 168, 140, 120, 105
endconst
const alt_fact
.short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
endconst
// int dav1d_cdef_find_dir_neon(const pixel *img, const ptrdiff_t stride,
// unsigned *const var)
function cdef_find_dir_neon, export=1
sub sp, sp, #32 // cost
mov w3, #8
movi v31.16b, #128
movi v30.16b, #0
movi v1.8h, #0 // v0-v1 sum_diag[0]
movi v3.8h, #0 // v2-v3 sum_diag[1]
movi v5.8h, #0 // v4-v5 sum_hv[0-1]
movi v7.8h, #0 // v6-v7 sum_alt[0]
movi v17.8h, #0 // v16-v17 sum_alt[1]
movi v18.8h, #0 // v18-v19 sum_alt[2]
movi v19.8h, #0
movi v21.8h, #0 // v20-v21 sum_alt[3]
.irpc i, 01234567
ld1 {v26.8b}, [x0], x1
usubl v26.8h, v26.8b, v31.8b
addv h25, v26.8h // [y]
rev64 v27.8h, v26.8h
addp v28.8h, v26.8h, v30.8h // [(x >> 1)]
add v5.8h, v5.8h, v26.8h // sum_hv[1]
ext v27.16b, v27.16b, v27.16b, #8 // [-x]
rev64 v29.4h, v28.4h // [-(x >> 1)]
ins v4.h[\i], v25.h[0] // sum_hv[0]
.if \i == 0
mov v0.16b, v26.16b // sum_diag[0]
mov v2.16b, v27.16b // sum_diag[1]
mov v6.16b, v28.16b // sum_alt[0]
mov v16.16b, v29.16b // sum_alt[1]
.else
ext v22.16b, v30.16b, v26.16b, #(16-2*\i)
ext v23.16b, v26.16b, v30.16b, #(16-2*\i)
ext v24.16b, v30.16b, v27.16b, #(16-2*\i)
ext v25.16b, v27.16b, v30.16b, #(16-2*\i)
add v0.8h, v0.8h, v22.8h // sum_diag[0]
add v1.8h, v1.8h, v23.8h // sum_diag[0]
add v2.8h, v2.8h, v24.8h // sum_diag[1]
add v3.8h, v3.8h, v25.8h // sum_diag[1]
ext v22.16b, v30.16b, v28.16b, #(16-2*\i)
ext v23.16b, v28.16b, v30.16b, #(16-2*\i)
ext v24.16b, v30.16b, v29.16b, #(16-2*\i)
ext v25.16b, v29.16b, v30.16b, #(16-2*\i)
add v6.8h, v6.8h, v22.8h // sum_alt[0]
add v7.4h, v7.4h, v23.4h // sum_alt[0]
add v16.8h, v16.8h, v24.8h // sum_alt[1]
add v17.4h, v17.4h, v25.4h // sum_alt[1]
.endif
.if \i < 6
ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
add v18.8h, v18.8h, v22.8h // sum_alt[2]
add v19.4h, v19.4h, v23.4h // sum_alt[2]
.else
add v18.8h, v18.8h, v26.8h // sum_alt[2]
.endif
.if \i == 0
mov v20.16b, v26.16b // sum_alt[3]
.elseif \i == 1
add v20.8h, v20.8h, v26.8h // sum_alt[3]
.else
ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
add v20.8h, v20.8h, v24.8h // sum_alt[3]
add v21.4h, v21.4h, v25.4h // sum_alt[3]
.endif
.endr
movi v31.4s, #105
smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0]
smlal2 v26.4s, v4.8h, v4.8h
smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1]
smlal2 v27.4s, v5.8h, v5.8h
mul v26.4s, v26.4s, v31.4s // cost[2] *= 105
mul v27.4s, v27.4s, v31.4s // cost[6] *= 105
addv s4, v26.4s // cost[2]
addv s5, v27.4s // cost[6]
rev64 v1.8h, v1.8h
rev64 v3.8h, v3.8h
ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n]
ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n]
str s4, [sp, #2*4] // cost[2]
str s5, [sp, #6*4] // cost[6]
movrel x4, div_table
ld1 {v31.8h}, [x4]
smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0]
smull2 v23.4s, v0.8h, v0.8h
smlal v22.4s, v1.4h, v1.4h
smlal2 v23.4s, v1.8h, v1.8h
smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1]
smull2 v25.4s, v2.8h, v2.8h
smlal v24.4s, v3.4h, v3.4h
smlal2 v25.4s, v3.8h, v3.8h
uxtl v30.4s, v31.4h // div_table
uxtl2 v31.4s, v31.8h
mul v22.4s, v22.4s, v30.4s // cost[0]
mla v22.4s, v23.4s, v31.4s // cost[0]
mul v24.4s, v24.4s, v30.4s // cost[4]
mla v24.4s, v25.4s, v31.4s // cost[4]
addv s0, v22.4s // cost[0]
addv s2, v24.4s // cost[4]
movrel x5, alt_fact
ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105
str s0, [sp, #0*4] // cost[0]
str s2, [sp, #4*4] // cost[4]
uxtl v29.4s, v29.4h // div_table[2*m+1] + 105
uxtl v30.4s, v30.4h
uxtl v31.4s, v31.4h
.macro cost_alt d1, d2, s1, s2, s3, s4
smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
smull2 v23.4s, \s1\().8h, \s1\().8h
smull v24.4s, \s2\().4h, \s2\().4h
smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
smull2 v26.4s, \s3\().8h, \s3\().8h
smull v27.4s, \s4\().4h, \s4\().4h
mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact
mla v22.4s, v23.4s, v30.4s
mla v22.4s, v24.4s, v31.4s
mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact
mla v25.4s, v26.4s, v30.4s
mla v25.4s, v27.4s, v31.4s
addv \d1, v22.4s // *cost_ptr
addv \d2, v25.4s // *cost_ptr
.macro filter_8 w
filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
.endm
cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3]
cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7]
str s6, [sp, #1*4] // cost[1]
str s16, [sp, #3*4] // cost[3]
mov w0, #0 // best_dir
mov w1, v0.s[0] // best_cost
mov w3, #1 // n
str s18, [sp, #5*4] // cost[5]
str s20, [sp, #7*4] // cost[7]
mov w4, v6.s[0]
.macro find_best s1, s2, s3
.ifnb \s2
mov w5, \s2\().s[0]
.endif
cmp w4, w1 // cost[n] > best_cost
csel w0, w3, w0, gt // best_dir = n
csel w1, w4, w1, gt // best_cost = cost[n]
.ifnb \s2
add w3, w3, #1 // n++
cmp w5, w1 // cost[n] > best_cost
mov w4, \s3\().s[0]
csel w0, w3, w0, gt // best_dir = n
csel w1, w5, w1, gt // best_cost = cost[n]
add w3, w3, #1 // n++
.endif
.endm
find_best v6, v4, v16
find_best v16, v2, v18
find_best v18, v5, v20
find_best v20
eor w3, w0, #4 // best_dir ^4
ldr w4, [sp, w3, uxtw #2]
sub w1, w1, w4 // best_cost - cost[best_dir ^ 4]
lsr w1, w1, #10
str w1, [x2] // *var
add sp, sp, #32
ret
endfunc
filter_8 8
filter_8 4
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2020, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
#include "cdef_tmpl.S"
.macro pad_top_bot_16 s1, s2, w, stride, reg, ret
tst w6, #1 // CDEF_HAVE_LEFT
b.eq 2f
// CDEF_HAVE_LEFT
sub \s1, \s1, #4
sub \s2, \s2, #4
tst w6, #2 // CDEF_HAVE_RIGHT
b.eq 1f
// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
ldr \reg\()0, [\s1]
ldr d1, [\s1, #2*\w]
ldr \reg\()2, [\s2]
ldr d3, [\s2, #2*\w]
str \reg\()0, [x0]
str d1, [x0, #2*\w]
add x0, x0, #2*\stride
str \reg\()2, [x0]
str d3, [x0, #2*\w]
.if \ret
ret
.else
add x0, x0, #2*\stride
b 3f
.endif
1:
// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
ldr \reg\()0, [\s1]
ldr s1, [\s1, #2*\w]
ldr \reg\()2, [\s2]
ldr s3, [\s2, #2*\w]
str \reg\()0, [x0]
str s1, [x0, #2*\w]
str s31, [x0, #2*\w+4]
add x0, x0, #2*\stride
str \reg\()2, [x0]
str s3, [x0, #2*\w]
str s31, [x0, #2*\w+4]
.if \ret
ret
.else
add x0, x0, #2*\stride
b 3f
.endif
2:
// !CDEF_HAVE_LEFT
tst w6, #2 // CDEF_HAVE_RIGHT
b.eq 1f
// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
ldr \reg\()0, [\s1]
ldr s1, [\s1, #2*\w]
ldr \reg\()2, [\s2]
ldr s3, [\s2, #2*\w]
str s31, [x0]
stur \reg\()0, [x0, #4]
str s1, [x0, #4+2*\w]
add x0, x0, #2*\stride
str s31, [x0]
stur \reg\()2, [x0, #4]
str s3, [x0, #4+2*\w]
.if \ret
ret
.else
add x0, x0, #2*\stride
b 3f
.endif
1:
// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
ldr \reg\()0, [\s1]
ldr \reg\()1, [\s2]
str s31, [x0]
stur \reg\()0, [x0, #4]
str s31, [x0, #4+2*\w]
add x0, x0, #2*\stride
str s31, [x0]
stur \reg\()1, [x0, #4]
str s31, [x0, #4+2*\w]
.if \ret
ret
.else
add x0, x0, #2*\stride
.endif
3:
.endm
.macro load_n_incr_16 dst, src, incr, w
.if \w == 4
ld1 {\dst\().4h}, [\src], \incr
.else
ld1 {\dst\().8h}, [\src], \incr
.endif
.endm
// void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// const pixel *const top, int h,
// enum CdefEdgeFlags edges);
.macro padding_func_16 w, stride, reg
function cdef_padding\w\()_16bpc_neon, export=1
movi v30.8h, #0x80, lsl #8
mov v31.16b, v30.16b
sub x0, x0, #2*(2*\stride+2)
tst w6, #4 // CDEF_HAVE_TOP
b.ne 1f
// !CDEF_HAVE_TOP
st1 {v30.8h, v31.8h}, [x0], #32
.if \w == 8
st1 {v30.8h, v31.8h}, [x0], #32
.endif
b 3f
1:
// CDEF_HAVE_TOP
add x9, x4, x2
pad_top_bot_16 x4, x9, \w, \stride, \reg, 0
// Middle section
3:
tst w6, #1 // CDEF_HAVE_LEFT
b.eq 2f
// CDEF_HAVE_LEFT
tst w6, #2 // CDEF_HAVE_RIGHT
b.eq 1f
// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
0:
ld1 {v0.s}[0], [x3], #4
ldr s2, [x1, #2*\w]
load_n_incr_16 v1, x1, x2, \w
subs w5, w5, #1
str s0, [x0]
stur \reg\()1, [x0, #4]
str s2, [x0, #4+2*\w]
add x0, x0, #2*\stride
b.gt 0b
b 3f
1:
// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
ld1 {v0.s}[0], [x3], #4
load_n_incr_16 v1, x1, x2, \w
subs w5, w5, #1
str s0, [x0]
stur \reg\()1, [x0, #4]
str s31, [x0, #4+2*\w]
add x0, x0, #2*\stride
b.gt 1b
b 3f
2:
tst w6, #2 // CDEF_HAVE_RIGHT
b.eq 1f
// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
0:
ldr s1, [x1, #2*\w]
load_n_incr_16 v0, x1, x2, \w
subs w5, w5, #1
str s31, [x0]
stur \reg\()0, [x0, #4]
str s1, [x0, #4+2*\w]
add x0, x0, #2*\stride
b.gt 0b
b 3f
1:
// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
load_n_incr_16 v0, x1, x2, \w
subs w5, w5, #1
str s31, [x0]
stur \reg\()0, [x0, #4]
str s31, [x0, #4+2*\w]
add x0, x0, #2*\stride
b.gt 1b
3:
tst w6, #8 // CDEF_HAVE_BOTTOM
b.ne 1f
// !CDEF_HAVE_BOTTOM
st1 {v30.8h, v31.8h}, [x0], #32
.if \w == 8
st1 {v30.8h, v31.8h}, [x0], #32
.endif
ret
1:
// CDEF_HAVE_BOTTOM
add x9, x1, x2
pad_top_bot_16 x1, x9, \w, \stride, \reg, 1
endfunc
.endm
padding_func_16 8, 16, q
padding_func_16 4, 8, d
tables
filter 8, 16
filter 4, 16
find_dir 16
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2020, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
.macro dir_table w, stride
const directions\w
.byte -1 * \stride + 1, -2 * \stride + 2
.byte 0 * \stride + 1, -1 * \stride + 2
.byte 0 * \stride + 1, 0 * \stride + 2
.byte 0 * \stride + 1, 1 * \stride + 2
.byte 1 * \stride + 1, 2 * \stride + 2
.byte 1 * \stride + 0, 2 * \stride + 1
.byte 1 * \stride + 0, 2 * \stride + 0
.byte 1 * \stride + 0, 2 * \stride - 1
// Repeated, to avoid & 7
.byte -1 * \stride + 1, -2 * \stride + 2
.byte 0 * \stride + 1, -1 * \stride + 2
.byte 0 * \stride + 1, 0 * \stride + 2
.byte 0 * \stride + 1, 1 * \stride + 2
.byte 1 * \stride + 1, 2 * \stride + 2
.byte 1 * \stride + 0, 2 * \stride + 1
endconst
.endm
.macro tables
dir_table 8, 16
dir_table 4, 8
const pri_taps
.byte 4, 2, 3, 3
endconst
.endm
.macro load_px d1, d2, w
.if \w == 8
add x6, x2, w9, sxtb #1 // x + off
sub x9, x2, w9, sxtb #1 // x - off
ld1 {\d1\().8h}, [x6] // p0
ld1 {\d2\().8h}, [x9] // p1
.else
add x6, x2, w9, sxtb #1 // x + off
sub x9, x2, w9, sxtb #1 // x - off
ld1 {\d1\().4h}, [x6] // p0
add x6, x6, #2*8 // += stride
ld1 {\d2\().4h}, [x9] // p1
add x9, x9, #2*8 // += stride
ld1 {\d1\().d}[1], [x6] // p0
ld1 {\d2\().d}[1], [x9] // p1
.endif
.endm
.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
.if \min
umin v2.8h, v2.8h, \s1\().8h
smax v3.8h, v3.8h, \s1\().8h
umin v2.8h, v2.8h, \s2\().8h
smax v3.8h, v3.8h, \s2\().8h
.endif
uabd v16.8h, v0.8h, \s1\().8h // abs(diff)
uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
ushl v21.8h, v20.8h, \shift // abs(diff) >> shift
uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px
sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px
neg v16.8h, v17.8h // -clip
neg v20.8h, v21.8h // -clip
smin v18.8h, v18.8h, v17.8h // imin(diff, clip)
smin v22.8h, v22.8h, v21.8h // imin(diff, clip)
dup v19.8h, \tap // taps[k]
smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip)
smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip)
mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain()
mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain()
.endm
// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping,
// int h, size_t edges);
.macro filter_func w, bpc, pri, sec, min, suffix
function cdef_filter\w\suffix\()_\bpc\()bpc_neon
.if \bpc == 8
ldr w8, [sp] // bitdepth_max
cmp w8, #0xf
b.eq cdef_filter\w\suffix\()_edged_8bpc_neon
.endif
.if \pri
.if \bpc == 16
ldr w9, [sp, #8] // bitdepth_max
clz w9, w9
sub w9, w9, #24 // -bitdepth_min_8
neg w9, w9 // bitdepth_min_8
.endif
movrel x8, pri_taps
.if \bpc == 16
lsr w9, w3, w9 // pri_strength >> bitdepth_min_8
and w9, w9, #1 // (pri_strength >> bitdepth_min_8) & 1
.else
and w9, w3, #1
.endif
add x8, x8, w9, uxtw #1
.endif
movrel x9, directions\w
add x5, x9, w5, uxtw #1
movi v30.4h, #15
dup v28.4h, w6 // damping
.if \pri
dup v25.8h, w3 // threshold
.endif
.if \sec
dup v27.8h, w4 // threshold
.endif
trn1 v24.4h, v25.4h, v27.4h
clz v24.4h, v24.4h // clz(threshold)
sub v24.4h, v30.4h, v24.4h // ulog2(threshold)
uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold))
neg v24.4h, v24.4h // -shift
.if \sec
dup v26.8h, v24.h[1]
.endif
.if \pri
dup v24.8h, v24.h[0]
.endif
1:
.if \w == 8
ld1 {v0.8h}, [x2] // px
.else
add x12, x2, #2*8
ld1 {v0.4h}, [x2] // px
ld1 {v0.d}[1], [x12] // px
.endif
movi v1.8h, #0 // sum
.if \min
mov v2.16b, v0.16b // min
mov v3.16b, v0.16b // max
.endif
// Instead of loading sec_taps 2, 1 from memory, just set it
// to 2 initially and decrease for the second round.
// This is also used as loop counter.
mov w11, #2 // sec_taps[0]
2:
.if \pri
ldrb w9, [x5] // off1
load_px v4, v5, \w
.endif
.if \sec
add x5, x5, #4 // +2*2
ldrb w9, [x5] // off2
load_px v6, v7, \w
.endif
.if \pri
ldrb w10, [x8] // *pri_taps
handle_pixel v4, v5, v25.8h, v24.8h, w10, \min
.endif
.if \sec
add x5, x5, #8 // +2*4
ldrb w9, [x5] // off3
load_px v4, v5, \w
handle_pixel v6, v7, v27.8h, v26.8h, w11, \min
handle_pixel v4, v5, v27.8h, v26.8h, w11, \min
sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
.else
add x5, x5, #1 // x5 += 1
.endif
subs w11, w11, #1 // sec_tap-- (value)
.if \pri
add x8, x8, #1 // pri_taps++ (pointer)
.endif
b.ne 2b
sshr v4.8h, v1.8h, #15 // -(sum < 0)
add v1.8h, v1.8h, v4.8h // sum - (sum < 0)
srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4
.if \min
smin v0.8h, v0.8h, v3.8h
smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max)
.endif
.if \bpc == 8
xtn v0.8b, v0.8h
.endif
.if \w == 8
add x2, x2, #2*16 // tmp += tmp_stride
subs w7, w7, #1 // h--
.if \bpc == 8
st1 {v0.8b}, [x0], x1
.else
st1 {v0.8h}, [x0], x1
.endif
.else
.if \bpc == 8
st1 {v0.s}[0], [x0], x1
.else
st1 {v0.d}[0], [x0], x1
.endif
add x2, x2, #2*16 // tmp += 2*tmp_stride
subs w7, w7, #2 // h -= 2
.if \bpc == 8
st1 {v0.s}[1], [x0], x1
.else
st1 {v0.d}[1], [x0], x1
.endif
.endif
// Reset pri_taps and directions back to the original point
sub x5, x5, #2
.if \pri
sub x8, x8, #2
.endif
b.gt 1b
ret
endfunc
.endm
.macro filter w, bpc
filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec
function cdef_filter\w\()_\bpc\()bpc_neon, export=1
cbnz w3, 1f // pri_strength
b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec
1:
cbnz w4, 1f // sec_strength
b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri
1:
b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
endfunc
.endm
const div_table
.short 840, 420, 280, 210, 168, 140, 120, 105
endconst
const alt_fact
.short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
endconst
.macro cost_alt d1, d2, s1, s2, s3, s4
smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
smull2 v23.4s, \s1\().8h, \s1\().8h
smull v24.4s, \s2\().4h, \s2\().4h
smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
smull2 v26.4s, \s3\().8h, \s3\().8h
smull v27.4s, \s4\().4h, \s4\().4h
mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact
mla v22.4s, v23.4s, v30.4s
mla v22.4s, v24.4s, v31.4s
mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact
mla v25.4s, v26.4s, v30.4s
mla v25.4s, v27.4s, v31.4s
addv \d1, v22.4s // *cost_ptr
addv \d2, v25.4s // *cost_ptr
.endm
.macro find_best s1, s2, s3
.ifnb \s2
mov w5, \s2\().s[0]
.endif
cmp w4, w1 // cost[n] > best_cost
csel w0, w3, w0, gt // best_dir = n
csel w1, w4, w1, gt // best_cost = cost[n]
.ifnb \s2
add w3, w3, #1 // n++
cmp w5, w1 // cost[n] > best_cost
mov w4, \s3\().s[0]
csel w0, w3, w0, gt // best_dir = n
csel w1, w5, w1, gt // best_cost = cost[n]
add w3, w3, #1 // n++
.endif
.endm
// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
// unsigned *const var)
.macro find_dir bpc
function cdef_find_dir_\bpc\()bpc_neon, export=1
.if \bpc == 16
str d8, [sp, #-0x10]!
clz w3, w3 // clz(bitdepth_max)
sub w3, w3, #24 // -bitdepth_min_8
dup v8.8h, w3
.endif
sub sp, sp, #32 // cost
mov w3, #8
.if \bpc == 8
movi v31.16b, #128
.else
movi v31.8h, #128
.endif
movi v30.16b, #0
movi v1.8h, #0 // v0-v1 sum_diag[0]
movi v3.8h, #0 // v2-v3 sum_diag[1]
movi v5.8h, #0 // v4-v5 sum_hv[0-1]
movi v7.8h, #0 // v6-v7 sum_alt[0]
movi v17.8h, #0 // v16-v17 sum_alt[1]
movi v18.8h, #0 // v18-v19 sum_alt[2]
movi v19.8h, #0
movi v21.8h, #0 // v20-v21 sum_alt[3]
.irpc i, 01234567
.if \bpc == 8
ld1 {v26.8b}, [x0], x1
usubl v26.8h, v26.8b, v31.8b
.else
ld1 {v26.8h}, [x0], x1
ushl v26.8h, v26.8h, v8.8h
sub v26.8h, v26.8h, v31.8h
.endif
addv h25, v26.8h // [y]
rev64 v27.8h, v26.8h
addp v28.8h, v26.8h, v30.8h // [(x >> 1)]
add v5.8h, v5.8h, v26.8h // sum_hv[1]
ext v27.16b, v27.16b, v27.16b, #8 // [-x]
rev64 v29.4h, v28.4h // [-(x >> 1)]
ins v4.h[\i], v25.h[0] // sum_hv[0]
.if \i == 0
mov v0.16b, v26.16b // sum_diag[0]
mov v2.16b, v27.16b // sum_diag[1]
mov v6.16b, v28.16b // sum_alt[0]
mov v16.16b, v29.16b // sum_alt[1]
.else
ext v22.16b, v30.16b, v26.16b, #(16-2*\i)
ext v23.16b, v26.16b, v30.16b, #(16-2*\i)
ext v24.16b, v30.16b, v27.16b, #(16-2*\i)
ext v25.16b, v27.16b, v30.16b, #(16-2*\i)
add v0.8h, v0.8h, v22.8h // sum_diag[0]
add v1.8h, v1.8h, v23.8h // sum_diag[0]
add v2.8h, v2.8h, v24.8h // sum_diag[1]
add v3.8h, v3.8h, v25.8h // sum_diag[1]
ext v22.16b, v30.16b, v28.16b, #(16-2*\i)
ext v23.16b, v28.16b, v30.16b, #(16-2*\i)
ext v24.16b, v30.16b, v29.16b, #(16-2*\i)
ext v25.16b, v29.16b, v30.16b, #(16-2*\i)
add v6.8h, v6.8h, v22.8h // sum_alt[0]
add v7.4h, v7.4h, v23.4h // sum_alt[0]
add v16.8h, v16.8h, v24.8h // sum_alt[1]
add v17.4h, v17.4h, v25.4h // sum_alt[1]
.endif
.if \i < 6
ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
add v18.8h, v18.8h, v22.8h // sum_alt[2]
add v19.4h, v19.4h, v23.4h // sum_alt[2]
.else
add v18.8h, v18.8h, v26.8h // sum_alt[2]
.endif
.if \i == 0
mov v20.16b, v26.16b // sum_alt[3]
.elseif \i == 1
add v20.8h, v20.8h, v26.8h // sum_alt[3]
.else
ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
add v20.8h, v20.8h, v24.8h // sum_alt[3]
add v21.4h, v21.4h, v25.4h // sum_alt[3]
.endif
.endr
movi v31.4s, #105
smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0]
smlal2 v26.4s, v4.8h, v4.8h
smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1]
smlal2 v27.4s, v5.8h, v5.8h
mul v26.4s, v26.4s, v31.4s // cost[2] *= 105
mul v27.4s, v27.4s, v31.4s // cost[6] *= 105
addv s4, v26.4s // cost[2]
addv s5, v27.4s // cost[6]
rev64 v1.8h, v1.8h
rev64 v3.8h, v3.8h
ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n]
ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n]
str s4, [sp, #2*4] // cost[2]
str s5, [sp, #6*4] // cost[6]
movrel x4, div_table
ld1 {v31.8h}, [x4]
smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0]
smull2 v23.4s, v0.8h, v0.8h
smlal v22.4s, v1.4h, v1.4h
smlal2 v23.4s, v1.8h, v1.8h
smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1]
smull2 v25.4s, v2.8h, v2.8h
smlal v24.4s, v3.4h, v3.4h
smlal2 v25.4s, v3.8h, v3.8h
uxtl v30.4s, v31.4h // div_table
uxtl2 v31.4s, v31.8h
mul v22.4s, v22.4s, v30.4s // cost[0]
mla v22.4s, v23.4s, v31.4s // cost[0]
mul v24.4s, v24.4s, v30.4s // cost[4]
mla v24.4s, v25.4s, v31.4s // cost[4]
addv s0, v22.4s // cost[0]
addv s2, v24.4s // cost[4]
movrel x5, alt_fact
ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105
str s0, [sp, #0*4] // cost[0]
str s2, [sp, #4*4] // cost[4]
uxtl v29.4s, v29.4h // div_table[2*m+1] + 105
uxtl v30.4s, v30.4h
uxtl v31.4s, v31.4h
cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3]
cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7]
str s6, [sp, #1*4] // cost[1]
str s16, [sp, #3*4] // cost[3]
mov w0, #0 // best_dir
mov w1, v0.s[0] // best_cost
mov w3, #1 // n
str s18, [sp, #5*4] // cost[5]
str s20, [sp, #7*4] // cost[7]
mov w4, v6.s[0]
find_best v6, v4, v16
find_best v16, v2, v18
find_best v18, v5, v20
find_best v20
eor w3, w0, #4 // best_dir ^4
ldr w4, [sp, w3, uxtw #2]
sub w1, w1, w4 // best_cost - cost[best_dir ^ 4]
lsr w1, w1, #10
str w1, [x2] // *var
add sp, sp, #32
.if \bpc == 16
ldr d8, [sp], 0x10
.endif
ret
endfunc
.endm
......@@ -161,31 +161,6 @@ endconst
.endif
.endm
.macro scale_wide sz, c, r0, r1, r2 r3, r4, r5, r6, r7
smull_sz v2, v3, \r0, \c, \sz
smull_sz v4, v5, \r1, \c, \sz
smull_sz v6, v7, \r2, \c, \sz
rshrn_sz \r0, v2, v3, #12, \sz
smull_sz v2, v3, \r3, \c, \sz
rshrn_sz \r1, v4, v5, #12, \sz
.ifnb \r4
smull_sz v4, v5, \r4, \c, \sz
.endif
rshrn_sz \r2, v6, v7, #12, \sz
.ifnb \r4
smull_sz v6, v7, \r5, \c, \sz
.endif
rshrn_sz \r3, v2, v3, #12, \sz
.ifnb \r4
smull_sz v2, v3, \r6, \c, \sz
rshrn_sz \r4, v4, v5, #12, \sz
smull_sz v4, v5, \r7, \c, \sz
rshrn_sz \r5, v6, v7, #12, \sz
rshrn_sz \r6, v2, v3, #12, \sz
rshrn_sz \r7, v4, v5, #12, \sz
.endif
.endm
.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4
.ifnb \load
ld1 {\load}, [\src], x1
......@@ -599,41 +574,40 @@ function inv_flipadst_8x4_neon
endfunc
function inv_identity_4x4_neon
mov w16, #5793
mov w16, #(5793-4096)*8
dup v0.4h, w16
smull v4.4s, v16.4h, v0.h[0]
smull v5.4s, v17.4h, v0.h[0]
smull v6.4s, v18.4h, v0.h[0]
smull v7.4s, v19.4h, v0.h[0]
rshrn v16.4h, v4.4s, #12
rshrn v17.4h, v5.4s, #12
rshrn v18.4h, v6.4s, #12
rshrn v19.4h, v7.4s, #12
sqrdmulh v4.4h, v16.4h, v0.h[0]
sqrdmulh v5.4h, v17.4h, v0.h[0]
sqrdmulh v6.4h, v18.4h, v0.h[0]
sqrdmulh v7.4h, v19.4h, v0.h[0]
sqadd v16.4h, v16.4h, v4.4h
sqadd v17.4h, v17.4h, v5.4h
sqadd v18.4h, v18.4h, v6.4h
sqadd v19.4h, v19.4h, v7.4h
ret
endfunc
function inv_identity_8x4_neon
mov w16, #5793
mov w16, #(5793-4096)*8
dup v0.4h, w16
smull v2.4s, v16.4h, v0.h[0]
smull2 v3.4s, v16.8h, v0.h[0]
smull v4.4s, v17.4h, v0.h[0]
smull2 v5.4s, v17.8h, v0.h[0]
rshrn v16.4h, v2.4s, #12
rshrn2 v16.8h, v3.4s, #12
smull v6.4s, v18.4h, v0.h[0]
smull2 v7.4s, v18.8h, v0.h[0]
rshrn v17.4h, v4.4s, #12
rshrn2 v17.8h, v5.4s, #12
smull v2.4s, v19.4h, v0.h[0]
smull2 v3.4s, v19.8h, v0.h[0]
rshrn v18.4h, v6.4s, #12
rshrn2 v18.8h, v7.4s, #12
rshrn v19.4h, v2.4s, #12
rshrn2 v19.8h, v3.4s, #12
sqrdmulh v4.8h, v16.8h, v0.h[0]
sqrdmulh v5.8h, v17.8h, v0.h[0]
sqrdmulh v6.8h, v18.8h, v0.h[0]
sqrdmulh v7.8h, v19.8h, v0.h[0]
sqadd v16.8h, v16.8h, v4.8h
sqadd v17.8h, v17.8h, v5.8h
sqadd v18.8h, v18.8h, v6.8h
sqadd v19.8h, v19.8h, v7.8h
ret
endfunc
.macro identity_8x4_shift1 r0, r1, r2, r3, c
.irp i, \r0\().8h, \r1\().8h, \r2\().8h, \r3\().8h
sqrdmulh v2.8h, \i, \c
srhadd \i, \i, v2.8h
.endr
.endm
function inv_txfm_add_wht_wht_4x4_neon, export=1
mov x15, x30
movi v31.8h, #0
......@@ -877,30 +851,31 @@ function inv_flipadst_4x8_neon
endfunc
function inv_identity_8x8_neon
shl v16.8h, v16.8h, #1
shl v17.8h, v17.8h, #1
shl v18.8h, v18.8h, #1
shl v19.8h, v19.8h, #1
shl v20.8h, v20.8h, #1
shl v21.8h, v21.8h, #1
shl v22.8h, v22.8h, #1
shl v23.8h, v23.8h, #1
sqshl v16.8h, v16.8h, #1
sqshl v17.8h, v17.8h, #1
sqshl v18.8h, v18.8h, #1
sqshl v19.8h, v19.8h, #1
sqshl v20.8h, v20.8h, #1
sqshl v21.8h, v21.8h, #1
sqshl v22.8h, v22.8h, #1
sqshl v23.8h, v23.8h, #1
ret
endfunc
function inv_identity_4x8_neon
shl v16.4h, v16.4h, #1
shl v17.4h, v17.4h, #1
shl v18.4h, v18.4h, #1
shl v19.4h, v19.4h, #1
shl v20.4h, v20.4h, #1
shl v21.4h, v21.4h, #1
shl v22.4h, v22.4h, #1
shl v23.4h, v23.4h, #1
sqshl v16.4h, v16.4h, #1
sqshl v17.4h, v17.4h, #1
sqshl v18.4h, v18.4h, #1
sqshl v19.4h, v19.4h, #1
sqshl v20.4h, v20.4h, #1
sqshl v21.4h, v21.4h, #1
sqshl v22.4h, v22.4h, #1
sqshl v23.4h, v23.4h, #1
ret
endfunc
function inv_txfm_add_8x8_neon
.macro def_fn_8x8_base variant
function inv_txfm_\variant\()add_8x8_neon
movi v28.8h, #0
movi v29.8h, #0
movi v30.8h, #0
......@@ -910,6 +885,9 @@ function inv_txfm_add_8x8_neon
ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2]
st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2]
.ifc \variant, identity_
// The identity shl #1 and downshift srshr #1 cancel out
.else
blr x4
srshr v16.8h, v16.8h, #1
......@@ -920,6 +898,7 @@ function inv_txfm_add_8x8_neon
srshr v21.8h, v21.8h, #1
srshr v22.8h, v22.8h, #1
srshr v23.8h, v23.8h, #1
.endif
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
......@@ -928,6 +907,10 @@ function inv_txfm_add_8x8_neon
load_add_store_8x8 x0, x7
br x15
endfunc
.endm
def_fn_8x8_base
def_fn_8x8_base identity_
.macro def_fn_8x8 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_neon, export=1
......@@ -936,9 +919,13 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_neon, export=1
.ifc \txfm1\()_\txfm2, dct_dct
idct_dc 8, 8, 1
.endif
adr x4, inv_\txfm1\()_8x8_neon
adr x5, inv_\txfm2\()_8x8_neon
.ifc \txfm1, identity
b inv_txfm_identity_add_8x8_neon
.else
adr x4, inv_\txfm1\()_8x8_neon
b inv_txfm_add_8x8_neon
.endif
endfunc
.endm
......@@ -1083,9 +1070,12 @@ def_fns_48 8, 4
rshrn_sz v27, v6, v7, #12, \sz // t14a
smull_smlsl v4, v5, v29, v23, v0.h[2], v0.h[3], \sz // -> t13a
neg v29\sz, v29\sz
smull_smlsl v6, v7, v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
smull_smlal v6, v7, v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
rshrn_sz v29, v4, v5, #12, \sz // t13a
neg v6.4s, v6.4s
.ifc \sz, .8h
neg v7.4s, v7.4s
.endif
rshrn_sz v23, v6, v7, #12, \sz // t10a
sqsub v2\sz, v17\sz, v19\sz // t11a
......@@ -1333,27 +1323,59 @@ function inv_flipadst_4x16_neon
endfunc
function inv_identity_8x16_neon
mov w16, #2*5793
mov w16, #2*(5793-4096)*8
dup v0.4h, w16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
smull v2.4s, v\i\().4h, v0.h[0]
smull2 v3.4s, v\i\().8h, v0.h[0]
rshrn v\i\().4h, v2.4s, #12
rshrn2 v\i\().8h, v3.4s, #12
sqrdmulh v2.8h, v\i\().8h, v0.h[0]
sqadd v\i\().8h, v\i\().8h, v\i\().8h
sqadd v\i\().8h, v\i\().8h, v2.8h
.endr
ret
endfunc
function inv_identity_4x16_neon
mov w16, #2*5793
mov w16, #2*(5793-4096)*8
dup v0.4h, w16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
smull v2.4s, v\i\().4h, v0.h[0]
rshrn v\i\().4h, v2.4s, #12
sqrdmulh v2.4h, v\i\().4h, v0.h[0]
sqadd v\i\().4h, v\i\().4h, v\i\().4h
sqadd v\i\().4h, v\i\().4h, v2.4h
.endr
ret
endfunc
.macro identity_8x16_shift2 c
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
sqrdmulh v2.8h, \i, \c
sshr v2.8h, v2.8h, #1
srhadd \i, \i, v2.8h
.endr
.endm
.macro identity_8x16_shift1 c
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
sqrdmulh v2.8h, \i, \c
srshr v2.8h, v2.8h, #1
sqadd \i, \i, v2.8h
.endr
.endm
.macro identity_8x8_shift1 c
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
sqrdmulh v2.8h, \i, \c
srshr v2.8h, v2.8h, #1
sqadd \i, \i, v2.8h
.endr
.endm
.macro identity_8x8 c
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
sqrdmulh v2.8h, \i, \c
sqadd \i, \i, \i
sqadd \i, \i, v2.8h
.endr
.endm
function inv_txfm_horz_16x8_neon
mov x14, x30
movi v7.8h, #0
......@@ -1375,6 +1397,26 @@ function inv_txfm_horz_16x8_neon
br x14
endfunc
function inv_txfm_horz_identity_16x8_neon
mov x14, x30
movi v7.8h, #0
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
ld1 {v\i\().8h}, [x7]
st1 {v7.8h}, [x7], x8
.endr
mov w16, #2*(5793-4096)*8
dup v0.4h, w16
identity_8x16_shift2 v0.h[0]
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
st1 {v\i\().8h}, [x6], #16
.endr
br x14
endfunc
function inv_txfm_horz_scale_16x8_neon
mov x14, x30
movi v7.8h, #0
......@@ -1421,7 +1463,7 @@ function inv_txfm_add_16x16_neon
.endif
add x7, x2, #(\i*2)
mov x8, #16*2
bl inv_txfm_horz_16x8_neon
blr x9
.endr
b 2f
1:
......@@ -1449,7 +1491,12 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_neon, export=1
.ifc \txfm1\()_\txfm2, dct_dct
idct_dc 16, 16, 2
.endif
.ifc \txfm1, identity
adr x9, inv_txfm_horz_identity_16x8_neon
.else
adr x9, inv_txfm_horz_16x8_neon
adr x4, inv_\txfm1\()_8x16_neon
.endif
adr x5, inv_\txfm2\()_8x16_neon
mov x13, #\eob_half
b inv_txfm_add_16x16_neon
......@@ -1469,12 +1516,35 @@ def_fn_16x16 flipadst, adst, 36
def_fn_16x16 flipadst, flipadst, 36
def_fn_16x16 identity, dct, 8
function inv_txfm_add_16x4_neon
.macro def_fn_416_base variant
function inv_txfm_\variant\()add_16x4_neon
mov x15, x30
movi v4.8h, #0
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
ld1 {v\i\().4h}, [x2]
.ifc \variant, identity_
.irp i, v16.4h, v17.4h, v18.4h, v19.4h
ld1 {\i}, [x2]
st1 {v4.4h}, [x2], #8
.endr
.irp i, v16.d, v17.d, v18.d, v19.d
ld1 {\i}[1], [x2]
st1 {v4.4h}, [x2], #8
.endr
mov w16, #2*(5793-4096)*8
dup v0.4h, w16
.irp i, v20.4h, v21.4h, v22.4h, v23.4h
ld1 {\i}, [x2]
st1 {v4.4h}, [x2], #8
.endr
.irp i, v20.d, v21.d, v22.d, v23.d
ld1 {\i}[1], [x2]
st1 {v4.4h}, [x2], #8
.endr
identity_8x16_shift1 v0.h[0]
.else
.irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
ld1 {\i}, [x2]
st1 {v4.4h}, [x2], #8
.endr
......@@ -1484,14 +1554,21 @@ function inv_txfm_add_16x4_neon
ins v17.d[1], v21.d[0]
ins v18.d[1], v22.d[0]
ins v19.d[1], v23.d[0]
.irp i, 16, 17, 18, 19
srshr v\i\().8h, v\i\().8h, #1
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
srshr \i, \i, #1
.endr
.endif
transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
blr x5
mov x6, x0
load_add_store_8x4 x6, x7
.ifc \variant, identity_
mov v16.16b, v20.16b
mov v17.16b, v21.16b
mov v18.16b, v22.16b
mov v19.16b, v23.16b
.else
ins v24.d[1], v28.d[0]
ins v25.d[1], v29.d[0]
ins v26.d[1], v30.d[0]
......@@ -1500,6 +1577,7 @@ function inv_txfm_add_16x4_neon
srshr v17.8h, v25.8h, #1
srshr v18.8h, v26.8h, #1
srshr v19.8h, v27.8h, #1
.endif
transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
blr x5
add x6, x0, #8
......@@ -1508,7 +1586,7 @@ function inv_txfm_add_16x4_neon
br x15
endfunc
function inv_txfm_add_4x16_neon
function inv_txfm_\variant\()add_4x16_neon
mov x15, x30
movi v2.8h, #0
......@@ -1517,8 +1595,17 @@ function inv_txfm_add_4x16_neon
b.lt 1f
add x6, x2, #16
.irp i, 16, 17, 18, 19
ld1 {v\i\().8h}, [x6]
.ifc \variant, identity_
.irp i, v24.8h, v25.8h, v26.8h, v27.8h
ld1 {\i}, [x6]
st1 {v2.8h}, [x6], x11
.endr
mov w16, #(5793-4096)*8
dup v0.4h, w16
identity_8x4_shift1 v24, v25, v26, v27, v0.h[0]
.else
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
ld1 {\i}, [x6]
st1 {v2.8h}, [x6], x11
.endr
blr x4
......@@ -1526,6 +1613,7 @@ function inv_txfm_add_4x16_neon
srshr v25.8h, v17.8h, #1
srshr v26.8h, v18.8h, #1
srshr v27.8h, v19.8h, #1
.endif
transpose_4x8h v24, v25, v26, v27, v4, v5, v6, v7
ins v28.d[0], v24.d[1]
ins v29.d[0], v25.d[1]
......@@ -1534,19 +1622,25 @@ function inv_txfm_add_4x16_neon
b 2f
1:
.irp i, 24, 25, 26, 27, 28, 29, 30, 31
movi v\i\().4h, #0
.irp i, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
movi \i, #0
.endr
2:
movi v2.8h, #0
.irp i, 16, 17, 18, 19
ld1 {v\i\().8h}, [x2]
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
ld1 {\i}, [x2]
st1 {v2.8h}, [x2], x11
.endr
.ifc \variant, identity_
mov w16, #(5793-4096)*8
dup v0.4h, w16
identity_8x4_shift1 v16, v17, v18, v19, v0.h[0]
.else
blr x4
.irp i, 16, 17, 18, 19
srshr v\i\().8h, v\i\().8h, #1
.irp i, v16.8h, v17.8h, v18.8h, v19.8h
srshr \i, \i, #1
.endr
.endif
transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
ins v20.d[0], v16.d[1]
ins v21.d[0], v17.d[1]
......@@ -1559,6 +1653,10 @@ function inv_txfm_add_4x16_neon
br x15
endfunc
.endm
def_fn_416_base
def_fn_416_base identity_
.macro def_fn_416 w, h, txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
......@@ -1573,7 +1671,11 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
adr x4, inv_\txfm1\()_4x\w\()_neon
adr x5, inv_\txfm2\()_8x\h\()_neon
.endif
.ifc \txfm1, identity
b inv_txfm_identity_add_\w\()x\h\()_neon
.else
b inv_txfm_add_\w\()x\h\()_neon
.endif
endfunc
.endm
......@@ -1600,24 +1702,31 @@ def_fns_416 4, 16
def_fns_416 16, 4
function inv_txfm_add_16x8_neon
.macro def_fn_816_base variant
function inv_txfm_\variant\()add_16x8_neon
mov x15, x30
movi v4.8h, #0
mov w16, #2896*8
dup v0.4h, w16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
ld1 {v\i\().8h}, [x2]
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
ld1 {\i}, [x2]
st1 {v4.8h}, [x2], #16
.endr
scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
.ifc \variant, identity_
mov w16, #2*(5793-4096)*8
dup v0.4h, w16
identity_8x16_shift1 v0.h[0]
.else
blr x4
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
srshr v\i\().8h, v\i\().8h, #1
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
srshr \i, \i, #1
.endr
.endif
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
blr x5
......@@ -1625,6 +1734,16 @@ function inv_txfm_add_16x8_neon
mov x6, x0
load_add_store_8x8 x6, x7
.ifc \variant, identity_
mov v16.16b, v24.16b
mov v17.16b, v25.16b
mov v18.16b, v26.16b
mov v19.16b, v27.16b
mov v20.16b, v28.16b
mov v21.16b, v29.16b
mov v22.16b, v30.16b
mov v23.16b, v31.16b
.else
srshr v16.8h, v24.8h, #1
srshr v17.8h, v25.8h, #1
srshr v18.8h, v26.8h, #1
......@@ -1633,6 +1752,7 @@ function inv_txfm_add_16x8_neon
srshr v21.8h, v29.8h, #1
srshr v22.8h, v30.8h, #1
srshr v23.8h, v31.8h, #1
.endif
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
......@@ -1644,7 +1764,7 @@ function inv_txfm_add_16x8_neon
br x15
endfunc
function inv_txfm_add_8x16_neon
function inv_txfm_\variant\()add_8x16_neon
mov x15, x30
movi v4.8h, #0
mov w16, #2896*8
......@@ -1655,8 +1775,16 @@ function inv_txfm_add_8x16_neon
b.lt 1f
add x6, x2, #16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
ld1 {v\i\().8h}, [x6]
.ifc \variant, identity_
.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
ld1 {\i}, [x6]
st1 {v4.8h}, [x6], x11
.endr
scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
// The identity shl #1 and downshift srshr #1 cancel out
.else
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
ld1 {\i}, [x6]
st1 {v4.8h}, [x6], x11
.endr
scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
......@@ -1670,13 +1798,14 @@ function inv_txfm_add_8x16_neon
srshr v29.8h, v21.8h, #1
srshr v30.8h, v22.8h, #1
srshr v31.8h, v23.8h, #1
.endif
transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
b 2f
1:
.irp i, 24, 25, 26, 27, 28, 29, 30, 31
movi v\i\().8h, #0
.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
movi \i, #0
.endr
2:
......@@ -1684,16 +1813,20 @@ function inv_txfm_add_8x16_neon
mov w16, #2896*8
dup v0.4h, w16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
ld1 {v\i\().8h}, [x2]
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
ld1 {\i}, [x2]
st1 {v4.8h}, [x2], x11
.endr
scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
.ifc \variant, identity_
// The identity shl #1 and downshift srshr #1 cancel out
.else
blr x4
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
srshr v\i\().8h, v\i\().8h, #1
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
srshr \i, \i, #1
.endr
.endif
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
......@@ -1703,6 +1836,10 @@ function inv_txfm_add_8x16_neon
br x15
endfunc
.endm
def_fn_816_base
def_fn_816_base identity_
.macro def_fn_816 w, h, txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
......@@ -1714,7 +1851,11 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
.if \w == 8
mov x13, #\eob_half
.endif
.ifc \txfm1, identity
b inv_txfm_identity_add_\w\()x\h\()_neon
.else
b inv_txfm_add_\w\()x\h\()_neon
.endif
endfunc
.endm
......@@ -2120,7 +2261,7 @@ endfunc
.macro def_identity_1632 w, h, wshort, hshort
function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1
mov w16, #2896*8
mov w17, #2*5793
mov w17, #2*(5793-4096)*8
dup v1.4h, w16
movi v0.8h, #0
mov v1.h[1], w17
......@@ -2140,12 +2281,11 @@ function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1
.if \w == 16
// 16x32
scale_wide .8h, v1.h[1], v16, v17, v18, v19, v20, v21, v22, v23
shift_8_regs srshr, 1
identity_8x8_shift1 v1.h[1]
.else
// 32x16
shift_8_regs shl, 1
scale_wide .8h, v1.h[1], v16, v17, v18, v19, v20, v21, v22, v23
shift_8_regs sqshl, 1
identity_8x8 v1.h[1]
.endif
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
......
......@@ -151,8 +151,8 @@ function lpf_16_wd\wd\()_neon
movi v7.16b, #3
sqxtn v2.8b, v2.8h // f
sqxtn2 v2.16b, v3.8h
sqadd v4.16b, v6.16b, v2.16b // imin(f + 4, 128)
sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 128)
sqadd v4.16b, v6.16b, v2.16b // imin(f + 4, 127)
sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 127)
sshr v4.16b, v4.16b, #3 // f1
sshr v5.16b, v5.16b, #3 // f2
uxtl v2.8h, v23.8b // p0
......@@ -981,13 +981,13 @@ function lpf_h_16_16_neon
br x15
endfunc
// void dav1d_lpf_v_sb_y_neon(pixel *dst, const ptrdiff_t stride,
// const uint32_t *const vmask,
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
// const Av1FilterLUT *lut, const int w)
// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
// const uint32_t *const vmask,
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
// const Av1FilterLUT *lut, const int w)
.macro lpf_func dir, type
function lpf_\dir\()_sb_\type\()_neon, export=1
function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
mov x11, x30
stp d8, d9, [sp, #-0x40]!
stp d10, d11, [sp, #0x10]
......
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2020, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
.macro loop_filter wd
function lpf_8_wd\wd\()_neon
uabd v0.8h, v22.8h, v23.8h // abs(p1 - p0)
uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0)
uabd v2.8h, v23.8h, v24.8h // abs(p0 - q0)
uabd v3.8h, v22.8h, v25.8h // abs(p1 - q1)
.if \wd >= 6
uabd v4.8h, v21.8h, v22.8h // abs(p2 - p1)
uabd v5.8h, v26.8h, v25.8h // abs(q2 - q1)
.endif
.if \wd >= 8
uabd v6.8h, v20.8h, v21.8h // abs(p3 - p2)
uabd v7.8h, v27.8h, v26.8h // abs(q3 - q3)
.endif
.if \wd >= 6
umax v4.8h, v4.8h, v5.8h
.endif
uqadd v2.8h, v2.8h, v2.8h // abs(p0 - q0) * 2
.if \wd >= 8
umax v6.8h, v6.8h, v7.8h
.endif
ushr v3.8h, v3.8h, #1
.if \wd >= 8
umax v4.8h, v4.8h, v6.8h
.endif
.if \wd >= 6
and v4.16b, v4.16b, v14.16b
.endif
umax v0.8h, v0.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0))
uqadd v2.8h, v2.8h, v3.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
.if \wd >= 6
umax v4.8h, v0.8h, v4.8h
cmhs v1.8h, v11.8h, v4.8h // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
.else
cmhs v1.8h, v11.8h, v0.8h // max(abs(p1 - p0), abs(q1 - q0)) <= I
.endif
cmhs v2.8h, v10.8h, v2.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
and v1.16b, v1.16b, v2.16b // fm
and v1.16b, v1.16b, v13.16b // fm && wd >= 4
.if \wd >= 6
and v14.16b, v14.16b, v1.16b // fm && wd > 4
.endif
.if \wd >= 16
and v15.16b, v15.16b, v1.16b // fm && wd == 16
.endif
mov x16, v1.d[0]
mov x17, v1.d[1]
adds x16, x16, x17
b.eq 9f // if (!fm || wd < 4) return;
.if \wd >= 6
movi v10.8h, #1
uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0)
uabd v3.8h, v22.8h, v23.8h // abs(p1 - p0)
uabd v4.8h, v25.8h, v24.8h // abs(q1 - q0)
uabd v5.8h, v26.8h, v24.8h // abs(q2 - q0)
dup v9.8h, w9 // bitdepth_min_8
.if \wd >= 8
uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0)
uabd v7.8h, v27.8h, v24.8h // abs(q3 - q0)
.endif
umax v2.8h, v2.8h, v3.8h
umax v4.8h, v4.8h, v5.8h
.if \wd >= 8
umax v6.8h, v6.8h, v7.8h
.endif
umax v2.8h, v2.8h, v4.8h
ushl v10.8h, v10.8h, v9.8h // F = 1 << bitdepth_min_8
.if \wd >= 8
umax v2.8h, v2.8h, v6.8h
.endif
.if \wd == 16
uabd v3.8h, v17.8h, v23.8h // abs(p6 - p0)
uabd v4.8h, v18.8h, v23.8h // abs(p5 - p0)
uabd v5.8h, v19.8h, v23.8h // abs(p4 - p0)
.endif
cmhs v2.8h, v10.8h, v2.8h // flat8in
.if \wd == 16
uabd v6.8h, v28.8h, v24.8h // abs(q4 - q0)
uabd v7.8h, v29.8h, v24.8h // abs(q5 - q0)
uabd v8.8h, v30.8h, v24.8h // abs(q6 - q0)
.endif
and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4
bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in
.if \wd == 16
umax v3.8h, v3.8h, v4.8h
umax v5.8h, v5.8h, v6.8h
.endif
mov x16, v1.d[0]
mov x17, v1.d[1]
.if \wd == 16
umax v7.8h, v7.8h, v8.8h
umax v3.8h, v3.8h, v5.8h
umax v3.8h, v3.8h, v7.8h
cmhs v3.8h, v10.8h, v3.8h // flat8out
.endif
adds x16, x16, x17
.if \wd == 16
and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16
and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16
bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out
.endif
b.eq 1f // skip wd == 4 case
.endif
dup v3.8h, w8 // bitdepth_max
sub v2.8h, v22.8h, v25.8h // p1 - q1
ushr v3.8h, v3.8h, #1 // 128 << bitdepth_min_8 - 1
cmhi v0.8h, v0.8h, v12.8h // hev
not v9.16b, v3.16b // - 128 * (1 << bitdepth_min_8)
smin v2.8h, v2.8h, v3.8h // iclip_diff(p1 - q1)
smax v2.8h, v2.8h, v9.8h // iclip_diff(p1 - q1)
and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1)
sub v2.8h, v24.8h, v23.8h
movi v5.8h, #3
bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev)
mul v2.8h, v2.8h, v5.8h
movi v6.8h, #4
add v2.8h, v2.8h, v4.8h
smin v2.8h, v2.8h, v3.8h // f = iclip_diff()
movi v7.8h, #3
smax v2.8h, v2.8h, v9.8h // f = iclip_diff()
sqadd v4.8h, v6.8h, v2.8h // f + 4
sqadd v5.8h, v7.8h, v2.8h // f + 3
smin v4.8h, v4.8h, v3.8h // imin(f + 4, 128 << bitdepth_min_8 - 1)
smin v5.8h, v5.8h, v3.8h // imin(f + 3, 128 << bitdepth_min_8 - 1)
sshr v4.8h, v4.8h, #3 // f1
sshr v5.8h, v5.8h, #3 // f2
movi v9.8h, #0
dup v3.8h, w8 // bitdepth_max
sqadd v2.8h, v23.8h, v5.8h // p0 + f2
sqsub v6.8h, v24.8h, v4.8h // q0 - f1
srshr v4.8h, v4.8h, #1 // (f1 + 1) >> 1
smin v2.8h, v2.8h, v3.8h // out p0 = iclip_pixel()
smin v6.8h, v6.8h, v3.8h // out q0 = iclip_pixel()
smax v2.8h, v2.8h, v9.8h // out p0 = iclip_pixel()
smax v6.8h, v6.8h, v9.8h // out q0 = iclip_pixel()
bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4)
bit v24.16b, v6.16b, v1.16b // if (fm && wd >= 4)
sqadd v2.8h, v22.8h, v4.8h // p1 + f
sqsub v6.8h, v25.8h, v4.8h // q1 - f
smin v2.8h, v2.8h, v3.8h // out p1 = iclip_pixel()
smin v6.8h, v6.8h, v3.8h // out q1 = iclip_pixel()
smax v2.8h, v2.8h, v9.8h // out p1 = iclip_pixel()
smax v6.8h, v6.8h, v9.8h // out q1 = iclip_pixel()
bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev)
bit v25.16b, v6.16b, v0.16b // if (fm && wd >= 4 && !hev)
1:
.if \wd == 6
mov x16, v14.d[0]
mov x17, v14.d[1]
adds x16, x16, x17
b.eq 2f // skip if there's no flat8in
add v0.8h, v21.8h, v21.8h // p2 * 2
add v2.8h, v21.8h, v22.8h // p2 + p1
add v4.8h, v22.8h, v23.8h // p1 + p0
add v6.8h, v23.8h, v24.8h // p0 + q0
add v8.8h, v0.8h, v2.8h
add v10.8h, v4.8h, v6.8h
add v12.8h, v24.8h, v25.8h // q0 + q1
add v8.8h, v8.8h, v10.8h
sub v12.8h, v12.8h, v0.8h
add v10.8h, v25.8h, v26.8h // q1 + q2
urshr v0.8h, v8.8h, #3 // out p1
add v8.8h, v8.8h, v12.8h
sub v10.8h, v10.8h, v2.8h
add v12.8h, v26.8h, v26.8h // q2 + q2
urshr v1.8h, v8.8h, #3 // out p0
add v8.8h, v8.8h, v10.8h
sub v12.8h, v12.8h, v4.8h
urshr v2.8h, v8.8h, #3 // out q0
bit v22.16b, v0.16b, v14.16b // p1 if (flat8in)
add v8.8h, v8.8h, v12.8h
bit v23.16b, v1.16b, v14.16b // p0 if (flat8in)
urshr v3.8h, v8.8h, #3 // out q1
bit v24.16b, v2.16b, v14.16b // q0 if (flat8in)
bit v25.16b, v3.16b, v14.16b // q1 if (flat8in)
.elseif \wd >= 8
mov x16, v14.d[0]
mov x17, v14.d[1]
adds x16, x16, x17
.if \wd == 8
b.eq 8f // skip if there's no flat8in
.else
b.eq 2f // skip if there's no flat8in
.endif
add v0.8h, v20.8h, v21.8h // p3 + p2
add v2.8h, v22.8h, v25.8h // p1 + q1
add v4.8h, v20.8h, v22.8h // p3 + p1
add v6.8h, v23.8h, v26.8h // p0 + q2
add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2)
add v9.8h, v23.8h, v24.8h // p0 + q0
add v8.8h, v8.8h, v4.8h // + p3 + p1
sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2
add v8.8h, v8.8h, v9.8h // + p0 + q0
sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1
urshr v10.8h, v8.8h, #3 // out p2
add v8.8h, v8.8h, v2.8h
add v0.8h, v20.8h, v23.8h // p3 + p0
add v2.8h, v24.8h, v27.8h // q0 + q3
urshr v11.8h, v8.8h, #3 // out p1
add v8.8h, v8.8h, v6.8h
sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0
add v4.8h, v21.8h, v24.8h // p2 + q0
add v6.8h, v25.8h, v27.8h // q1 + q3
urshr v12.8h, v8.8h, #3 // out p0
add v8.8h, v8.8h, v2.8h
sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0
add v0.8h, v22.8h, v25.8h // p1 + q1
add v2.8h, v26.8h, v27.8h // q2 + q3
urshr v13.8h, v8.8h, #3 // out q0
add v8.8h, v8.8h, v6.8h
sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1
urshr v0.8h, v8.8h, #3 // out q1
add v8.8h, v8.8h, v2.8h
bit v21.16b, v10.16b, v14.16b
bit v22.16b, v11.16b, v14.16b
bit v23.16b, v12.16b, v14.16b
urshr v1.8h, v8.8h, #3 // out q2
bit v24.16b, v13.16b, v14.16b
bit v25.16b, v0.16b, v14.16b
bit v26.16b, v1.16b, v14.16b
.endif
2:
.if \wd == 16
mov x16, v15.d[0]
mov x17, v15.d[1]
adds x16, x16, x17
b.ne 1f // check if flat8out is needed
mov x16, v14.d[0]
mov x17, v14.d[1]
adds x16, x16, x17
b.eq 8f // if there was no flat8in, just write the inner 4 pixels
b 7f // if flat8in was used, write the inner 6 pixels
1:
add v2.8h, v17.8h, v17.8h // p6 + p6
add v4.8h, v17.8h, v18.8h // p6 + p5
add v6.8h, v17.8h, v19.8h // p6 + p4
add v8.8h, v17.8h, v20.8h // p6 + p3
add v12.8h, v2.8h, v4.8h
add v10.8h, v6.8h, v8.8h
add v6.8h, v17.8h, v21.8h // p6 + p2
add v12.8h, v12.8h, v10.8h
add v8.8h, v17.8h, v22.8h // p6 + p1
add v10.8h, v18.8h, v23.8h // p5 + p0
add v6.8h, v6.8h, v8.8h
add v8.8h, v19.8h, v24.8h // p4 + q0
add v12.8h, v12.8h, v6.8h
add v10.8h, v10.8h, v8.8h
add v6.8h, v20.8h, v25.8h // p3 + q1
add v12.8h, v12.8h, v10.8h
sub v6.8h, v6.8h, v2.8h
add v2.8h, v21.8h, v26.8h // p2 + q2
urshr v0.8h, v12.8h, #4 // out p5
add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1)
sub v2.8h, v2.8h, v4.8h
add v4.8h, v22.8h, v27.8h // p1 + q3
add v6.8h, v17.8h, v19.8h // p6 + p4
urshr v1.8h, v12.8h, #4 // out p4
add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2)
sub v4.8h, v4.8h, v6.8h
add v6.8h, v23.8h, v28.8h // p0 + q4
add v8.8h, v17.8h, v20.8h // p6 + p3
urshr v2.8h, v12.8h, #4 // out p3
add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3)
sub v6.8h, v6.8h, v8.8h
add v8.8h, v24.8h, v29.8h // q0 + q5
add v4.8h, v17.8h, v21.8h // p6 + p2
urshr v3.8h, v12.8h, #4 // out p2
add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4)
sub v8.8h, v8.8h, v4.8h
add v6.8h, v25.8h, v30.8h // q1 + q6
add v10.8h, v17.8h, v22.8h // p6 + p1
urshr v4.8h, v12.8h, #4 // out p1
add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5)
sub v6.8h, v6.8h, v10.8h
add v8.8h, v26.8h, v30.8h // q2 + q6
bif v0.16b, v18.16b, v15.16b // out p5
add v10.8h, v18.8h, v23.8h // p5 + p0
urshr v5.8h, v12.8h, #4 // out p0
add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6)
sub v8.8h, v8.8h, v10.8h
add v10.8h, v27.8h, v30.8h // q3 + q6
bif v1.16b, v19.16b, v15.16b // out p4
add v18.8h, v19.8h, v24.8h // p4 + q0
urshr v6.8h, v12.8h, #4 // out q0
add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6)
sub v10.8h, v10.8h, v18.8h
add v8.8h, v28.8h, v30.8h // q4 + q6
bif v2.16b, v20.16b, v15.16b // out p3
add v18.8h, v20.8h, v25.8h // p3 + q1
urshr v7.8h, v12.8h, #4 // out q1
add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6)
sub v18.8h, v8.8h, v18.8h
add v10.8h, v29.8h, v30.8h // q5 + q6
bif v3.16b, v21.16b, v15.16b // out p2
add v20.8h, v21.8h, v26.8h // p2 + q2
urshr v8.8h, v12.8h, #4 // out q2
add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6)
sub v10.8h, v10.8h, v20.8h
add v18.8h, v30.8h, v30.8h // q6 + q6
bif v4.16b, v22.16b, v15.16b // out p1
add v20.8h, v22.8h, v27.8h // p1 + q3
urshr v9.8h, v12.8h, #4 // out q3
add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6)
sub v18.8h, v18.8h, v20.8h
bif v5.16b, v23.16b, v15.16b // out p0
urshr v10.8h, v12.8h, #4 // out q4
add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6)
urshr v11.8h, v12.8h, #4 // out q5
bif v6.16b, v24.16b, v15.16b // out q0
bif v7.16b, v25.16b, v15.16b // out q1
bif v8.16b, v26.16b, v15.16b // out q2
bif v9.16b, v27.16b, v15.16b // out q3
bif v10.16b, v28.16b, v15.16b // out q4
bif v11.16b, v29.16b, v15.16b // out q5
.endif
ret
.if \wd == 16
7:
// Return to a shorter epilogue, writing only the inner 6 pixels
br x13
.endif
.if \wd >= 8
8:
// Return to a shorter epilogue, writing only the inner 4 pixels
br x14
.endif
9:
// Return directly without writing back any pixels
br x15
endfunc
.endm
loop_filter 16
loop_filter 8
loop_filter 6
loop_filter 4
.macro lpf_8_wd16
adr x13, 7f
adr x14, 8f
bl lpf_8_wd16_neon
.endm
.macro lpf_8_wd8
adr x14, 8f
bl lpf_8_wd8_neon
.endm
.macro lpf_8_wd6
bl lpf_8_wd6_neon
.endm
.macro lpf_8_wd4
bl lpf_8_wd4_neon
.endm
function lpf_v_4_8_neon
mov x15, x30
sub x16, x0, x1, lsl #1
ld1 {v22.8h}, [x16], x1 // p1
ld1 {v24.8h}, [x0], x1 // q0
ld1 {v23.8h}, [x16], x1 // p0
ld1 {v25.8h}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
lpf_8_wd4
sub x16, x0, x1, lsl #1
st1 {v22.8h}, [x16], x1 // p1
st1 {v24.8h}, [x0], x1 // q0
st1 {v23.8h}, [x16], x1 // p0
st1 {v25.8h}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
br x15
endfunc
function lpf_h_4_8_neon
mov x15, x30
sub x16, x0, #4
add x0, x16, x1, lsl #2
ld1 {v22.d}[0], [x16], x1
ld1 {v22.d}[1], [x0], x1
ld1 {v23.d}[0], [x16], x1
ld1 {v23.d}[1], [x0], x1
ld1 {v24.d}[0], [x16], x1
ld1 {v24.d}[1], [x0], x1
ld1 {v25.d}[0], [x16], x1
ld1 {v25.d}[1], [x0], x1
add x0, x0, #4
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
lpf_8_wd4
sub x16, x0, x1, lsl #3
sub x16, x16, #4
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #2
st1 {v22.d}[0], [x16], x1
st1 {v22.d}[1], [x0], x1
st1 {v23.d}[0], [x16], x1
st1 {v23.d}[1], [x0], x1
st1 {v24.d}[0], [x16], x1
st1 {v24.d}[1], [x0], x1
st1 {v25.d}[0], [x16], x1
st1 {v25.d}[1], [x0], x1
add x0, x0, #4
br x15
endfunc
function lpf_v_6_8_neon
mov x15, x30
sub x16, x0, x1, lsl #1
sub x16, x16, x1
ld1 {v21.8h}, [x16], x1 // p2
ld1 {v24.8h}, [x0], x1 // q0
ld1 {v22.8h}, [x16], x1 // p1
ld1 {v25.8h}, [x0], x1 // q1
ld1 {v23.8h}, [x16], x1 // p0
ld1 {v26.8h}, [x0], x1 // q2
sub x0, x0, x1, lsl #1
sub x0, x0, x1
lpf_8_wd6
sub x16, x0, x1, lsl #1
st1 {v22.8h}, [x16], x1 // p1
st1 {v24.8h}, [x0], x1 // q0
st1 {v23.8h}, [x16], x1 // p0
st1 {v25.8h}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
br x15
endfunc
function lpf_h_6_8_neon
mov x15, x30
sub x16, x0, #8
add x0, x16, x1, lsl #2
ld1 {v20.8h}, [x16], x1
ld1 {v24.8h}, [x0], x1
ld1 {v21.8h}, [x16], x1
ld1 {v25.8h}, [x0], x1
ld1 {v22.8h}, [x16], x1
ld1 {v26.8h}, [x0], x1
ld1 {v23.8h}, [x16], x1
ld1 {v27.8h}, [x0], x1
add x0, x0, #8
transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
lpf_8_wd6
sub x16, x0, x1, lsl #3
sub x16, x16, #4
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #2
st1 {v22.d}[0], [x16], x1
st1 {v22.d}[1], [x0], x1
st1 {v23.d}[0], [x16], x1
st1 {v23.d}[1], [x0], x1
st1 {v24.d}[0], [x16], x1
st1 {v24.d}[1], [x0], x1
st1 {v25.d}[0], [x16], x1
st1 {v25.d}[1], [x0], x1
add x0, x0, #4
br x15
endfunc
function lpf_v_8_8_neon
mov x15, x30
sub x16, x0, x1, lsl #2
ld1 {v20.8h}, [x16], x1 // p3
ld1 {v24.8h}, [x0], x1 // q0
ld1 {v21.8h}, [x16], x1 // p2
ld1 {v25.8h}, [x0], x1 // q1
ld1 {v22.8h}, [x16], x1 // p1
ld1 {v26.8h}, [x0], x1 // q2
ld1 {v23.8h}, [x16], x1 // p0
ld1 {v27.8h}, [x0], x1 // q3
sub x0, x0, x1, lsl #2
lpf_8_wd8
sub x16, x0, x1, lsl #1
sub x16, x16, x1
st1 {v21.8h}, [x16], x1 // p2
st1 {v24.8h}, [x0], x1 // q0
st1 {v22.8h}, [x16], x1 // p1
st1 {v25.8h}, [x0], x1 // q1
st1 {v23.8h}, [x16], x1 // p0
st1 {v26.8h}, [x0], x1 // q2
sub x0, x0, x1, lsl #1
sub x0, x0, x1
br x15
8:
sub x16, x0, x1, lsl #1
st1 {v22.8h}, [x16], x1 // p1
st1 {v24.8h}, [x0], x1 // q0
st1 {v23.8h}, [x16], x1 // p0
st1 {v25.8h}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
br x15
endfunc
function lpf_h_8_8_neon
mov x15, x30
sub x16, x0, #8
add x0, x16, x1, lsl #2
ld1 {v20.8h}, [x16], x1
ld1 {v24.8h}, [x0], x1
ld1 {v21.8h}, [x16], x1
ld1 {v25.8h}, [x0], x1
ld1 {v22.8h}, [x16], x1
ld1 {v26.8h}, [x0], x1
ld1 {v23.8h}, [x16], x1
ld1 {v27.8h}, [x0], x1
add x0, x0, #8
transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
lpf_8_wd8
sub x16, x0, x1, lsl #3
sub x16, x16, #8
transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #2
st1 {v20.8h}, [x16], x1
st1 {v24.8h}, [x0], x1
st1 {v21.8h}, [x16], x1
st1 {v25.8h}, [x0], x1
st1 {v22.8h}, [x16], x1
st1 {v26.8h}, [x0], x1
st1 {v23.8h}, [x16], x1
st1 {v27.8h}, [x0], x1
add x0, x0, #8
br x15
8:
sub x16, x0, x1, lsl #3
sub x16, x16, #4
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #2
st1 {v22.d}[0], [x16], x1
st1 {v22.d}[1], [x0], x1
st1 {v23.d}[0], [x16], x1
st1 {v23.d}[1], [x0], x1
st1 {v24.d}[0], [x16], x1
st1 {v24.d}[1], [x0], x1
st1 {v25.d}[0], [x16], x1
st1 {v25.d}[1], [x0], x1
add x0, x0, #4
br x15
endfunc
function lpf_v_16_8_neon
mov x15, x30
sub x16, x0, x1, lsl #3
add x16, x16, x1
ld1 {v17.8h}, [x16], x1 // p6
ld1 {v24.8h}, [x0], x1 // q0
ld1 {v18.8h}, [x16], x1 // p5
ld1 {v25.8h}, [x0], x1 // q1
ld1 {v19.8h}, [x16], x1 // p4
ld1 {v26.8h}, [x0], x1 // q2
ld1 {v20.8h}, [x16], x1 // p3
ld1 {v27.8h}, [x0], x1 // q3
ld1 {v21.8h}, [x16], x1 // p2
ld1 {v28.8h}, [x0], x1 // q4
ld1 {v22.8h}, [x16], x1 // p1
ld1 {v29.8h}, [x0], x1 // q5
ld1 {v23.8h}, [x16], x1 // p0
ld1 {v30.8h}, [x0], x1 // q6
sub x0, x0, x1, lsl #3
add x0, x0, x1
lpf_8_wd16
sub x16, x0, x1, lsl #2
sub x16, x16, x1, lsl #1
st1 {v0.8h}, [x16], x1 // p5
st1 {v6.8h}, [x0], x1 // q0
st1 {v1.8h}, [x16], x1 // p4
st1 {v7.8h}, [x0], x1 // q1
st1 {v2.8h}, [x16], x1 // p3
st1 {v8.8h}, [x0], x1 // q2
st1 {v3.8h}, [x16], x1 // p2
st1 {v9.8h}, [x0], x1 // q3
st1 {v4.8h}, [x16], x1 // p1
st1 {v10.8h}, [x0], x1 // q4
st1 {v5.8h}, [x16], x1 // p0
st1 {v11.8h}, [x0], x1 // q5
sub x0, x0, x1, lsl #2
sub x0, x0, x1, lsl #1
br x15
7:
sub x16, x0, x1
sub x16, x16, x1, lsl #1
st1 {v21.8h}, [x16], x1 // p2
st1 {v24.8h}, [x0], x1 // q0
st1 {v22.8h}, [x16], x1 // p1
st1 {v25.8h}, [x0], x1 // q1
st1 {v23.8h}, [x16], x1 // p0
st1 {v26.8h}, [x0], x1 // q2
sub x0, x0, x1, lsl #1
sub x0, x0, x1
br x15
8:
sub x16, x0, x1, lsl #1
st1 {v22.8h}, [x16], x1 // p1
st1 {v24.8h}, [x0], x1 // q0
st1 {v23.8h}, [x16], x1 // p0
st1 {v25.8h}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
br x15
endfunc
function lpf_h_16_8_neon
mov x15, x30
sub x16, x0, #16
ld1 {v16.8h}, [x16], x1
ld1 {v24.8h}, [x0], x1
ld1 {v17.8h}, [x16], x1
ld1 {v25.8h}, [x0], x1
ld1 {v18.8h}, [x16], x1
ld1 {v26.8h}, [x0], x1
ld1 {v19.8h}, [x16], x1
ld1 {v27.8h}, [x0], x1
ld1 {v20.8h}, [x16], x1
ld1 {v28.8h}, [x0], x1
ld1 {v21.8h}, [x16], x1
ld1 {v29.8h}, [x0], x1
ld1 {v22.8h}, [x16], x1
ld1 {v30.8h}, [x0], x1
ld1 {v23.8h}, [x16], x1
ld1 {v31.8h}, [x0], x1
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
lpf_8_wd16
sub x0, x0, x1, lsl #3
sub x16, x0, #16
transpose_8x8h v16, v17, v0, v1, v2, v3, v4, v5, v18, v19
transpose_8x8h v6, v7, v8, v9, v10, v11, v30, v31, v18, v19
st1 {v16.8h}, [x16], x1
st1 {v6.8h}, [x0], x1
st1 {v17.8h}, [x16], x1
st1 {v7.8h}, [x0], x1
st1 {v0.8h}, [x16], x1
st1 {v8.8h}, [x0], x1
st1 {v1.8h}, [x16], x1
st1 {v9.8h}, [x0], x1
st1 {v2.8h}, [x16], x1
st1 {v10.8h}, [x0], x1
st1 {v3.8h}, [x16], x1
st1 {v11.8h}, [x0], x1
st1 {v4.8h}, [x16], x1
st1 {v30.8h}, [x0], x1
st1 {v5.8h}, [x16], x1
st1 {v31.8h}, [x0], x1
br x15
7:
sub x16, x0, x1, lsl #3
sub x16, x16, #8
transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #2
st1 {v20.8h}, [x16], x1
st1 {v24.8h}, [x0], x1
st1 {v21.8h}, [x16], x1
st1 {v25.8h}, [x0], x1
st1 {v22.8h}, [x16], x1
st1 {v26.8h}, [x0], x1
st1 {v23.8h}, [x16], x1
st1 {v27.8h}, [x0], x1
add x0, x0, #8
br x15
8:
sub x16, x0, x1, lsl #3
sub x16, x16, #4
transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #2
st1 {v22.d}[0], [x16], x1
st1 {v22.d}[1], [x0], x1
st1 {v23.d}[0], [x16], x1
st1 {v23.d}[1], [x0], x1
st1 {v24.d}[0], [x16], x1
st1 {v24.d}[1], [x0], x1
st1 {v25.d}[0], [x16], x1
st1 {v25.d}[1], [x0], x1
add x0, x0, #4
br x15
endfunc
// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const uint32_t *const vmask,
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
// const Av1FilterLUT *lut, const int w,
// const int bitdepth_max)
.macro lpf_func dir, type
function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1
mov x11, x30
mov w8, w7 // bitdepth_max
clz w9, w8
mov w10, #24
sub w9, w10, w9 // bitdepth_min_8
stp d8, d9, [sp, #-0x40]!
stp d10, d11, [sp, #0x10]
stp d12, d13, [sp, #0x20]
stp d14, d15, [sp, #0x30]
ldp w6, w7, [x2] // vmask[0], vmask[1]
.ifc \type, y
ldr w2, [x2, #8] // vmask[2]
.endif
add x5, x5, #128 // Move to sharp part of lut
.ifc \type, y
orr w7, w7, w2 // vmask[1] |= vmask[2]
.endif
.ifc \dir, v
sub x4, x3, x4, lsl #2
.else
sub x3, x3, #4
lsl x4, x4, #2
.endif
orr w6, w6, w7 // vmask[0] |= vmask[1]
1:
tst w6, #0x0f
.ifc \dir, v
ld1 {v0.8b}, [x4], #8
ld1 {v1.8b}, [x3], #8
.else
ld2 {v0.s,v1.s}[0], [x3], x4
ld2 {v0.s,v1.s}[1], [x3], x4
.endif
b.eq 7f // if (!(vm & bits)) continue;
ld1r {v5.8b}, [x5] // sharp[0]
add x5, x5, #8
movi v2.2s, #0xff
dup v13.2s, w6 // vmask[0]
dup v31.8h, w9 // bitdepth_min_8
and v0.8b, v0.8b, v2.8b // Keep only lowest byte in each 32 bit word
and v1.8b, v1.8b, v2.8b
cmtst v3.8b, v1.8b, v2.8b // Check for nonzero values in l[0][0]
movi v4.8b, #1
ld1r {v6.8b}, [x5] // sharp[1]
sub x5, x5, #8
bif v1.8b, v0.8b, v3.8b // if (!l[0][0]) L = l[offset][0]
mul v1.2s, v1.2s, v4.2s // L
.ifc \type, y
dup v15.2s, w2 // vmask[2]
.endif
cmtst v2.2s, v1.2s, v2.2s // L != 0
dup v14.2s, w7 // vmask[1]
mov x16, v2.d[0]
cmp x16, #0
b.eq 7f // if (!L) continue;
neg v5.8b, v5.8b // -sharp[0]
movrel x16, word_12
ushr v12.8b, v1.8b, #4 // H
ld1 {v16.2s}, [x16]
sshl v3.8b, v1.8b, v5.8b // L >> sharp[0]
.ifc \type, y
cmtst v15.2s, v15.2s, v16.2s // if (vmask[2] & bits)
.endif
movi v7.8b, #2
umin v3.8b, v3.8b, v6.8b // imin(L >> sharp[0], sharp[1])
add v0.8b, v1.8b, v7.8b // L + 2
umax v11.8b, v3.8b, v4.8b // imax(imin(), 1) = limit = I
add v0.8b, v0.8b, v0.8b // 2*(L + 2)
cmtst v14.2s, v14.2s, v16.2s // if (vmask[1] & bits)
uxtl v12.8h, v12.8b
add v10.8b, v0.8b, v11.8b // 2*(L + 2) + limit = E
cmtst v13.2s, v13.2s, v16.2s // if (vmask[0] & bits)
uxtl v11.8h, v11.8b
uxtl v10.8h, v10.8b
and v13.8b, v13.8b, v2.8b // vmask[0] &= L != 0
sxtl v14.8h, v14.8b
sxtl v13.8h, v13.8b
.ifc \type, y
sxtl v15.8h, v15.8b
.endif
ushl v12.8h, v12.8h, v31.8h
ushl v11.8h, v11.8h, v31.8h
ushl v10.8h, v10.8h, v31.8h
.ifc \type, y
tst w2, #0x0f
b.eq 2f
// wd16
bl lpf_\dir\()_16_8_neon
b 8f
2:
.endif
tst w7, #0x0f
b.eq 3f
.ifc \type, y
// wd8
bl lpf_\dir\()_8_8_neon
.else
// wd6
bl lpf_\dir\()_6_8_neon
.endif
b 8f
3:
// wd4
bl lpf_\dir\()_4_8_neon
.ifc \dir, h
b 8f
7:
// For dir h, the functions above increment x0.
// If the whole function is skipped, increment it here instead.
add x0, x0, x1, lsl #3
.else
7:
.endif
8:
lsr w6, w6, #2 // vmask[0] >>= 2
lsr w7, w7, #2 // vmask[1] >>= 2
.ifc \type, y
lsr w2, w2, #2 // vmask[2] >>= 2
.endif
.ifc \dir, v
add x0, x0, #16
.else
// For dir h, x0 is returned incremented
.endif
cbnz w6, 1b
ldp d14, d15, [sp, #0x30]
ldp d12, d13, [sp, #0x20]
ldp d10, d11, [sp, #0x10]
ldp d8, d9, [sp], 0x40
br x11
endfunc
.endm
lpf_func v, y
lpf_func h, y
lpf_func v, uv
lpf_func h, uv
const word_12
.word 1, 2
endconst
......@@ -28,11 +28,11 @@
#include "src/arm/asm.S"
#include "util.S"
// void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
// const pixel *src, ptrdiff_t stride,
// const int16_t fh[7], const intptr_t w,
// int h, enum LrEdgeFlags edges);
function wiener_filter_h_neon, export=1
// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
// const pixel *src, ptrdiff_t stride,
// const int16_t fh[7], const intptr_t w,
// int h, enum LrEdgeFlags edges);
function wiener_filter_h_8bpc_neon, export=1
mov w8, w5
ld1 {v0.8h}, [x4]
mov w9, #(1 << 14) - (1 << 2)
......@@ -306,11 +306,11 @@ L(variable_shift_tbl):
.purgem filter
endfunc
// void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
// const int16_t *mid, int w, int h,
// const int16_t fv[7], enum LrEdgeFlags edges,
// ptrdiff_t mid_stride);
function wiener_filter_v_neon, export=1
// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
// const int16_t *mid, int w, int h,
// const int16_t fv[7], enum LrEdgeFlags edges,
// ptrdiff_t mid_stride);
function wiener_filter_v_8bpc_neon, export=1
mov w8, w4
ld1 {v0.8h}, [x5]
movi v1.8h, #128
......@@ -482,9 +482,9 @@ function wiener_filter_v_neon, export=1
.purgem filter
endfunc
// void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
// const pixel *src, int w, int h);
function copy_narrow_neon, export=1
// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride,
// const pixel *src, int w, int h);
function copy_narrow_8bpc_neon, export=1
adr x5, L(copy_narrow_tbl)
ldrh w6, [x5, w3, uxtw #1]
sub x5, x5, w6, uxth
......@@ -617,12 +617,14 @@ endfunc
#define SUM_STRIDE (384+16)
// void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box3_h_neon, export=1
#include "looprestoration_tmpl.S"
// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box3_h_8bpc_neon, export=1
add w5, w5, #2 // w += 2
// Set up pointers for reading/writing alternate rows
......@@ -844,11 +846,11 @@ L(box3_variable_shift_tbl):
umull2 v6.8h, v4.16b, v4.16b
add3 4
subs w5, w5, #4
st1 {v3.4h}, [x1], #8
st1 {v7.4h}, [x11], #8
st1 {v26.4s}, [x0], #16
st1 {v28.4s}, [x10], #16
subs w5, w5, #4
b.le 9f
ext v0.16b, v0.16b, v0.16b, #4
ext v4.16b, v4.16b, v4.16b, #4
......@@ -879,12 +881,12 @@ L(box3_variable_shift_tbl):
.purgem add3
endfunc
// void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box5_h_neon, export=1
// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box5_h_8bpc_neon, export=1
add w5, w5, #2 // w += 2
// Set up pointers for reading/writing alternate rows
......@@ -950,7 +952,7 @@ function sgr_box5_h_neon, export=1
b 2f
0:
// !LR_HAVE_LEFT, fill v1 with the leftmost byte
// and shift v0 to have 2x the first byte at the front.
// and shift v0 to have 3x the first byte at the front.
dup v1.16b, v0.b[0]
dup v5.16b, v4.b[0]
// Move x3 back to account for the last 3 bytes we loaded before,
......@@ -1114,11 +1116,11 @@ L(box5_variable_shift_tbl):
umull2 v6.8h, v4.16b, v4.16b
add5 4
subs w5, w5, #4
st1 {v3.4h}, [x1], #8
st1 {v7.4h}, [x11], #8
st1 {v26.4s}, [x0], #16
st1 {v28.4s}, [x10], #16
subs w5, w5, #4
b.le 9f
ext v0.16b, v0.16b, v0.16b, #4
ext v1.16b, v1.16b, v2.16b, #8
......@@ -1147,839 +1149,4 @@ L(box5_variable_shift_tbl):
.purgem add5
endfunc
// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box3_v_neon, export=1
add w10, w3, #2 // Number of output rows to move back
mov w11, w3 // Number of input rows to move back
add w2, w2, #2 // Actual summed width
mov x7, #(4*SUM_STRIDE) // sumsq stride
mov x8, #(2*SUM_STRIDE) // sum stride
sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
tst w4, #4 // LR_HAVE_TOP
b.eq 0f
// If have top, read from row -2.
sub x5, x0, #(4*SUM_STRIDE)
sub x6, x1, #(2*SUM_STRIDE)
add w11, w11, #2
b 1f
0:
// !LR_HAVE_TOP
// If we don't have top, read from row 0 even if
// we start writing to row -1.
add x5, x0, #(4*SUM_STRIDE)
add x6, x1, #(2*SUM_STRIDE)
1:
tst w4, #8 // LR_HAVE_BOTTOM
b.eq 1f
// LR_HAVE_BOTTOM
add w3, w3, #2 // Sum all h+2 lines with the main loop
add w11, w11, #2
1:
mov w9, w3 // Backup of h for next loops
1:
// Start of horizontal loop; start one vertical filter slice.
// Start loading rows into v16-v21 and v24-v26 taking top
// padding into consideration.
tst w4, #4 // LR_HAVE_TOP
ld1 {v16.4s, v17.4s}, [x5], x7
ld1 {v24.8h}, [x6], x8
b.eq 2f
// LR_HAVE_TOP
ld1 {v18.4s, v19.4s}, [x5], x7
ld1 {v25.8h}, [x6], x8
ld1 {v20.4s, v21.4s}, [x5], x7
ld1 {v26.8h}, [x6], x8
b 3f
2: // !LR_HAVE_TOP
mov v18.16b, v16.16b
mov v19.16b, v17.16b
mov v25.16b, v24.16b
mov v20.16b, v16.16b
mov v21.16b, v17.16b
mov v26.16b, v24.16b
3:
subs w3, w3, #1
.macro add3
add v16.4s, v16.4s, v18.4s
add v17.4s, v17.4s, v19.4s
add v24.8h, v24.8h, v25.8h
add v16.4s, v16.4s, v20.4s
add v17.4s, v17.4s, v21.4s
add v24.8h, v24.8h, v26.8h
st1 {v16.4s, v17.4s}, [x0], x7
st1 {v24.8h}, [x1], x8
.endm
add3
mov v16.16b, v18.16b
mov v17.16b, v19.16b
mov v24.16b, v25.16b
mov v18.16b, v20.16b
mov v19.16b, v21.16b
mov v25.16b, v26.16b
b.le 4f
ld1 {v20.4s, v21.4s}, [x5], x7
ld1 {v26.8h}, [x6], x8
b 3b
4:
tst w4, #8 // LR_HAVE_BOTTOM
b.ne 5f
// !LR_HAVE_BOTTOM
// Produce two more rows, extending the already loaded rows.
add3
mov v16.16b, v18.16b
mov v17.16b, v19.16b
mov v24.16b, v25.16b
add3
5: // End of one vertical slice.
subs w2, w2, #8
b.le 0f
// Move pointers back up to the top and loop horizontally.
// Input pointers
msub x5, x7, x11, x5
msub x6, x8, x11, x6
// Output pointers
msub x0, x7, x10, x0
msub x1, x8, x10, x1
add x0, x0, #32
add x1, x1, #16
add x5, x5, #32
add x6, x6, #16
mov w3, w9
b 1b
0:
ret
.purgem add3
endfunc
// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box5_v_neon, export=1
add w10, w3, #2 // Number of output rows to move back
mov w11, w3 // Number of input rows to move back
add w2, w2, #8 // Actual summed width
mov x7, #(4*SUM_STRIDE) // sumsq stride
mov x8, #(2*SUM_STRIDE) // sum stride
sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
tst w4, #4 // LR_HAVE_TOP
b.eq 0f
// If have top, read from row -2.
sub x5, x0, #(4*SUM_STRIDE)
sub x6, x1, #(2*SUM_STRIDE)
add w11, w11, #2
b 1f
0:
// !LR_HAVE_TOP
// If we don't have top, read from row 0 even if
// we start writing to row -1.
add x5, x0, #(4*SUM_STRIDE)
add x6, x1, #(2*SUM_STRIDE)
1:
tst w4, #8 // LR_HAVE_BOTTOM
b.eq 0f
// LR_HAVE_BOTTOM
add w3, w3, #2 // Handle h+2 lines with the main loop
add w11, w11, #2
b 1f
0:
// !LR_HAVE_BOTTOM
sub w3, w3, #1 // Handle h-1 lines with the main loop
1:
mov w9, w3 // Backup of h for next loops
1:
// Start of horizontal loop; start one vertical filter slice.
// Start loading rows into v16-v25 and v26-v30 taking top
// padding into consideration.
tst w4, #4 // LR_HAVE_TOP
ld1 {v16.4s, v17.4s}, [x5], x7
ld1 {v26.8h}, [x6], x8
b.eq 2f
// LR_HAVE_TOP
ld1 {v20.4s, v21.4s}, [x5], x7
ld1 {v28.8h}, [x6], x8
mov v18.16b, v16.16b
mov v19.16b, v17.16b
mov v27.16b, v26.16b
ld1 {v22.4s, v23.4s}, [x5], x7
ld1 {v29.8h}, [x6], x8
b 3f
2: // !LR_HAVE_TOP
mov v18.16b, v16.16b
mov v19.16b, v17.16b
mov v27.16b, v26.16b
mov v20.16b, v16.16b
mov v21.16b, v17.16b
mov v28.16b, v26.16b
mov v22.16b, v16.16b
mov v23.16b, v17.16b
mov v29.16b, v26.16b
3:
cbz w3, 4f
ld1 {v24.4s, v25.4s}, [x5], x7
ld1 {v30.8h}, [x6], x8
3:
// Start of vertical loop
subs w3, w3, #2
.macro add5
add v16.4s, v16.4s, v18.4s
add v17.4s, v17.4s, v19.4s
add v26.8h, v26.8h, v27.8h
add v0.4s, v20.4s, v22.4s
add v1.4s, v21.4s, v23.4s
add v2.8h, v28.8h, v29.8h
add v16.4s, v16.4s, v24.4s
add v17.4s, v17.4s, v25.4s
add v26.8h, v26.8h, v30.8h
add v16.4s, v16.4s, v0.4s
add v17.4s, v17.4s, v1.4s
add v26.8h, v26.8h, v2.8h
st1 {v16.4s, v17.4s}, [x0], x7
st1 {v26.8h}, [x1], x8
.endm
add5
.macro shift2
mov v16.16b, v20.16b
mov v17.16b, v21.16b
mov v26.16b, v28.16b
mov v18.16b, v22.16b
mov v19.16b, v23.16b
mov v27.16b, v29.16b
mov v20.16b, v24.16b
mov v21.16b, v25.16b
mov v28.16b, v30.16b
.endm
shift2
add x0, x0, x7
add x1, x1, x8
b.le 5f
ld1 {v22.4s, v23.4s}, [x5], x7
ld1 {v29.8h}, [x6], x8
ld1 {v24.4s, v25.4s}, [x5], x7
ld1 {v30.8h}, [x6], x8
b 3b
4:
// h == 1, !LR_HAVE_BOTTOM.
// Pad the last row with the only content row, and add.
mov v24.16b, v22.16b
mov v25.16b, v23.16b
mov v30.16b, v29.16b
add5
shift2
add x0, x0, x7
add x1, x1, x8
add5
b 6f
5:
tst w4, #8 // LR_HAVE_BOTTOM
b.ne 6f
// !LR_HAVE_BOTTOM
cbnz w3, 5f
// The intended three edge rows left; output the one at h-2 and
// the past edge one at h.
ld1 {v22.4s, v23.4s}, [x5], x7
ld1 {v29.8h}, [x6], x8
// Pad the past-edge row from the last content row.
mov v24.16b, v22.16b
mov v25.16b, v23.16b
mov v30.16b, v29.16b
add5
shift2
add x0, x0, x7
add x1, x1, x8
// The last two rows are already padded properly here.
add5
b 6f
5:
// w3 == -1, two rows left, output one.
// Pad the last two rows from the mid one.
mov v22.16b, v20.16b
mov v23.16b, v21.16b
mov v29.16b, v28.16b
mov v24.16b, v20.16b
mov v25.16b, v21.16b
mov v30.16b, v28.16b
add5
add x0, x0, x7
add x1, x1, x8
b 6f
6: // End of one vertical slice.
subs w2, w2, #8
b.le 0f
// Move pointers back up to the top and loop horizontally.
// Input pointers
msub x5, x7, x11, x5
msub x6, x8, x11, x6
// Output pointers
msub x0, x7, x10, x0
msub x1, x8, x10, x1
add x0, x0, #32
add x1, x1, #16
add x5, x5, #32
add x6, x6, #16
mov w3, w9
b 1b
0:
ret
.purgem add5
endfunc
// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
// const int w, const int h, const int strength);
// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
// const int w, const int h, const int strength);
function sgr_calc_ab1_neon, export=1
add x3, x3, #2 // h += 2
movi v31.4s, #9 // n
mov x5, #455
mov x8, #SUM_STRIDE
b sgr_calc_ab_neon
endfunc
function sgr_calc_ab2_neon, export=1
add x3, x3, #3 // h += 3
asr x3, x3, #1 // h /= 2
movi v31.4s, #25 // n
mov x5, #164
mov x8, #(2*SUM_STRIDE)
endfunc
function sgr_calc_ab_neon
movrel x12, X(sgr_x_by_x)
ld1 {v16.16b, v17.16b, v18.16b}, [x12]
movi v19.16b, #5
movi v20.8b, #55 // idx of last 5
movi v21.8b, #72 // idx of last 4
movi v22.8b, #101 // idx of last 3
movi v23.8b, #169 // idx of last 2
movi v24.8b, #254 // idx of last 1
add x2, x2, #2 // w += 2
add x7, x2, #7
bic x7, x7, #7 // aligned w
sub x7, x8, x7 // increment between rows
movi v29.8h, #1, lsl #8
dup v28.4s, w4
dup v30.4s, w5 // one_by_x
sub x0, x0, #(4*(SUM_STRIDE))
sub x1, x1, #(2*(SUM_STRIDE))
mov x6, x2 // backup of w
sub v16.16b, v16.16b, v19.16b
sub v17.16b, v17.16b, v19.16b
sub v18.16b, v18.16b, v19.16b
1:
subs x2, x2, #8
ld1 {v0.4s, v1.4s}, [x0] // a
ld1 {v2.8h}, [x1] // b
mul v0.4s, v0.4s, v31.4s // a * n
mul v1.4s, v1.4s, v31.4s // a * n
umull v3.4s, v2.4h, v2.4h // b * b
umull2 v4.4s, v2.8h, v2.8h // b * b
uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
mul v0.4s, v0.4s, v28.4s // p * s
mul v1.4s, v1.4s, v28.4s // p * s
uqshrn v0.4h, v0.4s, #16
uqshrn2 v0.8h, v1.4s, #16
uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
cmhi v5.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
add v25.8b, v25.8b, v26.8b
cmhi v6.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
add v27.8b, v27.8b, v5.8b
add v6.8b, v6.8b, v19.8b
add v25.8b, v25.8b, v27.8b
add v1.8b, v1.8b, v6.8b
add v1.8b, v1.8b, v25.8b
uxtl v1.8h, v1.8b // x
umull v3.4s, v1.4h, v2.4h // x * BB[i]
umull2 v4.4s, v1.8h, v2.8h // x * BB[i]
mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
srshr v3.4s, v3.4s, #12 // AA[i]
srshr v4.4s, v4.4s, #12 // AA[i]
sub v2.8h, v29.8h, v1.8h // 256 - x
st1 {v3.4s, v4.4s}, [x0], #32
st1 {v2.8h}, [x1], #16
b.gt 1b
subs x3, x3, #1
b.le 0f
add x0, x0, x7, lsl #2
add x1, x1, x7, lsl #1
mov x2, x6
b 1b
0:
ret
endfunc
#define FILTER_OUT_STRIDE 384
// void dav1d_sgr_finish_filter1_neon(coef *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter1_neon, export=1
sub x7, x3, #(4*SUM_STRIDE)
add x8, x3, #(4*SUM_STRIDE)
sub x9, x4, #(2*SUM_STRIDE)
add x10, x4, #(2*SUM_STRIDE)
mov x11, #SUM_STRIDE
mov x12, #FILTER_OUT_STRIDE
add x13, x5, #7
bic x13, x13, #7 // Aligned width
sub x2, x2, x13
sub x12, x12, x13
sub x11, x11, x13
sub x11, x11, #4 // We read 4 extra elements from a
sub x14, x11, #4 // We read 8 extra elements from b
mov x13, x5
movi v6.8h, #3
movi v7.4s, #3
1:
ld1 {v0.8h, v1.8h}, [x9], #32
ld1 {v2.8h, v3.8h}, [x4], #32
ld1 {v4.8h, v5.8h}, [x10], #32
ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
ld1 {v19.4s, v20.4s, v21.4s}, [x3], #48
ld1 {v22.4s, v23.4s, v24.4s}, [x8], #48
2:
subs x5, x5, #8
ext v25.16b, v0.16b, v1.16b, #2 // -stride
ext v26.16b, v2.16b, v3.16b, #2 // 0
ext v27.16b, v4.16b, v5.16b, #2 // +stride
ext v28.16b, v0.16b, v1.16b, #4 // +1-stride
ext v29.16b, v2.16b, v3.16b, #4 // +1
ext v30.16b, v4.16b, v5.16b, #4 // +1+stride
add v2.8h, v2.8h, v25.8h // -1, -stride
add v26.8h, v26.8h, v27.8h // 0, +stride
add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride
add v2.8h, v2.8h, v26.8h
add v4.8h, v4.8h, v30.8h // -1+stride, +1+stride
add v2.8h, v2.8h, v29.8h // +1
add v0.8h, v0.8h, v4.8h
ext v25.16b, v16.16b, v17.16b, #4 // -stride
ext v26.16b, v17.16b, v18.16b, #4
shl v2.8h, v2.8h, #2
ext v27.16b, v16.16b, v17.16b, #8 // +1-stride
ext v28.16b, v17.16b, v18.16b, #8
ext v29.16b, v19.16b, v20.16b, #4 // 0
ext v30.16b, v20.16b, v21.16b, #4
mla v2.8h, v0.8h, v6.8h // * 3 -> a
add v25.4s, v25.4s, v19.4s // -stride, -1
add v26.4s, v26.4s, v20.4s
add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride
add v17.4s, v17.4s, v28.4s
ext v27.16b, v19.16b, v20.16b, #8 // +1
ext v28.16b, v20.16b, v21.16b, #8
add v16.4s, v16.4s, v22.4s // -1+stride
add v17.4s, v17.4s, v23.4s
add v29.4s, v29.4s, v27.4s // 0, +1
add v30.4s, v30.4s, v28.4s
add v25.4s, v25.4s, v29.4s
add v26.4s, v26.4s, v30.4s
ext v27.16b, v22.16b, v23.16b, #4 // +stride
ext v28.16b, v23.16b, v24.16b, #4
ext v29.16b, v22.16b, v23.16b, #8 // +1+stride
ext v30.16b, v23.16b, v24.16b, #8
ld1 {v19.8b}, [x1], #8 // src
add v25.4s, v25.4s, v27.4s // +stride
add v26.4s, v26.4s, v28.4s
add v16.4s, v16.4s, v29.4s // +1+stride
add v17.4s, v17.4s, v30.4s
shl v25.4s, v25.4s, #2
shl v26.4s, v26.4s, #2
mla v25.4s, v16.4s, v7.4s // * 3 -> b
mla v26.4s, v17.4s, v7.4s
uxtl v19.8h, v19.8b // src
mov v0.16b, v1.16b
umlal v25.4s, v2.4h, v19.4h // b + a * src
umlal2 v26.4s, v2.8h, v19.8h
mov v2.16b, v3.16b
rshrn v25.4h, v25.4s, #9
rshrn2 v25.8h, v26.4s, #9
mov v4.16b, v5.16b
st1 {v25.8h}, [x0], #16
b.le 3f
mov v16.16b, v18.16b
mov v19.16b, v21.16b
mov v22.16b, v24.16b
ld1 {v1.8h}, [x9], #16
ld1 {v3.8h}, [x4], #16
ld1 {v5.8h}, [x10], #16
ld1 {v17.4s, v18.4s}, [x7], #32
ld1 {v20.4s, v21.4s}, [x3], #32
ld1 {v23.4s, v24.4s}, [x8], #32
b 2b
3:
subs x6, x6, #1
b.le 0f
mov x5, x13
add x0, x0, x12, lsl #1
add x1, x1, x2
add x3, x3, x11, lsl #2
add x7, x7, x11, lsl #2
add x8, x8, x11, lsl #2
add x4, x4, x14, lsl #1
add x9, x9, x14, lsl #1
add x10, x10, x14, lsl #1
b 1b
0:
ret
endfunc
// void dav1d_sgr_finish_filter2_neon(coef *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter2_neon, export=1
add x7, x3, #(4*(SUM_STRIDE))
sub x3, x3, #(4*(SUM_STRIDE))
add x8, x4, #(2*(SUM_STRIDE))
sub x4, x4, #(2*(SUM_STRIDE))
mov x9, #(2*SUM_STRIDE)
mov x10, #FILTER_OUT_STRIDE
add x11, x5, #7
bic x11, x11, #7 // Aligned width
sub x2, x2, x11
sub x10, x10, x11
sub x9, x9, x11
sub x9, x9, #4 // We read 4 extra elements from a
sub x12, x9, #4 // We read 8 extra elements from b
mov x11, x5
movi v4.8h, #5
movi v5.4s, #5
movi v6.8h, #6
movi v7.4s, #6
1:
ld1 {v0.8h, v1.8h}, [x4], #32
ld1 {v2.8h, v3.8h}, [x8], #32
ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
2:
subs x5, x5, #8
ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
ext v22.16b, v0.16b, v1.16b, #2 // -stride
ext v23.16b, v2.16b, v3.16b, #2 // +stride
add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
add v2.8h, v22.8h, v23.8h // -stride, +stride
add v0.8h, v0.8h, v25.8h
ext v22.16b, v16.16b, v17.16b, #4 // -stride
ext v23.16b, v17.16b, v18.16b, #4
ext v24.16b, v19.16b, v20.16b, #4 // +stride
ext v25.16b, v20.16b, v21.16b, #4
ext v26.16b, v16.16b, v17.16b, #8 // +1-stride
ext v27.16b, v17.16b, v18.16b, #8
ext v28.16b, v19.16b, v20.16b, #8 // +1+stride
ext v29.16b, v20.16b, v21.16b, #8
mul v0.8h, v0.8h, v4.8h // * 5
mla v0.8h, v2.8h, v6.8h // * 6
ld1 {v31.8b}, [x1], #8
add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
add v17.4s, v17.4s, v27.4s
add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride
add v20.4s, v20.4s, v29.4s
add v16.4s, v16.4s, v19.4s
add v17.4s, v17.4s, v20.4s
add v22.4s, v22.4s, v24.4s // -stride, +stride
add v23.4s, v23.4s, v25.4s
// This is, surprisingly, faster than other variants where the
// mul+mla pairs are further apart, on Cortex A53.
mul v16.4s, v16.4s, v5.4s // * 5
mla v16.4s, v22.4s, v7.4s // * 6
mul v17.4s, v17.4s, v5.4s // * 5
mla v17.4s, v23.4s, v7.4s // * 6
uxtl v31.8h, v31.8b
umlal v16.4s, v0.4h, v31.4h // b + a * src
umlal2 v17.4s, v0.8h, v31.8h
mov v0.16b, v1.16b
rshrn v16.4h, v16.4s, #9
rshrn2 v16.8h, v17.4s, #9
mov v2.16b, v3.16b
st1 {v16.8h}, [x0], #16
b.le 3f
mov v16.16b, v18.16b
mov v19.16b, v21.16b
ld1 {v1.8h}, [x4], #16
ld1 {v3.8h}, [x8], #16
ld1 {v17.4s, v18.4s}, [x3], #32
ld1 {v20.4s, v21.4s}, [x7], #32
b 2b
3:
subs x6, x6, #1
b.le 0f
mov x5, x11
add x0, x0, x10, lsl #1
add x1, x1, x2
add x3, x3, x9, lsl #2
add x7, x7, x9, lsl #2
add x4, x4, x12, lsl #1
add x8, x8, x12, lsl #1
mov x13, x3
mov x14, x4
ld1 {v0.8h, v1.8h}, [x4], #32
ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
4:
subs x5, x5, #8
ext v23.16b, v0.16b, v1.16b, #4 // +1
ext v22.16b, v0.16b, v1.16b, #2 // 0
add v0.8h, v0.8h, v23.8h // -1, +1
ext v24.16b, v16.16b, v17.16b, #4 // 0
ext v25.16b, v17.16b, v18.16b, #4
ext v26.16b, v16.16b, v17.16b, #8 // +1
ext v27.16b, v17.16b, v18.16b, #8
mul v2.8h, v22.8h, v6.8h // * 6
mla v2.8h, v0.8h, v4.8h // * 5 -> a
ld1 {v31.8b}, [x1], #8
add v16.4s, v16.4s, v26.4s // -1, +1
add v17.4s, v17.4s, v27.4s
uxtl v31.8h, v31.8b
// This is, surprisingly, faster than other variants where the
// mul+mla pairs are further apart, on Cortex A53.
mul v24.4s, v24.4s, v7.4s // * 6
mla v24.4s, v16.4s, v5.4s // * 5 -> b
mul v25.4s, v25.4s, v7.4s // * 6
mla v25.4s, v17.4s, v5.4s // * 5 -> b
umlal v24.4s, v2.4h, v31.4h // b + a * src
umlal2 v25.4s, v2.8h, v31.8h
mov v0.16b, v1.16b
rshrn v24.4h, v24.4s, #8
rshrn2 v24.8h, v25.4s, #8
mov v16.16b, v18.16b
st1 {v24.8h}, [x0], #16
b.le 5f
ld1 {v1.8h}, [x4], #16
ld1 {v17.4s, v18.4s}, [x3], #32
b 4b
5:
subs x6, x6, #1
b.le 0f
mov x5, x11
add x0, x0, x10, lsl #1
add x1, x1, x2
mov x3, x13 // Rewind x3/x4 to where they started
mov x4, x14
b 1b
0:
ret
endfunc
// void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
// const pixel *src, const ptrdiff_t src_stride,
// const coef *t1, const int w, const int h,
// const int wt);
function sgr_weighted1_neon, export=1
dup v31.8h, w7
cmp x6, #2
add x9, x0, x1
add x10, x2, x3
add x11, x4, #2*FILTER_OUT_STRIDE
mov x7, #(4*FILTER_OUT_STRIDE)
lsl x1, x1, #1
lsl x3, x3, #1
add x8, x5, #7
bic x8, x8, #7 // Aligned width
sub x1, x1, x8
sub x3, x3, x8
sub x7, x7, x8, lsl #1
mov x8, x5
b.lt 2f
1:
ld1 {v0.8b}, [x2], #8
ld1 {v4.8b}, [x10], #8
ld1 {v1.8h}, [x4], #16
ld1 {v5.8h}, [x11], #16
subs x5, x5, #8
ushll v0.8h, v0.8b, #4 // u
ushll v4.8h, v4.8b, #4 // u
sub v1.8h, v1.8h, v0.8h // t1 - u
sub v5.8h, v5.8h, v4.8h // t1 - u
ushll v2.4s, v0.4h, #7 // u << 7
ushll2 v3.4s, v0.8h, #7 // u << 7
ushll v6.4s, v4.4h, #7 // u << 7
ushll2 v7.4s, v4.8h, #7 // u << 7
smlal v2.4s, v1.4h, v31.4h // v
smlal2 v3.4s, v1.8h, v31.8h // v
smlal v6.4s, v5.4h, v31.4h // v
smlal2 v7.4s, v5.8h, v31.8h // v
rshrn v2.4h, v2.4s, #11
rshrn2 v2.8h, v3.4s, #11
rshrn v6.4h, v6.4s, #11
rshrn2 v6.8h, v7.4s, #11
sqxtun v2.8b, v2.8h
sqxtun v6.8b, v6.8h
st1 {v2.8b}, [x0], #8
st1 {v6.8b}, [x9], #8
b.gt 1b
sub x6, x6, #2
cmp x6, #1
b.lt 0f
mov x5, x8
add x0, x0, x1
add x9, x9, x1
add x2, x2, x3
add x10, x10, x3
add x4, x4, x7
add x11, x11, x7
b.eq 2f
b 1b
2:
ld1 {v0.8b}, [x2], #8
ld1 {v1.8h}, [x4], #16
subs x5, x5, #8
ushll v0.8h, v0.8b, #4 // u
sub v1.8h, v1.8h, v0.8h // t1 - u
ushll v2.4s, v0.4h, #7 // u << 7
ushll2 v3.4s, v0.8h, #7 // u << 7
smlal v2.4s, v1.4h, v31.4h // v
smlal2 v3.4s, v1.8h, v31.8h // v
rshrn v2.4h, v2.4s, #11
rshrn2 v2.8h, v3.4s, #11
sqxtun v2.8b, v2.8h
st1 {v2.8b}, [x0], #8
b.gt 2b
0:
ret
endfunc
// void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *src, const ptrdiff_t src_stride,
// const coef *t1, const coef *t2,
// const int w, const int h,
// const int16_t wt[2]);
function sgr_weighted2_neon, export=1
ldr x8, [sp]
cmp x7, #2
add x10, x0, x1
add x11, x2, x3
add x12, x4, #2*FILTER_OUT_STRIDE
add x13, x5, #2*FILTER_OUT_STRIDE
ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
mov x8, #4*FILTER_OUT_STRIDE
lsl x1, x1, #1
lsl x3, x3, #1
add x9, x6, #7
bic x9, x9, #7 // Aligned width
sub x1, x1, x9
sub x3, x3, x9
sub x8, x8, x9, lsl #1
mov x9, x6
b.lt 2f
1:
ld1 {v0.8b}, [x2], #8
ld1 {v16.8b}, [x11], #8
ld1 {v1.8h}, [x4], #16
ld1 {v17.8h}, [x12], #16
ld1 {v2.8h}, [x5], #16
ld1 {v18.8h}, [x13], #16
subs x6, x6, #8
ushll v0.8h, v0.8b, #4 // u
ushll v16.8h, v16.8b, #4 // u
sub v1.8h, v1.8h, v0.8h // t1 - u
sub v2.8h, v2.8h, v0.8h // t2 - u
sub v17.8h, v17.8h, v16.8h // t1 - u
sub v18.8h, v18.8h, v16.8h // t2 - u
ushll v3.4s, v0.4h, #7 // u << 7
ushll2 v4.4s, v0.8h, #7 // u << 7
ushll v19.4s, v16.4h, #7 // u << 7
ushll2 v20.4s, v16.8h, #7 // u << 7
smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
smlal v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)
smlal v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)
smlal2 v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)
smlal2 v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)
rshrn v3.4h, v3.4s, #11
rshrn2 v3.8h, v4.4s, #11
rshrn v19.4h, v19.4s, #11
rshrn2 v19.8h, v20.4s, #11
sqxtun v3.8b, v3.8h
sqxtun v19.8b, v19.8h
st1 {v3.8b}, [x0], #8
st1 {v19.8b}, [x10], #8
b.gt 1b
subs x7, x7, #2
cmp x7, #1
b.lt 0f
mov x6, x9
add x0, x0, x1
add x10, x10, x1
add x2, x2, x3
add x11, x11, x3
add x4, x4, x8
add x12, x12, x8
add x5, x5, x8
add x13, x13, x8
b.eq 2f
b 1b
2:
ld1 {v0.8b}, [x2], #8
ld1 {v1.8h}, [x4], #16
ld1 {v2.8h}, [x5], #16
subs x6, x6, #8
ushll v0.8h, v0.8b, #4 // u
sub v1.8h, v1.8h, v0.8h // t1 - u
sub v2.8h, v2.8h, v0.8h // t2 - u
ushll v3.4s, v0.4h, #7 // u << 7
ushll2 v4.4s, v0.8h, #7 // u << 7
smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
rshrn v3.4h, v3.4s, #11
rshrn2 v3.8h, v4.4s, #11
sqxtun v3.8b, v3.8h
st1 {v3.8b}, [x0], #8
b.gt 1b
0:
ret
endfunc
sgr_funcs 8
This diff is collapsed.
This diff is collapsed.