...
 
Commits (118)
......@@ -38,7 +38,7 @@ build-debian:
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
stage: build
tags:
- debian
- avx2
- amd64
script:
- meson build --buildtype release --werror
......@@ -173,7 +173,7 @@ build-win-arm64:
build-debian-aarch64:
stage: build
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
tags:
- aarch64
- debian
......@@ -184,7 +184,7 @@ build-debian-aarch64:
build-debian-aarch64-clang-5:
stage: build
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
tags:
- aarch64
- debian
......@@ -203,7 +203,7 @@ build-macos:
- cd build && meson test -v
build-debian-werror:
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
stage: build
tags:
- aarch64
......@@ -219,7 +219,7 @@ build-debian-armv7:
- armv7
- debian
script:
- meson build --buildtype debugoptimized --werror
- linux32 meson build --buildtype debugoptimized --werror
- ninja -C build
- cd build && meson test -v
......@@ -230,13 +230,13 @@ build-debian-armv7-clang-5:
- armv7
- debian
script:
- env CC=clang-5.0 CFLAGS='-integrated-as' meson build --buildtype release
- env CC=clang-5.0 CFLAGS='-integrated-as' linux32 meson build --buildtype release
- ninja -C build
- cd build && meson test -v
build-ubuntu-snap:
stage: build
image: registry.videolan.org:5000/dav1d-ubuntu-bionic:20190221154127
image: registry.videolan.org/dav1d-ubuntu-bionic:20200121182340
tags:
- debian
- amd64
......@@ -292,7 +292,7 @@ test-debian-unaligned-stack:
stage: test
needs: ["build-debian"]
tags:
- debian
- avx2
- amd64
cache:
key: testdata.git-20190215
......@@ -382,7 +382,7 @@ test-win64:
stage: test
needs: ["build-win64"]
tags:
- debian
- avx2
- amd64
cache:
key: testdata.git-20190215
......@@ -403,7 +403,7 @@ test-win64:
dependencies: []
test-debian-aarch64:
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
stage: test
needs: ["build-debian-aarch64"]
tags:
......@@ -464,7 +464,7 @@ test-debian-armv7-clang-5:
- test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
- test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
- git clone cache/dav1d-test-data.git tests/dav1d-test-data
- env CC=clang-5.0 CFLAGS='-integrated-as' meson build --buildtype release
- env CC=clang-5.0 CFLAGS='-integrated-as' linux32 meson build --buildtype release
-Dtestdata_tests=true
-Dlogging=false
- ninja -C build
......
Changes for 0.6.0 'Gyrfalcon':
------------------------------
0.6.0 is a major release for dav1d:
- New ARM64 optimizations for the 10/12bit depth:
- mc_avg, mc_w_avg, mc_mask
- mc_put/mc_prep 8tap/bilin
- mc_warp_8x8
- mc_w_mask
- mc_blend
- wiener
- SGR
- loopfilter
- cdef
- New AVX-512 optimizations for prep_bilin, prep_8tap, cdef_filter, mc_avg/w_avg/mask
- New SSSE3 optimizations for film grain
- New AVX2 optimizations for msac_adapt16
- Fix rare mismatches against the reference decoder, notably because of clipping
- Improvements on ARM64 on msac, cdef and looprestoration optimizations
- Improvements on AVX2 optimizations for cdef_filter
- Improvements in the C version for itxfm, cdef_filter
Changes for 0.5.2 'Asiatic Cheetah':
------------------------------------
......@@ -32,7 +55,7 @@ and improving speed significantly:
- NEON optimizations for CDEF and warp on ARM32
- SSE2 optimizations for MSAC hi_tok decoding
- SSSE3 optimizations for deblocking loopfilters and warp_affine
- AVX-2 optimizations for film grain and ipred_z2
- AVX2 optimizations for film grain and ipred_z2
- SSE4 optimizations for warp_affine
- VSX optimizations for wiener
- Fix inverse transform overflows in x86 and NEON asm
......@@ -81,7 +104,7 @@ Changes for 0.2.2 (0.3.0-rc) 'Antelope':
-----------------------------
- Large improvement on MSAC decoding with SSE, bringing 4-6% speed increase
The impact is important on SSSE3, SSE4 and AVX-2 cpus
The impact is important on SSSE3, SSE4 and AVX2 cpus
- SSSE3 optimizations for all blocks size in itx
- SSSE3 optimizations for ipred_paeth and ipred_cfl (420, 422 and 444)
- Speed improvements on CDEF for SSE4 CPUs
......@@ -93,7 +116,7 @@ Changes for 0.2.1 'Antelope':
----------------------------
- SSSE3 optimization for cdef_dir
- AVX-2 improvements of the existing CDEF optimizations
- AVX2 improvements of the existing CDEF optimizations
- NEON improvements of the existing CDEF and wiener optimizations
- Clarification about the numbering/versionning scheme
......@@ -103,7 +126,7 @@ Changes for 0.2.0 'Antelope':
- ARM64 and ARM optimizations using NEON instructions
- SSSE3 optimizations for both 32 and 64bits
- More AVX-2 assembly, reaching almost completion
- More AVX2 assembly, reaching almost completion
- Fix installation of includes
- Rewrite inverse transforms to avoid overflows
- Snap packaging for Linux
......@@ -118,6 +141,6 @@ Initial release of dav1d, the fast and small AV1 decoder.
- Support for all features of the AV1 bitstream
- Support for all bitdepth, 8, 10 and 12bits
- Support for all chroma subsamplings 4:2:0, 4:2:2, 4:4:4 *and* grayscale
- Full acceleration for AVX-2 64bits processors, making it the fastest decoder
- Full acceleration for AVX2 64bits processors, making it the fastest decoder
- Partial acceleration for SSSE3 processors
- Partial acceleration for NEON processors
......@@ -73,7 +73,7 @@ The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this pr
# Compile
1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.13.02 or higher)
1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.14 or higher)
2. Run `mkdir build && cd build` to create a build directory and enter it
3. Run `meson ..` to configure meson, add `--default-library=static` if static linking is desired
4. Run `ninja` to compile
......
......@@ -43,15 +43,18 @@
#endif
#if ARCH_X86_64
/* x86-64 needs 32-byte alignment for AVX2. */
/* x86-64 needs 32- and 64-byte alignment for AVX2 and AVX-512. */
#define ALIGN_64_VAL 64
#define ALIGN_32_VAL 32
#define ALIGN_16_VAL 16
#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64 || ARCH_PPC64LE
/* ARM doesn't benefit from anything more than 16-byte alignment. */
#define ALIGN_64_VAL 16
#define ALIGN_32_VAL 16
#define ALIGN_16_VAL 16
#else
/* No need for extra alignment on platforms without assembly. */
#define ALIGN_64_VAL 8
#define ALIGN_32_VAL 8
#define ALIGN_16_VAL 8
#endif
......@@ -76,9 +79,10 @@
* becomes:
* ALIGN_STK_$align(uint8_t, var, 1, [2][3][4])
*/
#define ALIGN_STK_64(type, var, sz1d, sznd) \
ALIGN(type var[sz1d]sznd, ALIGN_64_VAL)
#define ALIGN_STK_32(type, var, sz1d, sznd) \
ALIGN(type var[sz1d]sznd, ALIGN_32_VAL)
// as long as stack is itself 16-byte aligned, this works (win64, gcc)
#define ALIGN_STK_16(type, var, sz1d, sznd) \
ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
......@@ -92,6 +96,12 @@
#define NOINLINE __attribute__((noinline))
#endif /* !_MSC_VER */
#ifdef __clang__
#define NO_SANITIZE(x) __attribute__((no_sanitize(x)))
#else
#define NO_SANITIZE(x)
#endif
#if defined(NDEBUG) && (defined(__GNUC__) || defined(__clang__))
#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
#elif defined(NDEBUG) && defined(_MSC_VER)
......
......@@ -31,6 +31,8 @@
#include <stdint.h>
#include <string.h>
#include "common/attributes.h"
#if !defined(BITDEPTH)
typedef void pixel;
typedef void coef;
......@@ -47,12 +49,14 @@ typedef int16_t coef;
#define iclip_pixel iclip_u8
#define PIX_HEX_FMT "%02x"
#define bitfn(x) x##_8bpc
#define PXSTRIDE(x) x
#define BF(x, suffix) x##_8bpc_##suffix
#define PXSTRIDE(x) (x)
#define highbd_only(x)
#define HIGHBD_DECL_SUFFIX /* nothing */
#define HIGHBD_CALL_SUFFIX /* nothing */
#define HIGHBD_TAIL_SUFFIX /* nothing */
#define bitdepth_from_max(x) 8
#define BITDEPTH_MAX 0xff
#elif BITDEPTH == 16
typedef uint16_t pixel;
typedef int32_t coef;
......@@ -69,8 +73,13 @@ static inline void pixel_set(pixel *const dst, const int val, const int num) {
#define HIGHBD_CALL_SUFFIX , f->bitdepth_max
#define HIGHBD_TAIL_SUFFIX , bitdepth_max
#define bitdepth_from_max(bitdepth_max) (32 - clz(bitdepth_max))
#define BITDEPTH_MAX bitdepth_max
#define bitfn(x) x##_16bpc
#define PXSTRIDE(x) (x >> 1)
#define BF(x, suffix) x##_16bpc_##suffix
static inline ptrdiff_t PXSTRIDE(const ptrdiff_t x) {
assert(!(x & 1));
return x >> 1;
}
#define highbd_only(x) x
#else
#error invalid value for bitdepth
......
......@@ -318,8 +318,8 @@ typedef struct Dav1dFilmGrainData {
int scaling_shift;
int ar_coeff_lag;
int8_t ar_coeffs_y[24];
int8_t ar_coeffs_uv[2][25];
int ar_coeff_shift;
int8_t ar_coeffs_uv[2][25 + 3 /* padding for alignment purposes */];
uint64_t ar_coeff_shift;
int grain_scale_shift;
int uv_mult[2];
int uv_luma_mult[2];
......@@ -329,13 +329,13 @@ typedef struct Dav1dFilmGrainData {
} Dav1dFilmGrainData;
typedef struct Dav1dFrameHeader {
enum Dav1dFrameType frame_type; ///< type of the picture
int width[2 /* { coded_width, superresolution_upscaled_width } */], height;
int frame_offset; ///< frame number
struct {
int present, update;
Dav1dFilmGrainData data;
int present, update;
} film_grain; ///< film grain parameters
enum Dav1dFrameType frame_type; ///< type of the picture
int width[2 /* { coded_width, superresolution_upscaled_width } */], height;
int frame_offset; ///< frame number
int temporal_id, spatial_id; ///< spatial and temporal id of the frame for SVC
int show_existing_frame;
......
......@@ -23,14 +23,14 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
version: '0.5.2',
version: '0.6.0',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',
'b_ndebug=if-release'],
meson_version: '>= 0.47.0')
dav1d_soname_version = '3.1.0'
dav1d_soname_version = '4.0.0'
dav1d_api_version_array = dav1d_soname_version.split('.')
dav1d_api_version_major = dav1d_api_version_array[0]
dav1d_api_version_minor = dav1d_api_version_array[1]
......@@ -84,13 +84,15 @@ test_args = []
optional_arguments = []
# Define _POSIX_C_SOURCE to POSIX.1–2001 (IEEE Std 1003.1-2001)
test_args += '-D_POSIX_C_SOURCE=200112L'
add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c')
if host_machine.system() == 'darwin'
if host_machine.system() == 'linux'
test_args += '-D_GNU_SOURCE'
add_project_arguments('-D_GNU_SOURCE', language: 'c')
elif host_machine.system() == 'darwin'
test_args += '-D_DARWIN_C_SOURCE'
add_project_arguments('-D_DARWIN_C_SOURCE', language: 'c')
else
test_args += '-D_POSIX_C_SOURCE=200112L'
add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c')
endif
if host_machine.system() == 'windows'
......@@ -131,6 +133,15 @@ else
endif
endif
libdl_dependency = []
if host_machine.system() == 'linux'
libdl_dependency = cc.find_library('dl', required : false)
if cc.has_function('dlsym', prefix : '#include <dlfcn.h>', args : test_args, dependencies : libdl_dependency)
cdata.set('HAVE_DLSYM', 1)
endif
endif
# Header checks
stdatomic_dependency = []
......@@ -257,12 +268,12 @@ if host_machine.cpu_family().startswith('x86')
if get_option('stack_alignment') > 0
stack_alignment = get_option('stack_alignment')
elif host_machine.cpu_family() == 'x86_64'
if cc.has_argument('-mpreferred-stack-boundary=5')
stackalign_flag = ['-mpreferred-stack-boundary=5']
if cc.has_argument('-mpreferred-stack-boundary=6')
stackalign_flag = ['-mpreferred-stack-boundary=6']
stackrealign_flag = ['-mincoming-stack-boundary=4']
stack_alignment = 32
elif cc.has_argument('-mstack-alignment=32')
stackalign_flag = ['-mstack-alignment=32']
elif cc.has_argument('-mstack-alignment=64')
stackalign_flag = ['-mstack-alignment=64']
stackrealign_flag = ['-mstackrealign']
stack_alignment = 32
else
......@@ -364,8 +375,8 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
out = nasm_r.stdout().strip().split()
if out[1].to_lower() == 'version'
if out[2].version_compare('<2.13.02')
error('nasm 2.13.02 or later is required, found nasm @0@'.format(out[2]))
if out[2].version_compare('<2.14')
error('nasm 2.14 or later is required, found nasm @0@'.format(out[2]))
endif
else
error('unexpected nasm version string: @0@'.format(nasm_r.stdout()))
......@@ -390,7 +401,7 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
depfile: '@BASENAME@.obj.ndep',
arguments: [
'-f', nasm_format,
'-I', '@SOURCE_DIR@/src/',
'-I', '@0@/src/'.format(meson.current_source_dir()),
'-I', '@0@/'.format(meson.current_build_dir()),
'-MQ', '@OUTPUT@', '-MF', '@DEPFILE@',
'@EXTRA_ARGS@',
......
......@@ -148,20 +148,22 @@
.endif
.endm
// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// /*const*/ pixel *const top[2], int h,
// enum CdefEdgeFlags edges);
// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// const pixel *const top, int h,
// enum CdefEdgeFlags edges);
// n1 = s0/d0
// w1 = d0/q0
// n2 = s4/d2
// w2 = d2/q1
.macro padding_func w, stride, n1, w1, n2, w2, align
function cdef_padding\w\()_neon, export=1
function cdef_padding\w\()_8bpc_neon, export=1
push {r4-r7,lr}
ldrd r4, r5, [sp, #20]
ldr r6, [sp, #28]
cmp r6, #0xf // fully edged
beq cdef_padding\w\()_edged_8bpc_neon
vmov.i16 q3, #0x8000
tst r6, #4 // CDEF_HAVE_TOP
bne 1f
......@@ -175,10 +177,9 @@ function cdef_padding\w\()_neon, export=1
b 3f
1:
// CDEF_HAVE_TOP
ldr r7, [r4]
ldr lr, [r4, #4]
add r7, r4, r2
sub r0, r0, #2*(2*\stride)
pad_top_bottom r7, lr, \w, \stride, \n1, \w1, \n2, \w2, \align, 0
pad_top_bottom r4, r7, \w, \stride, \n1, \w1, \n2, \w2, \align, 0
// Middle section
3:
......@@ -267,6 +268,65 @@ endfunc
padding_func 8, 16, d0, q0, d2, q1, 128
padding_func 4, 8, s0, d0, s4, d2, 64
// void cdef_paddingX_edged_8bpc_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// const pixel *const top, int h,
// enum CdefEdgeFlags edges);
.macro padding_func_edged w, stride, reg, align
function cdef_padding\w\()_edged_8bpc_neon
sub r0, r0, #(2*\stride)
ldrh r12, [r4, #-2]
vldr \reg, [r4]
add r7, r4, r2
strh r12, [r0, #-2]
ldrh r12, [r4, #\w]
vstr \reg, [r0]
strh r12, [r0, #\w]
ldrh r12, [r7, #-2]
vldr \reg, [r7]
strh r12, [r0, #\stride-2]
ldrh r12, [r7, #\w]
vstr \reg, [r0, #\stride]
strh r12, [r0, #\stride+\w]
add r0, r0, #2*\stride
0:
ldrh r12, [r3], #2
vldr \reg, [r1]
str r12, [r0, #-2]
ldrh r12, [r1, #\w]
add r1, r1, r2
subs r5, r5, #1
vstr \reg, [r0]
str r12, [r0, #\w]
add r0, r0, #\stride
bgt 0b
ldrh r12, [r1, #-2]
vldr \reg, [r1]
add r7, r1, r2
strh r12, [r0, #-2]
ldrh r12, [r1, #\w]
vstr \reg, [r0]
strh r12, [r0, #\w]
ldrh r12, [r7, #-2]
vldr \reg, [r7]
strh r12, [r0, #\stride-2]
ldrh r12, [r7, #\w]
vstr \reg, [r0, #\stride]
strh r12, [r0, #\stride+\w]
pop {r4-r7,pc}
endfunc
.endm
padding_func_edged 8, 16, d0, 64
padding_func_edged 4, 8, s0, 32
.macro dir_table w, stride
const directions\w
.byte -1 * \stride + 1, -2 * \stride + 2
......@@ -311,14 +371,13 @@ endconst
vld1.16 {\d22}, [r9] // p1
.endif
.endm
.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
cmp \threshold, #0
.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
.if \min
vmin.u16 q2, q2, \s1
vmax.s16 q3, q3, \s1
vmin.u16 q2, q2, \s2
vmax.s16 q3, q3, \s2
beq 3f
.endif
vabd.u16 q8, q0, \s1 // abs(diff)
vabd.u16 q11, q0, \s2 // abs(diff)
vshl.u16 q9, q8, \shift // abs(diff) >> shift
......@@ -326,7 +385,7 @@ endconst
vqsub.u16 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
vqsub.u16 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
vsub.i16 q10, \s1, q0 // diff = p0 - px
vsub.u16 q13, \s2, q0 // diff = p1 - px
vsub.i16 q13, \s2, q0 // diff = p1 - px
vneg.s16 q8, q9 // -clip
vneg.s16 q11, q12 // -clip
vmin.s16 q10, q10, q9 // imin(diff, clip)
......@@ -336,36 +395,44 @@ endconst
vmax.s16 q13, q13, q11 // constrain() = imax(imin(diff, clip), -clip)
vmla.i16 q1, q10, q9 // sum += taps[k] * constrain()
vmla.i16 q1, q13, q9 // sum += taps[k] * constrain()
3:
.endm
// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping, int h);
.macro filter w
function cdef_filter\w\()_neon, export=1
push {r4-r9,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #92]
ldrd r6, r7, [sp, #100]
// void dav1d_cdef_filterX_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping,
// int h, size_t edges);
.macro filter_func w, pri, sec, min, suffix
function cdef_filter\w\suffix\()_neon
cmp r8, #0xf
beq cdef_filter\w\suffix\()_edged_neon
.if \pri
movrel_local r8, pri_taps
and r9, r3, #1
add r8, r8, r9, lsl #1
.endif
movrel_local r9, directions\w
add r5, r9, r5, lsl #1
vmov.u16 d17, #15
vdup.16 d16, r6 // damping
.if \pri
vdup.16 q5, r3 // threshold
.endif
.if \sec
vdup.16 q7, r4 // threshold
.endif
vmov.16 d8[0], r3
vmov.16 d8[1], r4
vclz.i16 d8, d8 // clz(threshold)
vsub.i16 d8, d17, d8 // ulog2(threshold)
vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
vneg.s16 d8, d8 // -shift
.if \sec
vdup.16 q6, d8[1]
.endif
.if \pri
vdup.16 q4, d8[0]
.endif
1:
.if \w == 8
......@@ -377,47 +444,64 @@ function cdef_filter\w\()_neon, export=1
.endif
vmov.u16 q1, #0 // sum
.if \min
vmov.u16 q2, q0 // min
vmov.u16 q3, q0 // max
.endif
// Instead of loading sec_taps 2, 1 from memory, just set it
// to 2 initially and decrease for the second round.
// This is also used as loop counter.
mov lr, #2 // sec_taps[0]
2:
.if \pri
ldrsb r9, [r5] // off1
load_px d28, d29, d30, d31, \w
.endif
.if \sec
add r5, r5, #4 // +2*2
ldrsb r9, [r5] // off2
.endif
.if \pri
ldrb r12, [r8] // *pri_taps
handle_pixel q14, q15, r3, q5, q4, r12
handle_pixel q14, q15, q5, q4, r12, \min
.endif
.if \sec
load_px d28, d29, d30, d31, \w
add r5, r5, #8 // +2*4
ldrsb r9, [r5] // off3
handle_pixel q14, q15, r4, q7, q6, lr
handle_pixel q14, q15, q7, q6, lr, \min
load_px d28, d29, d30, d31, \w
handle_pixel q14, q15, r4, q7, q6, lr
handle_pixel q14, q15, q7, q6, lr, \min
sub r5, r5, #11 // x8 -= 2*(2+4); x8 += 1;
sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1;
.else
add r5, r5, #1 // r5 += 1
.endif
subs lr, lr, #1 // sec_tap-- (value)
.if \pri
add r8, r8, #1 // pri_taps++ (pointer)
.endif
bne 2b
vshr.s16 q14, q1, #15 // -(sum < 0)
vadd.i16 q1, q1, q14 // sum - (sum < 0)
vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4
.if \min
vmin.s16 q0, q0, q3
vmax.s16 q0, q0, q2 // iclip(px + .., min, max)
.endif
vmovn.u16 d0, q0
.if \w == 8
add r2, r2, #2*16 // tmp += tmp_stride
......@@ -430,9 +514,11 @@ function cdef_filter\w\()_neon, export=1
vst1.32 {d0[1]}, [r0, :32], r1
.endif
// Reset pri_taps/sec_taps back to the original point
// Reset pri_taps and directions back to the original point
sub r5, r5, #2
.if \pri
sub r8, r8, #2
.endif
bgt 1b
vpop {q4-q7}
......@@ -440,9 +526,237 @@ function cdef_filter\w\()_neon, export=1
endfunc
.endm
.macro filter w
filter_func \w, pri=1, sec=0, min=0, suffix=_pri
filter_func \w, pri=0, sec=1, min=0, suffix=_sec
filter_func \w, pri=1, sec=1, min=1, suffix=_pri_sec
function cdef_filter\w\()_8bpc_neon, export=1
push {r4-r9,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #92]
ldrd r6, r7, [sp, #100]
ldr r8, [sp, #108]
cmp r3, #0 // pri_strength
bne 1f
b cdef_filter\w\()_sec_neon // only sec
1:
cmp r4, #0 // sec_strength
bne 1f
b cdef_filter\w\()_pri_neon // only pri
1:
b cdef_filter\w\()_pri_sec_neon // both pri and sec
endfunc
.endm
filter 8
filter 4
.macro load_px_8 d11, d12, d21, d22, w
.if \w == 8
add r6, r2, r9 // x + off
sub r9, r2, r9 // x - off
vld1.8 {\d11}, [r6] // p0
add r6, r6, #16 // += stride
vld1.8 {\d21}, [r9] // p1
add r9, r9, #16 // += stride
vld1.8 {\d12}, [r6] // p0
vld1.8 {\d22}, [r9] // p1
.else
add r6, r2, r9 // x + off
sub r9, r2, r9 // x - off
vld1.32 {\d11[0]}, [r6] // p0
add r6, r6, #8 // += stride
vld1.32 {\d21[0]}, [r9] // p1
add r9, r9, #8 // += stride
vld1.32 {\d11[1]}, [r6] // p0
add r6, r6, #8 // += stride
vld1.32 {\d21[1]}, [r9] // p1
add r9, r9, #8 // += stride
vld1.32 {\d12[0]}, [r6] // p0
add r6, r6, #8 // += stride
vld1.32 {\d22[0]}, [r9] // p1
add r9, r9, #8 // += stride
vld1.32 {\d12[1]}, [r6] // p0
vld1.32 {\d22[1]}, [r9] // p1
.endif
.endm
.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
.if \min
vmin.u8 q3, q3, \s1
vmax.u8 q4, q4, \s1
vmin.u8 q3, q3, \s2
vmax.u8 q4, q4, \s2
.endif
vabd.u8 q8, q0, \s1 // abs(diff)
vabd.u8 q11, q0, \s2 // abs(diff)
vshl.u8 q9, q8, \shift // abs(diff) >> shift
vshl.u8 q12, q11, \shift // abs(diff) >> shift
vqsub.u8 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
vqsub.u8 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
vcgt.u8 q10, q0, \s1 // px > p0
vcgt.u8 q13, q0, \s2 // px > p1
vmin.u8 q9, q9, q8 // imin(abs(diff), clip)
vmin.u8 q12, q12, q11 // imin(abs(diff), clip)
vneg.s8 q8, q9 // -imin()
vneg.s8 q11, q12 // -imin()
vbsl q10, q8, q9 // constrain() = imax(imin(diff, clip), -clip)
vdup.8 d18, \tap // taps[k]
vbsl q13, q11, q12 // constrain() = imax(imin(diff, clip), -clip)
vmlal.s8 q1, d20, d18 // sum += taps[k] * constrain()
vmlal.s8 q1, d26, d18 // sum += taps[k] * constrain()
vmlal.s8 q2, d21, d18 // sum += taps[k] * constrain()
vmlal.s8 q2, d27, d18 // sum += taps[k] * constrain()
.endm
// void cdef_filterX_edged_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping,
// int h, size_t edges);
.macro filter_func_8 w, pri, sec, min, suffix
function cdef_filter\w\suffix\()_edged_neon
.if \pri
movrel_local r8, pri_taps
and r9, r3, #1
add r8, r8, r9, lsl #1
.endif
movrel_local r9, directions\w
add r5, r9, r5, lsl #1
vmov.u8 d17, #7
vdup.8 d16, r6 // damping
vmov.8 d8[0], r3
vmov.8 d8[1], r4
vclz.i8 d8, d8 // clz(threshold)
vsub.i8 d8, d17, d8 // ulog2(threshold)
vqsub.u8 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
vneg.s8 d8, d8 // -shift
.if \sec
vdup.8 q6, d8[1]
.endif
.if \pri
vdup.8 q5, d8[0]
.endif
1:
.if \w == 8
add r12, r2, #16
vld1.8 {d0}, [r2, :64] // px
vld1.8 {d1}, [r12, :64] // px
.else
add r12, r2, #8
vld1.32 {d0[0]}, [r2, :32] // px
add r9, r2, #2*8
vld1.32 {d0[1]}, [r12, :32] // px
add r12, r12, #2*8
vld1.32 {d1[0]}, [r9, :32] // px
vld1.32 {d1[1]}, [r12, :32] // px
.endif
vmov.u8 q1, #0 // sum
vmov.u8 q2, #0 // sum
.if \min
vmov.u16 q3, q0 // min
vmov.u16 q4, q0 // max
.endif
// Instead of loading sec_taps 2, 1 from memory, just set it
// to 2 initially and decrease for the second round.
// This is also used as loop counter.
mov lr, #2 // sec_taps[0]
2:
.if \pri
ldrsb r9, [r5] // off1
load_px_8 d28, d29, d30, d31, \w
.endif
.if \sec
add r5, r5, #4 // +2*2
ldrsb r9, [r5] // off2
.endif
.if \pri
ldrb r12, [r8] // *pri_taps
vdup.8 q7, r3 // threshold
handle_pixel_8 q14, q15, q7, q5, r12, \min
.endif
.if \sec
load_px_8 d28, d29, d30, d31, \w
add r5, r5, #8 // +2*4
ldrsb r9, [r5] // off3
vdup.8 q7, r4 // threshold
handle_pixel_8 q14, q15, q7, q6, lr, \min
load_px_8 d28, d29, d30, d31, \w
handle_pixel_8 q14, q15, q7, q6, lr, \min
sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1;
.else
add r5, r5, #1 // r5 += 1
.endif
subs lr, lr, #1 // sec_tap-- (value)
.if \pri
add r8, r8, #1 // pri_taps++ (pointer)
.endif
bne 2b
vshr.s16 q14, q1, #15 // -(sum < 0)
vshr.s16 q15, q2, #15 // -(sum < 0)
vadd.i16 q1, q1, q14 // sum - (sum < 0)
vadd.i16 q2, q2, q15 // sum - (sum < 0)
vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
vrshr.s16 q2, q2, #4 // (8 + sum - (sum < 0)) >> 4
vaddw.u8 q1, q1, d0 // px + (8 + sum ...) >> 4
vaddw.u8 q2, q2, d1 // px + (8 + sum ...) >> 4
vqmovun.s16 d0, q1
vqmovun.s16 d1, q2
.if \min
vmin.u8 q0, q0, q4
vmax.u8 q0, q0, q3 // iclip(px + .., min, max)
.endif
.if \w == 8
vst1.8 {d0}, [r0, :64], r1
add r2, r2, #2*16 // tmp += 2*tmp_stride
subs r7, r7, #2 // h -= 2
vst1.8 {d1}, [r0, :64], r1
.else
vst1.32 {d0[0]}, [r0, :32], r1
add r2, r2, #4*8 // tmp += 4*tmp_stride
vst1.32 {d0[1]}, [r0, :32], r1
subs r7, r7, #4 // h -= 4
vst1.32 {d1[0]}, [r0, :32], r1
vst1.32 {d1[1]}, [r0, :32], r1
.endif
// Reset pri_taps and directions back to the original point
sub r5, r5, #2
.if \pri
sub r8, r8, #2
.endif
bgt 1b
vpop {q4-q7}
pop {r4-r9,pc}
endfunc
.endm
.macro filter_8 w
filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
.endm
filter_8 8
filter_8 4
const div_table, align=4
.short 840, 420, 280, 210, 168, 140, 120, 105
endconst
......@@ -451,9 +765,9 @@ const alt_fact, align=4
.short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
endconst
// int dav1d_cdef_find_dir_neon(const pixel *img, const ptrdiff_t stride,
// unsigned *const var)
function cdef_find_dir_neon, export=1
// int dav1d_cdef_find_dir_8bpc_neon(const pixel *img, const ptrdiff_t stride,
// unsigned *const var)
function cdef_find_dir_8bpc_neon, export=1
push {lr}
vpush {q4-q7}
sub sp, sp, #32 // cost
......
......@@ -143,8 +143,8 @@ function lpf_8_wd\wd\()_neon
vaddw.s8 q1, q1, d4
vmov.i8 d7, #3
vqmovn.s16 d2, q1 // f
vqadd.s8 d4, d6, d2 // imin(f + 4, 128)
vqadd.s8 d5, d7, d2 // imin(f + 3, 128)
vqadd.s8 d4, d6, d2 // imin(f + 4, 127)
vqadd.s8 d5, d7, d2 // imin(f + 3, 127)
vshr.s8 d4, d4, #3 // f1
vshr.s8 d5, d5, #3 // f2
vmovl.u8 q1, d23 // p0
......@@ -734,13 +734,13 @@ function lpf_h_16_8_neon
bx r12
endfunc
// void dav1d_lpf_v_sb_y_neon(pixel *dst, const ptrdiff_t stride,
// const uint32_t *const vmask,
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
// const Av1FilterLUT *lut, const int w)
// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
// const uint32_t *const vmask,
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
// const Av1FilterLUT *lut, const int w)
.macro lpf_func dir, type
function lpf_\dir\()_sb_\type\()_neon, export=1
function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
......
......@@ -28,11 +28,11 @@
#include "src/arm/asm.S"
#include "util.S"
// void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
// const pixel *src, ptrdiff_t stride,
// const int16_t fh[7], const intptr_t w,
// int h, enum LrEdgeFlags edges);
function wiener_filter_h_neon, export=1
// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
// const pixel *src, ptrdiff_t stride,
// const int16_t fh[7], const intptr_t w,
// int h, enum LrEdgeFlags edges);
function wiener_filter_h_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4}
ldrd r4, r5, [sp, #52]
......@@ -367,11 +367,11 @@ L(variable_shift_tbl):
.purgem filter_4
endfunc
// void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
// const int16_t *mid, int w, int h,
// const int16_t fv[7], enum LrEdgeFlags edges,
// ptrdiff_t mid_stride);
function wiener_filter_v_neon, export=1
// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
// const int16_t *mid, int w, int h,
// const int16_t fv[7], enum LrEdgeFlags edges,
// ptrdiff_t mid_stride);
function wiener_filter_v_8bpc_neon, export=1
push {r4-r7,lr}
ldrd r4, r5, [sp, #20]
ldrd r6, r7, [sp, #28]
......@@ -548,9 +548,9 @@ function wiener_filter_v_neon, export=1
.purgem filter
endfunc
// void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
// const pixel *src, int w, int h);
function copy_narrow_neon, export=1
// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride,
// const pixel *src, int w, int h);
function copy_narrow_8bpc_neon, export=1
push {r4,lr}
ldr r4, [sp, #8]
adr r12, L(copy_narrow_tbl)
......@@ -687,12 +687,12 @@ endfunc
#define SUM_STRIDE (384+16)
// void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box3_h_neon, export=1
// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box3_h_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
......@@ -925,11 +925,11 @@ L(box3_variable_shift_tbl):
vmull.u8 q6, d9, d9
add3 4
subs r5, r5, #4
vst1.16 {d6}, [r1, :64]!
vst1.16 {d14}, [r11, :64]!
vst1.32 {q12}, [r0, :128]!
vst1.32 {q8}, [r10, :128]!
subs r5, r5, #4
ble 9f
vext.8 q0, q0, q0, #4
vext.8 q1, q1, q2, #8
......@@ -961,12 +961,12 @@ L(box3_variable_shift_tbl):
.purgem add3
endfunc
// void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box5_h_neon, export=1
// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
// const int w, const int h,
// const enum LrEdgeFlags edges);
function sgr_box5_h_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
......@@ -1038,7 +1038,7 @@ function sgr_box5_h_neon, export=1
b 2f
0:
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q0 to have 2x the first byte at the front.
// and shift q0 to have 3x the first byte at the front.
vdup.8 q1, d0[0]
vdup.8 q5, d8[0]
// Move r3 back to account for the last 3 bytes we loaded before,
......@@ -1215,11 +1215,11 @@ L(box5_variable_shift_tbl):
vmull.u8 q6, d9, d9
add5 4
subs r5, r5, #4
vst1.16 {d6}, [r1, :64]!
vst1.16 {d14}, [r11, :64]!
vst1.32 {q12}, [r0, :128]!
vst1.32 {q10}, [r10, :128]!
subs r5, r5, #4
ble 9f
vext.8 q0, q0, q0, #4
vext.8 q1, q1, q2, #8
......@@ -1661,11 +1661,11 @@ endfunc
#define FILTER_OUT_STRIDE 384
// void dav1d_sgr_finish_filter1_neon(coef *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter1_neon, export=1
// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter1_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
......@@ -1765,11 +1765,11 @@ function sgr_finish_filter1_neon, export=1
pop {r4-r11,pc}
endfunc
// void dav1d_sgr_finish_filter2_neon(coef *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter2_neon, export=1
// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
function sgr_finish_filter2_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
......@@ -1925,11 +1925,11 @@ function sgr_finish_filter2_neon, export=1
pop {r4-r11,pc}
endfunc
// void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
// const pixel *src, const ptrdiff_t src_stride,
// const coef *t1, const int w, const int h,
// const int wt);
function sgr_weighted1_neon, export=1
// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride,
// const pixel *src, const ptrdiff_t src_stride,
// const int16_t *t1, const int w, const int h,
// const int wt);
function sgr_weighted1_8bpc_neon, export=1
push {r4-r9,lr}
ldrd r4, r5, [sp, #28]
ldrd r6, r7, [sp, #36]
......@@ -2009,12 +2009,12 @@ function sgr_weighted1_neon, export=1
pop {r4-r9,pc}
endfunc
// void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *src, const ptrdiff_t src_stride,
// const coef *t1, const coef *t2,
// const int w, const int h,
// const int16_t wt[2]);
function sgr_weighted2_neon, export=1
// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *src, const ptrdiff_t src_stride,
// const int16_t *t1, const int16_t *t2,
// const int w, const int h,
// const int16_t wt[2]);
function sgr_weighted2_8bpc_neon, export=1
push {r4-r11,lr}
ldrd r4, r5, [sp, #36]
ldrd r6, r7, [sp, #44]
......
......@@ -753,7 +753,7 @@ L(blend_v_tbl):
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d5, d22, d4
sub r1, r1, #3
sub r1, r1, #2
4:
vld1.u8 {d2}, [r2, :64]!
vld1.32 {d0[]}, [r0, :32]
......@@ -764,10 +764,8 @@ L(blend_v_tbl):
vrshrn.i16 d20, q3, #6
vst1.16 {d20[0]}, [r0, :16]!
vst1.16 {d20[2]}, [r12, :16]!
vst1.8 {d20[2]}, [r0]!
vst1.8 {d20[6]}, [r12]!
add r0, r0, r1
add r12, r12, r1
vst1.8 {d20[2]}, [r0], r1
vst1.8 {d20[6]}, [r12], r1
bgt 4b
pop {r4-r5,pc}
80:
......@@ -776,7 +774,7 @@ L(blend_v_tbl):
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d17, d16, d2
sub r1, r1, #6
sub r1, r1, #4
8:
vld1.u8 {d4, d5}, [r2, :128]!
vld1.u8 {d0}, [r0, :64]
......@@ -790,10 +788,8 @@ L(blend_v_tbl):
vrshrn.i16 d23, q10, #6
vst1.32 {d22[0]}, [r0, :32]!
vst1.32 {d23[0]}, [r12, :32]!
vst1.16 {d22[2]}, [r0, :16]!
vst1.16 {d23[2]}, [r12, :16]!
add r0, r0, r1
add r12, r12, r1
vst1.16 {d22[2]}, [r0, :16], r1
vst1.16 {d23[2]}, [r12, :16], r1
bgt 8b
pop {r4-r5,pc}
160:
......@@ -802,7 +798,7 @@ L(blend_v_tbl):
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 q11, q12, q14
sub r1, r1, #12
sub r1, r1, #8
16:
vld1.u8 {q1, q2}, [r2, :128]!
vld1.u8 {q0}, [r0, :128]
......@@ -822,20 +818,18 @@ L(blend_v_tbl):
vrshrn.i16 d21, q8, #6
vst1.u8 {d18}, [r0, :64]!
vst1.u8 {d20}, [r12, :64]!
vst1.32 {d19[0]}, [r0, :32]!
vst1.32 {d21[0]}, [r12, :32]!
add r0, r0, r1
add r12, r12, r1
vst1.32 {d19[0]}, [r0, :32], r1
vst1.32 {d21[0]}, [r12, :32], r1
bgt 16b
pop {r4-r5,pc}
320:
vmov.i8 q10, #64
vld1.u8 {q2, q3}, [r5, :128]
vsub.i8 q11, q10, q2
vsub.i8 q12, q10, q3
vsub.i8 d24, d20, d6
32:
vld1.u8 {q8, q9}, [r2, :128]!
vld1.u8 {q0, q1}, [r0, :128]
vld1.u8 {d0, d1, d2}, [r0, :64]
subs r4, r4, #1
vmull.u8 q15, d16, d4
vmlal.u8 q15, d0, d22
......
......@@ -27,6 +27,7 @@
#include "src/arm/asm.S"
#include "util.S"
#include "cdef_tmpl.S"
.macro pad_top_bottom s1, s2, w, stride, rn, rw, ret
tst w6, #1 // CDEF_HAVE_LEFT
......@@ -137,13 +138,15 @@
.endif
.endm
// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// /*const*/ pixel *const top[2], int h,
// enum CdefEdgeFlags edges);
// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// const pixel *const top, int h,
// enum CdefEdgeFlags edges);
.macro padding_func w, stride, rn, rw
function cdef_padding\w\()_neon, export=1
function cdef_padding\w\()_8bpc_neon, export=1
cmp w6, #0xf // fully edged
b.eq cdef_padding\w\()_edged_8bpc_neon
movi v30.8h, #0x80, lsl #8
mov v31.16b, v30.16b
sub x0, x0, #2*(2*\stride+2)
......@@ -157,9 +160,8 @@ function cdef_padding\w\()_neon, export=1
b 3f
1:
// CDEF_HAVE_TOP
ldr x8, [x4]
ldr x9, [x4, #8]
pad_top_bottom x8, x9, \w, \stride, \rn, \rw, 0
add x9, x4, x2
pad_top_bottom x4, x9, \w, \stride, \rn, \rw, 0
// Middle section
3:
......@@ -242,358 +244,274 @@ endfunc
padding_func 8, 16, d, q
padding_func 4, 8, s, d
.macro dir_table w, stride
const directions\w
.byte -1 * \stride + 1, -2 * \stride + 2
.byte 0 * \stride + 1, -1 * \stride + 2
.byte 0 * \stride + 1, 0 * \stride + 2
.byte 0 * \stride + 1, 1 * \stride + 2
.byte 1 * \stride + 1, 2 * \stride + 2
.byte 1 * \stride + 0, 2 * \stride + 1
.byte 1 * \stride + 0, 2 * \stride + 0
.byte 1 * \stride + 0, 2 * \stride - 1
// Repeated, to avoid & 7
.byte -1 * \stride + 1, -2 * \stride + 2
.byte 0 * \stride + 1, -1 * \stride + 2
.byte 0 * \stride + 1, 0 * \stride + 2
.byte 0 * \stride + 1, 1 * \stride + 2
.byte 1 * \stride + 1, 2 * \stride + 2
.byte 1 * \stride + 0, 2 * \stride + 1
endconst
// void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// const pixel *const top, int h,
// enum CdefEdgeFlags edges);
.macro padding_func_edged w, stride, reg
function cdef_padding\w\()_edged_8bpc_neon, export=1
sub x4, x4, #2
sub x0, x0, #(2*\stride+2)
.if \w == 4
ldr d0, [x4]
ldr d1, [x4, x2]
st1 {v0.8b, v1.8b}, [x0], #16
.else
add x9, x4, x2
ldr d0, [x4]
ldr s1, [x4, #8]
ldr d2, [x9]
ldr s3, [x9, #8]
str d0, [x0]
str s1, [x0, #8]
str d2, [x0, #\stride]
str s3, [x0, #\stride+8]
add x0, x0, #2*\stride
.endif
0:
ld1 {v0.h}[0], [x3], #2
ldr h2, [x1, #\w]
load_n_incr v1, x1, x2, \w
subs w5, w5, #1
str h0, [x0]
stur \reg\()1, [x0, #2]
str h2, [x0, #2+\w]
add x0, x0, #\stride
b.gt 0b
sub x1, x1, #2
.if \w == 4
ldr d0, [x1]
ldr d1, [x1, x2]
st1 {v0.8b, v1.8b}, [x0], #16
.else
add x9, x1, x2