Compare revisions

Arpad Panyik · Martin Storsjö · Arpad Panyik · Martin Storsjö · Kyle Siefring · Martin Storsjö
--- a/NEWS
+++ b/NEWS
+Changes for 1.5.0 'Road Runner':
+--------------------------------
+
+1.5.0 is a major release of dav1d, that:
+ - WARNING: we removed some of the SSE2 optimizations, so if you care about
+            systems without SSSE3, you should be careful when updating!
+ - Add Arm OpenBSD run-time CPU feature
+ - Optimize index offset calculations for decode_coefs
+ - picture: copy HDR10+ and T35 metadata only to visible frames
+ - SSSE3 new optimizations for 6-tap (8bit and hbd)
+ - AArch64/SVE: Add HBD subpel filters using 128-bit SVE2
+ - AArch64: Add USMMLA implempentation for 6-tap H/HV
+ - AArch64: Optimize Armv8.0 NEON for HBD horizontal filters and 6-tap filters
+ - Allow playing videos in full-screen mode in dav1dplay
+
+
+Changes for 1.4.3 'Road Runner':
+--------------------------------
+
+1.4.3 is a small release focused on security issues
+ - AArch64: Fix potential out of bounds access in DotProd H/HV filters
+ - cli: Prevent buffer over-read
+
+
 Changes for 1.4.2 'Road Runner':
 --------------------------------


--- a/examples/dp_renderer.h
+++ b/examples/dp_renderer.h
@@ -30,22 +30,32 @@
 #include "dav1d/dav1d.h"

 #include <SDL.h>
-#ifdef HAVE_PLACEBO
+#if HAVE_PLACEBO
 # include <libplacebo/config.h>
 #endif

 // Check libplacebo Vulkan rendering
-#if defined(HAVE_VULKAN) && defined(SDL_VIDEO_VULKAN)
+#if HAVE_VULKAN && defined(SDL_VIDEO_VULKAN)
 # if defined(PL_HAVE_VULKAN) && PL_HAVE_VULKAN
-#  define HAVE_RENDERER_PLACEBO
-#  define HAVE_PLACEBO_VULKAN
+#  define HAVE_RENDERER_PLACEBO 1
+#  define HAVE_PLACEBO_VULKAN 1
 # endif
 #endif

 // Check libplacebo OpenGL rendering
 #if defined(PL_HAVE_OPENGL) && PL_HAVE_OPENGL
-# define HAVE_RENDERER_PLACEBO
-# define HAVE_PLACEBO_OPENGL
+# define HAVE_RENDERER_PLACEBO 1
+# define HAVE_PLACEBO_OPENGL 1
+#endif
+
+#ifndef HAVE_RENDERER_PLACEBO
+#define HAVE_RENDERER_PLACEBO 0
+#endif
+#ifndef HAVE_PLACEBO_VULKAN
+#define HAVE_PLACEBO_VULKAN 0
+#endif
+#ifndef HAVE_PLACEBO_OPENGL
+#define HAVE_PLACEBO_OPENGL 0
 #endif

 /**

--- a/examples/dp_renderer_placebo.c
+++ b/examples/dp_renderer_placebo.c
@@ -26,17 +26,17 @@

 #include "dp_renderer.h"

-#ifdef HAVE_RENDERER_PLACEBO
+#if HAVE_RENDERER_PLACEBO
 #include <assert.h>

 #include <libplacebo/renderer.h>
 #include <libplacebo/utils/dav1d.h>

-#ifdef HAVE_PLACEBO_VULKAN
+#if HAVE_PLACEBO_VULKAN
 # include <libplacebo/vulkan.h>
 # include <SDL_vulkan.h>
 #endif
-#ifdef HAVE_PLACEBO_OPENGL
+#if HAVE_PLACEBO_OPENGL
 # include <libplacebo/opengl.h>
 # include <SDL_opengl.h>
 #endif
@@ -53,7 +53,7 @@ typedef struct renderer_priv_ctx
    pl_log log;
    // Placebo renderer
    pl_renderer renderer;
-#ifdef HAVE_PLACEBO_VULKAN
+#if HAVE_PLACEBO_VULKAN
    // Placebo Vulkan handle
    pl_vulkan vk;
    // Placebo Vulkan instance
@@ -61,7 +61,7 @@ typedef struct renderer_priv_ctx
    // Vulkan surface
    VkSurfaceKHR surf;
 #endif
-#ifdef HAVE_PLACEBO_OPENGL
+#if HAVE_PLACEBO_OPENGL
    // Placebo OpenGL handle
    pl_opengl gl;
    // SDL OpenGL context
@@ -125,7 +125,7 @@ static Dav1dPlayRendererPrivateContext*
    return rd_priv_ctx;
 }

-#ifdef HAVE_PLACEBO_OPENGL
+#if HAVE_PLACEBO_OPENGL
 static void *placebo_renderer_create_gl(const Dav1dPlaySettings *settings)
 {
    SDL_Window *sdlwin = NULL;
@@ -181,7 +181,7 @@ static void *placebo_renderer_create_gl(const Dav1dPlaySettings *settings)
 }
 #endif

-#ifdef HAVE_PLACEBO_VULKAN
+#if HAVE_PLACEBO_VULKAN
 static void *placebo_renderer_create_vk(const Dav1dPlaySettings *settings)
 {
    SDL_Window *sdlwin = NULL;
@@ -278,14 +278,14 @@ static void placebo_renderer_destroy(void *cookie)
    for (int i = 0; i < 3; i++)
        pl_tex_destroy(rd_priv_ctx->gpu, &(rd_priv_ctx->plane_tex[i]));

-#ifdef HAVE_PLACEBO_VULKAN
+#if HAVE_PLACEBO_VULKAN
    if (rd_priv_ctx->vk) {
        pl_vulkan_destroy(&(rd_priv_ctx->vk));
        vkDestroySurfaceKHR(rd_priv_ctx->vk_inst->instance, rd_priv_ctx->surf, NULL);
        pl_vk_inst_destroy(&(rd_priv_ctx->vk_inst));
    }
 #endif
-#ifdef HAVE_PLACEBO_OPENGL
+#if HAVE_PLACEBO_OPENGL
    if (rd_priv_ctx->gl)
        pl_opengl_destroy(&(rd_priv_ctx->gl));
    if (rd_priv_ctx->gl_context)
@@ -392,7 +392,7 @@ static void placebo_release_pic(Dav1dPicture *pic, void *cookie)
    SDL_UnlockMutex(rd_priv_ctx->lock);
 }

-#ifdef HAVE_PLACEBO_VULKAN
+#if HAVE_PLACEBO_VULKAN
 const Dav1dPlayRenderInfo rdr_placebo_vk = {
    .name = "placebo-vk",
    .create_renderer = placebo_renderer_create_vk,
@@ -407,7 +407,7 @@ const Dav1dPlayRenderInfo rdr_placebo_vk = {
 const Dav1dPlayRenderInfo rdr_placebo_vk = { NULL };
 #endif

-#ifdef HAVE_PLACEBO_OPENGL
+#if HAVE_PLACEBO_OPENGL
 const Dav1dPlayRenderInfo rdr_placebo_gl = {
    .name = "placebo-gl",
    .create_renderer = placebo_renderer_create_gl,

--- a/examples/meson.build
+++ b/examples/meson.build
@@ -48,19 +48,23 @@ if sdl2_dependency.found()

    placebo_dependency = dependency('libplacebo', version: '>= 4.160.0', required: false)

-    if placebo_dependency.found()
+    have_vulkan = false
+    have_placebo = placebo_dependency.found()
+    if have_placebo
        dav1dplay_deps += placebo_dependency
-        dav1dplay_cflags += '-DHAVE_PLACEBO'

        # If libplacebo is found, we might be able to use Vulkan
        # with it, in which case we need the Vulkan library too.
        vulkan_dependency = dependency('vulkan', required: false)
        if vulkan_dependency.found()
            dav1dplay_deps += vulkan_dependency
-            dav1dplay_cflags += '-DHAVE_VULKAN'
+            have_vulkan = true
        endif
    endif

+    dav1dplay_cflags += '-DHAVE_PLACEBO=' + (have_placebo ? '1' : '0')
+    dav1dplay_cflags += '-DHAVE_VULKAN=' + (have_vulkan ? '1' : '0')
+
    dav1dplay = executable('dav1dplay',
        dav1dplay_sources,
        rev_target,

--- a/include/common/intops.h
+++ b/include/common/intops.h
@@ -65,11 +65,11 @@ static inline int apply_sign64(const int v, const int64_t s) {
 }

 static inline int ulog2(const unsigned v) {
-    return 31 - clz(v);
+    return 31 ^ clz(v);
 }

 static inline int u64log2(const uint64_t v) {
-    return 63 - clzll(v);
+    return 63 ^ clzll(v);
 }

 static inline unsigned inv_recenter(const unsigned r, const unsigned v) {

--- a/include/dav1d/dav1d.h
+++ b/include/dav1d/dav1d.h
@@ -31,10 +31,10 @@
 #include <errno.h>
 #include <stdarg.h>

-#include "dav1d/common.h"
-#include "dav1d/picture.h"
-#include "dav1d/data.h"
-#include "dav1d/version.h"
+#include "common.h"
+#include "picture.h"
+#include "data.h"
+#include "version.h"

 #ifdef __cplusplus
 extern "C" {

--- a/include/dav1d/meson.build
+++ b/include/dav1d/meson.build
@@ -22,24 +22,15 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-# installed version.h header generation
-version_h_data = configuration_data()
-version_h_data.set('DAV1D_API_VERSION_MAJOR', dav1d_api_version_major)
-version_h_data.set('DAV1D_API_VERSION_MINOR', dav1d_api_version_minor)
-version_h_data.set('DAV1D_API_VERSION_PATCH', dav1d_api_version_revision)
-version_h_target = configure_file(input: 'version.h.in',
-                                  output: 'version.h',
-                                  configuration: version_h_data)
-
 dav1d_api_headers = [
    'common.h',
    'data.h',
    'dav1d.h',
    'headers.h',
    'picture.h',
+    'version.h',
 ]

 # install headers
 install_headers(dav1d_api_headers,
-                version_h_target,
                subdir : 'dav1d')
--- a/include/dav1d/version.h.in
+++ b/include/dav1d/version.h.in
 /*
- * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019-2024, VideoLAN and dav1d authors
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@@ -31,9 +31,9 @@
 extern "C" {
 #endif

-#define DAV1D_API_VERSION_MAJOR @DAV1D_API_VERSION_MAJOR@
-#define DAV1D_API_VERSION_MINOR @DAV1D_API_VERSION_MINOR@
-#define DAV1D_API_VERSION_PATCH @DAV1D_API_VERSION_PATCH@
+#define DAV1D_API_VERSION_MAJOR 7
+#define DAV1D_API_VERSION_MINOR 0
+#define DAV1D_API_VERSION_PATCH 0

 /**
 * Extract version components from the value returned by

--- a/meson.build
+++ b/meson.build
-# Copyright © 2018-2022, VideoLAN and dav1d authors
+# Copyright © 2018-2024, VideoLAN and dav1d authors
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -23,19 +23,13 @@
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 project('dav1d', ['c'],
-    version: '1.4.2',
+    version: '1.5.0',
    default_options: ['c_std=c99',
                      'warning_level=2',
                      'buildtype=release',
                      'b_ndebug=if-release'],
    meson_version: '>= 0.49.0')

-dav1d_soname_version       = '7.0.0'
-dav1d_api_version_array    = dav1d_soname_version.split('.')
-dav1d_api_version_major    = dav1d_api_version_array[0]
-dav1d_api_version_minor    = dav1d_api_version_array[1]
-dav1d_api_version_revision = dav1d_api_version_array[2]
-
 dav1d_src_root = meson.current_source_dir()
 cc = meson.get_compiler('c')

@@ -48,7 +42,18 @@ cdata_asm = configuration_data()
 # Include directories
 dav1d_inc_dirs = include_directories(['.', 'include/dav1d', 'include'])

-
+dav1d_api_version_major    = cc.get_define('DAV1D_API_VERSION_MAJOR',
+                                           prefix: '#include "dav1d/version.h"',
+                                           include_directories: dav1d_inc_dirs).strip()
+dav1d_api_version_minor    = cc.get_define('DAV1D_API_VERSION_MINOR',
+                                           prefix: '#include "dav1d/version.h"',
+                                           include_directories: dav1d_inc_dirs).strip()
+dav1d_api_version_revision = cc.get_define('DAV1D_API_VERSION_PATCH',
+                                           prefix: '#include "dav1d/version.h"',
+                                           include_directories: dav1d_inc_dirs).strip()
+dav1d_soname_version       = '@0@.@1@.@2@'.format(dav1d_api_version_major,
+                                                  dav1d_api_version_minor,
+                                                  dav1d_api_version_revision)

 #
 # Option handling
@@ -98,6 +103,10 @@ if host_machine.system() in ['linux', 'gnu', 'emscripten']
    add_project_arguments('-D_GNU_SOURCE', language: 'c')
 endif

+have_clock_gettime = false
+have_posix_memalign = false
+have_memalign = false
+have_aligned_alloc = false
 if host_machine.system() == 'windows'
    cdata.set('_WIN32_WINNT',           '0x0601')
    cdata.set('UNICODE',                1) # Define to 1 for Unicode (Wide Chars) APIs
@@ -145,26 +154,25 @@ else

    rt_dependency = []
    if cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args)
-        cdata.set('HAVE_CLOCK_GETTIME', 1)
+        have_clock_gettime = true
    elif host_machine.system() not in ['darwin', 'ios', 'tvos']
        rt_dependency = cc.find_library('rt', required: false)
        if not cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args, dependencies : rt_dependency)
            error('clock_gettime not found')
        endif
-        cdata.set('HAVE_CLOCK_GETTIME', 1)
+        have_clock_gettime = true
    endif

-    if cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
-        cdata.set('HAVE_POSIX_MEMALIGN', 1)
-    endif
-    if cc.has_function('memalign', prefix : '#include <malloc.h>', args : test_args)
-        cdata.set('HAVE_MEMALIGN', 1)
-    endif
-    if cc.has_function('aligned_alloc', prefix : '#include <stdlib.h>', args : test_args)
-        cdata.set('HAVE_ALIGNED_ALLOC', 1)
-    endif
+    have_posix_memalign = cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
+    have_memalign = cc.has_function('memalign', prefix : '#include <malloc.h>', args : test_args)
+    have_aligned_alloc = cc.has_function('aligned_alloc', prefix : '#include <stdlib.h>', args : test_args)
 endif

+cdata.set10('HAVE_CLOCK_GETTIME', have_clock_gettime)
+cdata.set10('HAVE_POSIX_MEMALIGN', have_posix_memalign)
+cdata.set10('HAVE_MEMALIGN', have_memalign)
+cdata.set10('HAVE_ALIGNED_ALLOC', have_aligned_alloc)
+
 # check for fseeko on android. It is not always available if _FILE_OFFSET_BITS is defined to 64
 have_fseeko = true
 if host_machine.system() == 'android'
@@ -181,12 +189,12 @@ if host_machine.system() == 'android'
 endif

 libdl_dependency = []
+have_dlsym = false
 if host_machine.system() == 'linux'
    libdl_dependency = cc.find_library('dl', required : false)
-    if cc.has_function('dlsym', prefix : '#include <dlfcn.h>', args : test_args, dependencies : libdl_dependency)
-        cdata.set('HAVE_DLSYM', 1)
-    endif
+    have_dlsym = cc.has_function('dlsym', prefix : '#include <dlfcn.h>', args : test_args, dependencies : libdl_dependency)
 endif
+cdata.set10('HAVE_DLSYM', have_dlsym)

 libm_dependency = cc.find_library('m', required: false)

@@ -215,23 +223,13 @@ if host_machine.cpu_family().startswith('wasm')
    stdatomic_dependencies += thread_dependency.partial_dependency(compile_args: true)
 endif

-if cc.check_header('sys/types.h')
-    cdata.set('HAVE_SYS_TYPES_H', 1)
-endif
-
-if cc.check_header('unistd.h')
-    cdata.set('HAVE_UNISTD_H', 1)
-endif
-
-if cc.check_header('io.h')
-    cdata.set('HAVE_IO_H', 1)
-endif
-
-if cc.check_header('pthread_np.h')
-    cdata.set('HAVE_PTHREAD_NP_H', 1)
-    test_args += '-DHAVE_PTHREAD_NP_H'
-endif
+cdata.set10('HAVE_SYS_TYPES_H', cc.check_header('sys/types.h'))
+cdata.set10('HAVE_UNISTD_H', cc.check_header('unistd.h'))
+cdata.set10('HAVE_IO_H', cc.check_header('io.h'))

+have_pthread_np = cc.check_header('pthread_np.h')
+cdata.set10('HAVE_PTHREAD_NP_H', have_pthread_np)
+test_args += '-DHAVE_PTHREAD_NP_H=' + (have_pthread_np ? '1' : '0')

 # Function checks

@@ -244,41 +242,32 @@ else
    getopt_dependency = []
 endif

+have_getauxval = false
+have_elf_aux_info = false
 if (host_machine.cpu_family() == 'aarch64' or
    host_machine.cpu_family().startswith('arm') or
    host_machine.cpu_family().startswith('loongarch') or
    host_machine.cpu() == 'ppc64le' or
    host_machine.cpu_family().startswith('riscv'))
-    if cc.has_function('getauxval', prefix : '#include <sys/auxv.h>', args : test_args)
-        cdata.set('HAVE_GETAUXVAL', 1)
-    endif
-    if cc.has_function('elf_aux_info', prefix : '#include <sys/auxv.h>', args : test_args)
-        cdata.set('HAVE_ELF_AUX_INFO', 1)
-    endif
+    have_getauxval = cc.has_function('getauxval', prefix : '#include <sys/auxv.h>', args : test_args)
+    have_elf_aux_info = cc.has_function('elf_aux_info', prefix : '#include <sys/auxv.h>', args : test_args)
 endif

+cdata.set10('HAVE_GETAUXVAL', have_getauxval)
+cdata.set10('HAVE_ELF_AUX_INFO', have_elf_aux_info)
+
 pthread_np_prefix = '''
 #include <pthread.h>
-#ifdef HAVE_PTHREAD_NP_H
+#if HAVE_PTHREAD_NP_H
 #include <pthread_np.h>
 #endif
 '''
-if cc.has_function('pthread_getaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
-    cdata.set('HAVE_PTHREAD_GETAFFINITY_NP', 1)
-endif
-if cc.has_function('pthread_setaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
-    cdata.set('HAVE_PTHREAD_SETAFFINITY_NP', 1)
-endif
-if cc.has_function('pthread_setname_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
-    cdata.set('HAVE_PTHREAD_SETNAME_NP', 1)
-endif
-if cc.has_function('pthread_set_name_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
-    cdata.set('HAVE_PTHREAD_SET_NAME_NP', 1)
-endif
+cdata.set10('HAVE_PTHREAD_GETAFFINITY_NP', cc.has_function('pthread_getaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency))
+cdata.set10('HAVE_PTHREAD_SETAFFINITY_NP', cc.has_function('pthread_setaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency))
+cdata.set10('HAVE_PTHREAD_SETNAME_NP', cc.has_function('pthread_setname_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency))
+cdata.set10('HAVE_PTHREAD_SET_NAME_NP', cc.has_function('pthread_set_name_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency))

-if cc.compiles('int x = _Generic(0, default: 0);', name: '_Generic', args: test_args)
-    cdata.set('HAVE_C11_GENERIC', 1)
-endif
+cdata.set10('HAVE_C11_GENERIC', cc.compiles('int x = _Generic(0, default: 0);', name: '_Generic', args: test_args))

 # Compiler flag tests

@@ -359,6 +348,17 @@ endif

 cdata.set10('ARCH_AARCH64', host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64')
 cdata.set10('ARCH_ARM',     host_machine.cpu_family().startswith('arm') and host_machine.cpu() != 'arm64')
+
+have_as_func = false
+have_as_arch = false
+aarch64_extensions = {
+    'dotprod': 'udot v0.4s, v0.16b, v0.16b',
+    'i8mm':    'usdot v0.4s, v0.16b, v0.16b',
+    'sve':     'whilelt p0.s, x0, x1',
+    'sve2':    'sqrdmulh z0.s, z0.s, z0.s',
+}
+supported_aarch64_archexts = []
+supported_aarch64_instructions = []
 if (is_asm_enabled and
    (host_machine.cpu_family() == 'aarch64' or
     host_machine.cpu_family().startswith('arm')))
@@ -369,7 +369,6 @@ if (is_asm_enabled and
 );
 '''
    have_as_func = cc.compiles(as_func_code)
-    cdata.set10('HAVE_AS_FUNC', have_as_func)

    # fedora package build infrastructure uses a gcc specs file to enable
    # '-fPIE' by default. The chosen way only adds '-fPIE' to the C compiler
@@ -390,7 +389,6 @@ if (is_asm_enabled and

    if host_machine.cpu_family() == 'aarch64'
        have_as_arch = cc.compiles('''__asm__ (".arch armv8-a");''')
-        cdata.set10('HAVE_AS_ARCH_DIRECTIVE', have_as_arch)
        as_arch_str = ''
        if have_as_arch
            as_arch_level = 'armv8-a'
@@ -419,13 +417,7 @@ if (is_asm_enabled and
            cdata.set('AS_ARCH_LEVEL', as_arch_level)
            as_arch_str = '".arch ' + as_arch_level + '\\n"'
        endif
-        extensions = {
-            'dotprod': 'udot v0.4s, v0.16b, v0.16b',
-            'i8mm':    'usdot v0.4s, v0.16b, v0.16b',
-            'sve':     'whilelt p0.s, x0, x1',
-            'sve2':    'sqrdmulh z0.s, z0.s, z0.s',
-        }
-        foreach name, instr : extensions
+        foreach name, instr : aarch64_extensions
            # Test for support for the various extensions. First test if
            # the assembler supports the .arch_extension directive for
            # enabling/disabling the extension, then separately check whether
@@ -436,19 +428,27 @@ if (is_asm_enabled and
            code += '".arch_extension ' + name + '\\n"'
            code += ');'
            supports_archext = cc.compiles(code)
-            cdata.set10('HAVE_AS_ARCHEXT_' + name.to_upper() + '_DIRECTIVE', supports_archext)
            code = '__asm__ (' + as_arch_str
            if supports_archext
+                supported_aarch64_archexts += name
                code += '".arch_extension ' + name + '\\n"'
            endif
            code += '"' + instr + '\\n"'
            code += ');'
-            supports_instr = cc.compiles(code, name: name.to_upper())
-            cdata.set10('HAVE_' + name.to_upper(), supports_instr)
+            if cc.compiles(code, name: name.to_upper())
+                supported_aarch64_instructions += name
+            endif
        endforeach
    endif
 endif

+cdata.set10('HAVE_AS_FUNC', have_as_func)
+cdata.set10('HAVE_AS_ARCH_DIRECTIVE', have_as_arch)
+foreach name, _ : aarch64_extensions
+    cdata.set10('HAVE_AS_ARCHEXT_' + name.to_upper() + '_DIRECTIVE', name in supported_aarch64_archexts)
+    cdata.set10('HAVE_' + name.to_upper(), name in supported_aarch64_instructions)
+endforeach
+
 cdata.set10('ARCH_X86', host_machine.cpu_family().startswith('x86'))
 cdata.set10('ARCH_X86_64', host_machine.cpu_family() == 'x86_64')
 cdata.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86')

--- a/package/crossfiles/arm64-iPhoneOS.meson
+++ b/package/crossfiles/arm64-iPhoneOS.meson
+[binaries]
+c = ['clang', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk']
+cpp = ['clang++', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk']
+objc = ['clang', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk']
+objcpp = ['clang++', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk']
+ar = 'ar'
+strip = 'strip'
+
+[built-in options]
+c_args = ['-miphoneos-version-min=11.0']
+cpp_args = ['-miphoneos-version-min=11.0']
+c_link_args = ['-miphoneos-version-min=11.0']
+cpp_link_args = ['-miphoneos-version-min=11.0']
+objc_args = ['-miphoneos-version-min=11.0']
+objcpp_args = ['-miphoneos-version-min=11.0']
+
+[properties]
+root = '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer'
+needs_exe_wrapper = true
+
+[host_machine]
+system = 'darwin'
+subsystem = 'ios'
+kernel = 'xnu'
+cpu_family = 'aarch64'
+cpu = 'aarch64'
+endian = 'little'
--- a/package/crossfiles/x86_64-iPhoneSimulator.meson
+++ b/package/crossfiles/x86_64-iPhoneSimulator.meson
+[binaries]
+c = ['clang', '-arch', 'x86_64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk']
+cpp = ['clang++', '-arch', 'x86_64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk']
+objc = ['clang', '-arch', 'x86_64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk']
+objcpp = ['clang++', '-arch', 'x86_64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk']
+ar = 'ar'
+strip = 'strip'
+
+[built-in options]
+c_args = ['-miphoneos-version-min=11.0']
+cpp_args = ['-miphoneos-version-min=11.0']
+c_link_args = ['-miphoneos-version-min=11.0']
+cpp_link_args = ['-miphoneos-version-min=11.0']
+objc_args = ['-miphoneos-version-min=11.0']
+objcpp_args = ['-miphoneos-version-min=11.0']
+
+[properties]
+root = '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer'
+needs_exe_wrapper = true
+
+[host_machine]
+system = 'darwin'
+subsystem = 'ios-simulator'
+kernel = 'xnu'
+cpu_family = 'x86_64'
+cpu = 'x86_64'
+endian = 'little'
--- a/src/arm/32/util.S
+++ b/src/arm/32/util.S
@@ -31,18 +31,36 @@

 #include "config.h"
 #include "src/arm/asm.S"
+#include "src/arm/arm-arch.h"
+
+.macro v4bx rd
+#if __ARM_ARCH >= 5 || defined(__ARM_ARCH_4T__)
+        bx              \rd
+#else
+        mov             pc, \rd
+#endif
+.endm
+
+.macro v4blx rd
+#if __ARM_ARCH >= 5
+        blx             \rd
+#else
+        mov             lr,  pc
+        v4bx            \rd
+#endif
+.endm

 .macro movrel_local rd, val, offset=0
-#if defined(PIC)
+#if (__ARM_ARCH >= 7 || defined(__ARM_ARCH_6T2__)) && !defined(PIC)
+        movw            \rd, #:lower16:\val+\offset
+        movt            \rd, #:upper16:\val+\offset
+#else
        ldr             \rd,  90001f
        b               90002f
 90001:
        .word           \val + \offset - (90002f + 8 - 4 * CONFIG_THUMB)
 90002:
        add             \rd,  \rd,  pc
-#else
-        movw            \rd, #:lower16:\val+\offset
-        movt            \rd, #:upper16:\val+\offset
 #endif
 .endm


--- a/src/arm/64/looprestoration_common.S
+++ b/src/arm/64/looprestoration_common.S
@@ -28,14 +28,77 @@
 #include "src/arm/asm.S"
 #include "util.S"

+// Series of LUTs for efficiently computing sgr's 1 - x/(x+1) table.
+// In the comments, let RefTable denote the original, reference table.
+const x_by_x_tables
+// RangeMins
+//
+// Min(RefTable[i*8:i*8+8])
+// First two values are zeroed.
+//
+// Lookup using RangeMins[(x >> 3)]
+        .byte 0,  0, 11,  8,  6,  5,  5,  4,  4,  3,  3,  3,  2,  2,  2,  2
+        .byte 2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0
+
+// DiffMasks
+//
+// This contains a bit pattern, indicating at which index positions the value of RefTable changes. For each range
+// in the RangeMins table (covering 8 RefTable entries), we have one byte; each bit indicates whether the value of
+// RefTable changes at that particular index.
+// Using popcount, we can integrate the diff bit field. By shifting away bits in a byte, we can refine the range of
+// the integral. Finally, adding the integral to RangeMins[(x>>3)] reconstructs RefTable (for x > 15).
+//
+// Lookup using DiffMasks[(x >> 3)]
+        .byte 0x00, 0x00, 0xD4, 0x44
+        .byte 0x42, 0x04, 0x00, 0x00
+        .byte 0x00, 0x80, 0x00, 0x00
+        .byte 0x04, 0x00, 0x00, 0x00
+        .byte 0x00, 0x00, 0x00, 0x00
+        .byte 0x00, 0x40, 0x00, 0x00
+        .byte 0x00, 0x00, 0x00, 0x00
+        .byte 0x00, 0x00, 0x00, 0x02
+// Binary form:
+// 0b00000000, 0b00000000, 0b11010100, 0b01000100
+// 0b01000010, 0b00000100, 0b00000000, 0b00000000
+// 0b00000000, 0b10000000, 0b00000000, 0b00000000
+// 0b00000100, 0b00000000, 0b00000000, 0b00000000
+// 0b00000000, 0b00000000, 0b00000000, 0b00000000
+// 0b00000000, 0b01000000, 0b00000000, 0b00000000
+// 0b00000000, 0b00000000, 0b00000000, 0b00000000
+// 0b00000000, 0b00000000, 0b00000000, 0b00000010
+
+// RefLo
+//
+// RefTable[0:16]
+//      i.e. First 16 elements of the original table.
+// Add to the sum obtained in the rest of the other lut logic to include the first 16 bytes of RefTable.
+//
+// Lookup using RangeMins[x] (tbl will replace x > 15 with 0)
+        .byte 255, 128,  85,  64,  51,  43,  37,  32, 28,  26,  23,  21,  20,  18,  17,  16
+
+// Pseudo assembly
+//
+// hi_bits = x >> 3
+// tbl             ref,    {RefLo}, x
+// tbl             diffs,  {DiffMasks[0:16], DiffMasks[16:32]}, hi_bits
+// tbl             min,    {RangeMins[0:16], RangeMins[16:32]}, hi_bits
+// lo_bits = x & 0x7
+// diffs = diffs << lo_bits
+// ref = ref + min
+// integral = popcnt(diffs)
+// ref = ref + integral
+// return ref
+endconst
+
 // void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
 //                               int32_t *AA, int16_t *BB,
 //                               const int w, const int s,
 //                               const int bitdepth_max);
 function sgr_box3_vert_neon, export=1
-        stp             d8,  d9,  [sp, #-0x30]!
+        stp             d8,  d9,  [sp, #-0x40]!
        stp             d10, d11, [sp, #0x10]
        stp             d12, d13, [sp, #0x20]
+        stp             d14, d15, [sp, #0x30]

        add             w4,  w4,  #2
        clz             w9,  w6        // bitdepth_max
@@ -49,93 +112,112 @@ function sgr_box3_vert_neon, export=1
        movi            v31.4s,   #9   // n

        sub             w9,  w9,  #24  // -bitdepth_min_8
-        movrel          x12, X(sgr_x_by_x)
+        movrel          x12, x_by_x_tables
        mov             w13, #455      // one_by_x
-        ld1             {v16.16b, v17.16b, v18.16b}, [x12]
+        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x12] // RangeMins, DiffMasks
+        movi            v22.16b, #0x7
+        ldr             q23, [x12, #64] //RefLo
        dup             v6.8h,    w9   // -bitdepth_min_8
-        movi            v19.16b,  #5
-        movi            v20.8b,   #55  // idx of last 5
-        movi            v21.8b,   #72  // idx of last 4
-        movi            v22.8b,   #101 // idx of last 3
-        movi            v23.8b,   #169 // idx of last 2
-        movi            v24.8b,   #254 // idx of last 1
        saddl           v7.4s,    v6.4h,   v6.4h  // -2*bitdepth_min_8
        movi            v29.8h,   #1, lsl #8
        dup             v30.4s,   w13  // one_by_x

-        sub             v16.16b, v16.16b, v19.16b
-        sub             v17.16b, v17.16b, v19.16b
-        sub             v18.16b, v18.16b, v19.16b
-
-        ld1             {v8.4s,  v9.4s},  [x5], #32
-        ld1             {v10.4s, v11.4s}, [x6], #32
-        ld1             {v12.8h},         [x7], #16
-        ld1             {v13.8h},         [x8], #16
-        ld1             {v0.4s, v1.4s},   [x0], #32
-        ld1             {v2.8h},          [x1], #16
+        ld1             {v8.4s,  v9.4s,  v10.4s, v11.4s}, [x5], #64
+        ld1             {v12.4s, v13.4s, v14.4s, v15.4s}, [x6], #64
+        ld1             {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
+        ld1             {v20.8h, v21.8h}, [x8], #32
+        ld1             {v0.8h,  v1.8h},  [x7], #32
 1:
-
-        add             v8.4s,   v8.4s,   v10.4s
-        add             v9.4s,   v9.4s,   v11.4s
-
-        add             v12.8h,  v12.8h,  v13.8h
-
-        subs            w4,  w4,  #8
-        add             v0.4s,   v0.4s,   v8.4s
-        add             v1.4s,   v1.4s,   v9.4s
-        add             v2.8h,   v2.8h,   v12.8h
-
-        srshl           v0.4s,   v0.4s,   v7.4s
-        srshl           v1.4s,   v1.4s,   v7.4s
-        srshl           v4.8h,   v2.8h,   v6.8h
-        mul             v0.4s,   v0.4s,   v31.4s // a * n
-        mul             v1.4s,   v1.4s,   v31.4s // a * n
-        umull           v3.4s,   v4.4h,   v4.4h  // b * b
-        umull2          v4.4s,   v4.8h,   v4.8h  // b * b
-        uqsub           v0.4s,   v0.4s,   v3.4s  // imax(a * n - b * b, 0)
-        uqsub           v1.4s,   v1.4s,   v4.4s  // imax(a * n - b * b, 0)
-        mul             v0.4s,   v0.4s,   v28.4s // p * s
-        mul             v1.4s,   v1.4s,   v28.4s // p * s
-        ld1             {v8.4s,  v9.4s},  [x5], #32
-        uqshrn          v0.4h,   v0.4s,   #16
-        uqshrn2         v0.8h,   v1.4s,   #16
-        ld1             {v10.4s, v11.4s}, [x6], #32
-        uqrshrn         v0.8b,   v0.8h,   #4     // imin(z, 255)
-
-        ld1             {v12.8h},         [x7], #16
-
-        cmhi            v25.8b,  v0.8b,   v20.8b // = -1 if sgr_x_by_x[v0] < 5
-        cmhi            v26.8b,  v0.8b,   v21.8b // = -1 if sgr_x_by_x[v0] < 4
-        tbl             v1.8b, {v16.16b,  v17.16b, v18.16b}, v0.8b
-        cmhi            v27.8b,  v0.8b,   v22.8b // = -1 if sgr_x_by_x[v0] < 3
-        cmhi            v4.8b,   v0.8b,   v23.8b // = -1 if sgr_x_by_x[v0] < 2
-        add             v25.8b,  v25.8b,  v26.8b
-        cmhi            v5.8b,   v0.8b,   v24.8b // = -1 if sgr_x_by_x[v0] < 1
-        add             v27.8b,  v27.8b,  v4.8b
-        add             v5.8b,   v5.8b,   v19.8b
-        add             v25.8b,  v25.8b,  v27.8b
-        add             v5.8b,   v1.8b,   v5.8b
-        ld1             {v13.8h},         [x8], #16
-        add             v5.8b,   v5.8b,   v25.8b
-        ld1             {v0.4s, v1.4s},   [x0], #32
-        uxtl            v5.8h,   v5.8b           // x
-
-        umull           v3.4s,   v5.4h,   v2.4h  // x * BB[i]
-        umull2          v4.4s,   v5.8h,   v2.8h  // x * BB[i]
-        mul             v3.4s,   v3.4s,   v30.4s // x * BB[i] * sgr_one_by_x
-        mul             v4.4s,   v4.4s,   v30.4s // x * BB[i] * sgr_one_by_x
-        srshr           v3.4s,   v3.4s,   #12    // AA[i]
-        srshr           v4.4s,   v4.4s,   #12    // AA[i]
-        sub             v5.8h,   v29.8h,  v5.8h  // 256 - x
-        ld1             {v2.8h},          [x1], #16
-
-        st1             {v3.4s, v4.4s}, [x2], #32
-        st1             {v5.8h}, [x3], #16
+        ld1             {v2.8h,  v3.8h},   [x1], #32
+        add             v8.4s,   v8.4s,   v12.4s
+        add             v9.4s,   v9.4s,   v13.4s
+        add             v10.4s,  v10.4s,  v14.4s
+        add             v11.4s,  v11.4s,  v15.4s
+        add             v0.8h,   v0.8h,   v20.8h
+        add             v1.8h,   v1.8h,   v21.8h
+
+        add             v16.4s,  v16.4s,  v8.4s
+        add             v17.4s,  v17.4s,  v9.4s
+        add             v18.4s,  v18.4s,  v10.4s
+        add             v19.4s,  v19.4s,  v11.4s
+        add             v4.8h,   v2.8h,   v0.8h
+        add             v5.8h,   v3.8h,   v1.8h
+
+        srshl           v16.4s,  v16.4s,  v7.4s
+        srshl           v17.4s,  v17.4s,  v7.4s
+        srshl           v18.4s,  v18.4s,  v7.4s
+        srshl           v19.4s,  v19.4s,  v7.4s
+        srshl           v9.8h,   v4.8h,   v6.8h
+        srshl           v13.8h,  v5.8h,   v6.8h
+        mul             v16.4s,  v16.4s,  v31.4s // a * n
+        mul             v17.4s,  v17.4s,  v31.4s // a * n
+        mul             v18.4s,  v18.4s,  v31.4s // a * n
+        mul             v19.4s,  v19.4s,  v31.4s // a * n
+        umull           v8.4s,   v9.4h,   v9.4h  // b * b
+        umull2          v9.4s,   v9.8h,   v9.8h  // b * b
+        umull           v12.4s,  v13.4h,  v13.4h // b * b
+        umull2          v13.4s,  v13.8h,  v13.8h // b * b
+        uqsub           v16.4s,  v16.4s,  v8.4s  // imax(a * n - b * b, 0)
+        uqsub           v17.4s,  v17.4s,  v9.4s  // imax(a * n - b * b, 0)
+        uqsub           v18.4s,  v18.4s,  v12.4s // imax(a * n - b * b, 0)
+        uqsub           v19.4s,  v19.4s,  v13.4s // imax(a * n - b * b, 0)
+        mul             v16.4s,  v16.4s,  v28.4s // p * s
+        mul             v17.4s,  v17.4s,  v28.4s // p * s
+        mul             v18.4s,  v18.4s,  v28.4s // p * s
+        mul             v19.4s,  v19.4s,  v28.4s // p * s
+        uqshrn          v16.4h,  v16.4s,  #16
+        uqshrn2         v16.8h,  v17.4s,  #16
+        uqshrn          v18.4h,  v18.4s,  #16
+        uqshrn2         v18.8h,  v19.4s,  #16
+        uqrshrn         v1.8b,   v16.8h,  #4     // imin(z, 255)
+        uqrshrn2        v1.16b,  v18.8h,  #4     // imin(z, 255)
+
+        ld1             {v16.4s, v17.4s}, [x0], #32
+        subs            w4,  w4,  #16
+
+        ushr            v0.16b,  v1.16b,  #3
+        ld1             {v8.4s,  v9.4s}, [x5], #32
+        tbl             v2.16b,  {v26.16b, v27.16b}, v0.16b // RangeMins
+        tbl             v0.16b,  {v24.16b, v25.16b}, v0.16b // DiffMasks
+        tbl             v3.16b,  {v23.16b}, v1.16b          // RefLo
+        and             v1.16b,  v1.16b,   v22.16b
+        ld1             {v12.4s, v13.4s}, [x6], #32
+        ushl            v1.16b,  v2.16b,  v1.16b
+        ld1             {v20.8h, v21.8h}, [x8], #32
+        add             v3.16b,  v3.16b,  v0.16b
+        cnt             v1.16b,  v1.16b
+        ld1             {v18.4s, v19.4s}, [x0], #32
+        add             v3.16b,  v3.16b,  v1.16b
+        ld1             {v10.4s, v11.4s}, [x5], #32
+        uxtl            v0.8h,   v3.8b           // x
+        uxtl2           v1.8h,   v3.16b          // x
+
+        ld1             {v14.4s, v15.4s}, [x6], #32
+
+        umull           v2.4s,   v0.4h,   v4.4h // x * BB[i]
+        umull2          v3.4s,   v0.8h,   v4.8h // x * BB[i]
+        umull           v4.4s,   v1.4h,   v5.4h // x * BB[i]
+        umull2          v5.4s,   v1.8h,   v5.8h // x * BB[i]
+        sub             v0.8h,   v29.8h,  v0.8h // 256 - x
+        sub             v1.8h,   v29.8h,  v1.8h // 256 - x
+        mul             v2.4s,   v2.4s,  v30.4s // x * BB[i] * sgr_one_by_x
+        mul             v3.4s,   v3.4s,  v30.4s // x * BB[i] * sgr_one_by_x
+        mul             v4.4s,   v4.4s,  v30.4s // x * BB[i] * sgr_one_by_x
+        mul             v5.4s,   v5.4s,  v30.4s // x * BB[i] * sgr_one_by_x
+        st1             {v0.8h, v1.8h}, [x3], #32
+        ld1             {v0.8h, v1.8h}, [x7], #32
+        srshr           v2.4s,   v2.4s,  #12    // AA[i]
+        srshr           v3.4s,   v3.4s,  #12    // AA[i]
+        srshr           v4.4s,   v4.4s,  #12    // AA[i]
+        srshr           v5.4s,   v5.4s,  #12    // AA[i]
+
+        st1             {v2.4s, v3.4s, v4.4s, v5.4s}, [x2], #64
        b.gt            1b

+        ldp             d14, d15, [sp, #0x30]
        ldp             d12, d13, [sp, #0x20]
        ldp             d10, d11, [sp, #0x10]
-        ldp             d8,  d9,  [sp], 0x30
+        ldp             d8,  d9,  [sp], 0x40
        ret
 endfunc

@@ -144,10 +226,9 @@ endfunc
 //                               const int w, const int s,
 //                               const int bitdepth_max);
 function sgr_box5_vert_neon, export=1
-        stp             d8,  d9,  [sp, #-0x40]!
+        stp             d8,  d9,  [sp, #-0x30]!
        stp             d10, d11, [sp, #0x10]
        stp             d12, d13, [sp, #0x20]
-        stp             d14, d15, [sp, #0x30]

        add             w4,  w4,  #2
        clz             w15, w6        // bitdepth_max
@@ -163,24 +244,19 @@ function sgr_box5_vert_neon, export=1
        movi            v31.4s,   #25   // n

        sub             w15, w15, #24  // -bitdepth_min_8
-        movrel          x13, X(sgr_x_by_x)
-        mov             w14, #164      // one_by_x
-        ld1             {v16.16b, v17.16b, v18.16b}, [x13]
+        movrel          x13, x_by_x_tables
+        movi            v30.4s,  #164
+        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x13] // RangeMins, DiffMasks
        dup             v6.8h,   w15  // -bitdepth_min_8
-        movi            v19.16b, #5
-        movi            v24.8b,  #254 // idx of last 1
+        movi            v19.8b,  #0x7
+        ldr             q18, [x13, #64] // RefLo
        saddl           v7.4s,   v6.4h,   v6.4h  // -2*bitdepth_min_8
        movi            v29.8h,  #1, lsl #8
-        dup             v30.4s,  w14  // one_by_x
-
-        sub             v16.16b, v16.16b, v19.16b
-        sub             v17.16b, v17.16b, v19.16b
-        sub             v18.16b, v18.16b, v19.16b

        ld1             {v8.4s,  v9.4s},  [x5], #32
        ld1             {v10.4s, v11.4s}, [x6], #32
        ld1             {v12.4s, v13.4s}, [x7], #32
-        ld1             {v14.4s, v15.4s}, [x8], #32
+        ld1             {v16.4s, v17.4s}, [x8], #32
        ld1             {v20.8h},         [x9], #16
        ld1             {v21.8h},         [x10], #16
        ld1             {v22.8h},         [x11], #16
@@ -191,8 +267,8 @@ function sgr_box5_vert_neon, export=1
 1:
        add             v8.4s,   v8.4s,   v10.4s
        add             v9.4s,   v9.4s,   v11.4s
-        add             v12.4s,  v12.4s,  v14.4s
-        add             v13.4s,  v13.4s,  v15.4s
+        add             v12.4s,  v12.4s,  v16.4s
+        add             v13.4s,  v13.4s,  v17.4s

        add             v20.8h,  v20.8h,  v21.8h
        add             v22.8h,  v22.8h,  v23.8h
@@ -207,11 +283,6 @@ function sgr_box5_vert_neon, export=1

        subs            w4,  w4,  #8

-        movi            v20.8b,  #55  // idx of last 5
-        movi            v21.8b,  #72  // idx of last 4
-        movi            v22.8b,  #101 // idx of last 3
-        movi            v23.8b,  #169 // idx of last 2
-
        srshl           v0.4s,   v0.4s,   v7.4s
        srshl           v1.4s,   v1.4s,   v7.4s
        srshl           v4.8h,   v2.8h,   v6.8h
@@ -231,22 +302,19 @@ function sgr_box5_vert_neon, export=1

        ld1             {v12.4s, v13.4s}, [x7], #32

-        cmhi            v25.8b,  v0.8b,   v20.8b // = -1 if sgr_x_by_x[v0] < 5
-        cmhi            v26.8b,  v0.8b,   v21.8b // = -1 if sgr_x_by_x[v0] < 4
-        tbl             v1.8b, {v16.16b,  v17.16b, v18.16b}, v0.8b
-        cmhi            v27.8b,  v0.8b,   v22.8b // = -1 if sgr_x_by_x[v0] < 3
-        cmhi            v4.8b,   v0.8b,   v23.8b // = -1 if sgr_x_by_x[v0] < 2
-        ld1             {v14.4s, v15.4s}, [x8], #32
-        add             v25.8b,  v25.8b,  v26.8b
-        cmhi            v5.8b,   v0.8b,   v24.8b // = -1 if sgr_x_by_x[v0] < 1
-        add             v27.8b,  v27.8b,  v4.8b
+        ushr            v1.8b,   v0.8b,  #3
+        ld1             {v16.4s, v17.4s}, [x8], #32
+        tbl             v5.8b,   {v26.16b, v27.16b}, v1.8b // RangeMins
+        tbl             v1.8b,   {v24.16b, v25.16b}, v1.8b // DiffMasks
+        tbl             v4.8b,   {v18.16b}, v0.8b          // RefLo
+        and             v0.8b,   v0.8b,  v19.8b
        ld1             {v20.8h},         [x9], #16
-        add             v5.8b,   v5.8b,   v19.8b
-        add             v25.8b,  v25.8b,  v27.8b
+        ushl            v5.8b,   v5.8b,  v0.8b
+        add             v4.8b,   v4.8b,  v1.8b
        ld1             {v21.8h},         [x10], #16
-        add             v5.8b,   v1.8b,   v5.8b
+        cnt             v5.8b,   v5.8b
        ld1             {v22.8h},         [x11], #16
-        add             v5.8b,   v5.8b,   v25.8b
+        add             v5.8b,   v4.8b,  v5.8b
        ld1             {v23.8h},         [x12], #16
        uxtl            v5.8h,   v5.8b           // x

@@ -264,9 +332,8 @@ function sgr_box5_vert_neon, export=1
        st1             {v5.8h}, [x3], #16
        b.gt            1b

-        ldp             d14, d15, [sp, #0x30]
        ldp             d12, d13, [sp, #0x20]
        ldp             d10, d11, [sp, #0x10]
-        ldp             d8,  d9,  [sp], 0x40
+        ldp             d8,  d9,  [sp], 0x30
        ret
 endfunc
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -119,10 +119,10 @@ function \type\()_8bpc_neon, export=1
        add             x7,  x0,  x1
        lsl             x1,  x1,  #1
 8:
-        st1             {v4.d}[0],  [x0], x1
+        st1             {v4.8b},    [x0], x1
        \type           v5,  v0,  v1,  v2,  v3
        st1             {v4.d}[1],  [x7], x1
-        st1             {v5.d}[0],  [x0], x1
+        st1             {v5.8b},    [x0], x1
        subs            w5,  w5,  #4
        st1             {v5.d}[1],  [x7], x1
        b.le            0f
@@ -271,7 +271,7 @@ function w_mask_\type\()_8bpc_neon, export=1
        addp            v18.8h,   v24.8h,  v24.8h
        sub             v18.4h,   v3.4h,   v18.4h
        rshrn           v18.8b,   v18.8h,  #2
-        st1             {v18.s}[0],  [x6],  #4
+        str             s18,         [x6],  #4
 .endif
        st1             {v22.s}[0],  [x0],  x1
        st1             {v22.s}[1],  [x12], x1
@@ -315,7 +315,7 @@ function w_mask_\type\()_8bpc_neon, export=1
        addp            v18.8h,  v18.8h,  v18.8h
        sub             v18.4h,  v3.4h,   v18.4h
        rshrn           v18.8b,  v18.8h,  #2
-        st1             {v18.s}[0],  [x6],  #4
+        str             s18,       [x6],  #4
 .endif
        st1             {v22.8b},  [x0],  x1
        st1             {v23.8b},  [x12], x1
@@ -448,9 +448,9 @@ function blend_8bpc_neon, export=1
 40:
        AARCH64_VALID_JUMP_TARGET
 4:
-        ld1             {v2.8b},     [x5],  #8
-        ld1             {v1.d}[0],   [x2],  #8
-        ld1             {v0.s}[0],   [x0]
+        ld1             {v2.8b},  [x5],  #8
+        ldr             d1,       [x2],  #8
+        ldr             s0,       [x0]
        subs            w4,  w4,  #2
        ld1             {v0.s}[1],   [x8]
        sub             v3.8b,   v4.8b,   v2.8b
@@ -466,8 +466,8 @@ function blend_8bpc_neon, export=1
 8:
        ld1             {v2.16b},  [x5],  #16
        ld1             {v1.16b},  [x2],  #16
-        ld1             {v0.d}[0],   [x0]
-        ld1             {v0.d}[1],   [x8]
+        ldr             d0,        [x0]
+        ld1             {v0.d}[1], [x8]
        sub             v3.16b,  v4.16b,  v2.16b
        subs            w4,  w4,  #2
        umull           v5.8h,   v1.8b,   v2.8b
@@ -475,9 +475,9 @@ function blend_8bpc_neon, export=1
        umull2          v6.8h,   v1.16b,  v2.16b
        umlal2          v6.8h,   v0.16b,  v3.16b
        rshrn           v7.8b,   v5.8h,   #6
-        rshrn2          v7.16b,  v6.8h,   #6
-        st1             {v7.d}[0],   [x0],  x1
-        st1             {v7.d}[1],   [x8],  x1
+        rshrn           v16.8b,  v6.8h,   #6
+        st1             {v7.8b},   [x0],  x1
+        st1             {v16.8b},  [x8],  x1
        b.gt            8b
        ret
 160:
@@ -571,10 +571,10 @@ function blend_h_8bpc_neon, export=1
 20:
        AARCH64_VALID_JUMP_TARGET
 2:
-        ld1             {v0.h}[0],   [x5],  #2
-        ld1             {v1.s}[0],   [x2],  #4
+        ldr             h0,  [x5],  #2
+        ldr             s1,  [x2],  #4
        subs            w4,  w4,  #2
-        ld1             {v2.h}[0],   [x0]
+        ldr             h2,  [x0]
        zip1            v0.8b,   v0.8b,   v0.8b
        sub             v3.8b,   v4.8b,   v0.8b
        ld1             {v2.h}[1],   [x8]
@@ -592,7 +592,7 @@ function blend_h_8bpc_neon, export=1
        ld1             {v2.8b},   [x2],  #8
        subs            w4,  w4,  #2
        ext             v0.8b,   v0.8b,   v1.8b,   #4
-        ld1             {v3.s}[0],   [x0]
+        ldr             s3,          [x0]
        sub             v5.8b,   v4.8b,   v0.8b
        ld1             {v3.s}[1],   [x8]
        umull           v6.8h,   v2.8b,   v0.8b
@@ -607,19 +607,19 @@ function blend_h_8bpc_neon, export=1
 8:
        ld2r            {v0.16b,  v1.16b},  [x5],  #2
        ld1             {v2.16b},  [x2],  #16
-        ld1             {v3.d}[0],   [x0]
+        ldr             d3,        [x0]
        ext             v0.16b,  v0.16b,  v1.16b,  #8
        sub             v5.16b,  v4.16b,  v0.16b
-        ld1             {v3.d}[1],   [x8]
+        ld1             {v3.d}[1], [x8]
        subs            w4,  w4,  #2
        umull           v6.8h,   v0.8b,   v2.8b
        umlal           v6.8h,   v3.8b,   v5.8b
        umull2          v7.8h,   v0.16b,  v2.16b
        umlal2          v7.8h,   v3.16b,  v5.16b
        rshrn           v16.8b,  v6.8h,   #6
-        rshrn2          v16.16b, v7.8h,   #6
-        st1             {v16.d}[0],  [x0],  x1
-        st1             {v16.d}[1],  [x8],  x1
+        rshrn           v17.8b,  v7.8h,   #6
+        st1             {v16.8b},  [x0],  x1
+        st1             {v17.8b},  [x8],  x1
        b.gt            8b
        ret
 160:
@@ -728,8 +728,8 @@ function blend_v_8bpc_neon, export=1
        ld1r            {v0.8b},   [x5]
        sub             v1.8b,   v4.8b,   v0.8b
 2:
-        ld1             {v2.h}[0],   [x2],  #2
-        ld1             {v3.b}[0],   [x0]
+        ldr             h2,          [x2],  #2
+        ldr             b3,          [x0]
        subs            w4,  w4,  #2
        ld1             {v2.b}[1],   [x2]
        ld1             {v3.b}[1],   [x8]
@@ -748,13 +748,13 @@ function blend_v_8bpc_neon, export=1
        sub             v1.8b,   v4.8b,   v0.8b
 4:
        ld1             {v2.8b},   [x2],  #8
-        ld1             {v3.s}[0],   [x0]
+        ldr             s3,          [x0]
        ld1             {v3.s}[1],   [x8]
        subs            w4,  w4,  #2
        umull           v5.8h,   v2.8b,   v0.8b
        umlal           v5.8h,   v3.8b,   v1.8b
        rshrn           v5.8b,   v5.8h,   #6
-        st1             {v5.h}[0],   [x0],  #2
+        str             h5,          [x0],  #2
        st1             {v5.h}[2],   [x8],  #2
        st1             {v5.b}[2],   [x0],  x1
        st1             {v5.b}[6],   [x8],  x1
@@ -765,21 +765,22 @@ function blend_v_8bpc_neon, export=1
        ld1r            {v0.2d},   [x5]
        sub             x1,  x1,  #4
        sub             v1.16b,  v4.16b,  v0.16b
+        zip2            v16.2d,  v1.2d,   v1.2d
 8:
        ld1             {v2.16b},  [x2],  #16
-        ld1             {v3.d}[0],   [x0]
-        ld1             {v3.d}[1],   [x8]
+        ldr             d3,          [x0]
+        ldr             d4,          [x8]
        subs            w4,  w4,  #2
        umull           v5.8h,  v0.8b,  v2.8b
        umlal           v5.8h,  v3.8b,  v1.8b
        umull2          v6.8h,  v0.16b, v2.16b
-        umlal2          v6.8h,  v3.16b, v1.16b
+        umlal           v6.8h,  v4.8b,  v16.8b
        rshrn           v7.8b,  v5.8h,  #6
-        rshrn2          v7.16b, v6.8h,  #6
-        st1             {v7.s}[0],   [x0],  #4
-        st1             {v7.s}[2],   [x8],  #4
+        rshrn           v17.8b, v6.8h,  #6
+        str             s7,          [x0],  #4
+        str             s17,         [x8],  #4
        st1             {v7.h}[2],   [x0],  x1
-        st1             {v7.h}[6],   [x8],  x1
+        st1             {v17.h}[2],  [x8],  x1
        b.gt            8b
        ret
 160:
@@ -979,10 +980,14 @@ function prep_neon, export=1
 40:
        AARCH64_VALID_JUMP_TARGET
 4:
-        ld1             {v0.s}[0], [x1], x2
-        ld1             {v0.s}[1], [x1], x2
-        ld1             {v1.s}[0], [x1], x2
-        ld1             {v1.s}[1], [x1], x2
+        ldr             s0, [x1]
+        ldr             s2, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             s1, [x1]
+        ldr             s3, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        mov             v0.s[1], v2.s[0]
+        mov             v1.s[1], v3.s[0]
        ushll           v0.8h, v0.8b, #4
        ushll           v1.8h, v1.8b, #4
        subs            w4, w4, #4
@@ -1358,10 +1363,10 @@ endjumptable
 .endif
 .endm
 .macro st_d strd, r0, r1
-        st1             {\r0\().d}[0], [x0], \strd
+        st1             {\r0\().8b},   [x0], \strd
        st1             {\r0\().d}[1], [x8], \strd
 .ifnb \r1
-        st1             {\r1\().d}[0], [x0], \strd
+        st1             {\r1\().8b},   [x0], \strd
        st1             {\r1\().d}[1], [x8], \strd
 .endif
 .endm
@@ -1470,8 +1475,7 @@ L(\type\()_\taps\()_h):
 20:     // 2xN h
        AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
-        add             \xmx,  \xmx,  #2
-        ld1             {v0.s}[0], [\xmx]
+        ldur            s0,  [\xmx, #2]
        sub             \src,  \src,  #1
        add             \ds2,  \dst,  \d_strd
        add             \sr2,  \src,  \s_strd
@@ -1504,8 +1508,7 @@ L(\type\()_\taps\()_h):

 40:     // 4xN h
        AARCH64_VALID_JUMP_TARGET
-        add             \xmx,  \xmx,  #2
-        ld1             {v0.s}[0], [\xmx]
+        ldur            s0,  [\xmx, #2]
        sub             \src,  \src,  #1
        add             \ds2,  \dst,  \d_strd
        add             \sr2,  \src,  \s_strd
@@ -1537,8 +1540,10 @@ L(\type\()_\taps\()_h):
 .ifc \type, put
        sqrshrun        v16.8b,  v16.8h,  #4
        sqrshrun        v20.8b,  v20.8h,  #4
-        st1             {v16.s}[0], [\dst], \d_strd
-        st1             {v20.s}[0], [\ds2], \d_strd
+        str             s16,  [\dst]
+        str             s20,  [\ds2]
+        add             \dst, \dst, \d_strd
+        add             \ds2, \ds2, \d_strd
 .else
        st1             {v16.4h}, [\dst], \d_strd
        st1             {v20.4h}, [\ds2], \d_strd
@@ -1549,7 +1554,11 @@ L(\type\()_\taps\()_h):
 80:     // 8xN h
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8b}, [\xmx]
+.ifc \taps, 6tap
+        sub             \src,  \src,  #2
+.else
        sub             \src,  \src,  #3
+.endif
        add             \ds2,  \dst,  \d_strd
        add             \sr2,  \src,  \s_strd
        lsl             \d_strd,  \d_strd,  #1
@@ -1564,25 +1573,23 @@ L(\type\()_\taps\()_h):
        uxtl            v21.8h,  v21.8b

 .ifc \taps, 6tap
-        ext             v19.16b, v16.16b, v17.16b, #2
-        ext             v23.16b, v20.16b, v21.16b, #2
-        mul             v18.8h,  v19.8h,  v0.h[1]
-        mul             v22.8h,  v23.8h,  v0.h[1]
-.irpc i, 23456
-        ext             v19.16b, v16.16b, v17.16b, #(2*\i)
-        ext             v23.16b, v20.16b, v21.16b, #(2*\i)
+        mul             v18.8h,  v16.8h,  v0.h[1]
+        mul             v22.8h,  v20.8h,  v0.h[1]
+    .irpc i, 23456
+        ext             v19.16b, v16.16b, v17.16b, #(2*\i-2)
+        ext             v23.16b, v20.16b, v21.16b, #(2*\i-2)
        mla             v18.8h,  v19.8h,  v0.h[\i]
        mla             v22.8h,  v23.8h,  v0.h[\i]
-.endr
+    .endr
 .else   // 8tap
        mul             v18.8h,  v16.8h,  v0.h[0]
        mul             v22.8h,  v20.8h,  v0.h[0]
-.irpc i, 1234567
+    .irpc i, 1234567
        ext             v19.16b, v16.16b, v17.16b, #(2*\i)
        ext             v23.16b, v20.16b, v21.16b, #(2*\i)
        mla             v18.8h,  v19.8h,  v0.h[\i]
        mla             v22.8h,  v23.8h,  v0.h[\i]
-.endr
+    .endr
 .endif
        subs            \h,  \h,  #2
        srshr           v18.8h,  v18.8h, #2
@@ -1604,7 +1611,11 @@ L(\type\()_\taps\()_h):
 1280:   // 16xN, 32xN, ... h
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8b}, [\xmx]
+.ifc \taps, 6tap
+        sub             \src,  \src,  #2
+.else
        sub             \src,  \src,  #3
+.endif
        add             \ds2,  \dst,  \d_strd
        add             \sr2,  \src,  \s_strd
        lsl             \s_strd,  \s_strd,  #1
@@ -1629,30 +1640,26 @@ L(\type\()_\taps\()_h):

 16:
 .ifc \taps, 6tap
-        ext             v28.16b, v16.16b, v17.16b, #2
-        ext             v29.16b, v17.16b, v18.16b, #2
-        ext             v30.16b, v20.16b, v21.16b, #2
-        ext             v31.16b, v21.16b, v22.16b, #2
-        mul             v24.8h,  v28.8h,  v0.h[1]
-        mul             v25.8h,  v29.8h,  v0.h[1]
-        mul             v26.8h,  v30.8h,  v0.h[1]
-        mul             v27.8h,  v31.8h,  v0.h[1]
-.irpc i, 23456
-        ext             v28.16b, v16.16b, v17.16b, #(2*\i)
-        ext             v29.16b, v17.16b, v18.16b, #(2*\i)
-        ext             v30.16b, v20.16b, v21.16b, #(2*\i)
-        ext             v31.16b, v21.16b, v22.16b, #(2*\i)
+        mul             v24.8h,  v16.8h,  v0.h[1]
+        mul             v25.8h,  v17.8h,  v0.h[1]
+        mul             v26.8h,  v20.8h,  v0.h[1]
+        mul             v27.8h,  v21.8h,  v0.h[1]
+    .irpc i, 23456
+        ext             v28.16b, v16.16b, v17.16b, #(2*\i-2)
+        ext             v29.16b, v17.16b, v18.16b, #(2*\i-2)
+        ext             v30.16b, v20.16b, v21.16b, #(2*\i-2)
+        ext             v31.16b, v21.16b, v22.16b, #(2*\i-2)
        mla             v24.8h,  v28.8h,  v0.h[\i]
        mla             v25.8h,  v29.8h,  v0.h[\i]
        mla             v26.8h,  v30.8h,  v0.h[\i]
        mla             v27.8h,  v31.8h,  v0.h[\i]
-.endr
+    .endr
 .else   // 8tap
        mul             v24.8h,  v16.8h,  v0.h[0]
        mul             v25.8h,  v17.8h,  v0.h[0]
        mul             v26.8h,  v20.8h,  v0.h[0]
        mul             v27.8h,  v21.8h,  v0.h[0]
-.irpc i, 1234567
+    .irpc i, 1234567
        ext             v28.16b, v16.16b, v17.16b, #(2*\i)
        ext             v29.16b, v17.16b, v18.16b, #(2*\i)
        ext             v30.16b, v20.16b, v21.16b, #(2*\i)
@@ -1661,7 +1668,7 @@ L(\type\()_\taps\()_h):
        mla             v25.8h,  v29.8h,  v0.h[\i]
        mla             v26.8h,  v30.8h,  v0.h[\i]
        mla             v27.8h,  v31.8h,  v0.h[\i]
-.endr
+    .endr
 .endif
        srshr           v24.8h,  v24.8h, #2
        srshr           v25.8h,  v25.8h, #2
@@ -1732,8 +1739,7 @@ function L(\type\()_\taps\()_v)
        b.gt            28f

        cmp             \h,  #2
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
+        ldur            s0,  [\xmy, #2]
        sub             \src,  \src,  \s_strd
        add             \ds2,  \dst,  \d_strd
        add             \sr2,  \src,  \s_strd
@@ -1812,8 +1818,7 @@ function L(\type\()_\taps\()_v)

        // 4x2, 4x4 v
        cmp             \h,  #2
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
+        ldur            s0,  [\xmy, #2]
        sub             \src, \src, \s_strd
        add             \ds2, \dst, \d_strd
        add             \sr2, \src, \s_strd
@@ -1888,8 +1893,7 @@ function L(\type\()_\taps\()_v)

        // 8x2, 8x4 v
        cmp             \h,  #2
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
+        ldur            s0,  [\xmy, #2]
        sub             \src, \src, \s_strd
        add             \ds2, \dst, \d_strd
        add             \sr2, \src, \s_strd
@@ -1987,8 +1991,7 @@ function L(\type\()_\taps\()_v)
        b.gt            1680b

        // 16x2, 16x4 v
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
+        ldur            s0,  [\xmy, #2]
        sub             \src, \src, \s_strd
        add             \ds2, \dst, \d_strd
        add             \sr2, \src, \s_strd
@@ -2055,11 +2058,9 @@ function L(\type\()_\taps\()_hv)
 20:
        AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
-        add             \xmx,  \xmx,  #2
-        ld1             {v0.s}[0],  [\xmx]
+        ldur            s0,  [\xmx, #2]
        b.gt            280f
-        add             \xmy,  \xmy,  #2
-        ld1             {v1.s}[0],  [\xmy]
+        ldur            s1,  [\xmy, #2]

        // 2x2, 2x4 hv
        sub             \sr2, \src, #1
@@ -2193,11 +2194,9 @@ L(\type\()_\taps\()_filter_2):

 40:
        AARCH64_VALID_JUMP_TARGET
-        add             \xmx, \xmx, #2
-        ld1             {v0.s}[0],  [\xmx]
+        ldur            s0,  [\xmx, #2]
        b.gt            480f
-        add             \xmy, \xmy,  #2
-        ld1             {v1.s}[0],  [\xmy]
+        ldur            s1,  [\xmy, #2]
        sub             \sr2, \src, #1
        sub             \src, \sr2, \s_strd
        add             \ds2, \dst, \d_strd
@@ -2242,8 +2241,10 @@ L(\type\()_\taps\()_filter_2):
 .ifc \type, put
        sqxtun          v2.8b,  v2.8h
        sqxtun          v3.8b,  v3.8h
-        st1             {v2.s}[0], [\dst], \d_strd
-        st1             {v3.s}[0], [\ds2], \d_strd
+        str             s2,  [\dst]
+        str             s3,  [\ds2]
+        add             \dst, \dst, \d_strd
+        add             \ds2, \ds2, \d_strd
 .else
        st1             {v2.4h}, [\dst], \d_strd
        st1             {v3.4h}, [\ds2], \d_strd
@@ -2335,8 +2336,10 @@ L(\type\()_\taps\()_filter_2):
 .ifc \type, put
        sqxtun          v2.8b,  v2.8h
        sqxtun          v3.8b,  v3.8h
-        st1             {v2.s}[0], [\dst], \d_strd
-        st1             {v3.s}[0], [\ds2], \d_strd
+        str             s2,  [\dst]
+        str             s3,  [\ds2]
+        add             \dst, \dst, \d_strd
+        add             \ds2, \ds2, \d_strd
 .else
        st1             {v2.4h}, [\dst], \d_strd
        st1             {v3.4h}, [\ds2], \d_strd
@@ -2383,10 +2386,13 @@ L(\type\()_\taps\()_filter_4):
 320:
        AARCH64_VALID_JUMP_TARGET
        b.gt            880f
-        add             \xmy,  \xmy,  #2
        ld1             {v0.8b},  [\xmx]
-        ld1             {v1.s}[0],  [\xmy]
+        ldur            s1,  [\xmy, #2]
+.ifc \taps, 6tap
+        sub             \src,  \src,  #2
+.else
        sub             \src,  \src,  #3
+.endif
        sub             \src,  \src,  \s_strd
        sxtl            v0.8h,  v0.8b
        sxtl            v1.8h,  v1.8b
@@ -2464,8 +2470,10 @@ L(\type\()_\taps\()_filter_4):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8b},  [\xmx]
        ld1             {v1.8b},  [\xmy]
+.ifc \taps, 6tap
+        sub             \src,  \src,  #2
+.else
        sub             \src,  \src,  #3
-.ifc \taps, 8tap
        sub             \src,  \src,  \s_strd
 .endif
        sub             \src,  \src,  \s_strd, lsl #1
@@ -2609,17 +2617,15 @@ L(\type\()_\taps\()_filter_8_first):
        uxtl            v28.8h,  v28.8b
        uxtl            v29.8h,  v29.8b
 .ifc \taps, 6tap
-        ext             v24.16b, v28.16b, v29.16b, #(2*1)
-        ext             v25.16b, v28.16b, v29.16b, #(2*2)
-        ext             v26.16b, v28.16b, v29.16b, #(2*3)
-        ext             v27.16b, v28.16b, v29.16b, #(2*4)
-        mul             v16.8h,  v24.8h,  v0.h[1]
+        mul             v16.8h,  v28.8h,  v0.h[1]
+        ext             v25.16b, v28.16b, v29.16b, #(2*1)
+        ext             v26.16b, v28.16b, v29.16b, #(2*2)
+        ext             v27.16b, v28.16b, v29.16b, #(2*3)
        mla             v16.8h,  v25.8h,  v0.h[2]
        mla             v16.8h,  v26.8h,  v0.h[3]
        mla             v16.8h,  v27.8h,  v0.h[4]
-        ext             v24.16b, v28.16b, v29.16b, #(2*5)
-        ext             v25.16b, v28.16b, v29.16b, #(2*6)
-        ext             v26.16b, v28.16b, v29.16b, #(2*7)
+        ext             v24.16b, v28.16b, v29.16b, #(2*4)
+        ext             v25.16b, v28.16b, v29.16b, #(2*5)
        mla             v16.8h,  v24.8h,  v0.h[5]
        mla             v16.8h,  v25.8h,  v0.h[6]
 .else   // 8tap
@@ -2650,25 +2656,23 @@ L(\type\()_\taps\()_filter_8):
        uxtl            v30.8h,  v30.8b
        uxtl            v31.8h,  v31.8b
 .ifc \taps, 6tap
-        ext             v26.16b, v28.16b, v29.16b, #2
-        ext             v27.16b, v30.16b, v31.16b, #2
-        mul             v24.8h,  v26.8h,  v0.h[1]
-        mul             v25.8h,  v27.8h,  v0.h[1]
-.irpc i, 23456
-        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
-        ext             v27.16b, v30.16b, v31.16b, #(2*\i)
+        mul             v24.8h,  v28.8h,  v0.h[1]
+        mul             v25.8h,  v30.8h,  v0.h[1]
+    .irpc i, 23456
+        ext             v26.16b, v28.16b, v29.16b, #(2*\i-2)
+        ext             v27.16b, v30.16b, v31.16b, #(2*\i-2)
        mla             v24.8h,  v26.8h,  v0.h[\i]
        mla             v25.8h,  v27.8h,  v0.h[\i]
-.endr
+    .endr
 .else   // 8tap
        mul             v24.8h,  v28.8h,  v0.h[0]
        mul             v25.8h,  v30.8h,  v0.h[0]
-.irpc i, 1234567
+    .irpc i, 1234567
        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
        ext             v27.16b, v30.16b, v31.16b, #(2*\i)
        mla             v24.8h,  v26.8h,  v0.h[\i]
        mla             v25.8h,  v27.8h,  v0.h[\i]
-.endr
+    .endr
 .endif
        srshr           v24.8h,  v24.8h, #2
        srshr           v25.8h,  v25.8h, #2
@@ -2723,8 +2727,8 @@ L(\type\()_bilin_h):
        lsl             \d_strd,  \d_strd,  #1
        lsl             \s_strd,  \s_strd,  #1
 2:
-        ld1             {v4.s}[0],  [\src], \s_strd
-        ld1             {v6.s}[0],  [\sr2], \s_strd
+        ld1r            {v4.4s},  [\src], \s_strd
+        ld1r            {v6.4s},  [\sr2], \s_strd
        ext             v5.8b,  v4.8b,  v4.8b, #1
        ext             v7.8b,  v6.8b,  v6.8b, #1
        trn1            v4.4h,  v4.4h,  v6.4h
@@ -2760,7 +2764,7 @@ L(\type\()_bilin_h):
        st1             {v4.s}[0], [\dst], \d_strd
        st1             {v4.s}[1], [\ds2], \d_strd
 .else
-        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.8b},   [\dst], \d_strd
        st1             {v4.d}[1], [\ds2], \d_strd
 .endif
        b.gt            4b
@@ -2885,24 +2889,24 @@ function L(\type\()_bilin_v)
        lsl             \d_strd,  \d_strd,  #1

        // 2x2 v
-        ld1             {v16.h}[0], [\src], \s_strd
+        ld1r            {v16.8h}, [\src], \s_strd
        b.gt            24f
 22:
-        ld1             {v17.h}[0], [\sr2], \s_strd
-        ld1             {v18.h}[0], [\src], \s_strd
+        ld1r            {v17.8h}, [\sr2], \s_strd
+        ld1r            {v18.8h}, [\src], \s_strd
        trn1            v16.4h, v16.4h, v17.4h
        trn1            v17.4h, v17.4h, v18.4h
        umull           v4.8h,  v16.8b,  v2.8b
        umlal           v4.8h,  v17.8b,  v3.8b
        uqrshrn         v4.8b,  v4.8h,  #4
-        st1             {v4.h}[0], [\dst]
+        str             h4,        [\dst]
        st1             {v4.h}[1], [\ds2]
        ret
 24:     // 2x4, 2x6, 2x8, ... v
-        ld1             {v17.h}[0], [\sr2], \s_strd
-        ld1             {v18.h}[0], [\src], \s_strd
-        ld1             {v19.h}[0], [\sr2], \s_strd
-        ld1             {v20.h}[0], [\src], \s_strd
+        ld1r            {v17.8h}, [\sr2], \s_strd
+        ld1r            {v18.8h}, [\src], \s_strd
+        ld1r            {v19.8h}, [\sr2], \s_strd
+        ld1r            {v20.8h}, [\src], \s_strd
        sub             \h,  \h,  #4
        trn1            v16.4h, v16.4h, v17.4h
        trn1            v17.4h, v17.4h, v18.4h
@@ -2932,10 +2936,10 @@ function L(\type\()_bilin_v)
        add             \sr2,  \src,  \s_strd
        lsl             \s_strd,  \s_strd,  #1
        lsl             \d_strd,  \d_strd,  #1
-        ld1             {v16.s}[0], [\src], \s_strd
+        ld1r            {v16.4s}, [\src], \s_strd
 4:
-        ld1             {v17.s}[0], [\sr2], \s_strd
-        ld1             {v18.s}[0], [\src], \s_strd
+        ld1r            {v17.4s}, [\sr2], \s_strd
+        ld1r            {v18.4s}, [\src], \s_strd
        trn1            v16.2s, v16.2s, v17.2s
        trn1            v17.2s, v17.2s, v18.2s
        umull           v4.8h,  v16.8b,  v2.8b
@@ -2946,7 +2950,7 @@ function L(\type\()_bilin_v)
        st1             {v4.s}[0], [\dst], \d_strd
        st1             {v4.s}[1], [\ds2], \d_strd
 .else
-        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.8b},   [\dst], \d_strd
        st1             {v4.d}[1], [\ds2], \d_strd
 .endif
        b.le            0f
@@ -3070,14 +3074,14 @@ function L(\type\()_bilin_hv)
        lsl             \s_strd, \s_strd, #1
        lsl             \d_strd, \d_strd, #1

-        ld1             {v28.s}[0],  [\src], \s_strd
+        ld1r            {v28.4s},  [\src], \s_strd
        ext             v29.8b, v28.8b, v28.8b, #1
        umull           v16.8h, v28.8b, v0.8b
        umlal           v16.8h, v29.8b, v1.8b

 2:
-        ld1             {v28.s}[0],  [\sr2], \s_strd
-        ld1             {v30.s}[0],  [\src], \s_strd
+        ld1r            {v28.4s},  [\sr2], \s_strd
+        ld1r            {v30.4s},  [\src], \s_strd
        ext             v29.8b, v28.8b, v28.8b, #1
        ext             v31.8b, v30.8b, v30.8b, #1
        trn1            v28.4h, v28.4h, v30.4h
@@ -3133,7 +3137,7 @@ function L(\type\()_bilin_hv)
        st1             {v4.s}[1], [\ds2], \d_strd
 .else
        urshr           v4.8h,  v4.8h,  #4
-        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.8b},   [\dst], \d_strd
        st1             {v4.d}[1], [\ds2], \d_strd
 .endif
        b.le            0f

--- a/src/arm/64/mc16.S
+++ b/src/arm/64/mc16.S
@@ -157,9 +157,9 @@ function \type\()_16bpc_neon, export=1
        lsl             x1,  x1,  #1
 4:
        subs            w5,  w5,  #4
-        st1             {v4.d}[0],  [x0], x1
+        st1             {v4.8b},    [x0], x1
        st1             {v4.d}[1],  [x7], x1
-        st1             {v5.d}[0],  [x0], x1
+        st1             {v5.8b},    [x0], x1
        st1             {v5.d}[1],  [x7], x1
        b.le            0f
        \type           v4,  v5,  v0,  v1,  v2,  v3
@@ -336,11 +336,11 @@ function w_mask_\type\()_16bpc_neon, export=1
        addp            v20.8h,  v24.8h,  v24.8h  // (128 - m) + (128 - n) (column wise addition)
        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
-        st1             {v20.s}[0], [x6], #4
+        str             s20,        [x6],  #4
 .endif
-        st1             {v4.d}[0],  [x0],  x1
+        st1             {v4.8b},    [x0],  x1
        st1             {v4.d}[1],  [x12], x1
-        st1             {v5.d}[0],  [x0],  x1
+        st1             {v5.8b},    [x0],  x1
        st1             {v5.d}[1],  [x12], x1
        b.gt            4b
        ret
@@ -400,7 +400,7 @@ function w_mask_\type\()_16bpc_neon, export=1
        addp            v20.8h,  v20.8h,  v20.8h  // (128 - m) + (128 - n) (column wise addition)
        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
-        st1             {v20.s}[0], [x6], #4
+        str             s20,     [x6],  #4
 .endif
        st1             {v4.8h}, [x0],  x1
        st1             {v5.8h}, [x12], x1
@@ -578,7 +578,7 @@ function blend_16bpc_neon, export=1
 4:
        ld1             {v2.8b},   [x5], #8
        ld1             {v1.8h},   [x2], #16
-        ld1             {v0.d}[0], [x0]
+        ldr             d0,        [x0]
        neg             v2.8b,   v2.8b            // -m
        subs            w4,  w4,  #2
        ld1             {v0.d}[1], [x8]
@@ -587,7 +587,7 @@ function blend_16bpc_neon, export=1
        sub             v1.8h,   v0.8h,   v1.8h   // a - b
        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
        add             v0.8h,   v0.8h,   v1.8h
-        st1             {v0.d}[0], [x0], x1
+        st1             {v0.8b},   [x0], x1
        st1             {v0.d}[1], [x8], x1
        b.gt            4b
        ret
@@ -711,7 +711,7 @@ function blend_h_16bpc_neon, export=1
        ext             v2.8b,   v2.8b,   v3.8b,   #6
        subs            w4,  w4,  #2
        neg             v2.8b,   v2.8b            // -m
-        ld1             {v0.s}[0], [x0]
+        ldr             s0,        [x0]
        ld1             {v0.s}[1], [x8]
        sxtl            v2.8h,   v2.8b
        shl             v2.4h,   v2.4h,   #9      // -m << 9
@@ -730,14 +730,14 @@ function blend_h_16bpc_neon, export=1
        ext             v2.8b,   v2.8b,   v3.8b,   #4
        subs            w4,  w4,  #2
        neg             v2.8b,   v2.8b            // -m
-        ld1             {v0.d}[0],   [x0]
+        ldr             d0,          [x0]
        ld1             {v0.d}[1],   [x8]
        sxtl            v2.8h,   v2.8b
        shl             v2.8h,   v2.8h,   #9      // -m << 9
        sub             v1.8h,   v0.8h,   v1.8h   // a - b
        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
        add             v0.8h,   v0.8h,   v1.8h
-        st1             {v0.d}[0], [x0], x1
+        st1             {v0.8b},   [x0], x1
        st1             {v0.d}[1], [x8], x1
        b.gt            4b
        ret
@@ -880,8 +880,8 @@ function blend_v_16bpc_neon, export=1
        sxtl            v2.8h,   v2.8b
        shl             v2.4h,   v2.4h,   #9      // -m << 9
 2:
-        ld1             {v1.s}[0], [x2], #4
-        ld1             {v0.h}[0], [x0]
+        ldr             s1,  [x2],  #4
+        ldr             h0,  [x0]
        subs            w4,  w4,  #2
        ld1             {v1.h}[1], [x2]
        ld1             {v0.h}[1], [x8]
@@ -902,13 +902,13 @@ function blend_v_16bpc_neon, export=1
        shl             v2.8h,   v2.8h,   #9      // -m << 9
 4:
        ld1             {v1.8h},   [x2], #16
-        ld1             {v0.d}[0], [x0]
+        ldr             d0,        [x0]
        ld1             {v0.d}[1], [x8]
        subs            w4,  w4,  #2
        sub             v1.8h,   v0.8h,   v1.8h   // a - b
        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
        add             v0.8h,   v0.8h,   v1.8h
-        st1             {v0.s}[0], [x0], #4
+        str             s0,        [x0], #4
        st1             {v0.s}[2], [x8], #4
        st1             {v0.h}[2], [x0], x1
        st1             {v0.h}[6], [x8], x1
@@ -932,8 +932,8 @@ function blend_v_16bpc_neon, export=1
        sqrdmulh        v3.8h,   v3.8h,   v4.8h
        add             v0.8h,   v0.8h,   v2.8h
        add             v1.8h,   v1.8h,   v3.8h
-        st1             {v0.d}[0], [x0], #8
-        st1             {v1.d}[0], [x8], #8
+        str             d0,        [x0], #8
+        str             d1,        [x8], #8
        st1             {v0.s}[2], [x0], x1
        st1             {v1.s}[2], [x8], x1
        b.gt            8b
@@ -1031,8 +1031,8 @@ function put_16bpc_neon, export=1
 20:
        AARCH64_VALID_JUMP_TARGET
 2:
-        ld1             {v0.s}[0], [x2], x3
-        ld1             {v1.s}[0], [x2], x3
+        ld1r            {v0.4s},   [x2], x3
+        ld1r            {v1.4s},   [x2], x3
        subs            w5,  w5,  #2
        st1             {v0.s}[0], [x0], x1
        st1             {v1.s}[0], [x0], x1
@@ -1159,7 +1159,7 @@ function prep_16bpc_neon
        add             x9,  x1,  x2
        lsl             x2,  x2,  #1
 4:
-        ld1             {v0.d}[0], [x1], x2
+        ld1             {v0.8b},   [x1], x2
        ld1             {v0.d}[1], [x9], x2
        subs            w4,  w4,  #2
        sshl            v0.8h,   v0.8h,   v31.8h
@@ -1486,10 +1486,10 @@ endjumptable
 .endif
 .endm
 .macro st_d strd, r0, r1
-        st1             {\r0\().d}[0], [x0], \strd
+        st1             {\r0\().8b},   [x0], \strd
        st1             {\r0\().d}[1], [x9], \strd
 .ifnb \r1
-        st1             {\r1\().d}[0], [x0], \strd
+        st1             {\r1\().8b},   [x0], \strd
        st1             {\r1\().d}[1], [x9], \strd
 .endif
 .endm
@@ -1618,8 +1618,7 @@ L(\type\()_\taps\()_h):
 20:     // 2xN h
        AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
-        add             \xmx,  \xmx,  #2
-        ld1             {v0.s}[0], [\xmx]
+        ldur            s0,  [\xmx, #2]
        sub             \src,  \src,  #2
        add             \ds2,  \dst,  \d_strd
        add             \sr2,  \src,  \s_strd
@@ -1651,8 +1650,7 @@ L(\type\()_\taps\()_h):

 40:     // 4xN h
        AARCH64_VALID_JUMP_TARGET
-        add             \xmx,  \xmx,  #2
-        ld1             {v0.s}[0], [\xmx]
+        ldur            s0,  [\xmx, #2]
        sub             \src,  \src,  #2
        add             \ds2,  \dst,  \d_strd
        add             \sr2,  \src,  \s_strd
@@ -1699,7 +1697,7 @@ L(\type\()_\taps\()_h):
        uzp1            v16.8h,  v16.8h,  v20.8h // Same as xtn, xtn2
        sub             v16.8h,  v16.8h,  v28.8h // PREP_BIAS
 .endif
-        st1             {v16.d}[0], [\dst], \d_strd
+        st1             {v16.8b},   [\dst], \d_strd
        st1             {v16.d}[1], [\ds2], \d_strd
        b.gt            4b
        ret
@@ -1859,8 +1857,7 @@ function L(\type\()_\taps\()_v)
        b.gt            28f

        cmp             \h,  #2
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
+        ldur            s0,  [\xmy, #2]
        sub             \src,  \src,  \s_strd
        add             \ds2,  \dst,  \d_strd
        add             \sr2,  \src,  \s_strd
@@ -1937,8 +1934,7 @@ function L(\type\()_\taps\()_v)

        // 4x2, 4x4 v
        cmp             \h,  #2
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
+        ldur            s0,  [\xmy, #2]
        sub             \src, \src, \s_strd
        add             \ds2, \dst, \d_strd
        add             \sr2, \src, \s_strd
@@ -2002,8 +1998,7 @@ function L(\type\()_\taps\()_v)

        // 8x2, 8x4 v
        cmp             \h,  #2
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
+        ldur            s0,  [\xmy, #2]
        sub             \src, \src, \s_strd
        add             \ds2, \dst, \d_strd
        add             \sr2, \src, \s_strd
@@ -2091,8 +2086,7 @@ function L(\type\()_\taps\()_v)
        b.gt            1680b

        // 16x2, 16x4 v
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
+        ldur            s0,  [\xmy, #2]
        sub             \src, \src, \s_strd
        sxtl            v0.8h,   v0.8b

@@ -2154,11 +2148,9 @@ function L(\type\()_\taps\()_hv)
 20:
        AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
-        add             \xmx,  \xmx,  #2
-        ld1             {v0.s}[0],  [\xmx]
+        ldur            s0,  [\xmx, #2]
        b.gt            280f
-        add             \xmy,  \xmy,  #2
-        ld1             {v1.s}[0],  [\xmy]
+        ldur            s1,  [\xmy, #2]

        // 2x2, 2x4 hv
        sub             \sr2, \src, #2
@@ -2301,11 +2293,9 @@ L(\type\()_\taps\()_filter_2):

 40:
        AARCH64_VALID_JUMP_TARGET
-        add             \xmx, \xmx, #2
-        ld1             {v0.s}[0],  [\xmx]
+        ldur            s0,  [\xmx, #2]
        b.gt            480f
-        add             \xmy, \xmy,  #2
-        ld1             {v1.s}[0],  [\xmy]
+        ldur            s1,  [\xmy, #2]
        sub             \sr2, \src, #2
        sub             \src, \sr2, \s_strd
        add             \ds2, \dst, \d_strd
@@ -2358,7 +2348,7 @@ L(\type\()_\taps\()_filter_2):
 .endif
        subs            \h,  \h,  #2

-        st1             {v2.d}[0], [\dst], \d_strd
+        st1             {v2.8b},   [\dst], \d_strd
        st1             {v2.d}[1], [\ds2], \d_strd
        b.le            0f
        mov             v16.8b,  v18.8b
@@ -2457,7 +2447,7 @@ L(\type\()_\taps\()_filter_2):
        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
 .endif
        subs            \h,  \h,  #2
-        st1             {v3.d}[0], [\dst], \d_strd
+        st1             {v3.8b},   [\dst], \d_strd
        st1             {v3.d}[1], [\ds2], \d_strd
        b.le            0f
 .ifc \taps, 8tap
@@ -2501,9 +2491,8 @@ L(\type\()_\taps\()_filter_4):
 320:
        AARCH64_VALID_JUMP_TARGET
        b.gt            880f
-        add             \xmy,  \xmy,  #2
        ld1             {v0.8b},  [\xmx]
-        ld1             {v1.s}[0],  [\xmy]
+        ldur            s1,  [\xmy, #2]
 .ifc \taps, 6tap
        sub             \src,  \src,  #4
 .else
@@ -2932,7 +2921,7 @@ L(\type\()_bilin_h):
 .else
        sub             v4.8h,   v4.8h,   v29.8h
 .endif
-        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.8b},   [\dst], \d_strd
        st1             {v4.d}[1], [\ds2], \d_strd
        b.gt            4b
        ret
@@ -3073,24 +3062,24 @@ function L(\type\()_bilin_v)
        lsl             \d_strd,  \d_strd,  #1

        // 2x2 v
-        ld1             {v16.s}[0], [\src], \s_strd
+        ld1r            {v16.4s}, [\src], \s_strd
        b.gt            24f
 22:
-        ld1             {v17.s}[0], [\sr2], \s_strd
-        ld1             {v18.s}[0], [\src], \s_strd
+        ld1r            {v17.4s}, [\sr2], \s_strd
+        ld1r            {v18.4s}, [\src], \s_strd
        trn1            v16.2s,  v16.2s,  v17.2s
        trn1            v17.2s,  v17.2s,  v18.2s
        mul             v4.4h,   v16.4h,  v2.4h
        mla             v4.4h,   v17.4h,  v3.4h
        urshr           v4.8h,   v4.8h,   #4
-        st1             {v4.s}[0], [\dst]
+        str             s4,        [\dst]
        st1             {v4.s}[1], [\ds2]
        ret
 24:     // 2x4, 2x6, 2x8, ... v
-        ld1             {v17.s}[0], [\sr2], \s_strd
-        ld1             {v18.s}[0], [\src], \s_strd
-        ld1             {v19.s}[0], [\sr2], \s_strd
-        ld1             {v20.s}[0], [\src], \s_strd
+        ld1r            {v17.4s}, [\sr2], \s_strd
+        ld1r            {v18.4s}, [\src], \s_strd
+        ld1r            {v19.4s}, [\sr2], \s_strd
+        ld1r            {v20.4s}, [\src], \s_strd
        sub             \h,  \h,  #4
        trn1            v16.2s,  v16.2s,  v17.2s
        trn1            v17.2s,  v17.2s,  v18.2s
@@ -3135,7 +3124,7 @@ function L(\type\()_bilin_v)
        urshl           v4.8h,   v4.8h,   v31.8h
        sub             v4.8h,   v4.8h,   v29.8h
 .endif
-        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.8b},   [\dst], \d_strd
        st1             {v4.d}[1], [\ds2], \d_strd
        b.le            0f
        mov             v16.8b,  v18.8b
@@ -3344,7 +3333,7 @@ function L(\type\()_bilin_hv)
        sub             v4.8h,   v4.8h,   v29.8h
 .endif
        subs            \h,  \h,  #2
-        st1             {v4.d}[0], [\dst], \d_strd
+        st1             {v4.8b},   [\dst], \d_strd
        st1             {v4.d}[1], [\ds2], \d_strd
        b.le            0f
        trn2            v16.2d,  v17.2d,  v17.2d

--- a/src/arm/arm-arch.h
+++ b/src/arm/arm-arch.h
+/*
+ * Copyright © 2024, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ARM_ARM_ARCH_H
+#define ARM_ARM_ARCH_H
+
+/* Compatibility header to define __ARM_ARCH with older compilers */
+#ifndef __ARM_ARCH
+
+#ifdef _M_ARM
+#define __ARM_ARCH _M_ARM
+
+#elif defined(__ARM_ARCH_8A__) || defined(_M_ARM64)
+#define __ARM_ARCH 8
+
+#elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
+      defined(__ARM_ARCH_7EM__) || defined(__ARM_ARCH_7R__) || \
+      defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__)
+#define __ARM_ARCH 7
+
+#elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
+      defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || \
+      defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__)
+#define __ARM_ARCH 6
+
+#elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \
+      defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__)
+#define __ARM_ARCH 5
+
+#elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__)
+#define __ARM_ARCH 4
+
+#elif defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_3M__)
+#define __ARM_ARCH 3
+
+#elif defined(__ARM_ARCH_2__)
+#define __ARM_ARCH 2
+
+#else
+#error Unknown ARM architecture version
+#endif
+
+#endif /* !__ARM_ARCH */
+
+#endif /* ARM_ARM_ARCH_H */
--- a/src/arm/cpu.c
+++ b/src/arm/cpu.c
@@ -32,7 +32,7 @@
 #include "src/cpu.h"
 #include "src/arm/cpu.h"

-#if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)
+#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
 #include <sys/auxv.h>

 #if ARCH_AARCH64
@@ -43,7 +43,7 @@
 #define HWCAP2_AARCH64_I8MM   (1 << 13)

 COLD unsigned dav1d_get_cpu_flags_arm(void) {
-#ifdef HAVE_GETAUXVAL
+#if HAVE_GETAUXVAL
    unsigned long hw_cap = getauxval(AT_HWCAP);
    unsigned long hw_cap2 = getauxval(AT_HWCAP2);
 #else
@@ -69,7 +69,7 @@ COLD unsigned dav1d_get_cpu_flags_arm(void) {
 #define HWCAP_ARM_I8MM    (1 << 27)

 COLD unsigned dav1d_get_cpu_flags_arm(void) {
-#ifdef HAVE_GETAUXVAL
+#if HAVE_GETAUXVAL
    unsigned long hw_cap = getauxval(AT_HWCAP);
 #else
    unsigned long hw_cap = 0;

--- a/src/cpu.c
+++ b/src/cpu.c
@@ -38,13 +38,13 @@
 #include <sys/sysctl.h>
 #include <sys/types.h>
 #endif
-#ifdef HAVE_UNISTD_H
+#if HAVE_UNISTD_H
 #include <unistd.h>
 #endif

-#ifdef HAVE_PTHREAD_GETAFFINITY_NP
+#if HAVE_PTHREAD_GETAFFINITY_NP
 #include <pthread.h>
-#ifdef HAVE_PTHREAD_NP_H
+#if HAVE_PTHREAD_NP_H
 #include <pthread_np.h>
 #endif
 #if defined(__FreeBSD__)
@@ -91,7 +91,7 @@ COLD int dav1d_num_logical_processors(Dav1dContext *const c) {
    GetNativeSystemInfo(&system_info);
    return system_info.dwNumberOfProcessors;
 #endif
-#elif defined(HAVE_PTHREAD_GETAFFINITY_NP) && defined(CPU_COUNT)
+#elif HAVE_PTHREAD_GETAFFINITY_NP && defined(CPU_COUNT)
    cpu_set_t affinity;
    if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity))
        return CPU_COUNT(&affinity);

--- a/src/ctx.c
+++ b/src/ctx.c
+/*
+ * Copyright © 2024, VideoLAN and dav1d authors
+ * Copyright © 2024, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "ctx.h"
+
+static void memset_w1(void *const ptr, const int value) {
+    set_ctx1((uint8_t *) ptr, 0, value);
+}
+
+static void memset_w2(void *const ptr, const int value) {
+    set_ctx2((uint8_t *) ptr, 0, value);
+}
+
+static void memset_w4(void *const ptr, const int value) {
+    set_ctx4((uint8_t *) ptr, 0, value);
+}
+
+static void memset_w8(void *const ptr, const int value) {
+    set_ctx8((uint8_t *) ptr, 0, value);
+}
+
+static void memset_w16(void *const ptr, const int value) {
+    set_ctx16((uint8_t *) ptr, 0, value);
+}
+
+static void memset_w32(void *const ptr, const int value) {
+    set_ctx32((uint8_t *) ptr, 0, value);
+}
+
+const dav1d_memset_pow2_fn dav1d_memset_pow2[6] = {
+    memset_w1,
+    memset_w2,
+    memset_w4,
+    memset_w8,
+    memset_w16,
+    memset_w32
+};
--- a/src/ctx.h
+++ b/src/ctx.h
@@ -31,61 +31,59 @@
 #include <stdint.h>

 #include "common/attributes.h"
+#include "common/intops.h"

 union alias64 { uint64_t u64; uint8_t u8[8]; } ATTR_ALIAS;
 union alias32 { uint32_t u32; uint8_t u8[4]; } ATTR_ALIAS;
 union alias16 { uint16_t u16; uint8_t u8[2]; } ATTR_ALIAS;
 union alias8 { uint8_t u8; } ATTR_ALIAS;

-#define set_ctx_rep4(type, var, off, val) do { \
-        const uint64_t const_val = val; \
-        ((union alias64 *) &var[off +  0])->u64 = const_val; \
-        ((union alias64 *) &var[off +  8])->u64 = const_val; \
-        ((union alias64 *) &var[off + 16])->u64 = const_val; \
-        ((union alias64 *) &var[off + 24])->u64 = const_val; \
+typedef void (*dav1d_memset_pow2_fn)(void *ptr, int value);
+EXTERN const dav1d_memset_pow2_fn dav1d_memset_pow2[6];
+
+static inline void dav1d_memset_likely_pow2(void *const ptr, const int value, const int n) {
+    assert(n >= 1 && n <= 32);
+    if ((n&(n-1)) == 0) {
+        dav1d_memset_pow2[ulog2(n)](ptr, value);
+    } else {
+        memset(ptr, value, n);
+    }
+}
+
+// For smaller sizes use multiplication to broadcast bytes. memset misbehaves on the smaller sizes.
+// For the larger sizes, we want to use memset to get access to vector operations.
+#define set_ctx1(var, off, val) \
+    ((union alias8 *) &(var)[off])->u8 = (val) * 0x01
+#define set_ctx2(var, off, val) \
+    ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101
+#define set_ctx4(var, off, val) \
+    ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U
+#define set_ctx8(var, off, val) \
+    ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL
+#define set_ctx16(var, off, val) do { \
+        memset(&(var)[off], val, 16); \
    } while (0)
-#define set_ctx_rep2(type, var, off, val) do { \
-        const uint64_t const_val = val; \
-        ((union alias64 *) &var[off + 0])->u64 = const_val; \
-        ((union alias64 *) &var[off + 8])->u64 = const_val; \
+#define set_ctx32(var, off, val) do { \
+        memset(&(var)[off], val, 32); \
    } while (0)
-#define set_ctx_rep1(typesz, var, off, val) \
-    ((union alias##typesz *) &var[off])->u##typesz = val
-#define case_set(var, dir, diridx, off) \
-    switch (var) { \
-    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
-    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
-    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
-    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
-    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
-    case 32: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
-    }
-#define case_set_upto16(var, dir, diridx, off) \
-    switch (var) { \
-    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
-    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
-    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
-    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
-    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
-    }
-#define case_set_upto32_with_default(var, dir, diridx, off) \
+#define case_set(var) \
    switch (var) { \
-    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
-    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
-    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
-    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
-    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
-    case 32: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
-    default: default_memset(dir, diridx, off, var); break; \
+    case 0: set_ctx(set_ctx1); break; \
+    case 1: set_ctx(set_ctx2); break; \
+    case 2: set_ctx(set_ctx4); break; \
+    case 3: set_ctx(set_ctx8); break; \
+    case 4: set_ctx(set_ctx16); break; \
+    case 5: set_ctx(set_ctx32); break; \
+    default: assert(0); \
    }
-#define case_set_upto16_with_default(var, dir, diridx, off) \
+#define case_set_upto16(var) \
    switch (var) { \
-    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
-    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
-    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
-    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
-    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
-    default: default_memset(dir, diridx, off, var); break; \
+    case 0: set_ctx(set_ctx1); break; \
+    case 1: set_ctx(set_ctx2); break; \
+    case 2: set_ctx(set_ctx4); break; \
+    case 3: set_ctx(set_ctx8); break; \
+    case 4: set_ctx(set_ctx16); break; \
+    default: assert(0); \
    }

 #endif /* DAV1D_SRC_CTX_H */
No results found