Compare revisions

Marvin Scholz · Thomas Daede · Jean-Baptiste Kempf · Nathan E. Egge · Jean-Baptiste Kempf · Janne Grunau
--- a/.gitignore
+++ b/.gitignore
+/build*
+/Session.vim
+[._]*.swp
+*~
+tags
+.DS_Store
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+stages:
+    - build
+
+build-debian:
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20180927123816
+    stage: build
+    tags:
+        - debian
+        - amd64
+    script:
+        - meson build --buildtype release
+        - ninja -v -C build include/version.h
+        - ninja -v -C build
+
+build-win32:
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20180927123816
+    stage: build
+    tags:
+        - win32
+    script:
+        - meson build --buildtype release --cross-file /opt/crossfiles/i686-w64-mingw32.meson
+        - ninja -v -C build include/version.h
+        - ninja -v -C build
+
+build-win64:
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20180927123816
+    stage: build
+    tags:
+        - win64
+    script:
+        - meson build --buildtype release --cross-file /opt/crossfiles/x86_64-w64-mingw32.meson
+        - ninja -v -C build include/version.h
+        - ninja -v -C build
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
-# Dav1d contribution guide
+# dav1d contribution guide

 ## CoC
 The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this project.

+## ToDo
+
+Todo list can be found [on the wiki](https://code.videolan.org/videolan/dav1d/wikis/task-list).
+
 ## Codebase language

 The codebase is developed with the following assumptions:

 For the library:
 - C language with C99 version, without the VLA or the Complex (*\_\_STDC_NO_COMPLEX__*) features, and without compiler extension,
- asm in .asm files, using the NASM syntax,
+- x86 asm in .asm files, using the NASM syntax,
+- arm/arm64 in .S files, using the GAS syntax limited to subset llvm 5.0's internal assembler supports,
 - no C++ is allowed, whatever the version.

 For the tools and utils:
@@ -44,4 +49,4 @@ Please read [How to Write a Git Commit Message](https://chris.beams.io/posts/git

 ## Patent license

-You need to read and understand the [AV1 patents license](doc/PATENTS), before committing.
+You need to read, understand, and agree to the [AV1 patents license](doc/PATENTS), before committing.
--- a/README.md
+++ b/README.md
@@ -58,8 +58,9 @@ The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this pr

 # Compile

-1. Install [Meson](https://mesonbuild.com/)
-2. Run `meson build`
+1. Install [Meson](https://mesonbuild.com/) (0.47 or higher)
+2. Run `meson build --buildtype release`
+3. Build with `ninja -C build`

 # Support


--- a/ext/x86/x86inc.asm
+++ b/ext/x86/x86inc.asm
--- a/include/common/dump.h
+++ b/include/common/dump.h
@@ -72,4 +72,15 @@ static inline void coef_dump(const coef *buf, const int w, const int h,
    }
 }

+static inline void ac_dump(const int16_t *buf, int w, int h, const char *what)
+{
+    printf("%s\n", what);
+    while (h--) {
+        for (int x = 0; x < w; x++)
+            printf(" %03d", buf[x]);
+        buf += w;
+        printf("\n");
+    }
+}
+
 #endif /* __DAV1D_COMMON_DUMP_H__ */
--- a/include/common/validate.h
+++ b/include/common/validate.h
@@ -37,11 +37,11 @@
 #define debug_abort abort
 #endif

-#define validate_input_or_ret_with_msg(x, r, msg...) \
+#define validate_input_or_ret_with_msg(x, r, ...) \
    if (!(x)) { \
        fprintf(stderr, "Input validation check \'%s\' failed in %s!\n", \
-                #x, __PRETTY_FUNCTION__); \
-        fprintf(stderr, msg); \
+                #x, __func__); \
+        fprintf(stderr, __VA_ARGS__); \
        debug_abort(); \
        return r; \
    }
@@ -49,7 +49,7 @@
 #define validate_input_or_ret(x, r) \
    if (!(x)) { \
        fprintf(stderr, "Input validation check \'%s\' failed in %s!\n", \
-                #x, __PRETTY_FUNCTION__); \
+                #x, __func__); \
        debug_abort(); \
        return r; \
    }

--- a/include/dav1d/dav1d.h
+++ b/include/dav1d/dav1d.h
@@ -80,8 +80,8 @@ DAV1D_API int dav1d_open(Dav1dContext **c_out, const Dav1dSettings *s);
 DAV1D_API int dav1d_decode(Dav1dContext *c, Dav1dData *in, Dav1dPicture *out);

 /**
- * Close decoder instance, free all associated memory.
+ * Close decoder instance, free all associated memory, and set $c_out to NULL.
 */
-DAV1D_API void dav1d_close(Dav1dContext *c);
+DAV1D_API void dav1d_close(Dav1dContext **c_out);

 #endif /* __DAV1D_H__ */
--- a/meson.build
+++ b/meson.build
@@ -24,20 +24,19 @@

 project('dav1d', ['c'],
    version: '0.0.1',
-    default_options: ['c_std=c11'],
+    default_options: ['c_std=c99'],
    meson_version: '>= 0.47.0')

 dav1d_src_root = meson.current_source_dir()
 cdata = configuration_data()
+cdata_asm = configuration_data()
 cc = meson.get_compiler('c')

-if not meson.is_cross_build() 
+# On windows, we use a compatibility layer to emulate pthread
+if host_machine.system() != 'windows'
    thread_dependency = dependency('threads')
 else
-    thread_dependency = cc.find_library('pthread', required: false)
-endif
-if thread_dependency.found()
-    cdata.set('HAVE_PTHREAD_H', 1)
+    thread_dependency = declare_dependency(sources: ['src/win32/thread.c'])
 endif

 dav1d_inc_dirs = include_directories(['include', 'include/dav1d'])
@@ -55,12 +54,7 @@ endforeach
 #

 feature_defines = [
-    ['_REENTRANT',                  1], # Define so that reentrant versions of several functions get declared
-    ['_THREAD_SAFE',                1], # Same as _REENTANT for some other OSes
-    ['_GNU_SOURCE',                 1], # Enable GNU extensions on systems that have them
-    ['_POSIX_PTHREAD_SEMANTICS',    1], # Enable threading extensions on Solaris
-    ['__EXTENSIONS__',              1], # Enable general extensions on Solaris
-    ['_FILE_OFFSET_BITS',           64], # Define to 64 for large files support
+    ['_POSIX_C_SOURCE',             '200112L'], # POSIX.1–2001 (IEEE Std 1003.1-2001)
 ]

 if host_machine.system() == 'windows'
@@ -69,14 +63,6 @@ if host_machine.system() == 'windows'
            ['UNICODE',                     1], # Define to 1 for Unicode (Wide Chars) APIs
            ['_UNICODE',                    1], # Define to 1 for Unicode (Wide Chars) APIs
            ['__USE_MINGW_ANSI_STDIO',      1], # Define to force use of MinGW printf
-            ['_ISOC99_SOURCE',              1], # Extensions to ISO C89 from ISO C99
-            ['_ISOC11_SOURCE',              1], # Extensions to ISO C99 from ISO C11
-            ['_POSIX_SOURCE',               1], # IEEE Std 1003.1
-            ['_POSIX_C_SOURCE',             '200809L'], #IEEE Std 1003.1
-            ['_XOPEN_SOURCE',               700], # POSIX and XPG 7th edition
-            ['_XOPEN_SOURCE_EXTENDED',      1], # XPG things and X/Open Unix extensions
-            ['_BSD_SOURCE',                 1], # ISO C, POSIX, and 4.3BSD things
-            ['_SVID_SOURCE',                1], # ISO C, POSIX, and SVID things
    ]
 endif

@@ -84,21 +70,59 @@ if not cc.check_header('stdatomic.h')
    error('Atomics not supported')
 endif

-if cc.has_argument('-mpreferred-stack-boundary=5')
-    stackalign_flag = '-mpreferred-stack-boundary=5'
-    stackrealign_flag = '-mincoming-stack-boundary=4'
-# When cross compiling for win64 gcc refuses to use -mpreferred-stack-boundary
-# with a value which isn't 3 or 4. However, when cross compiling with clang, 5 is
-# accepted.
-elif (host_machine.system() == 'windows' and host_machine.cpu_family() == 'x86_64'
-and cc.has_argument('-mpreferred-stack-boundary=4'))
-    stackalign_flag = '-mpreferred-stack-boundary=4'
-    stackrealign_flag = '-mincoming-stack-boundary=4'
-elif cc.has_argument('-mstack-alignment=32')
-    stackalign_flag = '-mstack-alignment=32'
-    stackrealign_flag = '-mstackrealign'
+stackalign_flag = []
+stackrealign_flag = []
+
+if host_machine.cpu_family().startswith('x86')
+    if cc.has_argument('-mpreferred-stack-boundary=5')
+        stackalign_flag = ['-mpreferred-stack-boundary=5']
+        stackrealign_flag = ['-mincoming-stack-boundary=4']
+        cdata_asm.set('STACK_ALIGNMENT', 32)
+        cdata.set('STACK_ALIGNMENT', 32)
+    elif cc.has_argument('-mpreferred-stack-boundary=4')
+        stackalign_flag = ['-mpreferred-stack-boundary=4']
+        stackrealign_flag = ['-mincoming-stack-boundary=4']
+        cdata_asm.set('STACK_ALIGNMENT', 16)
+        cdata.set('STACK_ALIGNMENT', 16)
+    elif cc.has_argument('-mstack-alignment=32')
+        stackalign_flag = ['-mstack-alignment=32']
+        stackrealign_flag = ['-mstackrealign']
+        cdata_asm.set('STACK_ALIGNMENT', 32)
+        cdata.set('STACK_ALIGNMENT', 32)
+    else
+        if host_machine.cpu_family() == 'x86_64'
+            cdata_asm.set('STACK_ALIGNMENT', 16)
+            cdata.set('STACK_ALIGNMENT', 16)
+        else
+            cdata_asm.set('STACK_ALIGNMENT', 4)
+            cdata.set('STACK_ALIGNMENT', 4)
+        endif
+    endif
+endif
+
+if host_machine.cpu_family().startswith('x86')
+    cdata.set10('ARCH_X86', true)
+    if host_machine.cpu_family() == 'x86_64'
+        cdata_asm.set10('ARCH_X86_64', true)
+        cdata.set10('ARCH_X86_64', true)
+        cdata_asm.set10('ARCH_X86_32', false)
+        cdata.set10('ARCH_X86_32', false)
+
+        cdata_asm.set10('PIC', true)
+    else
+        cdata_asm.set10('ARCH_X86_64', false)
+        cdata.set10('ARCH_X86_64', false)
+        cdata_asm.set10('ARCH_X86_32', true)
+        cdata.set10('ARCH_X86_32', true)
+    endif
 else
-    error('Failed to specify stack alignment')
+    cdata.set10('ARCH_X86', false)
+    cdata.set10('ARCH_X86_64', false)
+    cdata.set10('ARCH_X86_32', false)
+endif
+
+if cc.symbols_have_underscore_prefix()
+    cdata_asm.set10('PREFIX', true)
 endif

 if cc.has_argument('-fvisibility=hidden')
@@ -107,29 +131,43 @@ else
    warning('Compiler does not support -fvisibility=hidden, all symbols will be public!')
 endif

-if cc.has_function('posix_memalign', prefix: '#include <stdlib.h>', args: ['-D_POSIX_C_SOURCE=200112'])
-    feature_defines += [['POSIX_C_SOURCE', 200112]]
+if cc.has_function('posix_memalign', prefix: '#include <stdlib.h>', args: ['-D_POSIX_C_SOURCE=200112L'])
    cdata.set('HAVE_POSIX_MEMALIGN', 1)
 elif cc.has_function('_aligned_malloc', prefix: '#include <malloc.h>')
    cdata.set('HAVE_ALIGNED_MALLOC', 1)
 endif

-add_project_arguments('-fomit-frame-pointer', '-ffast-math',
-    language: 'c')
+if (get_option('buildtype') != 'debug' and
+    get_option('buildtype') != 'plain')
+    add_project_arguments('-fomit-frame-pointer', '-ffast-math',
+        language: 'c')
+endif
+
+warning_flags = [
+  '-Wundef',
+  '-Wvla', # should be '-Werror=vla
+]

-add_project_arguments('-Wall', '-Wundef',
-    language: 'c')
+add_project_arguments(cc.get_supported_arguments(warning_flags), language: 'c')

 foreach f : feature_defines
   cdata.set(f.get(0), f.get(1))
 endforeach

+is_asm_enabled = (get_option('build_asm') == true and
+    host_machine.cpu_family().startswith('x86'))
+cdata.set10('HAVE_ASM', is_asm_enabled)
+
 #
 # Generate config headers
 #

 config_h_target = configure_file(output: 'config.h', configuration: cdata)

+if is_asm_enabled
+    config_asm_target = configure_file(output: 'config.asm', output_format: 'nasm', configuration: cdata_asm)
+endif
+
 subdir('include')

 #
@@ -149,20 +187,6 @@ libdav1d_tmpl_sources = files(
    'src/recon.c'
 )

-# Build a helper library for each bitdepth
-bitdepth_objs = []
-foreach bitdepth : dav1d_bitdepths
-    bitdepth_lib = static_library(
-        'dav1d_bitdepth_@0@'.format(bitdepth),
-        libdav1d_tmpl_sources, config_h_target,
-        include_directories: dav1d_inc_dirs,
-        c_args: ['-DBITDEPTH=@0@'.format(bitdepth), stackalign_flag],
-        install: false,
-        build_by_default: false,
-    )
-    bitdepth_objs += bitdepth_lib.extract_all_objects()
-endforeach
-
 entrypoints_src = files(
    'src/lib.c',
    'src/thread_task.c'
@@ -171,8 +195,9 @@ entrypoints_lib = static_library(
    'libdav1dentrypoint',
    entrypoints_src,
    include_directories: dav1d_inc_dirs,
-    c_args: [stackrealign_flag],
+    c_args: stackrealign_flag,
    install: false,
+    build_by_default: false,
 )
 entrypoints_objs = entrypoints_lib.extract_all_objects()

@@ -196,12 +221,66 @@ libdav1d_sources = files(
    'src/qm.c',
 )

-if host_machine.system() == 'windows'
-    libdav1d_sources += files('src/win32/thread.c')
+if is_asm_enabled
+    libdav1d_sources += files(
+        'src/x86/cpu.c',
+    )
+    libdav1d_tmpl_sources += files(
+        'src/x86/mc_init.c',
+    )
+    libdav1d_sources_asm = files(
+        'src/x86/cpuid.asm',
+        'src/x86/mc.asm',
+    )
+
+    nasm = find_program('nasm')
+
+    if host_machine.system() == 'windows'
+        nasm_format = 'win'
+    elif host_machine.system() == 'darwin'
+        nasm_format = 'macho'
+    else
+        nasm_format = 'elf'
+    endif
+    if host_machine.cpu_family() == 'x86_64'
+        nasm_format += '64'
+    else
+        nasm_format += '32'
+    endif
+
+    nasm_gen = generator(nasm,
+        output: '@BASENAME@.obj',
+        depfile: '@BASENAME@.obj.ndep',
+        arguments: [
+            '-f', nasm_format,
+            '-I', '@CURRENT_SOURCE_DIR@/',
+            '-MQ', '@OUTPUT@', '-MF', '@DEPFILE@',
+            '@EXTRA_ARGS@',
+            '@INPUT@',
+            '-o', '@OUTPUT@'
+        ])
+
+    nasm_objs = nasm_gen.process(libdav1d_sources_asm)
+else
+    nasm_objs = []
 endif

+# Build a helper library for each bitdepth
+bitdepth_objs = []
+foreach bitdepth : dav1d_bitdepths
+    bitdepth_lib = static_library(
+        'dav1d_bitdepth_@0@'.format(bitdepth),
+        libdav1d_tmpl_sources, config_h_target,
+        include_directories: dav1d_inc_dirs,
+        c_args: ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag,
+        install: false,
+        build_by_default: false,
+    )
+    bitdepth_objs += bitdepth_lib.extract_all_objects()
+endforeach
+
 libdav1d = library('dav1d',
-    libdav1d_sources, rev_target,
+    libdav1d_sources, rev_target, nasm_objs,
    version: '0.0.1',
    objects: [bitdepth_objs, entrypoints_objs],
    include_directories: dav1d_inc_dirs,
@@ -229,7 +308,8 @@ dav1d_sources = files(
 dav1d = executable('dav1d',
    dav1d_sources, rev_target,
    link_with: libdav1d,
-    include_directories: [dav1d_inc_dirs, include_directories('tools')]
+    include_directories: [dav1d_inc_dirs, include_directories('tools')],
+    install: true,
 )

 #

--- a/meson_options.txt
+++ b/meson_options.txt
@@ -4,3 +4,8 @@ option('bitdepths',
    type: 'array',
    choices: ['8', '10'],
    description: 'Enable only specified bitdepths')
+
+option('build_asm',
+    type: 'boolean',
+    value: true,
+    description: 'Build asm files, if available')
--- a/src/cdef.c
+++ b/src/cdef.c
@@ -25,23 +25,46 @@
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 #include "config.h"

+#include <assert.h>
 #include <stdlib.h>

 #include "common/intops.h"

 #include "src/cdef.h"

-static const int8_t cdef_directions[8 /* dir */][2 /* pass */][2 /* y, x */] = {
-    { { -1, 1 }, { -2,  2 } },
-    { {  0, 1 }, { -1,  2 } },
-    { {  0, 1 }, {  0,  2 } },
-    { {  0, 1 }, {  1,  2 } },
-    { {  1, 1 }, {  2,  2 } },
-    { {  1, 0 }, {  2,  1 } },
-    { {  1, 0 }, {  2,  0 } },
-    { {  1, 0 }, {  2, -1 } }
+static const int8_t cdef_directions4[8 /* dir */][2 /* pass */] = {
+    { -1 * 8 + 1, -2 * 8 + 2 },
+    {  0 * 8 + 1, -1 * 8 + 2 },
+    {  0 * 8 + 1,  0 * 8 + 2 },
+    {  0 * 8 + 1,  1 * 8 + 2 },
+    {  1 * 8 + 1,  2 * 8 + 2 },
+    {  1 * 8 + 0,  2 * 8 + 1 },
+    {  1 * 8 + 0,  2 * 8 + 0 },
+    {  1 * 8 + 0,  2 * 8 - 1 }
+};
+
+static const int8_t cdef_directions8[8 /* dir */][2 /* pass */] = {
+    { -1 * 16 + 1, -2 * 16 + 2 },
+    {  0 * 16 + 1, -1 * 16 + 2 },
+    {  0 * 16 + 1,  0 * 16 + 2 },
+    {  0 * 16 + 1,  1 * 16 + 2 },
+    {  1 * 16 + 1,  2 * 16 + 2 },
+    {  1 * 16 + 0,  2 * 16 + 1 },
+    {  1 * 16 + 0,  2 * 16 + 0 },
+    {  1 * 16 + 0,  2 * 16 - 1 }
 };
 static const uint8_t cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
 static const uint8_t cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } };
@@ -78,10 +101,16 @@ static void cdef_filter_block_c(pixel *const dst, const ptrdiff_t dst_stride,
                                const int sec_strength, const int dir,
                                const int damping, const enum CdefEdgeFlags edges)
 {
-    const ptrdiff_t tmp_stride = w + 4;
-    uint16_t tmp[tmp_stride * (h + 4)];
+    const ptrdiff_t tmp_stride = 16 >> (w == 4);
+    assert((w == 4 || w == 8) && (h == 4 || h == 8));
+    uint16_t tmp[192];  // 16*12 is the maximum value of tmp_stride * (h + 4)
+    uint16_t *tmp2 = tmp + 2 * tmp_stride + 2;
    const uint8_t *const pri_taps = cdef_pri_taps[(pri_strength >> (BITDEPTH - 8)) & 1];
    const uint8_t *const sec_taps = cdef_sec_taps[(pri_strength >> (BITDEPTH - 8)) & 1];
+    const int8_t (*cdef_directions)[2];
+
+    assert(w == 4 || w == 8);
+    cdef_directions = w == 4 ? cdef_directions4 : cdef_directions8;

    // fill extended input buffer
    int x_start = -2, x_end = w + 2, y_start = -2, y_end = h + 2;
@@ -104,10 +133,10 @@ static void cdef_filter_block_c(pixel *const dst, const ptrdiff_t dst_stride,
    }
    for (int y = y_start; y < 0; y++)
        for (int x = x_start; x < x_end; x++)
-            tmp[(y + 2) * tmp_stride + (x + 2)] = top[y & 1][x];
+            tmp2[y * tmp_stride + x] = top[y & 1][x];
    for (int y = 0; y < y_end; y++)
        for (int x = x_start; x < x_end; x++)
-            tmp[(y + 2) * tmp_stride + (x + 2)] = dst[y * PXSTRIDE(dst_stride) + x];
+            tmp2[y * tmp_stride + x] = dst[y * PXSTRIDE(dst_stride) + x];

    // run actual filter
    for (int y = 0; y < h; y++) {
@@ -116,23 +145,21 @@ static void cdef_filter_block_c(pixel *const dst, const ptrdiff_t dst_stride,
            const int px = dst[y * PXSTRIDE(dst_stride) + x];
            int max = px, min = px;
            for (int k = 0; k < 2; k++) {
-#define extpx(y, x) tmp[((y) + 2) * tmp_stride + ((x) + 2)]
-                const int8_t *const off1 = cdef_directions[dir][k];
-                const int p0 = extpx(y + off1[0], x + off1[1]);
-                const int p1 = extpx(y - off1[0], x - off1[1]);
+                const int8_t off1 = cdef_directions[dir][k];
+                const int p0 = tmp2[y * tmp_stride + x + off1];
+                const int p1 = tmp2[y * tmp_stride + x - off1];
                sum += pri_taps[k] * constrain(p0 - px, pri_strength, damping);
                sum += pri_taps[k] * constrain(p1 - px, pri_strength, damping);
                if (p0 != CDEF_VERY_LARGE) max = imax(p0, max);
                if (p1 != CDEF_VERY_LARGE) max = imax(p1, max);
                min = imin(p0, min);
                min = imin(p1, min);
-                const int8_t *const off2 = cdef_directions[(dir + 2) & 7][k];
-                const int s0 = extpx(y + off2[0], x + off2[1]);
-                const int s1 = extpx(y - off2[0], x - off2[1]);
-                const int8_t *const off3 = cdef_directions[(dir + 6) & 7][k];
-                const int s2 = extpx(y + off3[0], x + off3[1]);
-                const int s3 = extpx(y - off3[0], x - off3[1]);
-#undef extpx
+                const int8_t off2 = cdef_directions[(dir + 2) & 7][k];
+                const int s0 = tmp2[y * tmp_stride + x + off2];
+                const int s1 = tmp2[y * tmp_stride + x - off2];
+                const int8_t off3 = cdef_directions[(dir + 6) & 7][k];
+                const int s2 = tmp2[y * tmp_stride + x + off3];
+                const int s3 = tmp2[y * tmp_stride + x - off3];
                if (s0 != CDEF_VERY_LARGE) max = imax(s0, max);
                if (s1 != CDEF_VERY_LARGE) max = imax(s1, max);
                if (s2 != CDEF_VERY_LARGE) max = imax(s2, max);

--- a/src/decode.c
+++ b/src/decode.c
@@ -850,7 +850,7 @@ static void decode_b(Dav1dTileContext *const t,
        !(t->by & (31 >> !f->seq_hdr.sb128)))
    {
        const int prev_qidx = ts->last_qidx;
-        const int have_delta_q = f->frame_hdr.delta_q_present &&
+        const int have_delta_q = f->frame_hdr.delta.q.present &&
            (bs != (f->seq_hdr.sb128 ? BS_128x128 : BS_64x64) || !b->skip);
        if (have_delta_q) {
            int delta_q = msac_decode_symbol_adapt(&ts->msac, ts->cdf.m.delta_q, 4);
@@ -860,7 +860,7 @@ static void decode_b(Dav1dTileContext *const t,
            }
            if (delta_q) {
                if (msac_decode_bool(&ts->msac, 128 << 7)) delta_q = -delta_q;
-                delta_q *= 1 << f->frame_hdr.delta_q_res_log2;
+                delta_q *= 1 << f->frame_hdr.delta.q.res_log2;
            }
            ts->last_qidx = iclip(ts->last_qidx + delta_q, 1, 255);
            if (have_delta_q && DEBUG_BLOCK_INFO)
@@ -879,20 +879,20 @@ static void decode_b(Dav1dTileContext *const t,
        // delta_lf
        int8_t prev_delta_lf[4];
        memcpy(prev_delta_lf, ts->last_delta_lf, 4);
-        if (have_delta_q && f->frame_hdr.delta_lf_present) {
-            const int n_lfs = f->frame_hdr.delta_lf_multi ?
+        if (have_delta_q && f->frame_hdr.delta.lf.present) {
+            const int n_lfs = f->frame_hdr.delta.lf.multi ?
                f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 ? 4 : 2 : 1;

            for (int i = 0; i < n_lfs; i++) {
                int delta_lf = msac_decode_symbol_adapt(&ts->msac,
-                                ts->cdf.m.delta_lf[i + f->frame_hdr.delta_lf_multi], 4);
+                                ts->cdf.m.delta_lf[i + f->frame_hdr.delta.lf.multi], 4);
                if (delta_lf == 3) {
                    const int n_bits = 1 + msac_decode_bools(&ts->msac, 3);
                    delta_lf = msac_decode_bools(&ts->msac, n_bits) + 1 + (1 << n_bits);
                }
                if (delta_lf) {
                    if (msac_decode_bool(&ts->msac, 128 << 7)) delta_lf = -delta_lf;
-                    delta_lf *= 1 << f->frame_hdr.delta_lf_res_log2;
+                    delta_lf *= 1 << f->frame_hdr.delta.lf.res_log2;
                }
                ts->last_delta_lf[i] = iclip(ts->last_delta_lf[i] + delta_lf, -63, 63);
                if (have_delta_q && DEBUG_BLOCK_INFO)
@@ -2786,8 +2786,8 @@ int submit_frame(Dav1dContext *const c) {
        dav1d_thread_picture_ref(out_delayed, &f->cur);
    }

-    f->bw = (f->frame_hdr.width + 3) >> 2;
-    f->bh = (f->frame_hdr.height + 3) >> 2;
+    f->bw = ((f->frame_hdr.width + 7) >> 3) << 1;
+    f->bh = ((f->frame_hdr.height + 7) >> 3) << 1;
    f->sb128w = (f->bw + 31) >> 5;
    f->sb128h = (f->bh + 31) >> 5;
    f->sb_shift = 4 + f->seq_hdr.sb128;

--- a/src/env.h
+++ b/src/env.h
@@ -307,7 +307,7 @@ static inline int get_jnt_comp_ctx(const int order_hint_n_bits,
                                   const int yb4, const int xb4)
 {
    const unsigned d0 = abs(get_poc_diff(order_hint_n_bits, ref0poc, poc));
-    const unsigned d1 = abs(get_poc_diff(order_hint_n_bits, ref1poc, poc));
+    const unsigned d1 = abs(get_poc_diff(order_hint_n_bits, poc, ref1poc));
    const int offset = d0 == d1;
    const int a_ctx = a->comp_type[xb4] >= COMP_INTER_AVG ||
                      a->ref[0][xb4] == 6;

--- a/src/ipred.c
+++ b/src/ipred.c
@@ -36,24 +36,24 @@
 #include "src/ipred.h"

 #define sz_grid(l_fn) \
-l_fn( 4,  4); \
-l_fn( 4,  8); \
-l_fn( 4, 16); \
-l_fn( 8,  4); \
-l_fn( 8,  8); \
-l_fn( 8, 16); \
-l_fn( 8, 32); \
-l_fn(16,  4); \
-l_fn(16,  8); \
-l_fn(16, 16); \
-l_fn(16, 32); \
-l_fn(16, 64); \
-l_fn(32,  8); \
-l_fn(32, 16); \
-l_fn(32, 32); \
-l_fn(32, 64); \
-l_fn(64, 16); \
-l_fn(64, 32); \
+l_fn( 4,  4) \
+l_fn( 4,  8) \
+l_fn( 4, 16) \
+l_fn( 8,  4) \
+l_fn( 8,  8) \
+l_fn( 8, 16) \
+l_fn( 8, 32) \
+l_fn(16,  4) \
+l_fn(16,  8) \
+l_fn(16, 16) \
+l_fn(16, 32) \
+l_fn(16, 64) \
+l_fn(32,  8) \
+l_fn(32, 16) \
+l_fn(32, 32) \
+l_fn(32, 64) \
+l_fn(64, 16) \
+l_fn(64, 32) \
 l_fn(64, 64)

 static __attribute__((noinline)) void
@@ -99,31 +99,31 @@ static void dc##dir##_##w##x##h##_c(pixel *dst, const ptrdiff_t stride, \
 dc_lfn(width, height, top, unsigned dc = width >> 1; \
                           for (int i = 0; i < width; i++) \
                               dc += topleft[1 + i]; \
-                           dc >>= sh1); \
+                           dc >>= sh1) \
 dc_lfn(width, height, left, unsigned dc = height >> 1; \
                            for (int i = 0; i < height; i++) \
                                dc += topleft[-(1 + i)]; \
                            dc >>= sh2)

-dc1d_lfns( 4,  4, 2, 2);
-dc1d_lfns( 4,  8, 2, 3);
-dc1d_lfns( 4, 16, 2, 4);
-dc1d_lfns( 8,  4, 3, 2);
-dc1d_lfns( 8,  8, 3, 3);
-dc1d_lfns( 8, 16, 3, 4);
-dc1d_lfns( 8, 32, 3, 5);
-dc1d_lfns(16,  4, 4, 2);
-dc1d_lfns(16,  8, 4, 3);
-dc1d_lfns(16, 16, 4, 4);
-dc1d_lfns(16, 32, 4, 5);
-dc1d_lfns(16, 64, 4, 6);
-dc1d_lfns(32,  8, 5, 3);
-dc1d_lfns(32, 16, 5, 4);
-dc1d_lfns(32, 32, 5, 5);
-dc1d_lfns(32, 64, 5, 6);
-dc1d_lfns(64, 16, 6, 4);
-dc1d_lfns(64, 32, 6, 5);
-dc1d_lfns(64, 64, 6, 6);
+dc1d_lfns( 4,  4, 2, 2)
+dc1d_lfns( 4,  8, 2, 3)
+dc1d_lfns( 4, 16, 2, 4)
+dc1d_lfns( 8,  4, 3, 2)
+dc1d_lfns( 8,  8, 3, 3)
+dc1d_lfns( 8, 16, 3, 4)
+dc1d_lfns( 8, 32, 3, 5)
+dc1d_lfns(16,  4, 4, 2)
+dc1d_lfns(16,  8, 4, 3)
+dc1d_lfns(16, 16, 4, 4)
+dc1d_lfns(16, 32, 4, 5)
+dc1d_lfns(16, 64, 4, 6)
+dc1d_lfns(32,  8, 5, 3)
+dc1d_lfns(32, 16, 5, 4)
+dc1d_lfns(32, 32, 5, 5)
+dc1d_lfns(32, 64, 5, 6)
+dc1d_lfns(64, 16, 6, 4)
+dc1d_lfns(64, 32, 6, 5)
+dc1d_lfns(64, 64, 6, 6)

 #define dc2d_lfn(width, height, dc_gen) \
 dc_lfn(width, height,, unsigned dc = (width + height) >> 1; \
@@ -133,30 +133,44 @@ dc_lfn(width, height,, unsigned dc = (width + height) >> 1; \
                           dc += topleft[-(i + 1)]; \
                       dc_gen)

-dc2d_lfn( 4,  4, dc >>= 3);
-dc2d_lfn( 4,  8, dc = iclip_pixel(0x5556 * dc >> 18));
-dc2d_lfn( 4, 16, dc = iclip_pixel(0x3334 * dc >> 18));
-dc2d_lfn( 8,  4, dc = iclip_pixel(0x5556 * dc >> 18));
-dc2d_lfn( 8,  8, dc >>= 4);
-dc2d_lfn( 8, 16, dc = iclip_pixel(0x5556 * dc >> 19));
-dc2d_lfn( 8, 32, dc = iclip_pixel(0x3334 * dc >> 19));
-dc2d_lfn(16,  4, dc = iclip_pixel(0x3334 * dc >> 18));
-dc2d_lfn(16,  8, dc = iclip_pixel(0x5556 * dc >> 19));
-dc2d_lfn(16, 16, dc >>= 5);
-dc2d_lfn(16, 32, dc = iclip_pixel(0x5556 * dc >> 20));
-dc2d_lfn(16, 64, dc = iclip_pixel(0x3334 * dc >> 20));
-dc2d_lfn(32,  8, dc = iclip_pixel(0x3334 * dc >> 19));
-dc2d_lfn(32, 16, dc = iclip_pixel(0x5556 * dc >> 20));
-dc2d_lfn(32, 32, dc >>= 6);
-dc2d_lfn(32, 64, dc = iclip_pixel(0x5556 * dc >> 21));
-dc2d_lfn(64, 16, dc = iclip_pixel(0x3334 * dc >> 20));
-dc2d_lfn(64, 32, dc = iclip_pixel(0x5556 * dc >> 21));
-dc2d_lfn(64, 64, dc >>= 7);
+#if BITDEPTH == 8
+#define MULTIPLIER_1x2 0x5556
+#define MULTIPLIER_1x4 0x3334
+#define BASE_SHIFT 16
+#else
+#define MULTIPLIER_1x2 0xAAAB
+#define MULTIPLIER_1x4 0x6667
+#define BASE_SHIFT 17
+#endif
+
+dc2d_lfn( 4,  4, dc >>= 3)
+dc2d_lfn( 4,  8, dc = iclip_pixel(MULTIPLIER_1x2 * dc >> (BASE_SHIFT + 2)))
+dc2d_lfn( 4, 16, dc = iclip_pixel(MULTIPLIER_1x4 * dc >> (BASE_SHIFT + 2)))
+dc2d_lfn( 8,  4, dc = iclip_pixel(MULTIPLIER_1x2 * dc >> (BASE_SHIFT + 2)))
+dc2d_lfn( 8,  8, dc >>= 4)
+dc2d_lfn( 8, 16, dc = iclip_pixel(MULTIPLIER_1x2 * dc >> (BASE_SHIFT + 3)))
+dc2d_lfn( 8, 32, dc = iclip_pixel(MULTIPLIER_1x4 * dc >> (BASE_SHIFT + 3)))
+dc2d_lfn(16,  4, dc = iclip_pixel(MULTIPLIER_1x4 * dc >> (BASE_SHIFT + 2)))
+dc2d_lfn(16,  8, dc = iclip_pixel(MULTIPLIER_1x2 * dc >> (BASE_SHIFT + 3)))
+dc2d_lfn(16, 16, dc >>= 5)
+dc2d_lfn(16, 32, dc = iclip_pixel(MULTIPLIER_1x2 * dc >> (BASE_SHIFT + 4)))
+dc2d_lfn(16, 64, dc = iclip_pixel(MULTIPLIER_1x4 * dc >> (BASE_SHIFT + 4)))
+dc2d_lfn(32,  8, dc = iclip_pixel(MULTIPLIER_1x4 * dc >> (BASE_SHIFT + 3)))
+dc2d_lfn(32, 16, dc = iclip_pixel(MULTIPLIER_1x2 * dc >> (BASE_SHIFT + 4)))
+dc2d_lfn(32, 32, dc >>= 6)
+dc2d_lfn(32, 64, dc = iclip_pixel(MULTIPLIER_1x2 * dc >> (BASE_SHIFT + 5)))
+dc2d_lfn(64, 16, dc = iclip_pixel(MULTIPLIER_1x4 * dc >> (BASE_SHIFT + 4)))
+dc2d_lfn(64, 32, dc = iclip_pixel(MULTIPLIER_1x2 * dc >> (BASE_SHIFT + 5)))
+dc2d_lfn(64, 64, dc >>= 7)
+
+#undef MULTIPLIER_1x2
+#undef MULTIPLIER_1x4
+#undef BASE_SHIFT

 #define dc128_lfn(width, height) \
 dc_lfn(width, height, 128, const unsigned dc = (1 << BITDEPTH) >> 1)

-sz_grid(dc128_lfn);
+sz_grid(dc128_lfn)

 static __attribute__((noinline)) void
 v_c(pixel *dst, const ptrdiff_t stride,
@@ -175,7 +189,7 @@ static void v_##width##x##height##_##c(pixel *dst, const ptrdiff_t stride, \
    v_c(dst, stride, topleft, width, height); \
 }

-sz_grid(v_lfn);
+sz_grid(v_lfn)

 static __attribute__((noinline)) void
 h_c(pixel *dst, const ptrdiff_t stride,
@@ -194,7 +208,7 @@ static void h_##width##x##height##_c(pixel *dst, const ptrdiff_t stride, \
    h_c(dst, stride, topleft, width, height); \
 }

-sz_grid(h_lfn);
+sz_grid(h_lfn)

 static __attribute__((noinline)) void
 paeth_c(pixel *dst, const ptrdiff_t stride, const pixel *const tl_ptr,
@@ -225,7 +239,7 @@ static void paeth_##width##x##height##_c(pixel *dst, const ptrdiff_t stride, \
    paeth_c(dst, stride, topleft, width, height); \
 }

-sz_grid(paeth_lfn);
+sz_grid(paeth_lfn)

 static const uint8_t sm_weight_arrays[] = {
    // Unused, because we always offset by bs, which is at least 2.
@@ -276,7 +290,7 @@ static void smooth_##width##x##height##_c(pixel *dst, const ptrdiff_t stride, \
    smooth_c(dst, stride, topleft, width, height); \
 }

-sz_grid(smooth_lfn);
+sz_grid(smooth_lfn)

 static __attribute__((noinline)) void
 smooth_v_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
@@ -303,7 +317,7 @@ static void smooth_v_##width##x##height##_c(pixel *dst, const ptrdiff_t stride,
    smooth_v_c(dst, stride, topleft, width, height); \
 }

-sz_grid(smooth_v_lfn);
+sz_grid(smooth_v_lfn)

 static __attribute__((noinline)) void
 smooth_h_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
@@ -330,7 +344,7 @@ static void smooth_h_##width##x##height##_c(pixel *dst, const ptrdiff_t stride,
    smooth_h_c(dst, stride, topleft, width, height); \
 }

-sz_grid(smooth_h_lfn);
+sz_grid(smooth_h_lfn)

 static const int16_t dr_intra_derivative[90] = {
  // More evenly spread out angles and limited to 10-bit
@@ -454,7 +468,7 @@ z1_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft_in,
    angle &= 511;
    assert(angle < 90);
    const int dx = dr_intra_derivative[angle];
-    pixel top_out[(width + height) * 2];
+    pixel top_out[(64 + 64) * 2];
    const pixel *top;
    int max_base_x;
    const int upsample_above = get_upsample(width + height, 90 - angle, is_sm);
@@ -506,7 +520,7 @@ static void z1_##width##x##height##_c(pixel *dst, const ptrdiff_t stride, \
    z1_c(dst, stride, topleft, angle, width, height); \
 }

-sz_grid(z1_lfn);
+sz_grid(z1_lfn)

 static __attribute__((noinline)) void
 z2_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft_in,
@@ -519,7 +533,7 @@ z2_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft_in,
    const int dx = dr_intra_derivative[180 - angle];
    const int upsample_left = get_upsample(width + height, 180 - angle, is_sm);
    const int upsample_above = get_upsample(width + height, angle - 90, is_sm);
-    pixel edge[width * 2 + height * 2 + 1];
+    pixel edge[64 * 2 + 64 * 2 + 1];
    pixel *const topleft = &edge[height * 2];

    if (upsample_above) {
@@ -587,7 +601,7 @@ static void z2_##width##x##height##_c(pixel *dst, const ptrdiff_t stride, \
    z2_c(dst, stride, topleft, angle, width, height); \
 }

-sz_grid(z2_lfn);
+sz_grid(z2_lfn)

 static __attribute__((noinline)) void
 z3_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft_in,
@@ -597,7 +611,7 @@ z3_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft_in,
    angle &= 511;
    assert(angle > 180);
    const int dy = dr_intra_derivative[270 - angle];
-    pixel left_out[(width + height) * 2];
+    pixel left_out[(64 + 64) * 2];
    const pixel *left;
    int max_base_y;
    const int upsample_left = get_upsample(width + height, angle - 180, is_sm);
@@ -652,7 +666,7 @@ static void z3_##width##x##height##_c(pixel *dst, const ptrdiff_t stride, \
    z3_c(dst, stride, topleft, angle, width, height); \
 }

-sz_grid(z3_lfn);
+sz_grid(z3_lfn)

 static const int8_t av1_filter_intra_taps[5][8][8] = {
    {
@@ -758,20 +772,20 @@ static void filter_##width##x##height##_c(pixel *const dst, \
    filter_intra_c(dst, stride, topleft, filt_idx, width, height); \
 }

-filter_lfn( 4,  4);
-filter_lfn( 8,  4);
-filter_lfn(16,  4);
-filter_lfn( 4,  8);
-filter_lfn( 8,  8);
-filter_lfn(16,  8);
-filter_lfn(32,  8);
-filter_lfn( 4, 16);
-filter_lfn( 8, 16);
-filter_lfn(16, 16);
-filter_lfn(32, 16);
-filter_lfn( 8, 32);
-filter_lfn(16, 32);
-filter_lfn(32, 32);
+filter_lfn( 4,  4)
+filter_lfn( 8,  4)
+filter_lfn(16,  4)
+filter_lfn( 4,  8)
+filter_lfn( 8,  8)
+filter_lfn(16,  8)
+filter_lfn(32,  8)
+filter_lfn( 4, 16)
+filter_lfn( 8, 16)
+filter_lfn(16, 16)
+filter_lfn(32, 16)
+filter_lfn( 8, 32)
+filter_lfn(16, 32)
+filter_lfn(32, 32)

 static __attribute__((noinline)) void
 cfl_ac_c(int16_t *ac, const pixel *ypx, const ptrdiff_t stride,
@@ -800,7 +814,7 @@ cfl_ac_c(int16_t *ac, const pixel *ypx, const ptrdiff_t stride,
        ypx += PXSTRIDE(stride) << ss_ver;
    }
    for (; y < height; y++) {
-        memcpy(ac, &ac[-32], width * sizeof(*ac));
+        memcpy(ac, &ac[-width], width * sizeof(*ac));
        ac += width;
    }

@@ -830,39 +844,39 @@ static void cfl_ac_##lw##x##lh##_to_##cw##x##ch##_c(int16_t *const ac, \
    cfl_ac_c(ac, ypx, stride, w_pad, h_pad, cw, ch, ss_hor, ss_ver, log2sz); \
 }

-cfl_ac_fn( 8,  8,  4,  4, 1, 1, 4);
-cfl_ac_fn( 8, 16,  4,  8, 1, 1, 5);
-cfl_ac_fn( 8, 32,  4, 16, 1, 1, 6);
-cfl_ac_fn(16,  8,  8,  4, 1, 1, 5);
-cfl_ac_fn(16, 16,  8,  8, 1, 1, 6);
-cfl_ac_fn(16, 32,  8, 16, 1, 1, 7);
-cfl_ac_fn(32,  8, 16,  4, 1, 1, 6);
-cfl_ac_fn(32, 16, 16,  8, 1, 1, 7);
-cfl_ac_fn(32, 32, 16, 16, 1, 1, 8);
-
-cfl_ac_fn( 8,  4,  4,  4, 1, 0, 4);
-cfl_ac_fn( 8,  8,  4,  8, 1, 0, 5);
-cfl_ac_fn(16,  4,  8,  4, 1, 0, 5);
-cfl_ac_fn(16,  8,  8,  8, 1, 0, 6);
-cfl_ac_fn(16, 16,  8, 16, 1, 0, 7);
-cfl_ac_fn(32,  8, 16,  8, 1, 0, 7);
-cfl_ac_fn(32, 16, 16, 16, 1, 0, 8);
-cfl_ac_fn(32, 32, 16, 32, 1, 0, 9);
-
-cfl_ac_fn( 4,  4,  4,  4, 0, 0, 4);
-cfl_ac_fn( 4,  8,  4,  8, 0, 0, 5);
-cfl_ac_fn( 4, 16,  4, 16, 0, 0, 6);
-cfl_ac_fn( 8,  4,  8,  4, 0, 0, 5);
-cfl_ac_fn( 8,  8,  8,  8, 0, 0, 6);
-cfl_ac_fn( 8, 16,  8, 16, 0, 0, 7);
-cfl_ac_fn( 8, 32,  8, 32, 0, 0, 8);
-cfl_ac_fn(16,  4, 16,  4, 0, 0, 6);
-cfl_ac_fn(16,  8, 16,  8, 0, 0, 7);
-cfl_ac_fn(16, 16, 16, 16, 0, 0, 8);
-cfl_ac_fn(16, 32, 16, 32, 0, 0, 9);
-cfl_ac_fn(32,  8, 32,  8, 0, 0, 8);
-cfl_ac_fn(32, 16, 32, 16, 0, 0, 9);
-cfl_ac_fn(32, 32, 32, 32, 0, 0, 10);
+cfl_ac_fn( 8,  8,  4,  4, 1, 1, 4)
+cfl_ac_fn( 8, 16,  4,  8, 1, 1, 5)
+cfl_ac_fn( 8, 32,  4, 16, 1, 1, 6)
+cfl_ac_fn(16,  8,  8,  4, 1, 1, 5)
+cfl_ac_fn(16, 16,  8,  8, 1, 1, 6)
+cfl_ac_fn(16, 32,  8, 16, 1, 1, 7)
+cfl_ac_fn(32,  8, 16,  4, 1, 1, 6)
+cfl_ac_fn(32, 16, 16,  8, 1, 1, 7)
+cfl_ac_fn(32, 32, 16, 16, 1, 1, 8)
+
+cfl_ac_fn( 8,  4,  4,  4, 1, 0, 4)
+cfl_ac_fn( 8,  8,  4,  8, 1, 0, 5)
+cfl_ac_fn(16,  4,  8,  4, 1, 0, 5)
+cfl_ac_fn(16,  8,  8,  8, 1, 0, 6)
+cfl_ac_fn(16, 16,  8, 16, 1, 0, 7)
+cfl_ac_fn(32,  8, 16,  8, 1, 0, 7)
+cfl_ac_fn(32, 16, 16, 16, 1, 0, 8)
+cfl_ac_fn(32, 32, 16, 32, 1, 0, 9)
+
+cfl_ac_fn( 4,  4,  4,  4, 0, 0, 4)
+cfl_ac_fn( 4,  8,  4,  8, 0, 0, 5)
+cfl_ac_fn( 4, 16,  4, 16, 0, 0, 6)
+cfl_ac_fn( 8,  4,  8,  4, 0, 0, 5)
+cfl_ac_fn( 8,  8,  8,  8, 0, 0, 6)
+cfl_ac_fn( 8, 16,  8, 16, 0, 0, 7)
+cfl_ac_fn( 8, 32,  8, 32, 0, 0, 8)
+cfl_ac_fn(16,  4, 16,  4, 0, 0, 6)
+cfl_ac_fn(16,  8, 16,  8, 0, 0, 7)
+cfl_ac_fn(16, 16, 16, 16, 0, 0, 8)
+cfl_ac_fn(16, 32, 16, 32, 0, 0, 9)
+cfl_ac_fn(32,  8, 32,  8, 0, 0, 8)
+cfl_ac_fn(32, 16, 32, 16, 0, 0, 9)
+cfl_ac_fn(32, 32, 32, 32, 0, 0, 10)

 static __attribute__((noinline)) void
 cfl_pred_c(pixel *dstU, pixel *dstV, const ptrdiff_t stride,
@@ -896,10 +910,10 @@ static void cfl_pred_##width##xN_c(pixel *const dstU, \
    cfl_pred_c(dstU, dstV, stride, ac, dc_pred, alphas, width, height); \
 }

-cfl_pred_fn( 4);
-cfl_pred_fn( 8);
-cfl_pred_fn(16);
-cfl_pred_fn(32);
+cfl_pred_fn( 4)
+cfl_pred_fn( 8)
+cfl_pred_fn(16)
+cfl_pred_fn(32)

 static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
                       const uint16_t *const pal, const uint8_t *idx,
@@ -979,9 +993,9 @@ void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {

    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_4X4  ] = cfl_ac_8x4_to_4x4_c;
    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_4X8  ] = cfl_ac_8x8_to_4x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_8X4 ] = cfl_ac_16x4_to_8x4_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_8X8 ] = cfl_ac_16x8_to_8x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_8X16] = cfl_ac_16x16_to_8x16_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_8X4  ] = cfl_ac_16x4_to_8x4_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_8X8  ] = cfl_ac_16x8_to_8x8_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_8X16 ] = cfl_ac_16x16_to_8x16_c;
    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_16X8 ] = cfl_ac_32x8_to_16x8_c;
    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_16X16] = cfl_ac_32x16_to_16x16_c;
    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_16X32] = cfl_ac_32x32_to_16x32_c;

--- a/src/ipred.h
+++ b/src/ipred.h
@@ -40,7 +40,7 @@
 *   see ipred_prepare.h for more detailed documentation.
 */
 #define decl_angular_ipred_fn(name) \
-void (name)(pixel *dst, ptrdiff_t stride, const pixel *topleft, int angle);
+void (name)(pixel *dst, ptrdiff_t stride, const pixel *topleft, int angle)
 typedef decl_angular_ipred_fn(*angular_ipred_fn);

 /*

--- a/src/itx.c
+++ b/src/itx.c
@@ -49,7 +49,9 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
 {
    int i, j;
    const ptrdiff_t sh = imin(h, 32), sw = imin(w, 32);
-    coef tmp[w * h], out[h], in_mem[w];
+    assert((h >= 4 || h <= 64) && (w >= 4 || w <= 64));
+    // Maximum value for h and w is 64
+    coef tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];
    const int is_rect2 = w * 2 == h || h * 2 == w;

    if (w != sw) memset(&in_mem[sw], 0, (w - sw) * sizeof(*in_mem));
@@ -96,48 +98,48 @@ inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
 inv_txfm_fn(dct, dct, w, h, shift1, shift2)

 #define inv_txfm_fn32(w, h, shift1, shift2) \
-inv_txfm_fn64(w, h, shift1, shift2); \
+inv_txfm_fn64(w, h, shift1, shift2) \
 inv_txfm_fn(identity, identity, w, h, shift1, shift2)

 #define inv_txfm_fn16(w, h, shift1, shift2) \
-inv_txfm_fn32(w, h, shift1, shift2); \
-inv_txfm_fn(adst,     dct,      w, h, shift1, shift2); \
-inv_txfm_fn(dct,      adst,     w, h, shift1, shift2); \
-inv_txfm_fn(adst,     adst,     w, h, shift1, shift2); \
-inv_txfm_fn(dct,      flipadst, w, h, shift1, shift2); \
-inv_txfm_fn(flipadst, dct,      w, h, shift1, shift2); \
-inv_txfm_fn(adst,     flipadst, w, h, shift1, shift2); \
-inv_txfm_fn(flipadst, adst,     w, h, shift1, shift2); \
-inv_txfm_fn(flipadst, flipadst, w, h, shift1, shift2); \
-inv_txfm_fn(identity, dct,      w, h, shift1, shift2); \
-inv_txfm_fn(dct,      identity, w, h, shift1, shift2); \
+inv_txfm_fn32(w, h, shift1, shift2) \
+inv_txfm_fn(adst,     dct,      w, h, shift1, shift2) \
+inv_txfm_fn(dct,      adst,     w, h, shift1, shift2) \
+inv_txfm_fn(adst,     adst,     w, h, shift1, shift2) \
+inv_txfm_fn(dct,      flipadst, w, h, shift1, shift2) \
+inv_txfm_fn(flipadst, dct,      w, h, shift1, shift2) \
+inv_txfm_fn(adst,     flipadst, w, h, shift1, shift2) \
+inv_txfm_fn(flipadst, adst,     w, h, shift1, shift2) \
+inv_txfm_fn(flipadst, flipadst, w, h, shift1, shift2) \
+inv_txfm_fn(identity, dct,      w, h, shift1, shift2) \
+inv_txfm_fn(dct,      identity, w, h, shift1, shift2) \

 #define inv_txfm_fn84(w, h, shift1, shift2) \
-inv_txfm_fn16(w, h, shift1, shift2); \
-inv_txfm_fn(identity, flipadst, w, h, shift1, shift2); \
-inv_txfm_fn(flipadst, identity, w, h, shift1, shift2); \
-inv_txfm_fn(identity, adst,     w, h, shift1, shift2); \
-inv_txfm_fn(adst,     identity, w, h, shift1, shift2); \
-
-inv_txfm_fn84( 4,  4, 0, 4);
-inv_txfm_fn84( 4,  8, 0, 4);
-inv_txfm_fn84( 4, 16, 1, 4);
-inv_txfm_fn84( 8,  4, 0, 4);
-inv_txfm_fn84( 8,  8, 1, 4);
-inv_txfm_fn84( 8, 16, 1, 4);
-inv_txfm_fn32( 8, 32, 2, 4);
-inv_txfm_fn84(16,  4, 1, 4);
-inv_txfm_fn84(16,  8, 1, 4);
-inv_txfm_fn16(16, 16, 2, 4);
-inv_txfm_fn32(16, 32, 1, 4);
-inv_txfm_fn64(16, 64, 2, 4);
-inv_txfm_fn32(32,  8, 2, 4);
-inv_txfm_fn32(32, 16, 1, 4);
-inv_txfm_fn32(32, 32, 2, 4);
-inv_txfm_fn64(32, 64, 1, 4);
-inv_txfm_fn64(64, 16, 2, 4);
-inv_txfm_fn64(64, 32, 1, 4);
-inv_txfm_fn64(64, 64, 2, 4);
+inv_txfm_fn16(w, h, shift1, shift2) \
+inv_txfm_fn(identity, flipadst, w, h, shift1, shift2) \
+inv_txfm_fn(flipadst, identity, w, h, shift1, shift2) \
+inv_txfm_fn(identity, adst,     w, h, shift1, shift2) \
+inv_txfm_fn(adst,     identity, w, h, shift1, shift2) \
+
+inv_txfm_fn84( 4,  4, 0, 4)
+inv_txfm_fn84( 4,  8, 0, 4)
+inv_txfm_fn84( 4, 16, 1, 4)
+inv_txfm_fn84( 8,  4, 0, 4)
+inv_txfm_fn84( 8,  8, 1, 4)
+inv_txfm_fn84( 8, 16, 1, 4)
+inv_txfm_fn32( 8, 32, 2, 4)
+inv_txfm_fn84(16,  4, 1, 4)
+inv_txfm_fn84(16,  8, 1, 4)
+inv_txfm_fn16(16, 16, 2, 4)
+inv_txfm_fn32(16, 32, 1, 4)
+inv_txfm_fn64(16, 64, 2, 4)
+inv_txfm_fn32(32,  8, 2, 4)
+inv_txfm_fn32(32, 16, 1, 4)
+inv_txfm_fn32(32, 32, 2, 4)
+inv_txfm_fn64(32, 64, 1, 4)
+inv_txfm_fn64(64, 16, 2, 4)
+inv_txfm_fn64(64, 32, 1, 4)
+inv_txfm_fn64(64, 64, 2, 4)

 static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
                                       coef *const coeff, const int eob)

--- a/src/itx_1d.c
+++ b/src/itx_1d.c
@@ -802,9 +802,9 @@ static void inv_flipadst##sz##_1d(const coef *const in, const ptrdiff_t in_s, \
    inv_adst##sz##_1d(in, in_s, &out[(sz - 1) * out_s], -out_s); \
 }

-flip_inv_adst(4);
-flip_inv_adst(8);
-flip_inv_adst(16);
+flip_inv_adst(4)
+flip_inv_adst(8)
+flip_inv_adst(16)

 #undef flip_inv_adst


--- a/src/levels.h
+++ b/src/levels.h
@@ -465,11 +465,17 @@ typedef struct Av1FrameHeader {
        Av1SegmentationDataSet seg_data;
        int lossless[NUM_SEGMENTS], qidx[NUM_SEGMENTS];
    } segmentation;
-    int delta_q_present;
-    int delta_q_res_log2;
-    int delta_lf_present;
-    int delta_lf_res_log2;
-    int delta_lf_multi;
+    struct {
+        struct {
+            int present;
+            int res_log2;
+        } q;
+        struct {
+            int present;
+            int res_log2;
+            int multi;
+        } lf;
+    } delta;
    int all_lossless;
    struct {
        int level_y[2];

--- a/src/lf_apply.c
+++ b/src/lf_apply.c
@@ -220,12 +220,14 @@ void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
                                    int sby, const int start_of_tile_row)
 {
    int x, have_left;
+    // Don't filter outside the frame
+    const int hy4 = (f->cur.p.p.h + 3) >> 2;
    const int have_top = sby > 0;
    const int is_sb64 = !f->seq_hdr.sb128;
    const int starty4 = (sby & is_sb64) << 4;
    const int sbsz = 32 >> is_sb64;
    const int sbl2 = 5 - is_sb64;
-    const int endy4 = starty4 + imin(f->bh - sby * f->sb_step, sbsz);
+    const int endy4 = starty4 + imin(hy4 - sby * f->sb_step, sbsz);
    const int halign = (f->bh + 31) & ~31;
    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;

--- a/src/lf_mask.c
+++ b/src/lf_mask.c
@@ -390,13 +390,13 @@ void dav1d_calc_lf_values(uint8_t (*const lflvl_values)[4][8][2],
        dav1d_calc_lf_value(lflvl_values[s][0], 0, hdr->loopfilter.level_y[0],
                            lf_delta[0], segd ? segd->delta_lf_y_v : 0, mr_deltas);
        dav1d_calc_lf_value(lflvl_values[s][1], 0, hdr->loopfilter.level_y[1],
-                            lf_delta[hdr->delta_lf_multi ? 1 : 0],
+                            lf_delta[hdr->delta.lf.multi ? 1 : 0],
                            segd ? segd->delta_lf_y_h : 0, mr_deltas);
        dav1d_calc_lf_value(lflvl_values[s][2], 1, hdr->loopfilter.level_u,
-                            lf_delta[hdr->delta_lf_multi ? 2 : 0],
+                            lf_delta[hdr->delta.lf.multi ? 2 : 0],
                            segd ? segd->delta_lf_u : 0, mr_deltas);
        dav1d_calc_lf_value(lflvl_values[s][3], 1, hdr->loopfilter.level_v,
-                            lf_delta[hdr->delta_lf_multi ? 3 : 0],
+                            lf_delta[hdr->delta.lf.multi ? 3 : 0],
                            segd ? segd->delta_lf_v : 0, mr_deltas);
    }
 }
No results found