Matthias Dressel · Henrik Gramner · Kyle Siefring · Jean-Baptiste Kempf · Henrik Gramner · Henrik Gramner
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ tags
 .DS_Store
 /tests/dav1d-test-data
 *.snap
+/tools/output/xxhash.h
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -91,6 +91,7 @@ x86inc-check:
        - git diff --exit-code x86inc/master:x86inc.asm src/ext/x86/x86inc.asm
    allow_failure: true

+
 build-debian:
    extends: .debian-amd64-common
    tags:
@@ -138,6 +139,30 @@ build-debian-examples:
                      -Denable_examples=true
        - ninja -C build

+build-debian-no-tools:
+    extends: .debian-amd64-common
+    script:
+        - meson build --buildtype release
+                      --werror
+                      -Denable_tools=false
+        - ninja -C build
+
+build-debian-bitdepth-8:
+    extends: .debian-amd64-common
+    script:
+        - meson build --buildtype release
+                      --werror
+                      -Dbitdepths=8
+        - ninja -C build
+
+build-debian-bitdepth-16:
+    extends: .debian-amd64-common
+    script:
+        - meson build --buildtype release
+                      --werror
+                      -Dbitdepths=16
+        - ninja -C build
+
 build-win32:
    extends: .debian-amd64-common
    script:
@@ -290,7 +315,8 @@ build-debian-aarch64-clang-5:
 build-macos:
    stage: build
    tags:
-        - macos
+        - amd64
+        - catalina
    script:
        - meson build --buildtype release
                      -Ddefault_library=both
@@ -400,6 +426,8 @@ test-debian:
        - ninja coverage-xml
        - grep -Eo 'line-rate="[^"]+"' meson-logs/coverage.xml | head -n 1 |
          grep -Eo '[0-9.]+' | awk '{ print "coverage:", $1 * 100 } '
+        - time meson test -v --suite testdata_seek-stress --test-args "--tilethreads 2 --framethreads 1 --pfthreads=2"
+        - time meson test -v --suite testdata_seek-stress --test-args "--tilethreads 2 --framethreads 2 --pfthreads=2"
    coverage: '/^coverage: (\d+.\d+)$/'
    artifacts:
        expose_as: 'Coverage HTML report'
@@ -452,7 +480,7 @@ test-debian-asan:
        - ninja -C build
        - cd build
        - exit_code=0
-        - time meson test -v --setup=sanitizer --test-args "--cpumask 0" || exit_code=$((exit_code + $?))
+        - time meson test -v --setup=sanitizer --test-args "--cpumask 0"    || exit_code=$((exit_code + $?))
        - time meson test -v --setup=sanitizer --test-args "--cpumask 0xff" || exit_code=$((exit_code + $?))
        - if [ $exit_code -ne 0 ]; then exit $exit_code; fi

@@ -509,9 +537,14 @@ test-debian-tsan:
        - ninja -C build
        - cd build
        - exit_code=0
-        - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 1 --framethreads 2" || exit_code=$((exit_code + $?))
-        - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 1" || exit_code=$((exit_code + $?))
-        - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 2" || exit_code=$((exit_code + $?))
+        - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 1 --framethreads 2 --pfthreads 1" || exit_code=$((exit_code + $?))
+        - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 1 --pfthreads 1" || exit_code=$((exit_code + $?))
+        - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 2 --pfthreads 1" || exit_code=$((exit_code + $?))
+        - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 1 --pfthreads 2" || exit_code=$((exit_code + $?))
+        - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 2 --pfthreads 2" || exit_code=$((exit_code + $?))
+        - time meson test -v --setup=sanitizer --suite testdata_seek-stress --test-args "--tilethreads 2 --framethreads 1 --pfthreads 2"                               || exit_code=$((exit_code + $?))
+        - time meson test -v --setup=sanitizer --suite testdata_seek-stress --test-args "--tilethreads 2 --framethreads 2 --pfthreads 2"                               || exit_code=$((exit_code + $?))
+        - time meson test -v --setup=sanitizer --suite oss-fuzz-asan --suite oss-fuzz-msan --suite oss-fuzz-ubsan                                                      || exit_code=$((exit_code + $?))
        - if [ $exit_code -ne 0 ]; then exit $exit_code; fi

 test-win64:

--- a/NEWS
+++ b/NEWS
-Changes for 0.8.1 'Eurasian hobby":
+Changes for 0.8.2 'Eurasian hobby':
+-----------------------------------
+
+0.8.2 is a middle-size update of the 0.8.0 branch:
+ - ARM32 optimizations for ipred and itx in 10/12bits,
+   completing the 10b/12b work on ARM64 and ARM32
+ - Give the post-filters their own threads
+ - ARM64: rewrite the wiener functions
+ - Speed up coefficient decoding, 0.5%-3% global decoding gain
+ - x86 optimizations for CDEF_filter and wiener in 10/12bit
+ - x86: rewrite the SGR AVX2 asm
+ - x86: improve msac speed on SSE2+ machines
+ - ARM32: improve speed of ipred and warp
+ - ARM64: improve speed of ipred, cdef_dir, cdef_filter, warp_motion and itx16
+ - ARM32/64: improve speed of looprestoration
+ - Add seeking, pausing to the player
+ - Update the player for rendering of 10b/12b
+ - Misc speed improvements and fixes on all platforms
+ - Add a xxh3 muxer in the dav1d application
+
+
+Changes for 0.8.1 'Eurasian hobby':
 -----------------------------------

 0.8.1 is a minor update on 0.8.0:
@@ -10,7 +31,7 @@ Changes for 0.8.1 'Eurasian hobby":
 - x86 optimizations for wiener in SSE2/SSSE3/AVX2


-Changes for 0.8.0 'Eurasian hobby":
+Changes for 0.8.0 'Eurasian hobby':
 -----------------------------------

 0.8.0 is a major update for dav1d:

--- a/examples/dav1dplay.c
+++ b/examples/dav1dplay.c
@@ -39,6 +39,11 @@
 #include "dp_fifo.h"
 #include "dp_renderer.h"

+#define FRAME_OFFSET_TO_PTS(foff) \
+    (uint64_t)(((foff) * rd_ctx->spf) * 1000000000.0 + .5)
+#define TS_TO_PTS(ts) \
+    (uint64_t)(((ts) * rd_ctx->timebase) * 1000000000.0 + .5)
+
 // Selected renderer callbacks and cookie
 static const Dav1dPlayRenderInfo *renderer_info = { NULL };

@@ -59,27 +64,43 @@ typedef struct render_context
    // Lock to protect access to the context structure
    SDL_mutex *lock;

-    // Timestamp of previous decoded frame
-    int64_t last_pts;
-    // Timestamp of current decoded frame
-    int64_t current_pts;
+    // Timestamp of last displayed frame (in timebase unit)
+    int64_t last_ts;
+    // Timestamp of last decoded frame (in timebase unit)
+    int64_t current_ts;
    // Ticks when last frame was received
    uint32_t last_ticks;
    // PTS time base
    double timebase;
+    // Seconds per frame
+    double spf;
+    // Number of frames
+    uint32_t total;

    // Fifo
    Dav1dPlayPtrFifo *fifo;

-    // Custom SDL2 event type
-    uint32_t renderer_event_type;
+    // Custom SDL2 event types
+    uint32_t event_types;
+
+    // User pause state
+    uint8_t user_paused;
+    // Internal pause state
+    uint8_t paused;
+    // Start of internal pause state
+    uint32_t pause_start;
+    // Duration of internal pause state
+    uint32_t pause_time;
+
+    // Seek accumulator
+    int seek;

    // Indicates if termination of the decoder thread was requested
    uint8_t dec_should_terminate;
 } Dav1dPlayRenderContext;

 static void dp_settings_print_usage(const char *const app,
-    const char *const reason, ...)
+                                    const char *const reason, ...)
 {
    if (reason) {
        va_list args;
@@ -95,6 +116,7 @@ static void dp_settings_print_usage(const char *const app,
            " --untimed/-u:         ignore PTS, render as fast as possible\n"
            " --framethreads $num:  number of frame threads (default: 1)\n"
            " --tilethreads $num:   number of tile threads (default: 1)\n"
+            " --pfthreads $num:     number of postfilter threads(default: 1)\n"
            " --highquality:        enable high quality rendering\n"
            " --zerocopy/-z:        enable zero copy upload path\n"
            " --gpugrain/-g:        enable GPU grain synthesis\n"
@@ -115,7 +137,7 @@ static unsigned parse_unsigned(const char *const optarg, const int option,
 }

 static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
-    const int argc, char *const *const argv)
+                                 const int argc, char *const *const argv)
 {
    int o;
    Dav1dPlaySettings *settings = &rd_ctx->settings;
@@ -127,6 +149,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
    enum {
        ARG_FRAME_THREADS = 256,
        ARG_TILE_THREADS,
+        ARG_POSTFILTER_THREADS,
        ARG_HIGH_QUALITY,
    };

@@ -137,6 +160,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
        { "untimed",        0, NULL, 'u' },
        { "framethreads",   1, NULL, ARG_FRAME_THREADS },
        { "tilethreads",    1, NULL, ARG_TILE_THREADS },
+        { "pfthreads",      1, NULL, ARG_POSTFILTER_THREADS },
        { "highquality",    0, NULL, ARG_HIGH_QUALITY },
        { "zerocopy",       0, NULL, 'z' },
        { "gpugrain",       0, NULL, 'g' },
@@ -175,6 +199,10 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
                lib_settings->n_tile_threads =
                    parse_unsigned(optarg, ARG_TILE_THREADS, argv[0]);
                break;
+            case ARG_POSTFILTER_THREADS:
+                lib_settings->n_postfilter_threads =
+                    parse_unsigned(optarg, ARG_POSTFILTER_THREADS, argv[0]);
+                break;
            default:
                dp_settings_print_usage(argv[0], NULL);
        }
@@ -213,16 +241,16 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv)
    Dav1dPlayRenderContext *rd_ctx;

    // Alloc
-    rd_ctx = malloc(sizeof(Dav1dPlayRenderContext));
+    rd_ctx = calloc(1, sizeof(Dav1dPlayRenderContext));
    if (rd_ctx == NULL) {
        return NULL;
    }

    // Register a custom event to notify our SDL main thread
    // about new frames
-    rd_ctx->renderer_event_type = SDL_RegisterEvents(1);
-    if (rd_ctx->renderer_event_type == UINT32_MAX) {
-        fprintf(stderr, "Failure to create custom SDL event type!\n");
+    rd_ctx->event_types = SDL_RegisterEvents(3);
+    if (rd_ctx->event_types == UINT32_MAX) {
+        fprintf(stderr, "Failure to create custom SDL event types!\n");
        free(rd_ctx);
        return NULL;
    }
@@ -265,24 +293,17 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv)
        return NULL;
    }

-    rd_ctx->last_pts = 0;
-    rd_ctx->last_ticks = 0;
-    rd_ctx->current_pts = 0;
-    rd_ctx->timebase = 0;
-    rd_ctx->dec_should_terminate = 0;
-
    return rd_ctx;
 }

 /**
- * Notify about new available frame
+ * Notify about new event
 */
-static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t code)
+static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t type)
 {
    SDL_Event event;
    SDL_zero(event);
-    event.type = rd_ctx->renderer_event_type;
-    event.user.code = code;
+    event.type = type;
    SDL_PushEvent(&event);
 }

@@ -294,10 +315,137 @@ static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t code)
 * new picture.
 */
 static void dp_rd_ctx_update_with_dav1d_picture(Dav1dPlayRenderContext *rd_ctx,
-    Dav1dPicture *dav1d_pic)
+                                                Dav1dPicture *dav1d_pic)
 {
+    rd_ctx->current_ts = dav1d_pic->m.timestamp;
    renderer_info->update_frame(rd_ctx->rd_priv, dav1d_pic, &rd_ctx->settings);
-    rd_ctx->current_pts = dav1d_pic->m.timestamp;
+}
+
+/**
+ * Toggle pause state
+ */
+static void dp_rd_ctx_toggle_pause(Dav1dPlayRenderContext *rd_ctx)
+{
+    SDL_LockMutex(rd_ctx->lock);
+    rd_ctx->user_paused = !rd_ctx->user_paused;
+    if (rd_ctx->seek)
+        goto out;
+    rd_ctx->paused = rd_ctx->user_paused;
+    uint32_t now = SDL_GetTicks();
+    if (rd_ctx->paused)
+        rd_ctx->pause_start = now;
+    else {
+        rd_ctx->pause_time += now - rd_ctx->pause_start;
+        rd_ctx->pause_start = 0;
+        rd_ctx->last_ticks = now;
+    }
+out:
+    SDL_UnlockMutex(rd_ctx->lock);
+}
+
+/**
+ * Query pause state
+ */
+static int dp_rd_ctx_is_paused(Dav1dPlayRenderContext *rd_ctx)
+{
+    int ret;
+    SDL_LockMutex(rd_ctx->lock);
+    ret = rd_ctx->paused;
+    SDL_UnlockMutex(rd_ctx->lock);
+    return ret;
+}
+
+/**
+ * Request seeking, in seconds
+ */
+static void dp_rd_ctx_seek(Dav1dPlayRenderContext *rd_ctx, int sec)
+{
+    SDL_LockMutex(rd_ctx->lock);
+    rd_ctx->seek += sec;
+    if (!rd_ctx->paused)
+        rd_ctx->pause_start = SDL_GetTicks();
+    rd_ctx->paused = 1;
+    SDL_UnlockMutex(rd_ctx->lock);
+}
+
+static int decode_frame(Dav1dPicture **p, Dav1dContext *c,
+                        Dav1dData *data, DemuxerContext *in_ctx);
+static inline void destroy_pic(void *a);
+
+/**
+ * Seek the stream, if requested
+ */
+static int dp_rd_ctx_handle_seek(Dav1dPlayRenderContext *rd_ctx,
+                                 DemuxerContext *in_ctx,
+                                 Dav1dContext *c, Dav1dData *data)
+{
+    int res = 0;
+    SDL_LockMutex(rd_ctx->lock);
+    if (!rd_ctx->seek)
+        goto out;
+    int64_t seek = rd_ctx->seek * 1000000000ULL;
+    uint64_t pts = TS_TO_PTS(rd_ctx->current_ts);
+    pts = ((int64_t)pts > -seek) ? pts + seek : 0;
+    int end = pts >= FRAME_OFFSET_TO_PTS(rd_ctx->total);
+    if (end)
+        pts = FRAME_OFFSET_TO_PTS(rd_ctx->total - 1);
+    uint64_t target_pts = pts;
+    dav1d_flush(c);
+    uint64_t shift = FRAME_OFFSET_TO_PTS(5);
+    while (1) {
+        if (shift > pts)
+            shift = pts;
+        if ((res = input_seek(in_ctx, pts - shift)))
+            goto out;
+        Dav1dSequenceHeader seq;
+        uint64_t cur_pts;
+        do {
+            if ((res = input_read(in_ctx, data)))
+                break;
+            cur_pts = TS_TO_PTS(data->m.timestamp);
+            res = dav1d_parse_sequence_header(&seq, data->data, data->sz);
+        } while (res && cur_pts < pts);
+        if (!res && cur_pts <= pts)
+            break;
+        if (shift > pts)
+            shift = pts;
+        pts -= shift;
+    }
+    if (!res) {
+        pts = TS_TO_PTS(data->m.timestamp);
+        while (pts < target_pts) {
+            Dav1dPicture *p;
+            if ((res = decode_frame(&p, c, data, in_ctx)))
+                break;
+            if (p) {
+                pts = TS_TO_PTS(p->m.timestamp);
+                if (pts < target_pts)
+                    destroy_pic(p);
+                else {
+                    dp_fifo_push(rd_ctx->fifo, p);
+                    uint32_t type = rd_ctx->event_types + DAV1D_EVENT_SEEK_FRAME;
+                    dp_rd_ctx_post_event(rd_ctx, type);
+                }
+            }
+        }
+        if (!res) {
+            rd_ctx->last_ts = data->m.timestamp - rd_ctx->spf / rd_ctx->timebase;
+            rd_ctx->current_ts = data->m.timestamp;
+        }
+    }
+out:
+    rd_ctx->paused = rd_ctx->user_paused;
+    if (!rd_ctx->paused && rd_ctx->seek) {
+        uint32_t now = SDL_GetTicks();
+        rd_ctx->pause_time += now - rd_ctx->pause_start;
+        rd_ctx->pause_start = 0;
+        rd_ctx->last_ticks = now;
+    }
+    rd_ctx->seek = 0;
+    SDL_UnlockMutex(rd_ctx->lock);
+    if (res)
+        fprintf(stderr, "Error seeking, aborting\n");
+    return res;
 }

 /**
@@ -329,14 +477,15 @@ static int dp_rd_ctx_should_terminate(Dav1dPlayRenderContext *rd_ctx)
 */
 static void dp_rd_ctx_render(Dav1dPlayRenderContext *rd_ctx)
 {
+    SDL_LockMutex(rd_ctx->lock);
    // Calculate time since last frame was received
    uint32_t ticks_now = SDL_GetTicks();
    uint32_t ticks_diff = (rd_ctx->last_ticks != 0) ? ticks_now - rd_ctx->last_ticks : 0;

    // Calculate when to display the frame
-    int64_t pts_diff = rd_ctx->current_pts - rd_ctx->last_pts;
-    int32_t wait_time = (pts_diff * rd_ctx->timebase) * 1000 - ticks_diff;
-    rd_ctx->last_pts = rd_ctx->current_pts;
+    int64_t ts_diff = rd_ctx->current_ts - rd_ctx->last_ts;
+    int32_t pts_diff = (ts_diff * rd_ctx->timebase) * 1000.0 + .5;
+    int32_t wait_time = pts_diff - ticks_diff;

    // In untimed mode, simply don't wait
    if (rd_ctx->settings.untimed)
@@ -347,13 +496,59 @@ static void dp_rd_ctx_render(Dav1dPlayRenderContext *rd_ctx)
    // accurate player this would need to be done in a better way.
    if (wait_time > 0) {
        SDL_Delay(wait_time);
-    } else if (wait_time < -10) { // Do not warn for minor time drifts
-        fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time/(float)1000);
+    } else if (wait_time < -10 && !rd_ctx->paused) { // Do not warn for minor time drifts
+        fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time / 1000.0);
    }

    renderer_info->render(rd_ctx->rd_priv, &rd_ctx->settings);

+    rd_ctx->last_ts = rd_ctx->current_ts;
    rd_ctx->last_ticks = SDL_GetTicks();
+
+    SDL_UnlockMutex(rd_ctx->lock);
+}
+
+static int decode_frame(Dav1dPicture **p, Dav1dContext *c,
+                        Dav1dData *data, DemuxerContext *in_ctx)
+{
+    int res;
+    // Send data packets we got from the demuxer to dav1d
+    if ((res = dav1d_send_data(c, data)) < 0) {
+        // On EAGAIN, dav1d can not consume more data and
+        // dav1d_get_picture needs to be called first, which
+        // will happen below, so just keep going in that case
+        // and do not error out.
+        if (res != DAV1D_ERR(EAGAIN)) {
+            dav1d_data_unref(data);
+            goto err;
+        }
+    }
+    *p = calloc(1, sizeof(**p));
+    // Try to get a decoded frame
+    if ((res = dav1d_get_picture(c, *p)) < 0) {
+        // In all error cases, even EAGAIN, p needs to be freed as
+        // it is never added to the queue and would leak.
+        free(*p);
+        *p = NULL;
+        // On EAGAIN, it means dav1d has not enough data to decode
+        // therefore this is not a decoding error but just means
+        // we need to feed it more data, which happens in the next
+        // run of the decoder loop.
+        if (res != DAV1D_ERR(EAGAIN))
+            goto err;
+    }
+    return data->sz == 0 ? input_read(in_ctx, data) : 0;
+err:
+    fprintf(stderr, "Error decoding frame: %s\n",
+            strerror(-res));
+    return res;
+}
+
+static inline void destroy_pic(void *a)
+{
+    Dav1dPicture *p = (Dav1dPicture *)a;
+    dav1d_picture_unref(p);
+    free(p);
 }

 /* Decoder thread "main" function */
@@ -366,10 +561,7 @@ static int decoder_thread_main(void *cookie)
    Dav1dData data;
    DemuxerContext *in_ctx = NULL;
    int res = 0;
-    unsigned n_out = 0, total, timebase[2], fps[2];
-
-    // Store current ticks for stats calculation
-    uint32_t decoder_start = SDL_GetTicks();
+    unsigned total, timebase[2], fps[2];

    Dav1dPlaySettings settings = rd_ctx->settings;

@@ -382,8 +574,9 @@ static int decoder_thread_main(void *cookie)
        goto cleanup;
    }

-    double timebase_d = timebase[1]/(double)timebase[0];
-    rd_ctx->timebase = timebase_d;
+    rd_ctx->timebase = (double)timebase[1] / timebase[0];
+    rd_ctx->spf = (double)fps[1] / fps[0];
+    rd_ctx->total = total;

    if ((res = dav1d_open(&c, &rd_ctx->lib_settings))) {
        fprintf(stderr, "Failed opening dav1d decoder\n");
@@ -398,55 +591,29 @@ static int decoder_thread_main(void *cookie)
    }

    // Decoder loop
-    do {
-        if (dp_rd_ctx_should_terminate(rd_ctx))
+    while (1) {
+        if (dp_rd_ctx_should_terminate(rd_ctx) ||
+            (res = dp_rd_ctx_handle_seek(rd_ctx, in_ctx, c, &data)) ||
+            (res = decode_frame(&p, c, &data, in_ctx)))
+        {
            break;
-
-        // Send data packets we got from the demuxer to dav1d
-        if ((res = dav1d_send_data(c, &data)) < 0) {
-            // On EAGAIN, dav1d can not consume more data and
-            // dav1d_get_picture needs to be called first, which
-            // will happen below, so just keep going in that case
-            // and do not error out.
-            if (res != DAV1D_ERR(EAGAIN)) {
-                dav1d_data_unref(&data);
-                fprintf(stderr, "Error decoding frame: %s\n",
-                        strerror(-res));
-                break;
-            }
        }
-
-        p = calloc(1, sizeof(*p));
-
-        // Try to get a decoded frame
-        if ((res = dav1d_get_picture(c, p)) < 0) {
-            // In all error cases, even EAGAIN, p needs to be freed as
-            // it is never added to the queue and would leak.
-            free(p);
-
-            // On EAGAIN, it means dav1d has not enough data to decode
-            // therefore this is not a decoding error but just means
-            // we need to feed it more data, which happens in the next
-            // run of this decoder loop.
-            if (res != DAV1D_ERR(EAGAIN)) {
-                fprintf(stderr, "Error decoding frame: %s\n",
-                        strerror(-res));
-                break;
-            }
-            res = 0;
-        } else {
-
+        else if (p) {
            // Queue frame
-            dp_fifo_push(rd_ctx->fifo, p);
-            dp_rd_ctx_post_event(rd_ctx, DAV1D_EVENT_NEW_FRAME);
-
-            n_out++;
+            SDL_LockMutex(rd_ctx->lock);
+            int seek = rd_ctx->seek;
+            SDL_UnlockMutex(rd_ctx->lock);
+            if (!seek) {
+                dp_fifo_push(rd_ctx->fifo, p);
+                uint32_t type = rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME;
+                dp_rd_ctx_post_event(rd_ctx, type);
+            }
        }
-    } while ((data.sz > 0 || !input_read(in_ctx, &data)));
+    }

    // Release remaining data
-    if (data.sz > 0) dav1d_data_unref(&data);
-
+    if (data.sz > 0)
+        dav1d_data_unref(&data);
    // Do not drain in case an error occured and caused us to leave the
    // decoding loop early.
    if (res < 0)
@@ -461,7 +628,6 @@ static int decoder_thread_main(void *cookie)
    do {
        if (dp_rd_ctx_should_terminate(rd_ctx))
            break;
-
        p = calloc(1, sizeof(*p));
        res = dav1d_get_picture(c, p);
        if (res < 0) {
@@ -474,19 +640,13 @@ static int decoder_thread_main(void *cookie)
        } else {
            // Queue frame
            dp_fifo_push(rd_ctx->fifo, p);
-            dp_rd_ctx_post_event(rd_ctx, DAV1D_EVENT_NEW_FRAME);
-
-            n_out++;
+            uint32_t type = rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME;
+            dp_rd_ctx_post_event(rd_ctx, type);
        }
    } while (res != DAV1D_ERR(EAGAIN));

-    // Print stats
-    uint32_t decoding_time_ms = SDL_GetTicks() - decoder_start;
-    printf("Decoded %u frames in %d seconds, avg %.02f fps\n",
-        n_out, decoding_time_ms/1000, n_out / (decoding_time_ms / 1000.0));
-
 cleanup:
-    dp_rd_ctx_post_event(rd_ctx, DAV1D_EVENT_DEC_QUIT);
+    dp_rd_ctx_post_event(rd_ctx, rd_ctx->event_types + DAV1D_EVENT_DEC_QUIT);

    if (in_ctx)
        input_close(in_ctx);
@@ -543,41 +703,84 @@ int main(int argc, char **argv)
    decoder_thread = SDL_CreateThread(decoder_thread_main, "Decoder thread", rd_ctx);

    // Main loop
+#define NUM_MAX_EVENTS 8
+    SDL_Event events[NUM_MAX_EVENTS];
+    int num_frame_events = 0;
+    uint32_t start_time = 0, n_out = 0;
    while (1) {
-
-        SDL_Event e;
-        if (SDL_WaitEvent(&e)) {
-            if (e.type == SDL_QUIT) {
+        int num_events = 0;
+        SDL_WaitEvent(NULL);
+        while (num_events < NUM_MAX_EVENTS && SDL_PollEvent(&events[num_events++]))
+            break;
+        for (int i = 0; i < num_events; ++i) {
+            SDL_Event *e = &events[i];
+            if (e->type == SDL_QUIT) {
                dp_rd_ctx_request_shutdown(rd_ctx);
-            } else if (e.type == SDL_WINDOWEVENT) {
-                if (e.window.event == SDL_WINDOWEVENT_SIZE_CHANGED) {
+                dp_fifo_flush(rd_ctx->fifo, destroy_pic);
+                SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME);
+                SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_SEEK_FRAME);
+                num_frame_events = 0;
+            } else if (e->type == SDL_WINDOWEVENT) {
+                if (e->window.event == SDL_WINDOWEVENT_SIZE_CHANGED) {
                    // TODO: Handle window resizes
+                } else if(e->window.event == SDL_WINDOWEVENT_EXPOSED) {
+                    dp_rd_ctx_render(rd_ctx);
                }
-            } else if (e.type == rd_ctx->renderer_event_type) {
-                if (e.user.code == DAV1D_EVENT_NEW_FRAME) {
-                    // Dequeue frame and update the render context with it
-                    Dav1dPicture *p = dp_fifo_shift(rd_ctx->fifo);
-
-                    // Do not update textures during termination
-                    if (!dp_rd_ctx_should_terminate(rd_ctx))
-                        dp_rd_ctx_update_with_dav1d_picture(rd_ctx, p);
-                    dav1d_picture_unref(p);
-                    free(p);
-                } else if (e.user.code == DAV1D_EVENT_DEC_QUIT) {
-                    break;
+            } else if (e->type == SDL_KEYDOWN) {
+                SDL_KeyboardEvent *kbde = (SDL_KeyboardEvent *)e;
+                if (kbde->keysym.sym == SDLK_SPACE) {
+                    dp_rd_ctx_toggle_pause(rd_ctx);
+                } else if (kbde->keysym.sym == SDLK_LEFT ||
+                           kbde->keysym.sym == SDLK_RIGHT)
+                {
+                    if (kbde->keysym.sym == SDLK_LEFT)
+                        dp_rd_ctx_seek(rd_ctx, -5);
+                    else if (kbde->keysym.sym == SDLK_RIGHT)
+                        dp_rd_ctx_seek(rd_ctx, +5);
+                    dp_fifo_flush(rd_ctx->fifo, destroy_pic);
+                    SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME);
+                    num_frame_events = 0;
                }
+            } else if (e->type == rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME) {
+                num_frame_events++;
+                // Store current ticks for stats calculation
+                if (start_time == 0)
+                    start_time = SDL_GetTicks();
+            } else if (e->type == rd_ctx->event_types + DAV1D_EVENT_SEEK_FRAME) {
+                // Dequeue frame and update the render context with it
+                Dav1dPicture *p = dp_fifo_shift(rd_ctx->fifo);
+                // Do not update textures during termination
+                if (!dp_rd_ctx_should_terminate(rd_ctx)) {
+                    dp_rd_ctx_update_with_dav1d_picture(rd_ctx, p);
+                    n_out++;
+                }
+                destroy_pic(p);
+            } else if (e->type == rd_ctx->event_types + DAV1D_EVENT_DEC_QUIT) {
+                goto out;
            }
        }
-
-        // Do not render during termination
-        if (!dp_rd_ctx_should_terminate(rd_ctx))
-            dp_rd_ctx_render(rd_ctx);
+        if (num_frame_events && !dp_rd_ctx_is_paused(rd_ctx)) {
+            // Dequeue frame and update the render context with it
+            Dav1dPicture *p = dp_fifo_shift(rd_ctx->fifo);
+            // Do not update textures during termination
+            if (!dp_rd_ctx_should_terminate(rd_ctx)) {
+                dp_rd_ctx_update_with_dav1d_picture(rd_ctx, p);
+                dp_rd_ctx_render(rd_ctx);
+                n_out++;
+            }
+            destroy_pic(p);
+            num_frame_events--;
+        }
    }

+out:;
+    // Print stats
+    uint32_t time_ms = SDL_GetTicks() - start_time - rd_ctx->pause_time;
+    printf("Decoded %u frames in %d seconds, avg %.02f fps\n",
+           n_out, time_ms / 1000, n_out/ (time_ms / 1000.0));
+
    int decoder_ret = 0;
    SDL_WaitThread(decoder_thread, &decoder_ret);
-
    dp_rd_ctx_destroy(rd_ctx);
-
    return decoder_ret;
 }
--- a/examples/dp_fifo.c
+++ b/examples/dp_fifo.c
@@ -37,6 +37,8 @@ struct dp_fifo
    size_t capacity;
    size_t count;
    void **entries;
+    int push_wait;
+    int flush;
 };


@@ -54,6 +56,8 @@ Dav1dPlayPtrFifo *dp_fifo_create(size_t capacity)

    fifo->capacity = capacity;
    fifo->count = 0;
+    fifo->push_wait = 0;
+    fifo->flush = 0;

    fifo->lock = SDL_CreateMutex();
    if (fifo->lock == NULL) {
@@ -90,8 +94,16 @@ void dp_fifo_destroy(Dav1dPlayPtrFifo *fifo)
 void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element)
 {
    SDL_LockMutex(fifo->lock);
-    while (fifo->count == fifo->capacity)
+    while (fifo->count == fifo->capacity) {
+        fifo->push_wait = 1;
        SDL_CondWait(fifo->cond_change, fifo->lock);
+        fifo->push_wait = 0;
+        if (fifo->flush) {
+            SDL_CondSignal(fifo->cond_change);
+            SDL_UnlockMutex(fifo->lock);
+            return;
+        }
+    }
    fifo->entries[fifo->count++] = element;
    if (fifo->count == 1)
        SDL_CondSignal(fifo->cond_change);
@@ -120,4 +132,16 @@ void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo)
    return res;
 }

-
+void dp_fifo_flush(Dav1dPlayPtrFifo *fifo, void (*destroy_elem)(void *))
+{
+    SDL_LockMutex(fifo->lock);
+    fifo->flush = 1;
+    if (fifo->push_wait) {
+        SDL_CondSignal(fifo->cond_change);
+        SDL_CondWait(fifo->cond_change, fifo->lock);
+    }
+    while (fifo->count)
+        destroy_elem(fifo->entries[--fifo->count]);
+    fifo->flush = 0;
+    SDL_UnlockMutex(fifo->lock);
+}
--- a/examples/dp_fifo.h
+++ b/examples/dp_fifo.h
@@ -59,3 +59,5 @@ void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo);
 * other thread will call dp_fifo_shift will lead to a deadlock.
 */
 void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element);
+
+void dp_fifo_flush(Dav1dPlayPtrFifo *fifo, void (*destroy_elem)(void *));
--- a/examples/dp_renderer.h
+++ b/examples/dp_renderer.h
@@ -66,8 +66,11 @@ typedef struct {
 #define WINDOW_WIDTH  910
 #define WINDOW_HEIGHT 512

-#define DAV1D_EVENT_NEW_FRAME 1
-#define DAV1D_EVENT_DEC_QUIT  2
+enum {
+    DAV1D_EVENT_NEW_FRAME,
+    DAV1D_EVENT_SEEK_FRAME,
+    DAV1D_EVENT_DEC_QUIT
+};

 /**
 * Renderer info
@@ -84,7 +87,7 @@ typedef struct rdr_info
    void (*destroy_renderer)(void *cookie);
    // Callback to the render function that renders a prevously sent frame
    void (*render)(void *cookie, const Dav1dPlaySettings *settings);
-    // Callback to the send frame function
+    // Callback to the send frame function, _may_ also unref dav1d_pic!
    int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic,
                        const Dav1dPlaySettings *settings);
    // Callback for alloc/release pictures (optional)

--- a/examples/dp_renderer_placebo.c
+++ b/examples/dp_renderer_placebo.c
@@ -30,7 +30,7 @@
 #include <assert.h>

 #include <libplacebo/renderer.h>
-#include <libplacebo/utils/upload.h>
+#include <libplacebo/utils/dav1d.h>

 #ifdef HAVE_PLACEBO_VULKAN
 # include <libplacebo/vulkan.h>
@@ -72,7 +72,7 @@ typedef struct renderer_priv_ctx
    // Lock protecting access to the texture
    SDL_mutex *lock;
    // Image to render, and planes backing them
-    struct pl_image image;
+    struct pl_frame image;
    const struct pl_tex *plane_tex[3];
 } Dav1dPlayRendererPrivateContext;

@@ -319,22 +319,15 @@ static void placebo_render(void *cookie, const Dav1dPlaySettings *settings)
    if (settings->highquality)
        render_params = pl_render_default_params;

-    struct pl_render_target target;
-    pl_render_target_from_swapchain(&target, &frame);
-    target.profile = (struct pl_icc_profile) {
-        .data = NULL,
-        .len = 0,
-    };
-
-#if PL_API_VER >= 66
-    pl_rect2df_aspect_copy(&target.dst_rect, &rd_priv_ctx->image.src_rect, 0.0);
-    if (pl_render_target_partial(&target))
-        pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 0.0 });
-#endif
+    struct pl_frame target;
+    pl_frame_from_swapchain(&target, &frame);
+    pl_rect2df_aspect_copy(&target.crop, &rd_priv_ctx->image.crop, 0.0);
+    if (pl_frame_is_cropped(&target))
+        pl_tex_clear(rd_priv_ctx->gpu, frame.fbo, (float[4]){ 0.0 });

    if (!pl_render_image(rd_priv_ctx->renderer, &rd_priv_ctx->image, &target, &render_params)) {
        fprintf(stderr, "Failed rendering frame!\n");
-        pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 1.0 });
+        pl_tex_clear(rd_priv_ctx->gpu, frame.fbo, (float[4]){ 1.0 });
    }

    ok = pl_swapchain_submit_frame(rd_priv_ctx->swapchain);
@@ -351,320 +344,37 @@ static void placebo_render(void *cookie, const Dav1dPlaySettings *settings)
 static int placebo_upload_image(void *cookie, Dav1dPicture *dav1d_pic,
                                const Dav1dPlaySettings *settings)
 {
-    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
-    assert(rd_priv_ctx != NULL);
-
-    SDL_LockMutex(rd_priv_ctx->lock);
-
-    if (dav1d_pic == NULL) {
-        SDL_UnlockMutex(rd_priv_ctx->lock);
-        return 0;
-    }
-
-    int width = dav1d_pic->p.w;
-    int height = dav1d_pic->p.h;
-    int sub_x = 0, sub_y = 0;
-    int bytes = (dav1d_pic->p.bpc + 7) / 8; // rounded up
-    enum pl_chroma_location chroma_loc = PL_CHROMA_UNKNOWN;
-
-    struct pl_image *image = &rd_priv_ctx->image;
-    *image = (struct pl_image) {
-        .num_planes = 3,
-        .width      = width,
-        .height     = height,
-        .src_rect   = {0, 0, width, height},
-
-        .repr = {
-            .bits = {
-                .sample_depth = bytes * 8,
-                .color_depth = dav1d_pic->p.bpc,
-            },
-        },
+    Dav1dPlayRendererPrivateContext *p = cookie;
+    assert(p != NULL);
+    int ret = 0;
+
+    if (!dav1d_pic)
+        return ret;
+
+    struct pl_dav1d_upload_params params = {
+        .picture = dav1d_pic,
+        .film_grain = settings->gpugrain,
+        .gpu_allocated = settings->zerocopy,
+        .asynchronous = true,
    };

-    // Figure out the correct plane dimensions/count
-    switch (dav1d_pic->p.layout) {
-    case DAV1D_PIXEL_LAYOUT_I400:
-        image->num_planes = 1;
-        break;
-    case DAV1D_PIXEL_LAYOUT_I420:
-        sub_x = sub_y = 1;
-        break;
-    case DAV1D_PIXEL_LAYOUT_I422:
-        sub_x = 1;
-        break;
-    case DAV1D_PIXEL_LAYOUT_I444:
-        break;
-    }
-
-    // Set the right colorspace metadata etc.
-    switch (dav1d_pic->seq_hdr->pri) {
-    case DAV1D_COLOR_PRI_UNKNOWN:   image->color.primaries = PL_COLOR_PRIM_UNKNOWN; break;
-    case DAV1D_COLOR_PRI_BT709:     image->color.primaries = PL_COLOR_PRIM_BT_709; break;
-    case DAV1D_COLOR_PRI_BT470M:    image->color.primaries = PL_COLOR_PRIM_BT_470M; break;
-    case DAV1D_COLOR_PRI_BT470BG:   image->color.primaries = PL_COLOR_PRIM_BT_601_625; break;
-    case DAV1D_COLOR_PRI_BT601:     image->color.primaries = PL_COLOR_PRIM_BT_601_625; break;
-    case DAV1D_COLOR_PRI_BT2020:    image->color.primaries = PL_COLOR_PRIM_BT_2020; break;
-
-    case DAV1D_COLOR_PRI_XYZ:
-        // Handled below
-        assert(dav1d_pic->seq_hdr->mtrx == DAV1D_MC_IDENTITY);
-        break;
-
-    default:
-        printf("warning: unknown dav1d color primaries %d.. ignoring, picture "
-               "may be very incorrect\n", dav1d_pic->seq_hdr->pri);
-        break;
-    }
-
-    switch (dav1d_pic->seq_hdr->trc) {
-    case DAV1D_TRC_BT709:
-    case DAV1D_TRC_BT470M:
-    case DAV1D_TRC_BT470BG:
-    case DAV1D_TRC_BT601:
-    case DAV1D_TRC_SMPTE240:
-    case DAV1D_TRC_BT2020_10BIT:
-    case DAV1D_TRC_BT2020_12BIT:
-        // These all map to the effective "SDR" CRT-based EOTF, BT.1886
-        image->color.transfer = PL_COLOR_TRC_BT_1886;
-        break;
-
-    case DAV1D_TRC_UNKNOWN:     image->color.transfer = PL_COLOR_TRC_UNKNOWN; break;
-    case DAV1D_TRC_LINEAR:      image->color.transfer = PL_COLOR_TRC_LINEAR; break;
-    case DAV1D_TRC_SRGB:        image->color.transfer = PL_COLOR_TRC_SRGB; break;
-    case DAV1D_TRC_SMPTE2084:   image->color.transfer = PL_COLOR_TRC_PQ; break;
-    case DAV1D_TRC_HLG:         image->color.transfer = PL_COLOR_TRC_HLG; break;
-
-    default:
-        printf("warning: unknown dav1d color transfer %d.. ignoring, picture "
-               "may be very incorrect\n", dav1d_pic->seq_hdr->trc);
-        break;
-    }
-
-    switch (dav1d_pic->seq_hdr->mtrx) {
-    case DAV1D_MC_IDENTITY:
-        // This is going to be either RGB or XYZ
-        if (dav1d_pic->seq_hdr->pri == DAV1D_COLOR_PRI_XYZ) {
-            image->repr.sys = PL_COLOR_SYSTEM_XYZ;
-        } else {
-            image->repr.sys = PL_COLOR_SYSTEM_RGB;
-        }
-        break;
-
-    case DAV1D_MC_UNKNOWN:
-        // PL_COLOR_SYSTEM_UNKNOWN maps to RGB, so hard-code this one
-        image->repr.sys = pl_color_system_guess_ycbcr(width, height);
-        break;
-
-    case DAV1D_MC_BT709:        image->repr.sys = PL_COLOR_SYSTEM_BT_709; break;
-    case DAV1D_MC_BT601:        image->repr.sys = PL_COLOR_SYSTEM_BT_601; break;
-    case DAV1D_MC_SMPTE240:     image->repr.sys = PL_COLOR_SYSTEM_SMPTE_240M; break;
-    case DAV1D_MC_SMPTE_YCGCO:  image->repr.sys = PL_COLOR_SYSTEM_YCGCO; break;
-    case DAV1D_MC_BT2020_NCL:   image->repr.sys = PL_COLOR_SYSTEM_BT_2020_NC; break;
-    case DAV1D_MC_BT2020_CL:    image->repr.sys = PL_COLOR_SYSTEM_BT_2020_C; break;
-
-    case DAV1D_MC_ICTCP:
-        // This one is split up based on the actual HDR curve in use
-        if (dav1d_pic->seq_hdr->trc == DAV1D_TRC_HLG) {
-            image->repr.sys = PL_COLOR_SYSTEM_BT_2100_HLG;
-        } else {
-            image->repr.sys = PL_COLOR_SYSTEM_BT_2100_PQ;
-        }
-        break;
-
-    default:
-        printf("warning: unknown dav1d color matrix %d.. ignoring, picture "
-               "may be very incorrect\n", dav1d_pic->seq_hdr->mtrx);
-        break;
-    }
-
-    if (dav1d_pic->seq_hdr->color_range) {
-        image->repr.levels = PL_COLOR_LEVELS_PC;
-    } else {
-        image->repr.levels = PL_COLOR_LEVELS_TV;
-    }
-
-    switch (dav1d_pic->seq_hdr->chr) {
-    case DAV1D_CHR_UNKNOWN:     chroma_loc = PL_CHROMA_UNKNOWN; break;
-    case DAV1D_CHR_VERTICAL:    chroma_loc = PL_CHROMA_LEFT; break;
-    case DAV1D_CHR_COLOCATED:   chroma_loc = PL_CHROMA_TOP_LEFT; break;
-    }
-
-#if PL_API_VER >= 63
-    if (settings->gpugrain && dav1d_pic->frame_hdr->film_grain.present) {
-        Dav1dFilmGrainData *src = &dav1d_pic->frame_hdr->film_grain.data;
-        struct pl_av1_grain_data *dst = &image->av1_grain;
-        *dst = (struct pl_av1_grain_data) {
-            .grain_seed     = src->seed,
-            .num_points_y   = src->num_y_points,
-            .chroma_scaling_from_luma = src->chroma_scaling_from_luma,
-            .num_points_uv  = { src->num_uv_points[0], src->num_uv_points[1] },
-            .scaling_shift  = src->scaling_shift,
-            .ar_coeff_lag   = src->ar_coeff_lag,
-            .ar_coeff_shift = (int)src->ar_coeff_shift,
-            .grain_scale_shift = src->grain_scale_shift,
-            .uv_mult        = { src->uv_mult[0], src->uv_mult[1] },
-            .uv_mult_luma   = { src->uv_luma_mult[0], src->uv_luma_mult[1] },
-            .uv_offset      = { src->uv_offset[0], src->uv_offset[1] },
-            .overlap        = src->overlap_flag,
-        };
-
-        assert(sizeof(dst->points_y) == sizeof(src->y_points));
-        assert(sizeof(dst->points_uv) == sizeof(src->uv_points));
-        assert(sizeof(dst->ar_coeffs_y) == sizeof(src->ar_coeffs_y));
-        memcpy(dst->points_y, src->y_points, sizeof(src->y_points));
-        memcpy(dst->points_uv, src->uv_points, sizeof(src->uv_points));
-        memcpy(dst->ar_coeffs_y, src->ar_coeffs_y, sizeof(src->ar_coeffs_y));
-
-        // this one has different row sizes for alignment
-        for (int c = 0; c < 2; c++) {
-            for (int i = 0; i < 25; i++)
-                dst->ar_coeffs_uv[c][i] = src->ar_coeffs_uv[c][i];
-        }
-    }
-#endif
-
-    // Upload the actual planes
-    struct pl_plane_data data[3] = {
-        {
-            // Y plane
-            .type           = PL_FMT_UNORM,
-            .width          = width,
-            .height         = height,
-            .pixel_stride   = bytes,
-            .row_stride     = dav1d_pic->stride[0],
-            .component_size = {bytes * 8},
-            .component_map  = {0},
-        }, {
-            // U plane
-            .type           = PL_FMT_UNORM,
-            .width          = width >> sub_x,
-            .height         = height >> sub_y,
-            .pixel_stride   = bytes,
-            .row_stride     = dav1d_pic->stride[1],
-            .component_size = {bytes * 8},
-            .component_map  = {1},
-        }, {
-            // V plane
-            .type           = PL_FMT_UNORM,
-            .width          = width >> sub_x,
-            .height         = height >> sub_y,
-            .pixel_stride   = bytes,
-            .row_stride     = dav1d_pic->stride[1],
-            .component_size = {bytes * 8},
-            .component_map  = {2},
-        },
-    };
-
-    bool ok = true;
-
-    for (int i = 0; i < image->num_planes; i++) {
-        if (settings->zerocopy) {
-            const struct pl_buf *buf = dav1d_pic->allocator_data;
-            assert(buf);
-            data[i].buf = buf;
-            data[i].buf_offset = (uintptr_t) dav1d_pic->data[i] - (uintptr_t) buf->data;
-        } else {
-            data[i].pixels = dav1d_pic->data[i];
-        }
-
-        ok &= pl_upload_plane(rd_priv_ctx->gpu, &image->planes[i], &rd_priv_ctx->plane_tex[i], &data[i]);
-    }
-
-    // Apply the correct chroma plane shift. This has to be done after pl_upload_plane
-#if PL_API_VER >= 67
-    pl_image_set_chroma_location(image, chroma_loc);
-#else
-    pl_chroma_location_offset(chroma_loc, &image->planes[1].shift_x, &image->planes[1].shift_y);
-    pl_chroma_location_offset(chroma_loc, &image->planes[2].shift_x, &image->planes[2].shift_y);
-#endif
-
-    if (!ok) {
+    SDL_LockMutex(p->lock);
+    if (!pl_upload_dav1dpicture(p->gpu, &p->image, p->plane_tex, &params)) {
        fprintf(stderr, "Failed uploading planes!\n");
-        *image = (struct pl_image) {0};
+        p->image = (struct pl_frame) {0};
+        ret = -1;
    }
-
-    SDL_UnlockMutex(rd_priv_ctx->lock);
-    return !ok;
+    SDL_UnlockMutex(p->lock);
+    return ret;
 }

-// Align to power of 2
-#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
-
-static int placebo_alloc_pic(Dav1dPicture *const p, void *cookie)
+static int placebo_alloc_pic(Dav1dPicture *const pic, void *cookie)
 {
    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
    assert(rd_priv_ctx != NULL);
-    SDL_LockMutex(rd_priv_ctx->lock);
-
-    const struct pl_gpu *gpu = rd_priv_ctx->gpu;
-    int ret = DAV1D_ERR(ENOMEM);
-
-    // Copied from dav1d_default_picture_alloc
-    const int hbd = p->p.bpc > 8;
-    const int aligned_w = ALIGN2(p->p.w, 128);
-    const int aligned_h = ALIGN2(p->p.h, 128);
-    const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
-    const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    p->stride[0] = aligned_w << hbd;
-    p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
-
-    // Align strides up to multiples of the GPU performance hints
-    p->stride[0] = ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_stride);
-    p->stride[1] = ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_stride);
-
-    // Aligning offsets to 4 also implicity aligns to the texel size (1 or 2)
-    size_t off_align = ALIGN2(gpu->limits.align_tex_xfer_offset, 4);
-    const size_t y_sz = ALIGN2(p->stride[0] * aligned_h, off_align);
-    const size_t uv_sz = ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align);
-
-    // The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment,
-    // even in the case that the driver gives us insane alignments
-    const size_t pic_size = y_sz + 2 * uv_sz;
-    const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4;
-
-    // Validate size limitations
-    if (total_size > gpu->limits.max_xfer_size) {
-        printf("alloc of %zu bytes exceeds limits\n", total_size);
-        goto err;
-    }

-    const struct pl_buf *buf = pl_buf_create(gpu, &(struct pl_buf_params) {
-        .type = PL_BUF_TEX_TRANSFER,
-        .host_mapped = true,
-        .size = total_size,
-        .memory_type = PL_BUF_MEM_HOST,
-        .user_data = p,
-    });
-
-    if (!buf) {
-        printf("alloc of GPU mapped buffer failed\n");
-        goto err;
-    }
-
-    assert(buf->data);
-    uintptr_t base = (uintptr_t) buf->data, data[3];
-    data[0] = ALIGN2(base, DAV1D_PICTURE_ALIGNMENT);
-    data[1] = ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT);
-    data[2] = ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT);
-
-    // Sanity check offset alignment for the sake of debugging
-    if (data[0] - base != ALIGN2(data[0] - base, off_align) ||
-        data[1] - base != ALIGN2(data[1] - base, off_align) ||
-        data[2] - base != ALIGN2(data[2] - base, off_align))
-    {
-        printf("GPU buffer horribly misaligned, expect slowdown!\n");
-    }
-
-    p->allocator_data = (void *) buf;
-    p->data[0] = (void *) data[0];
-    p->data[1] = (void *) data[1];
-    p->data[2] = (void *) data[2];
-    ret = 0;
-
-    // fall through
-err:
+    SDL_LockMutex(rd_priv_ctx->lock);
+    int ret = pl_allocate_dav1dpicture(pic, rd_priv_ctx->gpu);
    SDL_UnlockMutex(rd_priv_ctx->lock);
    return ret;
 }
@@ -673,11 +383,9 @@ static void placebo_release_pic(Dav1dPicture *pic, void *cookie)
 {
    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
    assert(rd_priv_ctx != NULL);
-    assert(pic->allocator_data);

    SDL_LockMutex(rd_priv_ctx->lock);
-    const struct pl_gpu *gpu = rd_priv_ctx->gpu;
-    pl_buf_destroy(gpu, (const struct pl_buf **) &pic->allocator_data);
+    pl_release_dav1dpicture(pic, rd_priv_ctx->gpu);
    SDL_UnlockMutex(rd_priv_ctx->lock);
 }

@@ -690,10 +398,7 @@ const Dav1dPlayRenderInfo rdr_placebo_vk = {
    .update_frame = placebo_upload_image,
    .alloc_pic = placebo_alloc_pic,
    .release_pic = placebo_release_pic,
-
-# if PL_API_VER >= 63
    .supports_gpu_grain = 1,
-# endif
 };
 #else
 const Dav1dPlayRenderInfo rdr_placebo_vk = { NULL };
@@ -706,12 +411,7 @@ const Dav1dPlayRenderInfo rdr_placebo_gl = {
    .destroy_renderer = placebo_renderer_destroy,
    .render = placebo_render,
    .update_frame = placebo_upload_image,
-    .alloc_pic = placebo_alloc_pic,
-    .release_pic = placebo_release_pic,
-
-# if PL_API_VER >= 63
    .supports_gpu_grain = 1,
-# endif
 };
 #else
 const Dav1dPlayRenderInfo rdr_placebo_gl = { NULL };

--- a/examples/meson.build
+++ b/examples/meson.build
@@ -43,10 +43,10 @@ dav1dplay_sources = files(
 sdl2_dependency = dependency('sdl2', version: '>= 2.0.1', required: true)

 if sdl2_dependency.found()
-    dav1dplay_deps = [sdl2_dependency]
+    dav1dplay_deps = [sdl2_dependency, libm_dependency]
    dav1dplay_cflags = []

-    placebo_dependency = dependency('libplacebo', version: '>= 1.18.0', required: false)
+    placebo_dependency = dependency('libplacebo', version: '>= 3.110.0', required: false)

    if placebo_dependency.found()
        dav1dplay_deps += placebo_dependency

--- a/include/common/attributes.h
+++ b/include/common/attributes.h
@@ -116,8 +116,8 @@
 #    define dav1d_uninit(x) x
 #endif

- #ifdef _MSC_VER
- #include <intrin.h>
+#if defined(_MSC_VER) && !defined(__clang__)
+#include <intrin.h>

 static inline int ctz(const unsigned int mask) {
    unsigned long idx;

--- a/include/common/frame.h
+++ b/include/common/frame.h
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_COMMON_FRAME_H
+#define DAV1D_COMMON_FRAME_H
+
+/*
+ * Checks whether Dav1dFrameType == INTER || == SWITCH
+ * Both are defined as odd numbers {1, 3} and therefore have the LSB set.
+ * See also: AV1 spec 6.8.2
+ */
+#define IS_INTER_OR_SWITCH(frame_header) \
+    ((frame_header)->frame_type & 1)
+
+/*
+ * Checks whether Dav1dFrameType == KEY || == INTRA
+ * See also: AV1 spec 6.8.2
+ */
+#define IS_KEY_OR_INTRA(frame_header) \
+    (!IS_INTER_OR_SWITCH(frame_header))
+
+#endif /* DAV1D_COMMON_FRAME_H */
--- a/include/dav1d/dav1d.h
+++ b/include/dav1d/dav1d.h
@@ -45,6 +45,7 @@ typedef struct Dav1dRef Dav1dRef;

 #define DAV1D_MAX_FRAME_THREADS 256
 #define DAV1D_MAX_TILE_THREADS 64
+#define DAV1D_MAX_POSTFILTER_THREADS 256

 typedef struct Dav1dLogger {
    void *cookie; ///< Custom data to pass to the callback.
@@ -67,7 +68,8 @@ typedef struct Dav1dSettings {
    unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited)
    Dav1dPicAllocator allocator; ///< Picture allocator callback.
    Dav1dLogger logger; ///< Logger callback.
-    uint8_t reserved[32]; ///< reserved for future use
+    int n_postfilter_threads;
+    uint8_t reserved[28]; ///< reserved for future use
 } Dav1dSettings;

 /**

--- a/include/meson.build
+++ b/include/meson.build
@@ -25,9 +25,7 @@
 # Revision file (vcs_version.h) generation
 dav1d_git_dir = join_paths(dav1d_src_root, '.git')
 rev_target = vcs_tag(command: [
-        'git', '--git-dir', dav1d_git_dir,
-        'describe', '--tags', '--long',
-        '--match', '?.*.*', '--always'
+        'git', '--git-dir', dav1d_git_dir, 'describe', '--long', '--always'
    ],
    input: 'vcs_version.h.in',
    output: 'vcs_version.h'

--- a/meson.build
+++ b/meson.build
-# Copyright © 2018-2020, VideoLAN and dav1d authors
+# Copyright © 2018-2021, VideoLAN and dav1d authors
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -23,14 +23,14 @@
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 project('dav1d', ['c'],
-    version: '0.8.1',
+    version: '0.8.2',
    default_options: ['c_std=c99',
                      'warning_level=2',
                      'buildtype=release',
                      'b_ndebug=if-release'],
    meson_version: '>= 0.49.0')

-dav1d_soname_version       = '5.0.0'
+dav1d_soname_version       = '5.0.1'
 dav1d_api_version_array    = dav1d_soname_version.split('.')
 dav1d_api_version_major    = dav1d_api_version_array[0]
 dav1d_api_version_minor    = dav1d_api_version_array[1]
@@ -128,7 +128,7 @@ if host_machine.system() == 'windows'
    rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
    rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
    rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
-    rc_data.set('COPYRIGHT_YEARS', '2020')
+    rc_data.set('COPYRIGHT_YEARS', '2021')
 else
    thread_dependency = dependency('threads')
    thread_compat_dep = []
@@ -168,6 +168,8 @@ if host_machine.system() == 'linux'
    endif
 endif

+libm_dependency = cc.find_library('m', required: false)
+

 # Header checks

@@ -257,6 +259,7 @@ if cc.get_argument_syntax() != 'msvc'
 else
    optional_arguments += [
      '-wd4028', # parameter different from declaration
+      '-wd4090', # broken with arrays of pointers
      '-wd4996'  # use of POSIX functions
    ]
 endif

--- a/meson_options.txt
+++ b/meson_options.txt
@@ -53,3 +53,7 @@ option('fuzzer_ldflags',
 option('stack_alignment',
    type: 'integer',
    value: 0)
+
+option('xxhash_muxer',
+    type : 'feature',
+    value : 'auto')
--- a/src/arm/32/ipred.S
+++ b/src/arm/32/ipred.S
@@ -40,8 +40,7 @@ function ipred_dc_128_8bpc_neon, export=1
        adr             r2,  L(ipred_dc_128_tbl)
        sub             r3,  r3,  #25
        ldr             r3,  [r2,  r3,  lsl #2]
-        mov             lr,  #128
-        vdup.8          q0,  lr
+        vmov.i8         q0,  #128
        add             r2,  r2,  r3
        add             r12, r0,  r1
        lsl             r1,  r1,  #1
@@ -79,7 +78,7 @@ L(ipred_dc_128_tbl):
        bgt             16b
        pop             {r4, pc}
 320:
-        vdup.8          q1,  lr
+        vmov.i8         q1,  #128
 32:
        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
@@ -89,20 +88,18 @@ L(ipred_dc_128_tbl):
        bgt             32b
        pop             {r4, pc}
 640:
-        vdup.8          q1,  lr
-        vdup.8          q2,  lr
-        vdup.8          q3,  lr
+        vmov.i8         q1,  #128
        sub             r1,  r1,  #32
 64:
        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
-        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
-        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
        subs            r4,  r4,  #4
        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
-        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
-        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
        bgt             64b
        pop             {r4, pc}
 endfunc
@@ -401,19 +398,17 @@ L(ipred_dc_top_tbl):
        vrshrn.u16      d18, q0,  #6
        vdup.8          q0,  d18[0]
        vdup.8          q1,  d18[0]
-        vdup.8          q2,  d18[0]
-        vdup.8          q3,  d18[0]
        sub             r1,  r1,  #32
 64:
        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
-        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
-        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
        subs            r4,  r4,  #4
        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
-        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
-        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
        bgt             64b
        pop             {r4-r5, pc}
 endfunc
@@ -538,20 +533,18 @@ L(ipred_dc_left_h64):
        vdup.8          q0,  d0[0]
        bx              r3
 L(ipred_dc_left_w64):
-        sub             r1,  r1,  #32
        vmov.8          q1,  q0
-        vmov.8          q2,  q0
-        vmov.8          q3,  q0
+        sub             r1,  r1,  #32
 1:
        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
-        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
-        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
        subs            r4,  r4, #4
        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
-        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
-        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
        bgt             1b
        pop             {r4-r5, pc}
 endfunc
@@ -600,10 +593,10 @@ L(ipred_dc_tbl):
 L(ipred_dc_h4):
        vld1.32         {d0[]},  [r2, :32]!
        vpaddl.u8       d0,  d0
+        add             r2,  r2,  #1
        vpadd.u16       d0,  d0
        bx              r3
 L(ipred_dc_w4):
-        add             r2,  r2,  #1
        vld1.32         {d1[]},  [r2]
        vadd.s16        d0,  d0,  d30
        vpaddl.u8       d1,  d1
@@ -635,10 +628,10 @@ L(ipred_dc_h8):
        vld1.8          {d0},  [r2, :64]!
        vpaddl.u8       d0,  d0
        vpadd.u16       d0,  d0
+        add             r2,  r2,  #1
        vpadd.u16       d0,  d0
        bx              r3
 L(ipred_dc_w8):
-        add             r2,  r2,  #1
        vld1.8          {d2},  [r2]
        vadd.s16        d0,  d0,  d30
        vpaddl.u8       d2,  d2
@@ -672,10 +665,10 @@ L(ipred_dc_h16):
        vaddl.u8        q0,  d0,  d1
        vadd.u16        d0,  d0,  d1
        vpadd.u16       d0,  d0
+        add             r2,  r2,  #1
        vpadd.u16       d0,  d0
        bx              r3
 L(ipred_dc_w16):
-        add             r2,  r2,  #1
        vld1.8          {d2,  d3},  [r2]
        vadd.s16        d0,  d0,  d30
        vaddl.u8        q1,  d2,  d3
@@ -712,10 +705,10 @@ L(ipred_dc_h32):
        vadd.u16        q0,  q0,  q1
        vadd.u16        d0,  d0,  d1
        vpadd.u16       d0,  d0
+        add             r2,  r2,  #1
        vpadd.u16       d0,  d0
        bx              r3
 L(ipred_dc_w32):
-        add             r2,  r2,  #1
        vld1.8          {d2,  d3,  d4,  d5},  [r2]
        vadd.s16        d0,  d0,  d30
        vaddl.u8        q1,  d2,  d3
@@ -760,10 +753,10 @@ L(ipred_dc_h64):
        vadd.u16        q0,  q0,  q1
        vadd.u16        d0,  d0,  d1
        vpadd.u16       d0,  d0
+        add             r2,  r2,  #1
        vpadd.u16       d0,  d0
        bx              r3
 L(ipred_dc_w64):
-        add             r2,  r2,  #1
        vld1.8          {d2,  d3,  d4,  d5},  [r2]!
        vadd.s16        d0,  d0,  d30
        vaddl.u8        q2,  d4,  d5
@@ -789,11 +782,11 @@ L(ipred_dc_w64):
        vadd.s16        d0,  d0,  d2
        vadd.s16        d0,  d0,  d3
        vshl.u16        d18, d0,  d28
-        beq             1f                  // h = 16/32
+        beq             1f
+        // h = 16/32
        movw            lr,  #(0x5556/2)
        movt            lr,  #(0x3334/2)
-        mov             r5,  r4
-        and             r5,  r5,  #31
+        and             r5,  r4,  #31
        lsr             lr,  lr,  r5
        vdup.16         d30, lr
        vqdmulh.s16     d18, d18, d30
@@ -801,18 +794,16 @@ L(ipred_dc_w64):
        sub             r1,  r1,  #32
        vdup.8          q0,  d18[0]
        vdup.8          q1,  d18[0]
-        vdup.8          q2,  d18[0]
-        vdup.8          q3,  d18[0]
 2:
        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
-        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
-        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
        subs            r4,  r4,  #4
        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128]!
        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128]!
-        vst1.8          {d4,  d5,  d6,  d7},  [r0,  :128], r1
-        vst1.8          {d4,  d5,  d6,  d7},  [r12, :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r0,  :128], r1
+        vst1.8          {d0,  d1,  d2,  d3},  [r12, :128], r1
        bgt             2b
        pop             {r4-r6, pc}
 endfunc
@@ -1444,6 +1435,8 @@ function ipred_filter_8bpc_neon, export=1
        vmovl.s8        q13, d28
        vmovl.s8        q14, d29
        add             r8,  r2,  #1
+        sub             r2,  r2,  #2
+        mov             r7,  #-2
        bx              r5

        .align 2
@@ -1455,8 +1448,6 @@ L(ipred_filter_tbl):

 40:
        vld1.32         {d0[]}, [r8]     // top (0-3)
-        sub             r2,  r2,  #2
-        mov             r7,  #-2
        vmovl.u8        q0,  d0          // top (0-3)
 4:
        vld1.32         {d2[]}, [r2], r7 // left (0-1) + topleft (2)
@@ -1473,13 +1464,11 @@ L(ipred_filter_tbl):
        vst1.32         {d4[0]}, [r0, :32], r1
        vmovl.u8        q0,  d4
        vst1.32         {d4[1]}, [r6, :32], r1
-        vext.8          q0,  q0,  q0,  #8 // move top from [4-7] to [0-3]
+        vmov            d0,  d1          // move top from [4-7] to [0-3]
        bgt             4b
        pop             {r4-r8, pc}
 80:
        vld1.8          {d0},  [r8]      // top (0-7)
-        sub             r2,  r2,  #2
-        mov             r7,  #-2
        vmovl.u8        q0,  d0          // top (0-7)
 8:
        vld1.32         {d2[]}, [r2], r7 // left (0-1) + topleft (2)
@@ -1503,16 +1492,14 @@ L(ipred_filter_tbl):
        vqrshrun.s16    d5,  q3,  #4
        vzip.32         d4,  d5
        subs            r4,  r4,  #2
-        vst1.64         {d4}, [r0, :64], r1
+        vst1.8          {d4}, [r0, :64], r1
        vmovl.u8        q0,  d5
-        vst1.64         {d5}, [r6, :64], r1
+        vst1.8          {d5}, [r6, :64], r1
        bgt             8b
        pop             {r4-r8, pc}
 160:
 320:
        vpush           {q4-q5}
-        sub             r2,  r2,  #2
-        mov             r7,  #-2
        sub             r1,  r1,  r3
        mov             lr,  r3

@@ -2003,10 +1990,10 @@ L(ipred_cfl_tbl):
 L(ipred_cfl_h4):
        vld1.32         {d0[]}, [r2, :32]!
        vpaddl.u8       d0,  d0
+        add             r2,  r2,  #1
        vpadd.i16       d0,  d0
        bx              r12
 L(ipred_cfl_w4):
-        add             r2,  r2,  #1
        vld1.32         {d1[]},  [r2]
        vadd.i16        d0,  d0,  d16
        vpaddl.u8       d1,  d1
@@ -2031,10 +2018,10 @@ L(ipred_cfl_h8):
        vld1.8          {d0}, [r2, :64]!
        vpaddl.u8       d0,  d0
        vpadd.i16       d0,  d0
+        add             r2,  r2,  #1
        vpadd.i16       d0,  d0
        bx              r12
 L(ipred_cfl_w8):
-        add             r2,  r2,  #1
        vld1.8          {d1}, [r2]
        vadd.i16        d0,  d0,  d16
        vpaddl.u8       d1,  d1
@@ -2061,10 +2048,10 @@ L(ipred_cfl_h16):
        vaddl.u8        q0,  d0,  d1
        vadd.i16        d0,  d0,  d1
        vpadd.i16       d0,  d0
+        add             r2,  r2,  #1
        vpadd.i16       d0,  d0
        bx              r12
 L(ipred_cfl_w16):
-        add             r2,  r2,  #1
        vld1.8          {q2}, [r2]
        vadd.i16        d0,  d0,  d16
        vaddl.u8        q2,  d4,  d5
@@ -2094,10 +2081,10 @@ L(ipred_cfl_h32):
        vadd.i16        q0,  q2,  q3
        vadd.i16        d0,  d0,  d1
        vpadd.i16       d0,  d0
+        add             r2,  r2,  #1
        vpadd.i16       d0,  d0
        bx              r12
 L(ipred_cfl_w32):
-        add             r2,  r2,  #1
        vld1.8          {q2, q3},  [r2]
        vadd.i16        d0,  d0,  d16
        vaddl.u8        q2,  d4,  d5

--- a/src/arm/32/ipred16.S
+++ b/src/arm/32/ipred16.S
--- a/src/arm/32/itx.S
+++ b/src/arm/32/itx.S
@@ -706,7 +706,7 @@ def_fn_4x4 identity, flipadst
        vrshrn_8h       \r14, \r15, q4,   q5,   #12         // t7a
        vmull_vmlal_8h  q2,   q3,   \r10, \r11, \r6,  \r7,  d1[3], d1[2] // -> t6a
        vrshrn_8h       \r6,  \r7,  q6,   q7,   #12         // t5a
-        vrshrn_8h       \r10, \r11, q2,   q3,   #12         // taa
+        vrshrn_8h       \r10, \r11, q2,   q3,   #12         // t6a

        vqadd.s16       q2,   \q1,  \q3 // t4
        vqsub.s16       \q1,  \q1,  \q3 // t5a
@@ -1173,7 +1173,7 @@ function inv_dct_4h_x16_neon, export=1

        vrshrn.i32      d6,  q3,  #12  // t11
        vrshrn.i32      d7,  q4,  #12  // t12
-        vmull_vmlal     q4,  d25, d21, d0[0], d0[0]  // -> t10a
+        vmull_vmlal     q4,  d25, d21, d0[0], d0[0]  // -> t13a
        vrshrn.i32      d4,  q2,  #12  // t10a
        vrshrn.i32      d5,  q4,  #12  // t13a

@@ -1480,53 +1480,6 @@ function inv_txfm_add_vert_4x16_neon
        pop             {pc}
 endfunc

-.macro sub_sp_align space
-#if CONFIG_THUMB
-        mov             r7,  sp
-        and             r7,  r7,  #15
-#else
-        and             r7,  sp,  #15
-#endif
-        sub             sp,  sp,  r7
-        // Now the stack is aligned, store the amount of adjustment back
-        // on the stack, as we don't want to waste a register as frame
-        // pointer.
-        str             r7,  [sp, #-16]!
-#ifdef _WIN32
-.if \space > 8192
-        // Here, we'd need to touch two (or more) pages while decrementing
-        // the stack pointer.
-        .error          "sub_sp_align doesn't support values over 8K at the moment"
-.elseif \space > 4096
-        sub             r7,  sp,  #4096
-        ldr             r12, [r7]
-        sub             r7,  r7,  #(\space - 4096)
-        mov             sp,  r7
-.else
-        sub             sp,  sp,  #\space
-.endif
-#else
-.if \space >= 4096
-        sub             sp,  sp,  #(\space)/4096*4096
-.endif
-.if (\space % 4096) != 0
-        sub             sp,  sp,  #(\space)%4096
-.endif
-#endif
-.endm
-
-.macro add_sp_align space
-.if \space >= 4096
-        add             sp,  sp,  #(\space)/4096*4096
-.endif
-.if (\space % 4096) != 0
-        add             sp,  sp,  #(\space)%4096
-.endif
-        ldr             r7,  [sp], #16
-        // Add back the original stack adjustment
-        add             sp,  sp,  r7
-.endm
-
 function inv_txfm_add_16x16_neon
        sub_sp_align    512
        ldrh            r11, [r10], #2
@@ -3248,7 +3201,9 @@ function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
        mov             r8,  #(32 - \i)
        cmp             r3,  r11
        blt             1f
+.if \i < 28
        ldrh            r11, [r10], #2
+.endif
 .endif
        add             r7,  r2,  #(\i*2)
        mov             r8,  #32*2
@@ -3304,7 +3259,7 @@ function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
        add             r6,  r4,  #(\i*64*2)
        mov             r9,  #-2 // shift
        bl              inv_txfm_horz_dct_64x4_neon
-.if \i < 8
+.if \i < 12
        ldrh            r11, [r10], #2
 .endif
 .endr
@@ -3353,7 +3308,9 @@ function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
        mov             r8,  #(32 - \i)
        cmp             r3,  r11
        blt             1f
+.if \i < 28
        ldrh            r11, [r10], #2
+.endif
 .endif
        add             r7,  r2,  #(\i*2)
        mov             r8,  #32*2

--- a/src/arm/32/itx16.S
+++ b/src/arm/32/itx16.S
--- a/src/arm/32/loopfilter16.S
+++ b/src/arm/32/loopfilter16.S
@@ -141,13 +141,12 @@ function lpf_4_wd\wd\()_neon
        vmov.i16        d6,  #3
        vbic            d0,  d1,  d0  // (fm && wd >= 4 && !hev)
        vmul.i16        d2,  d2,  d6
-        vmov.i16        d6,  #4
+        vmov.i16        d7,  #4
        vadd.i16        d2,  d2,  d4
        vmin.s16        d2,  d2,  d3  // f = iclip_diff()
-        vmov.i16        d7,  #3
        vmax.s16        d2,  d2,  d9  // f = iclip_diff()
-        vqadd.s16       d4,  d6,  d2  // f + 4
-        vqadd.s16       d5,  d7,  d2  // f + 3
+        vqadd.s16       d4,  d7,  d2  // f + 4
+        vqadd.s16       d5,  d6,  d2  // f + 3
        vmin.s16        d4,  d4,  d3  // imin(f + 4, 128 << bitdepth_min_8 - 1)
        vmin.s16        d5,  d5,  d3  // imin(f + 3, 128 << bitdepth_min_8 - 1)
        vshr.s16        d4,  d4,  #3  // f1