diff --git a/demos/video-filtering.c b/demos/video-filtering.c
index 0a7342f9dcc4cced59e3037d12c03b2222fb3f81..d803161d079fd216b78cbac38776b7c6965b9d7e 100644
--- a/demos/video-filtering.c
+++ b/demos/video-filtering.c
@@ -7,21 +7,27 @@
  * For those of you too lazy to compile/run this file but still want to see
  * results, these are from my machine (RX 5700 XT + 1950X, as of 2020-05-25):
  *
- * RADV:
- *   api1: 10000 frames in 15.946306 s => 1.594631 ms/frame (627.10 fps)
- *   api2: 10000 frames in 5.398234 s => 0.539823 ms/frame (1852.46 fps)
+ * RADV+ACO:
+ *   api1: 10000 frames in 16.328440 s => 1.632844 ms/frame (612.43 fps)
+ *         render: 0.113524 ms, upload: 0.127551 ms, download: 0.146097 ms
+ *   api2: 10000 frames in 5.335634 s => 0.533563 ms/frame (1874.19 fps)
+ *         render: 0.064378 ms, upload: 0.000000 ms, download: 0.189719 ms
  *
  * AMDVLK:
- *   api1: 10000 frames in 14.766658 s => 1.476666 ms/frame (677.20 fps)
- *   api2: 10000 frames in 4.644831 s => 0.464483 ms/frame (2152.93 fps)
+ *   api1: 10000 frames in 14.921859 s => 1.492186 ms/frame (670.16 fps)
+ *         render: 0.110603 ms, upload: 0.114412 ms, download: 0.115375 ms
+ *   api2: 10000 frames in 4.667386 s => 0.466739 ms/frame (2142.53 fps)
+ *         render: 0.030781 ms, upload: 0.000000 ms, download: 0.075237 ms
  *
  * You can see that AMDVLK is still better at doing texture streaming than
  * RADV - this is because as of writing RADV still does not support
  * asynchronous texture queues / DMA engine transfers. If we disable the
  * `async_transfer` option with AMDVLK we get this:
  *
- *   api1: 10000 frames in 15.539355 s => 1.553936 ms/frame (643.53 fps)
- *   api2: 10000 frames in 6.277536 s => 0.627754 ms/frame (1592.98 fps)
+ *   api1: 10000 frames in 16.087723 s => 1.608772 ms/frame (621.59 fps)
+ *         render: 0.111154 ms, upload: 0.122476 ms, download: 0.133162 ms
+ *   api2: 10000 frames in 6.344959 s => 0.634496 ms/frame (1576.05 fps)
+ *         render: 0.031307 ms, upload: 0.000000 ms, download: 0.083520 ms
  *
  * Compiling:
  *
@@ -186,6 +192,17 @@ struct priv {
     struct pl_dispatch *dp;
     struct pl_shader_obj *dither_state;
 
+    // Timer objects
+    struct pl_timer *render_timer;
+    struct pl_timer *upload_timer;
+    struct pl_timer *download_timer;
+    uint64_t render_sum;
+    uint64_t upload_sum;
+    uint64_t download_sum;
+    int render_count;
+    int upload_count;
+    int download_count;
+
     // API #1: A simple pair of input and output textures
     const struct pl_tex *tex_in[MAX_PLANES];
     const struct pl_tex *tex_out[MAX_PLANES];
@@ -201,7 +218,11 @@ void *init(void) {
     if (!p)
         return NULL;
 
-    p->ctx = pl_context_create(PL_API_VER, NULL);
+    p->ctx = pl_context_create(PL_API_VER, &(struct pl_context_params) {
+        .log_cb = pl_log_simple,
+        .log_level = PL_LOG_WARN,
+    });
+
     if (!p->ctx) {
         fprintf(stderr, "Failed initializing libplacebo\n");
         goto error;
@@ -229,6 +250,10 @@ void *init(void) {
         goto error;
     }
 
+    p->render_timer = pl_timer_create(p->gpu);
+    p->upload_timer = pl_timer_create(p->gpu);
+    p->download_timer = pl_timer_create(p->gpu);
+
     return p;
 
 error:
@@ -257,6 +282,10 @@ void uninit(void *priv)
             image_unlock(p->entries[i].held_image);
     }
 
+    pl_timer_destroy(p->gpu, &p->render_timer);
+    pl_timer_destroy(p->gpu, &p->upload_timer);
+    pl_timer_destroy(p->gpu, &p->download_timer);
+
     pl_shader_obj_destroy(&p->dither_state);
     pl_dispatch_destroy(&p->dp);
     pl_vulkan_destroy(&p->vk);
@@ -309,9 +338,30 @@ bool do_plane(struct priv *p, const struct pl_tex *dst, const struct pl_tex *src
     return pl_dispatch_finish(p->dp, &(struct pl_dispatch_params) {
         .shader = &sh,
         .target = dst,
+        .timer  = p->render_timer,
     });
 }
 
+void check_timers(struct priv *p)
+{
+    uint64_t ret;
+
+    while ((ret = pl_timer_query(p->gpu, p->render_timer))) {
+        p->render_sum += ret;
+        p->render_count++;
+    }
+
+    while ((ret = pl_timer_query(p->gpu, p->upload_timer))) {
+        p->upload_sum += ret;
+        p->upload_count++;
+    }
+
+    while ((ret = pl_timer_query(p->gpu, p->download_timer))) {
+        p->download_sum += ret;
+        p->download_count++;
+    }
+}
+
 // API #1 implementation:
 //
 // In this design, we will create all GPU resources inside `reconfig`, based on
@@ -371,6 +421,7 @@ bool api1_filter(void *priv, struct image *dst, struct image *src)
             .tex = p->tex_in[i],
             .stride_w = data[i].row_stride / data[i].pixel_stride,
             .ptr = src->planes[i].data,
+            .timer = p->upload_timer,
         });
 
         if (!ok) {
@@ -393,6 +444,7 @@ bool api1_filter(void *priv, struct image *dst, struct image *src)
             .tex = p->tex_out[i],
             .stride_w = dst->planes[i].stride / data[i].pixel_stride,
             .ptr = dst->planes[i].data,
+            .timer = p->download_timer,
         });
 
         if (!ok) {
@@ -401,6 +453,7 @@ bool api1_filter(void *priv, struct image *dst, struct image *src)
         }
     }
 
+    check_timers(p);
     return true;
 }
 
@@ -437,6 +490,7 @@ static enum api2_status submit_work(struct priv *p, struct entry *e,
         if (!fmt)
             return API2_ERR_FMT;
 
+        // FIXME: can we plumb a `pl_timer` in here somehow?
         if (!pl_upload_plane(p->gpu, NULL, &e->tex_in[i], &data[i]))
             return API2_ERR_UNKNOWN;
 
@@ -492,6 +546,7 @@ static enum api2_status submit_work(struct priv *p, struct entry *e,
             .stride_w = stride[i] / data[i].pixel_stride,
             .buf = e->buf,
             .buf_offset = offset[i],
+            .timer = p->download_timer,
         });
         if (!ok)
             return API2_ERR_UNKNOWN;
@@ -645,7 +700,7 @@ static const struct image example_image = {
 // API #1: Nice and simple (but slow)
 void api1_example(void)
 {
-    void *vf = init();
+    struct priv *vf = init();
     if (!vf)
         return;
 
@@ -690,6 +745,13 @@ void api1_example(void)
     printf("api1: %4u frames in %1.6f s => %2.6f ms/frame (%5.2f fps)\n",
            frames, secs, 1000 * secs / frames, frames / secs);
 
+    if (vf->render_count) {
+        printf("      render: %f ms, upload: %f ms, download: %f ms\n",
+               1e-6 * vf->render_sum / vf->render_count,
+               vf->upload_count ? (1e-6 * vf->upload_sum / vf->upload_count) : 0.0,
+               vf->download_count ? (1e-6 * vf->download_sum / vf->download_count) : 0.0);
+    }
+
 done:
     free(srcbuf);
     free(dstbuf);
@@ -708,7 +770,7 @@ static unsigned api2_frames_out = 0;
 
 void api2_example(void)
 {
-    void *vf = init();
+    struct priv *vf = init();
     if (!vf)
         return;
 
@@ -745,6 +807,7 @@ void api2_example(void)
 
         // Sleep a short time (100us) to prevent busy waiting the CPU
         nanosleep(&(struct timespec) { .tv_nsec = 100000 }, NULL);
+        check_timers(vf);
     }
 
     gettimeofday(&stop, NULL);
@@ -755,6 +818,13 @@ void api2_example(void)
            api2_frames_out, secs, 1000 * secs / api2_frames_out,
            api2_frames_out / secs);
 
+    if (vf->render_count) {
+        printf("      render: %f ms, upload: %f ms, download: %f ms\n",
+               1e-6 * vf->render_sum / vf->render_count,
+               vf->upload_count ? (1e-6 * vf->upload_sum / vf->upload_count) : 0.0,
+               vf->download_count ? (1e-6 * vf->download_sum / vf->download_count) : 0.0);
+    }
+
     for (int i = 0; i < POOLSIZE; i++) {
         if (images[i].associated_buf) {
             api2_free(vf, images[i].associated_buf);