diff --git a/src/vulkan/common.h b/src/vulkan/common.h
index daeba2019e894b366948247376d0d9888178c787..ead439c4999bbadee823d8410bf0205218f7c88d 100644
--- a/src/vulkan/common.h
+++ b/src/vulkan/common.h
@@ -46,6 +46,11 @@
 #define PL_VK_MAX_QUEUED_CMDS 1024
 #define PL_VK_MAX_PENDING_CMDS 1024
 
+// Shitty compatibility alias for very old vulkan.h versions
+#ifndef VK_API_VERSION_1_2
+#define VK_API_VERSION_1_2 VK_MAKE_VERSION(1, 2, 0)
+#endif
+
 // Shared struct used to hold vulkan context information
 struct vk_ctx {
     void *ta; // allocations bound to the lifetime of this vk_ctx
@@ -205,6 +210,7 @@ struct vk_ctx {
     VK_FUN(QueueSubmit);
     VK_FUN(ResetEvent);
     VK_FUN(ResetFences);
+    VK_FUN(ResetQueryPoolEXT);
     VK_FUN(SetDebugUtilsObjectNameEXT);
     VK_FUN(SetHdrMetadataEXT);
     VK_FUN(UpdateDescriptorSets);
diff --git a/src/vulkan/context.c b/src/vulkan/context.c
index 89bef52118d482b37fe1c1c8ef145e117c9016c9..fac07eb818b0a9a8901f6cd17dd5c988764f6b2b 100644
--- a/src/vulkan/context.c
+++ b/src/vulkan/context.c
@@ -24,6 +24,7 @@ const struct pl_vk_inst_params pl_vk_inst_default_params = {0};
 
 struct vk_fun {
     const char *name;
+    const char *alias;
     size_t offset;
     bool device_level;
 };
@@ -45,6 +46,13 @@ struct vk_ext {
       .device_level = true,                 \
     }
 
+#define VK_DEV_FUN_ALIAS(N, ALIAS)          \
+    { .name = "vk" #N,                      \
+      .alias = #ALIAS,                      \
+      .offset = offsetof(struct vk_ctx, N), \
+      .device_level = true,                 \
+    }
+
 // Table of optional vulkan instance extensions
 static const char *vk_instance_extensions[] = {
     VK_KHR_SURFACE_EXTENSION_NAME,
@@ -156,6 +164,13 @@ static const struct vk_ext vk_device_extensions[] = {
             VK_DEV_FUN(SetHdrMetadataEXT),
             {0},
         },
+    }, {
+        .name = VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME,
+        .core_ver = VK_API_VERSION_1_2,
+        .funs = (struct vk_fun[]) {
+            VK_DEV_FUN_ALIAS(ResetQueryPoolEXT, vkResetQueryPool),
+            {0},
+        },
     },
 };
 
@@ -173,13 +188,21 @@ const char * const pl_vulkan_recommended_extensions[] = {
 #endif
     VK_EXT_PCI_BUS_INFO_EXTENSION_NAME,
     VK_EXT_HDR_METADATA_EXTENSION_NAME,
+    VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME,
 };
 
 const int pl_vulkan_num_recommended_extensions =
     PL_ARRAY_SIZE(pl_vulkan_recommended_extensions);
 
+// pNext chain of features we want enabled
+static const VkPhysicalDeviceHostQueryResetFeaturesEXT host_query_reset = {
+    .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT,
+    .hostQueryReset = true,
+};
+
 const VkPhysicalDeviceFeatures2KHR pl_vulkan_recommended_features = {
     .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR,
+    .pNext = (void *) &host_query_reset,
     .features = {
         .shaderImageGatherExtended = true,
 
@@ -1161,6 +1184,8 @@ static bool device_init(struct vk_ctx *vk, const struct pl_vulkan_params *params
         PFN_vkVoidFunction *pfn = (void *) ((uintptr_t) vk + (ptrdiff_t) fun->offset);
         if (fun->device_level) {
             *pfn = vk->GetDeviceProcAddr(vk->dev, fun->name);
+            if (fun->alias && !*pfn)
+                *pfn = vk->GetDeviceProcAddr(vk->dev, fun->alias);
         } else {
             *pfn = vk->GetInstanceProcAddr(vk->inst, fun->name);
         };
diff --git a/src/vulkan/gpu.c b/src/vulkan/gpu.c
index 8af8798d659ace8348a0da409e61f339b851112d..2e7140e94498b4b77d08056091dac675d15d9ff6 100644
--- a/src/vulkan/gpu.c
+++ b/src/vulkan/gpu.c
@@ -40,9 +40,10 @@ struct pl_vk {
     struct vk_malloc *alloc;
     struct spirv_compiler *spirv;
 
-    // Some additional cached device limits
+    // Some additional cached device limits and features checks
     uint32_t max_push_descriptors;
     size_t min_texel_alignment;
+    bool host_query_reset;
 
     // This is a pl_dispatch used (on ourselves!) for the purposes of
     // dispatching compute shaders for performing various emulation tasks
@@ -119,7 +120,7 @@ static inline bool supports_marks(struct vk_cmd *cmd) {
     } while (0)
 
 #define MAKE_LAZY_DESTRUCTOR(fun, argtype)                                  \
-    static void fun##_lazy(const struct pl_gpu *gpu, const argtype *arg) {  \
+    static void fun##_lazy(const struct pl_gpu *gpu, argtype *arg) {        \
         struct pl_vk *p = TA_PRIV(gpu);                                     \
         struct vk_ctx *vk = p->vk;                                          \
         if (p->cmd) {                                                       \
@@ -419,6 +420,15 @@ const struct pl_gpu *pl_gpu_create_vk(struct vk_ctx *vk)
         p->max_push_descriptors = pushd.maxPushDescriptors;
     }
 
+    if (vk->ResetQueryPoolEXT) {
+        const VkPhysicalDeviceHostQueryResetFeaturesEXT *host_query_reset;
+        host_query_reset = vk_find_struct(&vk->features,
+            VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT);
+
+        if (host_query_reset)
+            p->host_query_reset = host_query_reset->hostQueryReset;
+    }
+
     // We ostensibly support this, although it can still fail on buffer
     // creation (for certain combinations of buffers)
     gpu->caps |= PL_GPU_CAP_MAPPED_BUFFERS;
@@ -678,7 +688,7 @@ static void vk_tex_destroy(const struct pl_gpu *gpu, struct pl_tex *tex)
     talloc_free(tex);
 }
 
-MAKE_LAZY_DESTRUCTOR(vk_tex_destroy, struct pl_tex)
+MAKE_LAZY_DESTRUCTOR(vk_tex_destroy, const struct pl_tex)
 
 static const VkFilter filters[] = {
     [PL_TEX_SAMPLE_NEAREST] = VK_FILTER_NEAREST,
@@ -1487,7 +1497,7 @@ static void buf_flush(const struct pl_gpu *gpu, struct vk_cmd *cmd,
 }
 
 #define vk_buf_destroy vk_buf_deref
-MAKE_LAZY_DESTRUCTOR(vk_buf_destroy, struct pl_buf)
+MAKE_LAZY_DESTRUCTOR(vk_buf_destroy, const struct pl_buf)
 
 static void vk_buf_write(const struct pl_gpu *gpu, const struct pl_buf *buf,
                          size_t offset, const void *data, size_t size)
@@ -2084,7 +2094,7 @@ static void vk_pass_destroy(const struct pl_gpu *gpu, struct pl_pass *pass)
     talloc_free(pass);
 }
 
-MAKE_LAZY_DESTRUCTOR(vk_pass_destroy, struct pl_pass)
+MAKE_LAZY_DESTRUCTOR(vk_pass_destroy, const struct pl_pass)
 
 static const VkDescriptorType dsType[] = {
     [PL_DESC_SAMPLED_TEX] = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
@@ -3079,27 +3089,29 @@ error:
 #define VK_QUERY_POOL_SIZE 16
 
 struct pl_timer {
-    int refcount;
     bool recording; // true between vk_cmd_timer_begin() and vk_cmd_timer_end()
     VkQueryPool qpool; // even=start, odd=stop
     int index_write; // next index to write to
     int index_read; // next index to read from
+    uint64_t pending; // bitmask of queries that are still running
 };
 
+static inline uint64_t timer_bit(int index)
+{
+    return 1llu << (index / 2);
+}
+
 static void vk_timer_destroy(const struct pl_gpu *gpu, struct pl_timer *timer)
 {
     struct pl_vk *p = TA_PRIV(gpu);
     struct vk_ctx *vk = p->vk;
 
+    pl_assert(!timer->pending);
     vk->DestroyQueryPool(vk->dev, timer->qpool, VK_ALLOC);
     talloc_free(timer);
 }
 
-static void vk_timer_deref(const struct pl_gpu *gpu, struct pl_timer *timer)
-{
-    if (--timer->refcount == 0)
-        vk_timer_destroy(gpu, timer);
-}
+MAKE_LAZY_DESTRUCTOR(vk_timer_destroy, struct pl_timer)
 
 static struct pl_timer *vk_timer_create(const struct pl_gpu *gpu)
 {
@@ -3107,9 +3119,7 @@ static struct pl_timer *vk_timer_create(const struct pl_gpu *gpu)
     struct vk_ctx *vk = p->vk;
 
     struct pl_timer *timer = talloc_ptrtype(NULL, timer);
-    *timer = (struct pl_timer) {
-        .refcount = 1,
-    };
+    *timer = (struct pl_timer) {0};
 
     struct VkQueryPoolCreateInfo qinfo = {
         .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
@@ -3167,19 +3177,36 @@ static void vk_cmd_timer_begin(const struct pl_gpu *gpu, struct vk_cmd *cmd,
         return;
     }
 
+    vk_poll_commands(vk, 0);
+    if (timer->pending & timer_bit(timer->index_write))
+        return; // next query is still running, skip this timer
+
     VkQueueFlags reset_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT;
-    if (!(cmd->pool->props.queueFlags & reset_flags)) {
-        PL_TRACE(gpu, "QF %d does not support query pool resets", cmd->pool->qf);
+    if (cmd->pool->props.queueFlags & reset_flags) {
+        // Use direct command buffer resets
+        vk->CmdResetQueryPool(cmd->buf, timer->qpool, timer->index_write, 2);
+    } else if (p->host_query_reset) {
+        // Use host query resets
+        vk->ResetQueryPoolEXT(vk->dev, timer->qpool, timer->index_write, 2);
+    } else {
+        PL_TRACE(gpu, "QF %d supports no mechanism for resetting queries",
+                 cmd->pool->qf);
         return;
     }
 
-    vk->CmdResetQueryPool(cmd->buf, timer->qpool, timer->index_write, 2);
     vk->CmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
                           timer->qpool, timer->index_write);
 
     timer->recording = true;
 }
 
+static void vk_timer_cb(void *ptimer, void *pindex)
+{
+    struct pl_timer *timer = ptimer;
+    int index = (uintptr_t) pindex;
+    timer->pending &= ~timer_bit(index);
+}
+
 static void vk_cmd_timer_end(const struct pl_gpu *gpu, struct vk_cmd *cmd,
                              struct pl_timer *timer)
 {
@@ -3192,18 +3219,18 @@ static void vk_cmd_timer_end(const struct pl_gpu *gpu, struct vk_cmd *cmd,
     vk->CmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
                           timer->qpool, timer->index_write + 1);
 
+    timer->recording = false;
+    timer->pending |= timer_bit(timer->index_write);
+    vk_cmd_callback(cmd, (vk_cb) vk_timer_cb, timer,
+                    (void *) (uintptr_t) timer->index_write);
+
     timer->index_write = (timer->index_write + 2) % VK_QUERY_POOL_SIZE;
     if (timer->index_write == timer->index_read) {
         // forcibly drop the least recent result to make space
         timer->index_read = (timer->index_read + 2) % VK_QUERY_POOL_SIZE;
     }
-
-    timer->recording = false;
-    timer->refcount++;
-    vk_cmd_callback(cmd, (vk_cb) vk_timer_deref, gpu, timer);
 }
 
-
 static void vk_gpu_flush(const struct pl_gpu *gpu)
 {
     struct pl_vk *p = TA_PRIV(gpu);
@@ -3252,7 +3279,7 @@ static const struct pl_gpu_fns pl_fns_vk = {
     .sync_destroy           = vk_sync_deref,
     .tex_export             = vk_tex_export,
     .timer_create           = vk_timer_create,
-    .timer_destroy          = vk_timer_deref,
+    .timer_destroy          = vk_timer_destroy_lazy,
     .timer_query            = vk_timer_query,
     .gpu_flush              = vk_gpu_flush,
     .gpu_finish             = vk_gpu_finish,