gpu.c 95.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * This file is part of libplacebo.
 *
 * libplacebo is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * libplacebo is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
 */

18
#include "gpu.h"
19 20 21 22 23
#include "command.h"
#include "formats.h"
#include "malloc.h"
#include "spirv.h"

24 25 26 27
#ifdef VK_HAVE_UNIX
#include <unistd.h>
#endif

28
static struct pl_gpu_fns pl_fns_vk;
29 30 31 32 33 34 35

enum queue_type {
    GRAPHICS,
    COMPUTE,
    TRANSFER,
};

36 37
// For gpu.priv
struct pl_vk {
38 39 40 41
    struct vk_ctx *vk;
    struct vk_malloc *alloc;
    struct spirv_compiler *spirv;

42 43
    // Some additional cached device limits
    uint32_t max_push_descriptors;
44
    size_t min_texel_alignment;
45

46 47 48 49 50 51
    // This is a pl_dispatch used (on ourselves!) for the purposes of
    // dispatching compute shaders for performing various emulation tasks
    // (e.g. partial clears, blits or emulated texture transfers).
    // Warning: Care must be taken to avoid recursive calls.
    struct pl_dispatch *dp;

52 53 54 55 56
    // The "currently recording" command. This will be queued and replaced by
    // a new command every time we need to "switch" between queue families.
    struct vk_cmd *cmd;
};

57
struct vk_ctx *pl_vk_get(const struct pl_gpu *gpu)
58
{
59
    if (gpu->impl != &pl_fns_vk)
60 61
        return NULL;

62
    struct pl_vk *p = gpu->priv;
63 64 65
    return p->vk;
}

66
static void vk_submit(const struct pl_gpu *gpu)
67
{
68 69
    struct pl_vk *p = gpu->priv;
    struct vk_ctx *vk = pl_vk_get(gpu);
70 71 72 73 74 75 76 77

    if (p->cmd) {
        vk_cmd_queue(vk, p->cmd);
        p->cmd = NULL;
    }
}

// Returns a command buffer, or NULL on error
78 79
static struct vk_cmd *vk_require_cmd(const struct pl_gpu *gpu,
                                     enum queue_type type)
80
{
81 82
    struct pl_vk *p = gpu->priv;
    struct vk_ctx *vk = pl_vk_get(gpu);
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99

    struct vk_cmdpool *pool;
    switch (type) {
    case GRAPHICS: pool = vk->pool_graphics; break;
    case COMPUTE:  pool = vk->pool_compute;  break;

    // GRAPHICS and COMPUTE also imply TRANSFER capability (vulkan spec)
    case TRANSFER:
        pool = vk->pool_transfer;
        if (!pool)
            pool = vk->pool_compute;
        if (!pool)
            pool = vk->pool_graphics;
        break;
    default: abort();
    }

100
    pl_assert(pool);
101 102 103
    if (p->cmd && p->cmd->pool == pool)
        return p->cmd;

104
    vk_submit(gpu);
105 106 107 108
    p->cmd = vk_cmd_begin(vk, pool);
    return p->cmd;
}

109 110 111 112 113 114 115 116 117
#define MAKE_LAZY_DESTRUCTOR(fun, argtype)                                  \
    static void fun##_lazy(const struct pl_gpu *gpu, const argtype *arg) {  \
        struct pl_vk *p = gpu->priv;                                        \
        struct vk_ctx *vk = pl_vk_get(gpu);                                 \
        if (p->cmd) {                                                       \
            vk_cmd_callback(p->cmd, (vk_cb) fun, gpu, (void *) arg);        \
        } else {                                                            \
            vk_dev_callback(vk, (vk_cb) fun, gpu, (void *) arg);            \
        }                                                                   \
118 119
    }

Niklas Haas's avatar
Niklas Haas committed
120
static void vk_destroy_gpu(const struct pl_gpu *gpu)
121
{
122 123
    struct pl_vk *p = gpu->priv;
    struct vk_ctx *vk = pl_vk_get(gpu);
124

125
    pl_dispatch_destroy(&p->dp);
126
    vk_submit(gpu);
127
    vk_wait_idle(vk);
128 129 130 131

    vk_malloc_destroy(&p->alloc);
    spirv_compiler_destroy(&p->spirv);

132
    talloc_free((void *) gpu);
133 134
}

135
static void vk_setup_formats(struct pl_gpu *gpu)
136
{
137
    struct vk_ctx *vk = pl_vk_get(gpu);
138

139 140
    // Texture format emulation requires at least support for texel buffers
    bool has_emu = (gpu->caps & PL_GPU_CAP_COMPUTE) && gpu->limits.max_buffer_texels;
141

142 143 144 145
    for (const struct vk_format *vk_fmt = vk_formats; vk_fmt->ifmt; vk_fmt++) {
        VkFormatProperties prop;
        vkGetPhysicalDeviceFormatProperties(vk->physd, vk_fmt->ifmt, &prop);

146
        struct pl_fmt *fmt = talloc_ptrtype(gpu, fmt);
147 148 149 150 151 152
        *fmt = vk_fmt->fmt;
        fmt->priv = vk_fmt;

        // For sanity, clear the superfluous fields
        for (int i = fmt->num_components; i < 4; i++) {
            fmt->component_depth[i] = 0;
Niklas Haas's avatar
Niklas Haas committed
153 154
            fmt->sample_order[i] = 0;
            fmt->host_bits[i] = 0;
155 156
        }

157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
        struct { VkFormatFeatureFlags flags; enum pl_fmt_caps caps; } bufbits[] = {
            {VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT,        PL_FMT_CAP_VERTEX},
            {VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT, PL_FMT_CAP_TEXEL_UNIFORM},
            {VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT, PL_FMT_CAP_TEXEL_STORAGE},
        };

        for (int i = 0; i < PL_ARRAY_SIZE(bufbits); i++) {
            if ((prop.bufferFeatures & bufbits[i].flags) == bufbits[i].flags)
                fmt->caps |= bufbits[i].caps;
        }

        if (fmt->caps) {
            fmt->glsl_type = pl_var_glsl_type_name(pl_var_from_fmt(fmt, ""));
            pl_assert(fmt->glsl_type);
        }

173 174
        // For the texture capabilities, try falling back to the emulation
        // format if this format is wholly unsupported.
175
        if (has_emu && !prop.optimalTilingFeatures && vk_fmt->emufmt) {
176 177 178 179
            fmt->emulated = true;
            vkGetPhysicalDeviceFormatProperties(vk->physd, vk_fmt->emufmt, &prop);
        }

180 181 182 183 184 185
        struct { VkFormatFeatureFlags flags; enum pl_fmt_caps caps; } bits[] = {
            {VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT,      PL_FMT_CAP_BLENDABLE},
            {VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT, PL_FMT_CAP_LINEAR},
            {VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT,               PL_FMT_CAP_SAMPLEABLE},
            {VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT,               PL_FMT_CAP_STORABLE},
            {VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT,            PL_FMT_CAP_RENDERABLE},
Niklas Haas's avatar
Niklas Haas committed
186

187
            // We don't distinguish between the two blit modes for pl_fmt_caps
Niklas Haas's avatar
Niklas Haas committed
188
            {VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT,
189
                PL_FMT_CAP_BLITTABLE},
Niklas Haas's avatar
Niklas Haas committed
190
        };
191

Niklas Haas's avatar
Niklas Haas committed
192 193 194 195
        for (int i = 0; i < PL_ARRAY_SIZE(bits); i++) {
            if ((prop.optimalTilingFeatures & bits[i].flags) == bits[i].flags)
                fmt->caps |= bits[i].caps;
        }
196

197 198 199 200 201
        // Disable implied capabilities where the dependencies are unavailable
        if (!(fmt->caps & PL_FMT_CAP_SAMPLEABLE))
            fmt->caps &= ~PL_FMT_CAP_LINEAR;
        if (!(gpu->caps & PL_GPU_CAP_COMPUTE))
            fmt->caps &= ~(PL_FMT_CAP_STORABLE | PL_FMT_CAP_TEXEL_STORAGE);
202

203 204
        enum pl_fmt_caps storable = PL_FMT_CAP_STORABLE | PL_FMT_CAP_TEXEL_STORAGE;
        if (fmt->caps & storable) {
205
            fmt->glsl_format = pl_fmt_glsl_format(fmt);
206
            if (!fmt->glsl_format) {
207
                PL_WARN(gpu, "Storable format '%s' has no matching GLSL format "
208
                        "qualifier?", fmt->name);
209
                fmt->caps &= ~storable;
210 211 212
            }
        }

213
        TARRAY_APPEND(gpu, gpu->formats, gpu->num_formats, fmt);
214
    }
215

216
    pl_gpu_sort_formats(gpu);
217 218
}

219 220
static pl_handle_caps vk_sync_handle_caps(struct vk_ctx *vk)
{
221
    pl_handle_caps caps = 0;
222

223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
    if (!vk->vkGetPhysicalDeviceExternalSemaphorePropertiesKHR)
        return caps;

    for (int i = 0; vk_handle_list[i]; i++) {
        enum pl_handle_type type = vk_handle_list[i];

        VkPhysicalDeviceExternalSemaphoreInfoKHR info = {
            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_SEMAPHORE_INFO_KHR,
            .handleType = vk_handle_type(type),
        };

        VkExternalSemaphorePropertiesKHR props = {
            .sType = VK_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_PROPERTIES_KHR,
        };

        vk->vkGetPhysicalDeviceExternalSemaphorePropertiesKHR(vk->physd, &info, &props);
        VkExternalSemaphoreFeatureFlagsKHR flags = props.externalSemaphoreFeatures;
        if ((props.compatibleHandleTypes & info.handleType) &&
            (flags & VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT_KHR))
        {
            caps |= type;
        }
    }
246

247
    return caps;
248 249
}

250
const struct pl_gpu *pl_gpu_create_vk(struct vk_ctx *vk)
251
{
252
    pl_assert(vk->dev);
253

254 255 256
    struct pl_gpu *gpu = talloc_zero(NULL, struct pl_gpu);
    gpu->ctx = vk->ctx;
    gpu->impl = &pl_fns_vk;
257

258
    struct pl_vk *p = gpu->priv = talloc_zero(gpu, struct pl_vk);
259 260 261 262 263 264 265
    p->vk = vk;

    p->spirv = spirv_compiler_create(vk->ctx);
    p->alloc = vk_malloc_create(vk);
    if (!p->alloc || !p->spirv)
        goto error;

266 267
    gpu->glsl = p->spirv->glsl;
    gpu->limits = (struct pl_gpu_limits) {
268 269 270 271 272 273 274
        .max_tex_1d_dim    = vk->limits.maxImageDimension1D,
        .max_tex_2d_dim    = vk->limits.maxImageDimension2D,
        .max_tex_3d_dim    = vk->limits.maxImageDimension3D,
        .max_pushc_size    = vk->limits.maxPushConstantsSize,
        .max_xfer_size     = SIZE_MAX, // no limit imposed by vulkan
        .max_ubo_size      = vk->limits.maxUniformBufferRange,
        .max_ssbo_size     = vk->limits.maxStorageBufferRange,
275
        .max_buffer_texels = vk->limits.maxTexelBufferElements,
Niklas Haas's avatar
Niklas Haas committed
276
        .min_gather_offset = vk->limits.minTexelGatherOffset,
277 278
        .max_gather_offset = vk->limits.maxTexelGatherOffset,
        .align_tex_xfer_stride = vk->limits.optimalBufferCopyRowPitchAlignment,
279
        .align_tex_xfer_offset = pl_lcm(vk->limits.optimalBufferCopyOffsetAlignment, 4),
280 281
    };

282
    gpu->handle_caps.shared_mem = vk_malloc_handle_caps(p->alloc);
283
    gpu->handle_caps.sync = vk_sync_handle_caps(vk);
284

285 286 287 288 289 290 291 292 293 294
    if (pl_gpu_supports_interop(gpu)) {
        VkPhysicalDeviceIDPropertiesKHR id_props = {
            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR,
        };

        VkPhysicalDeviceProperties2KHR props = {
            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
            .pNext = &id_props,
        };

295
        vk->vkGetPhysicalDeviceProperties2KHR(vk->physd, &props);
296 297 298 299
        assert(sizeof(gpu->uuid) == VK_UUID_SIZE);
        memcpy(gpu->uuid, id_props.deviceUUID, sizeof(gpu->uuid));
    }

300 301 302 303 304 305 306 307 308 309
    if (vk->vkCmdPushDescriptorSetKHR) {
        VkPhysicalDevicePushDescriptorPropertiesKHR pushd = {
            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR,
        };

        VkPhysicalDeviceProperties2KHR props = {
            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
            .pNext = &pushd,
        };

310
        vk->vkGetPhysicalDeviceProperties2KHR(vk->physd, &props);
311 312 313
        p->max_push_descriptors = pushd.maxPushDescriptors;
    }

314
    if (vk->pool_compute) {
315 316 317
        gpu->caps |= PL_GPU_CAP_COMPUTE;
        gpu->limits.max_shmem_size = vk->limits.maxComputeSharedMemorySize;
        gpu->limits.max_group_threads = vk->limits.maxComputeWorkGroupInvocations;
318
        for (int i = 0; i < 3; i++) {
319 320
            gpu->limits.max_group_size[i] = vk->limits.maxComputeWorkGroupSize[i];
            gpu->limits.max_dispatch[i] = vk->limits.maxComputeWorkGroupCount[i];
321 322 323 324 325
        }

        // If we have more compute queues than graphics queues, we probably
        // want to be using them. (This seems mostly relevant for AMD)
        if (vk->pool_compute->num_queues > vk->pool_graphics->num_queues)
326
            gpu->caps |= PL_GPU_CAP_PARALLEL_COMPUTE;
327 328
    }

329 330 331 332 333
    if (!vk->features.shaderImageGatherExtended) {
        gpu->limits.min_gather_offset = 0;
        gpu->limits.max_gather_offset = 0;
    }

334
    vk_setup_formats(gpu);
335

336 337 338 339 340 341 342 343
    // Compute the correct minimum texture alignment
    p->min_texel_alignment = 1;
    for (int i = 0; i < gpu->num_formats; i++) {
        size_t texel_size = gpu->formats[i]->texel_size;
        p->min_texel_alignment = pl_lcm(p->min_texel_alignment, texel_size);
    }
    PL_DEBUG(gpu, "Minimum texel alignment: %zu", p->min_texel_alignment);

344 345
    // Create the dispatch last, after any setup of `gpu` is done
    p->dp = pl_dispatch_create(vk->ctx, gpu);
346 347 348
    pl_gpu_print_info(gpu, PL_LOG_INFO);
    pl_gpu_print_formats(gpu, PL_LOG_DEBUG);
    return gpu;
349 350

error:
Niklas Haas's avatar
Niklas Haas committed
351
    vk_destroy_gpu(gpu);
352 353 354 355 356 357
    return NULL;
}

// Boilerplate wrapper around vkCreateRenderPass to ensure passes remain
// compatible. The renderpass will automatically transition the image out of
// initialLayout and into finalLayout.
358
static VkResult vk_create_render_pass(VkDevice dev, const struct pl_fmt *fmt,
359 360 361 362 363
                                      VkAttachmentLoadOp loadOp,
                                      VkImageLayout initialLayout,
                                      VkImageLayout finalLayout,
                                      VkRenderPass *out)
{
Niklas Haas's avatar
Niklas Haas committed
364
    const struct vk_format *vk_fmt = fmt->priv;
365 366 367 368 369

    VkRenderPassCreateInfo rinfo = {
        .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
        .attachmentCount = 1,
        .pAttachments = &(VkAttachmentDescription) {
370
            .format = fmt->emulated ? vk_fmt->emufmt : vk_fmt->ifmt,
371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387
            .samples = VK_SAMPLE_COUNT_1_BIT,
            .loadOp = loadOp,
            .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
            .initialLayout = initialLayout,
            .finalLayout = finalLayout,
        },
        .subpassCount = 1,
        .pSubpasses = &(VkSubpassDescription) {
            .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
            .colorAttachmentCount = 1,
            .pColorAttachments = &(VkAttachmentReference) {
                .attachment = 0,
                .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
            },
        },
    };

Niklas Haas's avatar
Niklas Haas committed
388
    return vkCreateRenderPass(dev, &rinfo, VK_ALLOC, out);
389 390
}

391 392
// For pl_tex.priv
struct pl_tex_vk {
393
    bool held;
394
    bool external_img;
Niklas Haas's avatar
Niklas Haas committed
395
    bool may_invalidate;
Niklas Haas's avatar
Niklas Haas committed
396
    enum queue_type transfer_queue;
397 398 399
    VkImageType type;
    VkImage img;
    struct vk_memslice mem;
400 401 402
    // cached properties
    VkFormat img_fmt;
    VkImageUsageFlags usage_flags;
403 404 405 406 407
    // for sampling
    VkImageView view;
    VkSampler sampler;
    // for rendering
    VkFramebuffer framebuffer;
408
    // for transfers
409 410
    struct pl_buf_pool pbo_write;
    struct pl_buf_pool pbo_read;
411
    // for vk_tex_upload/download fallback code
412
    const struct pl_fmt *texel_fmt;
413 414
    struct pl_buf_pool tmp_write;
    struct pl_buf_pool tmp_read;
415 416 417 418 419 420
    // "current" metadata, can change during the course of execution
    VkImageLayout current_layout;
    VkAccessFlags current_access;
    // the signal guards reuse, and can be NULL
    struct vk_signal *sig;
    VkPipelineStageFlags sig_stage;
421
    VkSemaphore *ext_deps; // external semaphore, not owned by the pl_tex
422
    int num_ext_deps;
423
    const struct pl_sync *ext_sync; // indicates an exported image
424 425
};

426
void pl_tex_vk_external_dep(const struct pl_gpu *gpu, const struct pl_tex *tex,
Niklas Haas's avatar
Niklas Haas committed
427
                            VkSemaphore external_dep)
428
{
429 430 431
    if (!external_dep)
        return;

432
    struct pl_tex_vk *tex_vk = tex->priv;
433
    TARRAY_APPEND(tex_vk, tex_vk->ext_deps, tex_vk->num_ext_deps, external_dep);
434 435
}

436 437
static void vk_sync_deref(const struct pl_gpu *gpu, const struct pl_sync *sync);

438 439
// Small helper to ease image barrier creation. if `discard` is set, the contents
// of the image will be undefined after the barrier
440 441
static void tex_barrier(const struct pl_gpu *gpu, struct vk_cmd *cmd,
                        const struct pl_tex *tex, VkPipelineStageFlags stage,
442 443
                        VkAccessFlags newAccess, VkImageLayout newLayout,
                        bool export)
444
{
445 446
    struct vk_ctx *vk = pl_vk_get(gpu);
    struct pl_tex_vk *tex_vk = tex->priv;
447
    assert(!tex_vk->held);
448

449 450 451
    for (int i = 0; i < tex_vk->num_ext_deps; i++)
        vk_cmd_dep(cmd, tex_vk->ext_deps[i], stage);
    tex_vk->num_ext_deps = 0;
452 453 454 455 456

    VkImageMemoryBarrier imgBarrier = {
        .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
        .oldLayout = tex_vk->current_layout,
        .newLayout = newLayout,
457
        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
458 459
        .dstQueueFamilyIndex = export ? VK_QUEUE_FAMILY_EXTERNAL_KHR
                                      : VK_QUEUE_FAMILY_IGNORED,
460 461 462
        .srcAccessMask = tex_vk->current_access,
        .dstAccessMask = newAccess,
        .image = tex_vk->img,
Niklas Haas's avatar
Niklas Haas committed
463 464 465 466 467
        .subresourceRange = {
            .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
            .levelCount = 1,
            .layerCount = 1,
        },
468 469
    };

470 471 472 473 474 475 476
    if (tex_vk->ext_sync) {
        if (tex_vk->current_layout != VK_IMAGE_LAYOUT_UNDEFINED)
            imgBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_EXTERNAL_KHR;
        vk_cmd_callback(cmd, (vk_cb) vk_sync_deref, gpu, tex_vk->ext_sync);
        tex_vk->ext_sync = NULL;
    }

Niklas Haas's avatar
Niklas Haas committed
477 478
    if (tex_vk->may_invalidate) {
        tex_vk->may_invalidate = false;
479 480 481 482
        imgBarrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
        imgBarrier.srcAccessMask = 0;
    }

483
    VkEvent event = VK_NULL_HANDLE;
Niklas Haas's avatar
Niklas Haas committed
484
    enum vk_wait_type type = vk_cmd_wait(vk, cmd, &tex_vk->sig, stage, &event);
485 486

    bool need_trans = tex_vk->current_layout != newLayout ||
487 488 489
                      tex_vk->current_access != newAccess ||
                      (imgBarrier.srcQueueFamilyIndex !=
                       imgBarrier.dstQueueFamilyIndex);
490 491 492 493

    // Transitioning to VK_IMAGE_LAYOUT_UNDEFINED is a pseudo-operation
    // that for us means we don't need to perform the actual transition
    if (need_trans && newLayout != VK_IMAGE_LAYOUT_UNDEFINED) {
Niklas Haas's avatar
Niklas Haas committed
494 495 496 497 498
        switch (type) {
        case VK_WAIT_NONE:
            // No synchronization required, so we can safely transition out of
            // VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT
            imgBarrier.srcAccessMask = 0;
499 500
            vkCmdPipelineBarrier(cmd->buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                                 stage, 0, 0, NULL, 0, NULL, 1, &imgBarrier);
Niklas Haas's avatar
Niklas Haas committed
501 502 503 504 505 506 507 508 509 510 511
            break;
        case VK_WAIT_BARRIER:
            // Regular pipeline barrier is required
            vkCmdPipelineBarrier(cmd->buf, tex_vk->sig_stage, stage, 0, 0, NULL,
                                 0, NULL, 1, &imgBarrier);
            break;
        case VK_WAIT_EVENT:
            // We can/should use the VkEvent for synchronization
            vkCmdWaitEvents(cmd->buf, 1, &event, tex_vk->sig_stage,
                            stage, 0, NULL, 0, NULL, 1, &imgBarrier);
            break;
512 513 514 515 516 517 518
        }
    }

    tex_vk->current_layout = newLayout;
    tex_vk->current_access = newAccess;
}

519 520
static void tex_signal(const struct pl_gpu *gpu, struct vk_cmd *cmd,
                       const struct pl_tex *tex, VkPipelineStageFlags stage)
521
{
522 523
    struct pl_tex_vk *tex_vk = tex->priv;
    struct vk_ctx *vk = pl_vk_get(gpu);
524
    pl_assert(!tex_vk->sig);
525 526 527 528 529

    tex_vk->sig = vk_cmd_signal(vk, cmd, stage);
    tex_vk->sig_stage = stage;
}

530
static void vk_tex_destroy(const struct pl_gpu *gpu, struct pl_tex *tex)
531 532 533 534
{
    if (!tex)
        return;

535 536 537
    struct vk_ctx *vk = pl_vk_get(gpu);
    struct pl_tex_vk *tex_vk = tex->priv;
    struct pl_vk *p = gpu->priv;
538

539 540
    pl_buf_pool_uninit(gpu, &tex_vk->tmp_write);
    pl_buf_pool_uninit(gpu, &tex_vk->tmp_read);
541 542
    pl_buf_pool_uninit(gpu, &tex_vk->pbo_write);
    pl_buf_pool_uninit(gpu, &tex_vk->pbo_read);
543
    vk_sync_deref(gpu, tex_vk->ext_sync);
544
    vk_signal_destroy(vk, &tex_vk->sig);
Niklas Haas's avatar
Niklas Haas committed
545 546 547
    vkDestroyFramebuffer(vk->dev, tex_vk->framebuffer, VK_ALLOC);
    vkDestroySampler(vk->dev, tex_vk->sampler, VK_ALLOC);
    vkDestroyImageView(vk->dev, tex_vk->view, VK_ALLOC);
548
    if (!tex_vk->external_img) {
Niklas Haas's avatar
Niklas Haas committed
549 550
        vkDestroyImage(vk->dev, tex_vk->img, VK_ALLOC);
        vk_free_memslice(p->alloc, tex_vk->mem);
551 552 553 554 555
    }

    talloc_free(tex);
}

556
MAKE_LAZY_DESTRUCTOR(vk_tex_destroy, struct pl_tex);
557

558
static const VkFilter filters[] = {
559 560
    [PL_TEX_SAMPLE_NEAREST] = VK_FILTER_NEAREST,
    [PL_TEX_SAMPLE_LINEAR]  = VK_FILTER_LINEAR,
561 562
};

563
// Initializes non-VkImage values like the image view, samplers, etc.
564
static bool vk_init_image(const struct pl_gpu *gpu, const struct pl_tex *tex)
565
{
566
    struct vk_ctx *vk = pl_vk_get(gpu);
567

568 569
    const struct pl_tex_params *params = &tex->params;
    struct pl_tex_vk *tex_vk = tex->priv;
570
    pl_assert(tex_vk->img);
571 572 573

    tex_vk->current_layout = VK_IMAGE_LAYOUT_UNDEFINED;
    tex_vk->current_access = 0;
Niklas Haas's avatar
Niklas Haas committed
574 575 576
    tex_vk->transfer_queue = GRAPHICS;

    // Always use the transfer pool if available, for efficiency
577
    if ((params->host_writable || params->host_readable) && vk->pool_transfer)
Niklas Haas's avatar
Niklas Haas committed
578
        tex_vk->transfer_queue = TRANSFER;
579

580 581 582 583 584
    // For emulated formats: force usage of the compute queue, because we
    // can't properly track cross-queue dependencies for buffers (yet?)
    if (params->format->emulated)
        tex_vk->transfer_queue = COMPUTE;

Niklas Haas's avatar
Niklas Haas committed
585
    bool ret = false;
586
    VkRenderPass dummyPass = VK_NULL_HANDLE;
Niklas Haas's avatar
Niklas Haas committed
587 588

    if (params->sampleable || params->renderable) {
589 590 591 592 593 594 595 596 597 598 599
        static const VkImageViewType viewType[] = {
            [VK_IMAGE_TYPE_1D] = VK_IMAGE_VIEW_TYPE_1D,
            [VK_IMAGE_TYPE_2D] = VK_IMAGE_VIEW_TYPE_2D,
            [VK_IMAGE_TYPE_3D] = VK_IMAGE_VIEW_TYPE_3D,
        };

        const struct vk_format *fmt = params->format->priv;
        VkImageViewCreateInfo vinfo = {
            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
            .image = tex_vk->img,
            .viewType = viewType[tex_vk->type],
600
            .format = params->format->emulated ? fmt->emufmt : fmt->ifmt,
Niklas Haas's avatar
Niklas Haas committed
601 602 603 604 605
            .subresourceRange = {
                .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
                .levelCount = 1,
                .layerCount = 1,
            },
606 607
        };

Niklas Haas's avatar
Niklas Haas committed
608
        VK(vkCreateImageView(vk->dev, &vinfo, VK_ALLOC, &tex_vk->view));
609 610
    }

Niklas Haas's avatar
Niklas Haas committed
611
    if (params->sampleable) {
612
        static const VkSamplerAddressMode modes[] = {
613 614 615
            [PL_TEX_ADDRESS_CLAMP]  = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
            [PL_TEX_ADDRESS_REPEAT] = VK_SAMPLER_ADDRESS_MODE_REPEAT,
            [PL_TEX_ADDRESS_MIRROR] = VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT,
Niklas Haas's avatar
Niklas Haas committed
616 617
        };

618 619
        VkSamplerCreateInfo sinfo = {
            .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
Niklas Haas's avatar
Niklas Haas committed
620 621 622 623 624
            .magFilter = filters[params->sample_mode],
            .minFilter = filters[params->sample_mode],
            .addressModeU = modes[params->address_mode],
            .addressModeV = modes[params->address_mode],
            .addressModeW = modes[params->address_mode],
625 626 627
            .maxAnisotropy = 1.0,
        };

Niklas Haas's avatar
Niklas Haas committed
628
        VK(vkCreateSampler(vk->dev, &sinfo, VK_ALLOC, &tex_vk->sampler));
629 630
    }

Niklas Haas's avatar
Niklas Haas committed
631
    if (params->renderable) {
632 633 634 635 636 637
        // Framebuffers need to be created against a specific render pass
        // layout, so we need to temporarily create a skeleton/dummy render
        // pass for vulkan to figure out the compatibility
        VK(vk_create_render_pass(vk->dev, params->format,
                                 VK_ATTACHMENT_LOAD_OP_DONT_CARE,
                                 VK_IMAGE_LAYOUT_UNDEFINED,
Niklas Haas's avatar
Niklas Haas committed
638
                                 VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
Niklas Haas's avatar
Niklas Haas committed
639
                                 &dummyPass));
640 641 642

        VkFramebufferCreateInfo finfo = {
            .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
Niklas Haas's avatar
Niklas Haas committed
643
            .renderPass = dummyPass,
644 645 646 647 648 649 650
            .attachmentCount = 1,
            .pAttachments = &tex_vk->view,
            .width = tex->params.w,
            .height = tex->params.h,
            .layers = 1,
        };

Niklas Haas's avatar
Niklas Haas committed
651 652 653
        if (finfo.width > vk->limits.maxFramebufferWidth ||
            finfo.height > vk->limits.maxFramebufferHeight)
        {
654
            PL_ERR(gpu, "Framebuffer of size %dx%d exceeds the maximum allowed "
Niklas Haas's avatar
Niklas Haas committed
655 656 657 658 659
                   "dimensions: %dx%d", finfo.width, finfo.height,
                   vk->limits.maxFramebufferWidth,
                   vk->limits.maxFramebufferHeight);
            goto error;
        }
660

Niklas Haas's avatar
Niklas Haas committed
661 662
        VK(vkCreateFramebuffer(vk->dev, &finfo, VK_ALLOC,
                               &tex_vk->framebuffer));
663 664
    }

Niklas Haas's avatar
Niklas Haas committed
665
    ret = true;
666 667

error:
Niklas Haas's avatar
Niklas Haas committed
668 669
    vkDestroyRenderPass(vk->dev, dummyPass, VK_ALLOC);
    return ret;
670 671
}

672 673
static const struct pl_tex *vk_tex_create(const struct pl_gpu *gpu,
                                          const struct pl_tex_params *params)
674
{
675 676
    struct vk_ctx *vk = pl_vk_get(gpu);
    struct pl_vk *p = gpu->priv;
677

678
    struct pl_tex *tex = talloc_zero(NULL, struct pl_tex);
679 680 681
    tex->params = *params;
    tex->params.initial_data = NULL;

682
    struct pl_tex_vk *tex_vk = tex->priv = talloc_zero(tex, struct pl_tex_vk);
683 684

    const struct vk_format *fmt = params->format->priv;
685
    switch (pl_tex_params_dimension(*params)) {
686 687 688 689 690 691
    case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break;
    case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break;
    case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break;
    default: abort();
    }

692 693 694 695 696 697 698 699 700
    if (params->format->emulated) {
        tex_vk->texel_fmt = pl_find_fmt(gpu, params->format->type, 1, 0,
                                        params->format->host_bits[0],
                                        PL_FMT_CAP_TEXEL_UNIFORM);
        if (!tex_vk->texel_fmt) {
            PL_ERR(gpu, "Failed picking texel format for emulated texture!");
            goto error;
        }

701 702 703 704 705 706 707 708 709 710 711 712 713 714
        // Statically check to see if we'd even be able to upload it at all
        // and refuse right away if not. In theory, uploading can still fail
        // based on the size of pl_tex_transfer_params.stride_w, but for now
        // this should be enough.
        uint64_t texels = params->w * params->h * params->d *
                          params->format->num_components;

        if (texels > gpu->limits.max_buffer_texels) {
            PL_ERR(gpu, "Failed creating texture with emulated texture format: "
                   "texture dimensions exceed maximum texel buffer size! Try "
                   "again with a different (non-emulated) format?");
            goto error;
        }

715 716 717 718 719 720
        // Our format emulation requires storage image support. In order to
        // make a bunch of checks happy, just mark it off as storable (and also
        // enable VK_IMAGE_USAGE_STORAGE_BIT, which we do below)
        tex->params.storable = true;
    }

721
    VkImageUsageFlags usage = 0;
Niklas Haas's avatar
Niklas Haas committed
722
    if (params->sampleable)
723
        usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
Niklas Haas's avatar
Niklas Haas committed
724
    if (params->renderable)
725
        usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
726
    if (params->storable || params->format->emulated)
727
        usage |= VK_IMAGE_USAGE_STORAGE_BIT;
728
    if (params->host_readable || params->blit_src)
729
        usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
730
    if (params->host_writable || params->blit_dst || params->initial_data)
731 732
        usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;

733 734 735 736 737 738 739
    if (!usage) {
        // Vulkan requires images have at least *some* image usage set, but our
        // API is perfectly happy with a (useless) image. So just put
        // VK_IMAGE_USAGE_TRANSFER_DST_BIT since this harmless.
        usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT;
    }

740 741 742 743 744 745 746 747
    // FIXME: Since we can't keep track of queue family ownership properly,
    // and we don't know in advance what types of queue families this image
    // will belong to, we're forced to share all of our images between all
    // command pools.
    uint32_t qfs[3] = {0};
    for (int i = 0; i < vk->num_pools; i++)
        qfs[i] = vk->pools[i]->qf;

748 749
    VkExternalMemoryImageCreateInfoKHR ext_info = {
        .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO_KHR,
750
        .handleTypes = vk_handle_type(params->handle_type),
751 752
    };

753 754
    VkImageCreateInfo iinfo = {
        .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
755
        .pNext = params->handle_type ? &ext_info : NULL,
756
        .imageType = tex_vk->type,
757
        .format = params->format->emulated ? fmt->emufmt : fmt->ifmt,
Niklas Haas's avatar
Niklas Haas committed
758 759 760 761 762
        .extent = (VkExtent3D) {
            .width  = params->w,
            .height = PL_MAX(1, params->h),
            .depth  = PL_MAX(1, params->d)
        },
763 764 765 766 767 768 769 770 771 772 773 774
        .mipLevels = 1,
        .arrayLayers = 1,
        .samples = VK_SAMPLE_COUNT_1_BIT,
        .tiling = VK_IMAGE_TILING_OPTIMAL,
        .usage = usage,
        .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
        .sharingMode = vk->num_pools > 1 ? VK_SHARING_MODE_CONCURRENT
                                         : VK_SHARING_MODE_EXCLUSIVE,
        .queueFamilyIndexCount = vk->num_pools,
        .pQueueFamilyIndices = qfs,
    };

775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829
    // Double-check physical image format limits and fail if invalid
    VkPhysicalDeviceExternalImageFormatInfoKHR ext_pinfo = {
        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO_KHR,
        .handleType = ext_info.handleTypes,
    };

    VkPhysicalDeviceImageFormatInfo2KHR pinfo = {
        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2_KHR,
        .pNext = params->handle_type ? &ext_pinfo : NULL,
        .format = iinfo.format,
        .type = iinfo.imageType,
        .tiling = iinfo.tiling,
        .usage = iinfo.usage,
        .flags = iinfo.flags,
    };

    VkExternalImageFormatPropertiesKHR ext_props = {
        .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR,
    };

    VkImageFormatProperties2KHR props = {
        .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2_KHR,
        .pNext = params->handle_type ? &ext_props : NULL,
    };

    VkResult res;
    res = vk->vkGetPhysicalDeviceImageFormatProperties2KHR(vk->physd, &pinfo, &props);
    if (res == VK_ERROR_FORMAT_NOT_SUPPORTED) {
        goto error;
    } else {
        VK_ASSERT(res, "Querying image format properties");
    }

    VkExtent3D max = props.imageFormatProperties.maxExtent;
    if (params->w > max.width || params->h > max.height || params->d > max.depth)
    {
        PL_ERR(gpu, "Requested image size %dx%dx%d exceeds the maximum allowed "
               "dimensions %dx%dx%d for vulkan image format %x",
               params->w, params->h, params->d, max.width, max.height, max.depth,
               (unsigned) iinfo.format);
        goto error;
    }

    // Ensure the handle types are supported
    if (params->handle_type) {
        bool ok = vk_external_mem_check(&ext_props.externalMemoryProperties,
                                        params->handle_type);
        if (!ok) {
            PL_ERR(gpu, "Requested handle type is not compatible with the "
                   "specified combination of image parameters. Possibly the "
                   "handle type is unsupported altogether?");
            goto error;
        }
    }

Niklas Haas's avatar
Niklas Haas committed
830
    VK(vkCreateImage(vk->dev, &iinfo, VK_ALLOC, &tex_vk->img));
831 832
    tex_vk->img_fmt = iinfo.format;
    tex_vk->usage_flags = iinfo.usage;
833 834 835 836 837 838

    VkMemoryPropertyFlags memFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
    VkMemoryRequirements reqs;
    vkGetImageMemoryRequirements(vk->dev, tex_vk->img, &reqs);

    struct vk_memslice *mem = &tex_vk->mem;
839
    if (!vk_malloc_generic(p->alloc, reqs, memFlags, params->handle_type, mem))
840 841 842 843
        goto error;

    VK(vkBindImageMemory(vk->dev, tex_vk->img, mem->vkmem, mem->offset));

844
    if (!vk_init_image(gpu, tex))
845 846
        goto error;

847 848
    if (params->handle_type) {
        tex->shared_mem = tex_vk->mem.shared_mem;
849 850 851 852
        // Texture is not initially exported;
        // pl_vulkan_hold must be used to export it.
    }

853
    if (params->initial_data) {
854
        struct pl_tex_transfer_params ul_params = {
855
            .tex = tex,
856
            .ptr = (void *) params->initial_data,
Niklas Haas's avatar
Niklas Haas committed
857 858 859
            .rc = { 0, 0, 0, params->w, params->h, params->d },
            .stride_w = params->w,
            .stride_h = params->h,
860
        };
Niklas Haas's avatar
Niklas Haas committed
861

862
        // Since we re-use GPU helpers which require writable images, just fake it
Niklas Haas's avatar
Niklas Haas committed
863 864
        bool writable = tex->params.host_writable;
        tex->params.host_writable = true;
865
        if (!pl_tex_upload(gpu, &ul_params))
866
            goto error;
Niklas Haas's avatar
Niklas Haas committed
867
        tex->params.host_writable = writable;
868 869 870 871 872
    }

    return tex;

error:
873
    vk_tex_destroy(gpu, tex);
874 875 876
    return NULL;
}

877
static void vk_tex_invalidate(const struct pl_gpu *gpu, const struct pl_tex *tex)
Niklas Haas's avatar
Niklas Haas committed
878
{
879
    struct pl_tex_vk *tex_vk = tex->priv;
Niklas Haas's avatar
Niklas Haas committed
880 881 882
    tex_vk->may_invalidate = true;
}

883
static void vk_tex_clear(const struct pl_gpu *gpu, const struct pl_tex *tex,
Niklas Haas's avatar
Niklas Haas committed
884 885
                         const float color[4])
{
886
    struct pl_tex_vk *tex_vk = tex->priv;
Niklas Haas's avatar
Niklas Haas committed
887

888
    struct vk_cmd *cmd = vk_require_cmd(gpu, GRAPHICS);
Niklas Haas's avatar
Niklas Haas committed
889 890 891
    if (!cmd)
        return;

892
    tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_TRANSFER_BIT,
Niklas Haas's avatar
Niklas Haas committed
893
                VK_ACCESS_TRANSFER_WRITE_BIT,
894 895
                VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
                false);
Niklas Haas's avatar
Niklas Haas committed
896 897 898 899 900 901 902 903 904 905 906 907 908 909

    VkClearColorValue clearColor = {0};
    for (int c = 0; c < 4; c++)
        clearColor.float32[c] = color[c];

    static const VkImageSubresourceRange range = {
        .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
        .levelCount = 1,
        .layerCount = 1,
    };

    vkCmdClearColorImage(cmd->buf, tex_vk->img, tex_vk->current_layout,
                         &clearColor, 1, &range);

910
    tex_signal(gpu, cmd, tex, VK_PIPELINE_STAGE_TRANSFER_BIT);
Niklas Haas's avatar
Niklas Haas committed
911 912
}

913 914
static void vk_tex_blit(const struct pl_gpu *gpu,
                        const struct pl_tex *dst, const struct pl_tex *src,
Niklas Haas's avatar
Niklas Haas committed
915 916
                        struct pl_rect3d dst_rc, struct pl_rect3d src_rc)
{
917 918
    struct pl_tex_vk *src_vk = src->priv;
    struct pl_tex_vk *dst_vk = dst->priv;
Niklas Haas's avatar
Niklas Haas committed
919

920
    struct vk_cmd *cmd = vk_require_cmd(gpu, GRAPHICS);
Niklas Haas's avatar
Niklas Haas committed
921 922 923
    if (!cmd)
        return;

924
    tex_barrier(gpu, cmd, src, VK_PIPELINE_STAGE_TRANSFER_BIT,
Niklas Haas's avatar
Niklas Haas committed
925
                VK_ACCESS_TRANSFER_READ_BIT,
926 927
                VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
                false);
Niklas Haas's avatar
Niklas Haas committed
928

929
    tex_barrier(gpu, cmd, dst, VK_PIPELINE_STAGE_TRANSFER_BIT,
Niklas Haas's avatar
Niklas Haas committed
930
                VK_ACCESS_TRANSFER_WRITE_BIT,
931 932
                VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
                false);
Niklas Haas's avatar
Niklas Haas committed
933 934 935 936 937 938 939 940 941

    static const VkImageSubresourceLayers layers = {
        .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
        .layerCount = 1,
    };

    // When the blit operation doesn't require scaling, we can use the more
    // efficient vkCmdCopyImage instead of vkCmdBlitImage
    if (pl_rect3d_eq(src_rc, dst_rc)) {
Niklas Haas's avatar
Niklas Haas committed
942
        pl_rect3d_normalize(&src_rc);
943
        pl_rect3d_normalize(&dst_rc);
Niklas Haas's avatar
Niklas Haas committed
944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964

        VkImageCopy region = {
            .srcSubresource = layers,
            .dstSubresource = layers,
            .srcOffset = {src_rc.x0, src_rc.y0, src_rc.z0},
            .dstOffset = {dst_rc.x0, dst_rc.y0, dst_rc.z0},
            .extent = {
                pl_rect_w(src_rc),
                pl_rect_h(src_rc),
                pl_rect_d(src_rc),
            },
        };

        vkCmdCopyImage(cmd->buf, src_vk->img, src_vk->current_layout,
                       dst_vk->img, dst_vk->current_layout, 1, &region);
    } else {
        VkImageBlit region = {
            .srcSubresource = layers,
            .dstSubresource = layers,
            .srcOffsets = {{src_rc.x0, src_rc.y0, src_rc.z0},
                           {src_rc.x1, src_rc.y1, src_rc.z1}},
965 966
            .dstOffsets = {{dst_rc.x0, dst_rc.y0, dst_rc.z0},
                           {dst_rc.x1, dst_rc.y1, dst_rc.z1}},
Niklas Haas's avatar
Niklas Haas committed
967 968 969 970
        };

        vkCmdBlitImage(cmd->buf, src_vk->img, src_vk->current_layout,
                       dst_vk->img, dst_vk->current_layout, 1, &region,
971
                       filters[src->params.sample_mode]);
Niklas Haas's avatar
Niklas Haas committed
972 973
    }

974 975
    tex_signal(gpu, cmd, src, VK_PIPELINE_STAGE_TRANSFER_BIT);
    tex_signal(gpu, cmd, dst, VK_PIPELINE_STAGE_TRANSFER_BIT);
Niklas Haas's avatar
Niklas Haas committed
976 977
}

978 979 980 981
const struct pl_tex *pl_vulkan_wrap(const struct pl_gpu *gpu,
                                    VkImage image, int w, int h, int d,
                                    VkFormat imageFormat,
                                    VkImageUsageFlags imageUsage)
982
{
983
    struct pl_tex *tex = NULL;
984

985 986 987
    const struct pl_fmt *format = NULL;
    for (int i = 0; i < gpu->num_formats; i++) {
        const struct vk_format *fmt = gpu->formats[i]->priv;
988
        if (fmt->ifmt == imageFormat) {
989
            format = gpu->formats[i];
990 991 992 993 994
            break;
        }
    }

    if (!format) {
995 996
        PL_ERR(gpu, "Could not find pl_fmt suitable for wrapped image "
               "with VkFormat 0x%x\n", (unsigned) imageFormat);
997 998 999
        goto error;
    }

1000 1001
    tex = talloc_zero(NULL, struct pl_tex);
    tex->params = (struct pl_tex_params) {
1002
        .format = format,
1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
        .w = w,
        .h = h,
        .d = d,
        .sampleable  = !!(imageUsage & VK_IMAGE_USAGE_SAMPLED_BIT),
        .renderable  = !!(imageUsage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT),
        .storable    = !!(imageUsage & VK_IMAGE_USAGE_STORAGE_BIT),
        .blit_src    = !!(imageUsage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT),
        .blit_dst    = !!(imageUsage & VK_IMAGE_USAGE_TRANSFER_DST_BIT),
        .host_writable = !!(imageUsage & VK_IMAGE_USAGE_TRANSFER_DST_BIT),
        .host_readable = !!(imageUsage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT),
1013 1014
    };

1015
    struct pl_tex_vk *tex_vk = tex->priv = talloc_zero(tex, struct pl_tex_vk);
1016 1017
    tex_vk->type = VK_IMAGE_TYPE_2D;
    tex_vk->external_img = true;
1018 1019
    tex_vk->held = true;
    tex_vk->img = image;
1020 1021
    tex_vk->img_fmt = imageFormat;
    tex_vk->usage_flags = imageUsage;
1022

1023
    if (!vk_init_image(gpu, tex))
1024 1025 1026 1027 1028
        goto error;

    return tex;

error:
1029
    vk_tex_destroy(gpu, tex);
1030 1031 1032
    return NULL;
}

1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045
VkImage pl_vulkan_unwrap(const struct pl_gpu *gpu, const struct pl_tex *tex,
                         VkFormat *out_format, VkImageUsageFlags *out_flags)
{
    struct pl_tex_vk *tex_vk = tex->priv;

    if (out_format)
        *out_format = tex_vk->img_fmt;
    if (out_flags)
        *out_flags = tex_vk->usage_flags;

    return tex_vk->img;
}

1046 1047 1048
bool pl_vulkan_hold(const struct pl_gpu *gpu, const struct pl_tex *tex,
                    VkImageLayout layout, VkAccessFlags access,
                    VkSemaphore sem_out)
1049
{
1050
    struct vk_ctx *vk = pl_vk_get(gpu);
1051 1052 1053 1054 1055 1056 1057
    struct pl_tex_vk *tex_vk = tex->priv;
    pl_assert(!tex_vk->held);
    pl_assert(sem_out);

    struct vk_cmd *cmd = vk_require_cmd(gpu, GRAPHICS);
    if (!cmd) {
        PL_ERR(gpu, "Failed holding external image!");
1058
        return false;
1059 1060 1061
    }

    tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
1062 1063
                access, layout, false);

1064
    vk_cmd_sig(cmd, sem_out);
1065 1066 1067 1068
    vk_submit(gpu);
    tex_vk->held = vk_flush_commands(vk);

    return tex_vk->held;
1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083
}

void pl_vulkan_release(const struct pl_gpu *gpu, const struct pl_tex *tex,
                       VkImageLayout layout, VkAccessFlags access,
                       VkSemaphore sem_in)
{
    struct pl_tex_vk *tex_vk = tex->priv;
    pl_assert(tex_vk->held);
    pl_tex_vk_external_dep(gpu, tex, sem_in);

    tex_vk->current_layout = layout;
    tex_vk->current_access = access;
    tex_vk->held = false;
}

1084 1085
// For pl_buf.priv
struct pl_buf_vk {
1086 1087 1088
    struct vk_bufslice slice;
    int refcount; // 1 = object allocated but not in use, > 1 = in use
    enum queue_type update_queue;
1089
    VkBufferView view; // for texel buffers
1090 1091 1092
    // "current" metadata, can change during course of execution
    VkPipelineStageFlags current_stage;
    VkAccessFlags current_access;
1093 1094
    bool exported;
    bool needs_flush;
1095 1096
};

1097
#define PL_VK_BUF_VERTEX PL_BUF_PRIVATE
Niklas Haas's avatar
Niklas Haas committed
1098

1099
static void vk_buf_deref(const struct pl_gpu *gpu, struct pl_buf *buf)
1100 1101 1102 1103
{
    if (!buf)
        return;

1104
    struct vk_ctx *vk = pl_vk_get(gpu);
1105 1106
    struct pl_buf_vk *buf_vk = buf->priv;
    struct pl_vk *p = gpu->priv;
1107 1108

    if (--buf_vk->refcount == 0) {
1109
        vkDestroyBufferView(vk->dev, buf_vk->view, VK_ALLOC);
1110
        vk_free_memslice(p->alloc, buf_vk->slice.mem);
1111 1112 1113 1114
        talloc_free(buf);
    }
}

1115
// offset: relative to pl_buf
1116 1117
static void buf_barrier(const struct pl_gpu *gpu, struct vk_cmd *cmd,
                        const struct pl_buf *buf, VkPipelineStageFlags newStage,
1118 1119
                        VkAccessFlags newAccess, size_t offset, size_t size,
                        bool export)
1120
{
1121
    struct pl_buf_vk *buf_vk = buf->priv;
1122 1123 1124

    VkBufferMemoryBarrier buffBarrier = {
        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
1125 1126 1127 1128
        .srcQueueFamilyIndex = buf_vk->exported ? VK_QUEUE_FAMILY_EXTERNAL_KHR
                                                : VK_QUEUE_FAMILY_IGNORED,
        .dstQueueFamilyIndex = export ? VK_QUEUE_FAMILY_EXTERNAL_KHR
                                      : VK_QUEUE_FAMILY_IGNORED,
1129 1130 1131
        .srcAccessMask = buf_vk->current_access,
        .dstAccessMask = newAccess,
        .buffer = buf_vk->slice.buf,
1132
        .offset = buf_vk->slice.mem.offset + offset,
1133 1134 1135
        .size = size,
    };

1136
    if ((buf_vk->needs_flush || buf->params.host_mapped) && !buf_vk->exported) {
1137 1138
        buffBarrier.srcAccessMask |= VK_ACCESS_HOST_WRITE_BIT;
        buf_vk->current_stage |= VK_PIPELINE_STAGE_HOST_BIT;
Niklas Haas's avatar
Niklas Haas committed
1139
        buf_vk->needs_flush = false;
1140 1141 1142 1143 1144 1145 1146 1147 1148
    }

    if (buffBarrier.srcAccessMask != buffBarrier.dstAccessMask) {
        vkCmdPipelineBarrier(cmd->buf, buf_vk->current_stage, newStage, 0,
                             0, NULL, 1, &buffBarrier, 0, NULL);
    }

    buf_vk->current_stage = newStage;
    buf_vk->current_access = newAccess;
1149
    buf_vk->exported = export;
1150
    buf_vk->refcount++;
1151
    vk_cmd_callback(cmd, (vk_cb) vk_buf_deref, gpu, buf);
1152 1153
}

1154
// Flush visible writes to a buffer made by the API
1155
// offset: relative to pl_buf
1156
static void buf_flush(const struct pl_gpu *gpu, struct vk_cmd *cmd,
1157
                      const struct pl_buf *buf, size_t offset, size_t size)
1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178
{
    struct pl_buf_vk *buf_vk = buf->priv;

    // We need to perform a flush if the host is capable of reading back from
    // the buffer, or if we intend to overwrite it using mapped memory
    bool can_read = buf->params.host_readable;
    bool can_write = buf_vk->slice.data && buf->params.host_writable;
    if (buf->params.host_mapped)
        can_read = can_write = true;

    if (!can_read && !can_write)
        return;

    VkBufferMemoryBarrier buffBarrier = {
        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
        .srcAccessMask = buf_vk->current_access,
        .dstAccessMask = can_read ? VK_ACCESS_HOST_READ_BIT : 0
                       | can_write ? VK_ACCESS_HOST_WRITE_BIT : 0,
        .buffer = buf_vk->slice.buf,
1179
        .offset = buf_vk->slice.mem.offset + offset,
1180 1181 1182 1183 1184 1185 1186 1187
        .size = size,
    };

    vkCmdPipelineBarrier(cmd->buf, buf_vk->current_stage,
                         VK_PIPELINE_STAGE_HOST_BIT, 0,
                         0, NULL, 1, &buffBarrier, 0, NULL);
}

1188
#define vk_buf_destroy vk_buf_deref
1189
MAKE_LAZY_DESTRUCTOR(vk_buf_destroy, struct pl_buf);
1190

1191
static void vk_buf_write(const struct pl_gpu *gpu, const struct pl_buf *buf,
1192
                         size_t offset, const void *data, size_t size)
1193
{
1194
    struct pl_buf_vk *buf_vk = buf->priv;
1195 1196 1197 1198

    // For host-mapped buffers, we can just directly memcpy the buffer contents.
    // Otherwise, we can update the buffer from the GPU using a command buffer.
    if (buf_vk->slice.data) {
1199
        pl_assert(buf_vk->refcount == 1);
1200 1201
        uintptr_t addr = (uintptr_t) buf_vk->slice.data + (ptrdiff_t) offset;
        memcpy((void *) addr, data, size);
Niklas Haas's avatar
Niklas Haas committed
1202
        buf_vk->needs_flush = true;
1203
    } else {
1204
        pl_assert(size <= 64 * 1024);
1205
        struct vk_cmd *cmd = vk_require_cmd(gpu, buf_vk->update_queue);
1206
        if (!cmd) {
1207
            PL_ERR(gpu, "Failed updating buffer!");
1208 1209 1210
            return;
        }

1211
        buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_TRANSFER_BIT,
1212
                    VK_ACCESS_TRANSFER_WRITE_BIT, offset, size, false);
1213

1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227
        // Vulkan requires `size` to be a multiple of 4, so we need to make
        // sure to handle the end separately if the original data is not
        size_t size_rem = size % 4;
        size_t size_base = size - size_rem;
        VkDeviceSize buf_offset = buf_vk->slice.mem.offset + offset;

        vkCmdUpdateBuffer(cmd->buf, buf_vk->slice.buf, buf_offset, size_base, data);
        if (size_rem) {
            uint8_t tail[4] = {0};
            memcpy(tail, data, size_rem);
            vkCmdUpdateBuffer(cmd->buf, buf_vk->slice.buf, buf_offset + size_base,
                              sizeof(tail), tail);
        }

1228
        pl_assert(!buf->params.host_readable); // no flush needed due to this
1229 1230 1231
    }
}

1232
static bool vk_buf_read(const struct pl_gpu *gpu, const struct pl_buf *buf,
1233 1234
                        size_t offset, void *dest, size_t size)
{
1235
    struct pl_buf_vk *buf_vk = buf->priv;
1236
    pl_assert(buf_vk->slice.data);
1237 1238 1239 1240 1241 1242

    uintptr_t addr = (uintptr_t) buf_vk->slice.data + (ptrdiff_t) offset;
    memcpy(dest, (void *) addr, size);
    return true;
}

1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256
static bool vk_buf_export(const struct pl_gpu *gpu, const struct pl_buf *buf)
{
    struct vk_ctx *vk = pl_vk_get(gpu);
    struct pl_vk *p = gpu->priv;
    struct pl_buf_vk *buf_vk = buf->priv;
    if (buf_vk->exported)
        return true;

    struct vk_cmd *cmd = PL_DEF(p->cmd, vk_require_cmd(gpu, GRAPHICS));
    if (!cmd) {
        PL_ERR(gpu, "Failed exporting buffer!");
        return false;
    }

1257 1258 1259 1260 1261 1262
    // For the queue family ownership transfer, we can ignore all pipeline
    // stages since the synchronization via fences/semaphores is required
    buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, 0,
                0, buf->params.size, true);
    buf_vk->current_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

1263 1264 1265 1266
    vk_submit(gpu);
    return vk_flush_commands(vk);
}

1267 1268
static const struct pl_buf *vk_buf_create(const struct pl_gpu *gpu,
                                          const struct pl_buf_params *params)
1269
{
1270 1271
    struct vk_ctx *vk = pl_vk_get(gpu);
    struct pl_vk *p = gpu->priv;
1272

1273
    struct pl_buf *buf = talloc_zero(NULL, struct pl_buf);
1274
    buf->params = *params;
1275
    buf->params.initial_data = NULL;
1276

1277
    struct pl_buf_vk *buf_vk = buf->priv = talloc_zero(buf, struct pl_buf_vk);
1278 1279 1280 1281 1282 1283 1284
    buf_vk->current_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
    buf_vk->current_access = 0;
    buf_vk->refcount = 1;

    VkBufferUsageFlags bufFlags = 0;
    VkMemoryPropertyFlags memFlags = 0;
    VkDeviceSize align = 4; // alignment 4 is needed for buf_update
1285
    VkDeviceSize size = PL_ALIGN2(params->size, 4); // for vk_buf_write
1286

1287
    enum pl_buf_mem_type mem_type = params->memory_type;
1288

1289
    bool is_texel = false;
1290
    switch (params->type) {
1291
    case PL_BUF_TEX_TRANSFER:
1292 1293
        bufFlags |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
                    VK_BUFFER_USAGE_TRANSFER_DST_BIT;
1294
        align = pl_lcm(align, p->min_texel_alignment);
1295 1296 1297 1298
        // Use TRANSFER-style updates for large enough buffers for efficiency
        if (params->size > 1024*1024) // 1 MB
            buf_vk->update_queue = TRANSFER;
        break;
1299
    case PL_BUF_UNIFORM:
1300
        bufFlags |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
1301
        mem_type = PL_BUF_MEM_DEVICE;
1302
        align = pl_lcm(align, vk->limits.minUniformBufferOffsetAlignment);
1303
        break;
1304
    case PL_BUF_STORAGE:
1305
        bufFlags |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
1306
        mem_type = PL_BUF_MEM_DEVICE;
1307
        align = pl_lcm(align, vk->limits.minStorageBufferOffsetAlignment);
1308 1309
        buf_vk->update_queue = COMPUTE;
        break;
1310
    case PL_BUF_TEXEL_UNIFORM: // for emulated upload
1311
        bufFlags |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
1312
        bufFlags |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;