...
 
Commits (2)
  • Niklas Haas's avatar
    vulkan: permit non-coherent memory · e5cf9fbb
    Niklas Haas authored
    In theory we could try using coherent memory instead of noncoherent
    memory on platforms that support it, but this doesn't seem to be
    necessarily in either case, as it isn't a performance regression on any
    driver that supports it.
    
    This should allow iGPUs (which only have noncoherent memory) to work.
    e5cf9fbb
  • Niklas Haas's avatar
    ci: add gpu-based tests · 4bf79ea2
    Niklas Haas authored
    All hail the cloud overlords and their VMs with GPUs.
    (Note: 'the cloud' is a NUC sitting in an office somewhere)
    4bf79ea2
......@@ -14,10 +14,8 @@ build-ubuntu:
-Dtests=true
-Dshaderc=enabled
-Dglslang=enabled
-Db_coverage=true
- ninja -C build
- cd build && meson test -v
- ninja coverage-text && cat meson-logs/coverage.txt
build-ubuntu-static:
image: registry.videolan.org:5000/libplacebo-ubuntu-cosmic:20181205095959
......@@ -141,3 +139,19 @@ test-ubuntu-scan:
-Dshaderc=enabled
-Dglslang=enabled
- ninja -C build scan-build
test-gpu:
image: registry.videolan.org/libplacebo-ubuntu-cosmic:20190402093455
stage: test
tags:
- gpu
script:
- meson build --buildtype release
--werror
-Dtests=true
-Dshaderc=enabled
-Dglslang=disabled
-Db_coverage=true
- ninja -C build
- cd build && meson test -v
- ninja coverage-text && cat meson-logs/coverage.txt
......@@ -1204,20 +1204,39 @@ static void buf_barrier(const struct pl_gpu *gpu, struct vk_cmd *cmd,
enum vk_wait_type type = vk_cmd_wait(vk, cmd, &buf_vk->sig, stage, &event);
VkPipelineStageFlags src_stages = 0;
bool need_trans = buf_vk->current_access != newAccess ||
(buffBarrier.srcQueueFamilyIndex !=
buffBarrier.dstQueueFamilyIndex);
if (buf_vk->needs_flush || buf->params.host_mapped) {
if (!buf_vk->exported) {
buffBarrier.srcAccessMask |= VK_ACCESS_HOST_WRITE_BIT;
src_stages |= VK_PIPELINE_STAGE_HOST_BIT;
}
if (buf_vk->slice.mem.data && !buf_vk->slice.mem.coherent) {
if (buf_vk->exported) {
// TODO: figure out and clean up the semantics?
PL_WARN(vk, "Mixing host-mapped or user-writable buffers with "
"external APIs is risky and untested. If you run into "
"any issues, please try using a non-mapped buffer and "
"avoid pl_buf_write.");
}
VK(vkFlushMappedMemoryRanges(vk->dev, 1, &(struct VkMappedMemoryRange) {
.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
.memory = buf_vk->slice.mem.vkmem,
.offset = buf_vk->slice.mem.offset,
.size = buf_vk->slice.mem.size,
}));
// Just ignore errors, not much we can do about them other than
// logging them and moving on...
error: ;
}
if ((buf_vk->needs_flush || buf->params.host_mapped) && !buf_vk->exported) {
buffBarrier.srcAccessMask |= VK_ACCESS_HOST_WRITE_BIT;
src_stages |= VK_PIPELINE_STAGE_HOST_BIT;
buf_vk->needs_flush = false;
need_trans = true;
if (type == VK_WAIT_EVENT)
type = VK_WAIT_BARRIER;
}
if (need_trans) {
if (buffBarrier.srcAccessMask != buffBarrier.dstAccessMask ||
buffBarrier.srcQueueFamilyIndex != buffBarrier.dstQueueFamilyIndex)
{
switch (type) {
case VK_WAIT_NONE:
// No synchronization required, so we can safely transition out of
......@@ -1257,17 +1276,31 @@ static void buf_signal(const struct pl_gpu *gpu, struct vk_cmd *cmd,
buf_vk->sig_stage = stage;
}
static void invalidate_memslice(struct vk_ctx *vk, const struct vk_memslice *mem)
{
VK(vkInvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) {
.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
.memory = mem->vkmem,
.offset = mem->offset,
.size = mem->size,
}));
// Ignore errors (after logging), nothing useful we can do anyway
error: ;
}
// Flush visible writes to a buffer made by the API
// offset: relative to pl_buf
static void buf_flush(const struct pl_gpu *gpu, struct vk_cmd *cmd,
const struct pl_buf *buf, size_t offset, size_t size)
{
struct vk_ctx *vk = pl_vk_get(gpu);
struct pl_buf_vk *buf_vk = buf->priv;
// We need to perform a flush if the host is capable of reading back from
// the buffer, or if we intend to overwrite it using mapped memory
bool can_read = buf->params.host_readable;
bool can_write = buf_vk->slice.data && buf->params.host_writable;
bool can_write = buf_vk->slice.mem.data && buf->params.host_writable;
if (buf->params.host_mapped)
can_read = can_write = true;
......@@ -1289,6 +1322,10 @@ static void buf_flush(const struct pl_gpu *gpu, struct vk_cmd *cmd,
vkCmdPipelineBarrier(cmd->buf, buf_vk->sig_stage,
VK_PIPELINE_STAGE_HOST_BIT, 0,
0, NULL, 1, &buffBarrier, 0, NULL);
// Invalidate the mapped memory as soon as this barrier completes
if (buf_vk->slice.mem.data && !buf_vk->slice.mem.coherent)
vk_cmd_callback(cmd, (vk_cb) invalidate_memslice, vk, &buf_vk->slice.mem);
}
#define vk_buf_destroy vk_buf_deref
......@@ -1301,9 +1338,9 @@ static void vk_buf_write(const struct pl_gpu *gpu, const struct pl_buf *buf,
// For host-mapped buffers, we can just directly memcpy the buffer contents.
// Otherwise, we can update the buffer from the GPU using a command buffer.
if (buf_vk->slice.data) {
if (buf_vk->slice.mem.data) {
pl_assert(buf_vk->refcount == 1);
uintptr_t addr = (uintptr_t) buf_vk->slice.data + (ptrdiff_t) offset;
uintptr_t addr = (uintptr_t) buf_vk->slice.mem.data + offset;
memcpy((void *) addr, data, size);
buf_vk->needs_flush = true;
} else {
......@@ -1340,9 +1377,9 @@ static bool vk_buf_read(const struct pl_gpu *gpu, const struct pl_buf *buf,
size_t offset, void *dest, size_t size)
{
struct pl_buf_vk *buf_vk = buf->priv;
pl_assert(buf_vk->slice.data);
pl_assert(buf_vk->slice.mem.data);
uintptr_t addr = (uintptr_t) buf_vk->slice.data + (ptrdiff_t) offset;
uintptr_t addr = (uintptr_t) buf_vk->slice.mem.data + (size_t) offset;
memcpy(dest, (void *) addr, size);
return true;
}
......@@ -1436,19 +1473,19 @@ static const struct pl_buf *vk_buf_create(const struct pl_gpu *gpu,
default: abort();
}
bool host_mapped = params->host_mapped;
if (params->host_writable || params->initial_data) {
bufFlags |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
align = pl_lcm(align, vk->limits.optimalBufferCopyOffsetAlignment);
// Large buffers must be written using mapped memory
if (params->size > 64 * 1024)
memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
host_mapped = true;
}
if (params->host_mapped || params->host_readable) {
memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
memFlags |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
host_mapped = true;
}
if (params->host_writable || params->host_readable) {
......@@ -1461,17 +1498,25 @@ static const struct pl_buf *vk_buf_create(const struct pl_gpu *gpu,
memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
break;
case PL_BUF_MEM_HOST:
memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
host_mapped = true;
break;
default: break;
}
if (host_mapped) {
// Include any alignment restraints required for possibly
// noncoherent, mapped memory
memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
align = pl_lcm(align, vk->limits.nonCoherentAtomSize);
size = PL_ALIGN(size, vk->limits.nonCoherentAtomSize);
}
if (!vk_malloc_buffer(p->alloc, bufFlags, memFlags, size, align,
params->handle_type, &buf_vk->slice))
goto error;
if (params->host_mapped)
buf->data = buf_vk->slice.data;
buf->data = buf_vk->slice.mem.data;
if (params->handle_type) {
buf->shared_mem = buf_vk->slice.mem.shared_mem;
......
......@@ -71,6 +71,7 @@ struct vk_slab {
// optional, depends on the memory type:
VkBuffer buffer; // buffer spanning the entire slab
void *data; // mapped memory corresponding to `mem`
bool coherent; // mapped memory is coherent
union pl_handle handle; // handle associated with this device memory
enum pl_handle_type handle_type;
};
......@@ -278,8 +279,10 @@ static struct vk_slab *slab_alloc(struct vk_malloc *ma, struct vk_heap *heap,
minfo.memoryTypeIndex = index;
VK(vkAllocateMemory(vk->dev, &minfo, VK_ALLOC, &slab->mem));
if (heap->flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
if (heap->flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
VK(vkMapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data));
slab->coherent = heap->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
}
if (slab->buffer)
VK(vkBindBufferMemory(vk->dev, slab->buffer, slab->mem, 0));
......@@ -574,14 +577,19 @@ static bool slice_heap(struct vk_malloc *ma, struct vk_heap *heap, size_t size,
.vkmem = slab->mem,
.offset = offset,
.size = size,
.priv = slab,
.shared_mem = {
.handle = slab->handle,
.offset = offset,
.size = slab->size,
},
.priv = slab,
};
if (slab->data) {
out->data = (void *) ((uintptr_t) slab->data + offset);
out->coherent = slab->coherent;
}
PL_DEBUG(vk, "Sub-allocating slice %zu + %zu from slab with size %zu",
(size_t) out->offset, (size_t) out->size, (size_t) slab->size);
......@@ -613,8 +621,6 @@ bool vk_malloc_buffer(struct vk_malloc *ma, VkBufferUsageFlags bufFlags,
struct vk_slab *slab = out->mem.priv;
out->buf = slab->buffer;
if (slab->data)
out->data = (void *)((uintptr_t)slab->data + (ptrdiff_t)out->mem.offset);
return true;
}
......
......@@ -33,8 +33,11 @@ struct vk_memslice {
VkDeviceMemory vkmem;
VkDeviceSize offset;
VkDeviceSize size;
struct pl_shared_mem shared_mem;
void *priv;
// depending on the type/flags:
struct pl_shared_mem shared_mem;
void *data; // pointer to slice (for persistently mapped slices)
bool coherent; // whether `data` is coherent
};
void vk_free_memslice(struct vk_malloc *ma, struct vk_memslice slice);
......@@ -47,9 +50,6 @@ bool vk_malloc_generic(struct vk_malloc *ma, VkMemoryRequirements reqs,
struct vk_bufslice {
struct vk_memslice mem; // must be freed by the user when done
VkBuffer buf; // the buffer this memory was sliced from
// For persistently mapped buffers, this points to the first usable byte of
// this slice.
void *data;
};
// Allocate a buffer slice. This is more efficient than vk_malloc_generic for
......