Commit 8009a6c5 authored by Pablo Stebler's avatar Pablo Stebler

Various fixes

parent 7dc8c453
Pipeline #9052 failed with stages
in 1 minute and 45 seconds
......@@ -60,7 +60,7 @@ layout (set = 0, binding = 1) writeonly buffer Dst {
pixel dst[];
};
layout (set = 0, binding = 2) readonly buffer Params {
int bw;
int bs;
int damping;
BlockParams bps[];
};
......@@ -87,7 +87,7 @@ void main() {
const uint by = gl_GlobalInvocationID.y;
const uint byp = PL_IDX == 0 ? by : by & (BH - 1);
BlockParams bp = bps[bw * byp + bx];
BlockParams bp = bps[bs * byp + bx];
const int lvl = PL_IDX == 0 ? bp.y_lvl : bp.uv_lvl;
int pri_lvl = (lvl >> 2) << (BITDEPTH - 8);
int sec_lvl = lvl & 3;
......@@ -99,18 +99,21 @@ void main() {
if (pri_lvl == 0) dir = 0;
if (BSZ_IDX == 0) pri_lvl = adjust_strength(pri_lvl, variance);
const int pri_tap = 4 - ((pri_lvl >> (BITDEPTH - 8)) & 1);
const uint w = BSZX * bw;
const uint s = BSZX * bs + 4; // TODO rename
for (uint y = BSZY * by + 2; y < BSZY * by + BSZY + 2; y++) {
for (uint x = BSZX * bx + 2; x < BSZX + bx + BSZX + 2; x++) {
uint y_start = BSZY * by + 2;
if (PL_IDX == 1 && by >= BH) y_start += 4;
uint y_end = y_start + BSZY;
for (uint y = y_start; y < y_end; y++) {
for (uint x = BSZX * bx + 2; x < BSZX * bx + BSZX + 2; x++) {
int sum = 0;
const int px = int(tmp[w * y + x]);
const int px = int(tmp[s * y + x]);
int vmax = px, vmin = px;
int pri_tap_k = pri_tap;
for (int k = 0; k < 2; k++) {
const int off1 = cdef_directions[dir][k];
const int p0 = int(tmp[w * y + x + off1]);
const int p1 = int(tmp[w * y + x - off1]);
const int p0 = int(tmp[s * y + x + off1]);
const int p1 = int(tmp[s * y + x - off1]);
sum += pri_tap_k * constrain(p0 - px, pri_lvl, damping);
sum += pri_tap_k * constrain(p1 - px, pri_lvl, damping);
pri_tap_k -= (pri_tap_k << 1) - 6;
......@@ -119,11 +122,11 @@ void main() {
vmin = min(p0, vmin);
vmin = min(p1, vmin);
const int off2 = cdef_directions[(dir + 2) & 7][k];
const int s0 = int(tmp[w * y + x + off2]);
const int s1 = int(tmp[w * y + x - off2]);
const int s0 = int(tmp[s * y + x + off2]);
const int s1 = int(tmp[s * y + x - off2]);
const int off3 = cdef_directions[(dir + 6) & 7][k];
const int s2 = int(tmp[w * y + x + off3]);
const int s3 = int(tmp[w * y + x - off3]);
const int s2 = int(tmp[s * y + x + off3]);
const int s3 = int(tmp[s * y + x - off3]);
if (s0 != INT16_MAX) vmax = max(s0, vmax);
if (s1 != INT16_MAX) vmax = max(s1, vmax);
if (s2 != INT16_MAX) vmax = max(s2, vmax);
......@@ -138,7 +141,7 @@ void main() {
sum += sec_tap * constrain(s2 - px, sec_lvl, damping);
sum += sec_tap * constrain(s3 - px, sec_lvl, damping);
}
dst[w * y + x] = pixel(clamp(px + ((8 + sum - int(sum < 0)) >> 4), vmin, vmax));
dst[s * y + x] = pixel(clamp(px + ((8 + sum - int(sum < 0)) >> 4), vmin, vmax));
}
}
}
......@@ -49,7 +49,7 @@ typedef const void *const_left_pixel_row_2px;
#ifdef BITDEPTH
bitfn_decls(void dav1d_cdef_padding, uint16_t *tmp, const ptrdiff_t tmp_stride,
const pixel *src, const ptrdiff_t src_stride,
const pixel (*left)[2], const pixel *const top[2],
const pixel (*left)[2], pixel *const top[2],
const int w, const int h, const enum CdefEdgeFlags edges);
#endif
......@@ -59,7 +59,7 @@ bitfn_decls(void dav1d_cdef_padding, uint16_t *tmp, const ptrdiff_t tmp_stride,
// order to get access to pre-filter top pixels, use $top.
#define decl_cdef_fn(name) \
void (name)(pixel *dst, ptrdiff_t stride, const_left_pixel_row_2px left, \
const pixel *const top[2], int pri_strength, int sec_strength, \
/*const*/ pixel *const top[2], int pri_strength, int sec_strength, \
int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
typedef decl_cdef_fn(*cdef_fn);
......
......@@ -33,7 +33,6 @@
#include "common/intops.h"
#include "src/cdef_apply.h"
#include "src/vk/cdef.h"
static void backup2lines(pixel *const dst[3][2],
/*const*/ pixel *const src[3],
......@@ -82,19 +81,10 @@ static int adjust_strength(const int strength, const unsigned var) {
return (strength * (4 + i) + 8) >> 4;
}
static inline void fill(uint16_t *tmp, const ptrdiff_t stride, // TODO deduplicate
const int w, const int h)
{
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++)
tmp[x] = INT16_MAX;
tmp += stride;
}
}
void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f, pixel *const p[3],
const Av1Filter *const lflvl, const int by_start,
const int by_end)
void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
pixel *const p[3],
const Av1Filter *const lflvl,
const int by_start, const int by_end)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
......@@ -133,7 +123,7 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f, pixel *const p[3],
pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
edges &= ~CDEF_HAVE_LEFT;
edges |= CDEF_HAVE_RIGHT;
for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= CDEF_HAVE_LEFT) { // TODO indent
for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= CDEF_HAVE_LEFT) {
const int sb128x = sbx >> 1;
const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];
......@@ -190,7 +180,7 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f, pixel *const p[3],
if (y_lvl) {
count++;
dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
(const pixel *const [2]) {
(pixel *const [2]) {
&f->lf.cdef_line_ptr[tf][0][0][bx * 4],
&f->lf.cdef_line_ptr[tf][0][1][bx * 4],
},
......@@ -206,7 +196,7 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f, pixel *const p[3],
count++;
dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1],
lr_bak[bit][pl],
(const pixel *const [2]) {
(pixel *const [2]) {
&f->lf.cdef_line_ptr[tf][pl][0][bx * 4 >> ss_hor],
&f->lf.cdef_line_ptr[tf][pl][1][bx * 4 >> ss_hor],
},
......@@ -230,8 +220,8 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f, pixel *const p[3],
iptrs[1] += sbsz * 4 >> ss_hor;
iptrs[2] += sbsz * 4 >> ss_hor;
}
/*if (count && f->vk.state != DAV1D_VK_STATE_ENABLED) {
for (int j = 0; j < 8; j++) {
if (count) {
/*for (int j = 0; j < 8; j++) {
for (int i = 0; i < 4; i++) {
for (int k = 0; k < 8; k++) {
printf("%02x ", ptrs[0][f->cur.stride[0] * j + 8 * i + k]);
......@@ -239,11 +229,10 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f, pixel *const p[3],
printf(" ");
}
puts("");
}
puts("");*/
/*for (int pl = 1; pl <= 2; pl++) {
}*/
/*for (int pl = 2; pl <= 2; pl++) {
for (int j = 0; j < 4; j++) {
for (int i = 0; i < 4; i++) {
for (int i = 0; i < 8; i++) {
for (int k = 0; k < 4; k++) {
printf("%02x ", ptrs[pl][f->cur.stride[1] * j + 4 * i + k]);
}
......@@ -253,7 +242,7 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f, pixel *const p[3],
}
puts("");
}*/
//}
}
ptrs[0] += 8 * PXSTRIDE(f->cur.stride[0]);
ptrs[1] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
......
......@@ -55,7 +55,7 @@ static inline void fill(uint16_t *tmp, const ptrdiff_t stride,
void bitfn(dav1d_cdef_padding)(uint16_t *tmp, const ptrdiff_t tmp_stride,
const pixel *src, const ptrdiff_t src_stride,
const pixel (*left)[2], const pixel *const top[2],
const pixel (*left)[2], pixel *const top[2],
const int w, const int h,
const enum CdefEdgeFlags edges)
{
......@@ -94,7 +94,7 @@ void bitfn(dav1d_cdef_padding)(uint16_t *tmp, const ptrdiff_t tmp_stride,
static NOINLINE void
cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
const pixel (*left)[2], const pixel *const top[2],
const pixel (*left)[2], /*const*/ pixel *const top[2],
const int w, const int h, const int pri_strength,
const int sec_strength, const int dir,
const int damping, const enum CdefEdgeFlags edges
......@@ -170,7 +170,7 @@ cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \
const ptrdiff_t stride, \
const pixel (*left)[2], \
const pixel *const top[2], \
/*const*/ pixel *const top[2], \
const int pri_strength, \
const int sec_strength, \
const int dir, \
......
......@@ -269,7 +269,7 @@ struct Dav1dFrameContext {
VkCommandBuffer command_buffers[2];
uint16_t *tmp_y;
pixel *dst_y;
int32_t *bw;
int32_t *bs;
int32_t *damping;
struct VkCdefBlockParams {
int y_lvl;
......
......@@ -194,7 +194,7 @@ static void lr_stripe(Dav1dFrameContext *const f, pixel *p,
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
uint64_t t0 = 1000000000ULL * ts.tv_sec + ts.tv_nsec;
int vk = f->vk.enabled;
int vk = 0;//f->vk.enabled;
if (vk) {
if (bitfn(wiener_vk)(&f->vk, p, p_stride, left, lpf, lpf_stride,
unit_w, stripe_h, filterh, filterv, edges
......@@ -216,9 +216,9 @@ static void lr_stripe(Dav1dFrameContext *const f, pixel *p,
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
uint64_t t0 = 1000000000ULL * ts.tv_sec + ts.tv_nsec;
int vk = f->vk.enabled;
int vk = 0;//f->vk.enabled;
if (vk) {
if(bitfn(sgr_vk)(&f->vk, p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
if (bitfn(sgr_vk)(&f->vk, p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
lr->sgr_idx, lr->sgr_weights, edges HIGHBD_CALL_SUFFIX)) {
vk = 0;
}
......
......@@ -45,6 +45,7 @@
#include "src/scan.h"
#include "src/tables.h"
#include "src/wedge.h"
#include "src/vk/cdef.h"
static unsigned read_golomb(MsacContext *const msac) {
int len = 0;
......@@ -1685,19 +1686,27 @@ void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby);
}
if (f->seq_hdr->cdef) {
if (sby) {
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
pixel *p_up[3] = {
f->lf.p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
f->lf.p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
f->lf.p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
};
bytefn(dav1d_cdef_brow)(f, p_up, f->lf.prev_mask_ptr,
sby * sbsz - 2, sby * sbsz);
int vk = f->vk.enabled;
if (vk) {
if (bitfn(cdef_filter_sbrow_vk)(f, sby)) {
vk = 0;
}
}
if (!vk) {
if (sby) {
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
pixel *p_up[3] = {
f->lf.p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
f->lf.p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
f->lf.p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
};
bytefn(dav1d_cdef_brow)(f, p_up, f->lf.prev_mask_ptr,
sby * sbsz - 2, sby * sbsz);
}
const int n_blks = sbsz - 2 * (sby + 1 < sbh);
bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz,
imin(sby * sbsz + n_blks, f->bh));
}
const int n_blks = sbsz - 2 * (sby + 1 < sbh);
bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz,
imin(sby * sbsz + n_blks, f->bh));
}
if (f->frame_hdr->super_res.enabled) {
const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
......
......@@ -3,24 +3,53 @@
#include "src/cdef.h"
#include "src/vk/cdef.h"
static int cdef_init_vk(struct VkContext *const cv, struct VkCdefContext *const cc, const int uv_idx, const int has_chroma, const int b8w, const int b8h, const int b8w_wg, const int w_y, const int h_y, const int w_uv, const int h_uv) {
static void backup2lines(pixel *const dst[3][2], // TODO deduplicate
/*const*/ pixel *const src[3],
const ptrdiff_t src_stride[2], int y_off, int w,
const enum Dav1dPixelLayout layout)
{
pixel_copy(dst[0][0], src[0] + (y_off - 2) * PXSTRIDE(src_stride[0]), w);
pixel_copy(dst[0][1], src[0] + (y_off - 1) * PXSTRIDE(src_stride[0]), w);
if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
w >>= ss_hor;
y_off >>= ss_ver;
pixel_copy(dst[1][0], src[1] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
pixel_copy(dst[1][1], src[1] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
pixel_copy(dst[2][0], src[2] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
pixel_copy(dst[2][1], src[2] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
}
static inline void fill(uint16_t *tmp, const ptrdiff_t stride, // TODO deduplicate
const int w, const int h)
{
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++)
tmp[x] = INT16_MAX;
tmp += stride;
}
}
static int cdef_init_vk(struct VkContext *const cv, struct VkCdefContext *const cc, const int uv_idx, const int has_chroma, const int b8s, const int b8h, const int b8w_wg, const int n_wgs, const int s_y, const int h_y, const int s_uv, const int h_uv) {
cc->state = DAV1D_VK_STATE_DISABLED;
cc->tmp_y_stride = w_y + 4;
cc->tmp_uv_stride = w_uv + 4;
cc->tmp_y_stride = s_y;
cc->tmp_uv_stride = s_uv;
const uint64_t buffer_sizes[5] = {
sizeof(uint16_t) * (h_y + 4) * cc->tmp_y_stride,
sizeof(pixel) * (h_y + 4) * (w_y + 4), // TODO smaller?
2 * sizeof(int16_t) + sizeof(struct VkCdefBlockParams) * b8h * b8w,
sizeof(pixel) * (h_y + 4) * cc->tmp_y_stride, // TODO smaller?
2 * sizeof(int32_t) + sizeof(struct VkCdefBlockParams) * b8h * b8s,
sizeof(uint16_t) * 2 * (h_uv + 4) * cc->tmp_uv_stride,
sizeof(pixel) * 2 * (h_uv + 4) * (w_uv + 4),
sizeof(pixel) * 2 * (h_uv + 4) * cc->tmp_uv_stride,
};
const int n_bufs = 3 + 2 * has_chroma;
uint64_t offset = 0;
uint64_t buffer_offsets[5];
for (int i = 0; i < n_bufs; i++) {
buffer_offsets[i] = offset;
const VkBufferCreateInfo buffer_create_info = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.size = buffer_sizes[i],
......@@ -30,9 +59,11 @@ static int cdef_init_vk(struct VkContext *const cv, struct VkCdefContext *const
if (vkCreateBuffer(cv->device, &buffer_create_info, NULL, &cc->buffers[i])) return -1;
VkMemoryRequirements mem_reqs;
vkGetBufferMemoryRequirements(cv->device, cc->buffers[i], &mem_reqs);
// TODO check alignment and memory type
const uint64_t alignment = mem_reqs.alignment;
offset = (offset + alignment - 1) / alignment * alignment;
buffer_offsets[i] = offset;
// TODO memory type
offset += mem_reqs.size;
// TODO fix alignment
}
VkMemoryAllocateInfo memory_allocate_info = {
......@@ -46,9 +77,9 @@ static int cdef_init_vk(struct VkContext *const cv, struct VkCdefContext *const
if (vkMapMemory(cv->device, cc->memory, 0, offset, 0, &data)) return -1;
cc->tmp_y = data;
cc->dst_y = data + buffer_offsets[1];
cc->bw = data + buffer_offsets[2];
cc->damping = data + buffer_offsets[2] + sizeof(int16_t);
cc->bps = data + buffer_offsets[2] + 2 * sizeof(int16_t);
cc->bs = data + buffer_offsets[2];
cc->damping = data + buffer_offsets[2] + sizeof(int32_t);
cc->bps = data + buffer_offsets[2] + 2 * sizeof(int32_t);
cc->tmp_uv = data + buffer_offsets[3];
cc->dst_uv = data + buffer_offsets[4];
......@@ -56,9 +87,9 @@ static int cdef_init_vk(struct VkContext *const cv, struct VkCdefContext *const
if (vkBindBufferMemory(cv->device, cc->buffers[i], cc->memory, buffer_offsets[i])) return -1;
}
const uint32_t specialization_data[2][3] = {
{ b8w_wg, b8h, 0 },
{ b8w_wg, b8h, uv_idx },
const uint32_t specialization_data[2][4] = {
{ b8w_wg, b8h, 0, 0, },
{ b8w_wg / 2, b8h * 2, uv_idx, 1 }, // TODO fix
};
for (int i = 0; i < 1 + has_chroma; i++) {
const VkComputePipelineCreateInfo compute_pipeline_create_info = {
......@@ -69,8 +100,8 @@ static int cdef_init_vk(struct VkContext *const cv, struct VkCdefContext *const
.module = cc->shader_module,
.pName = "main",
.pSpecializationInfo = &(VkSpecializationInfo) {
.mapEntryCount = 3,
.pMapEntries = (const VkSpecializationMapEntry[3]) {
.mapEntryCount = 4,
.pMapEntries = (const VkSpecializationMapEntry[4]) { // TODO use loop
{
.constantID = 0,
.offset = 0,
......@@ -86,6 +117,11 @@ static int cdef_init_vk(struct VkContext *const cv, struct VkCdefContext *const
.offset = 2 * sizeof(uint32_t),
.size = sizeof(uint32_t),
},
{
.constantID = 3,
.offset = 3 * sizeof(uint32_t),
.size = sizeof(uint32_t),
},
},
.dataSize = sizeof(specialization_data[i]),
.pData = specialization_data[i],
......@@ -95,9 +131,9 @@ static int cdef_init_vk(struct VkContext *const cv, struct VkCdefContext *const
};
if (vkCreateComputePipelines(cv->device, 0, 1, &compute_pipeline_create_info, NULL, &cc->pipelines[i])) return -1;
VkDescriptorBufferInfo buffer_info[3];
VkWriteDescriptorSet write_descriptor_set[3];
for (int j = 0; j < 3; j++) {
VkDescriptorBufferInfo buffer_info[3];
VkWriteDescriptorSet write_descriptor_set[3];
int buffer_idx = 3 * i + j;
if (buffer_idx == 5) buffer_idx = 2;
buffer_info[j] = (VkDescriptorBufferInfo) {
......@@ -113,8 +149,8 @@ static int cdef_init_vk(struct VkContext *const cv, struct VkCdefContext *const
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.pBufferInfo = &buffer_info[j],
};
vkUpdateDescriptorSets(cv->device, 3, write_descriptor_set, 0, NULL);
}
vkUpdateDescriptorSets(cv->device, 3, write_descriptor_set, 0, NULL);
VkCommandBufferAllocateInfo command_buffer_allocate_info = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
......@@ -130,7 +166,7 @@ static int cdef_init_vk(struct VkContext *const cv, struct VkCdefContext *const
if (vkBeginCommandBuffer(cc->command_buffers[i], &command_buffer_begin_info)) return -1;
vkCmdBindPipeline(cc->command_buffers[i], VK_PIPELINE_BIND_POINT_COMPUTE, cc->pipelines[i]);
vkCmdBindDescriptorSets(cc->command_buffers[i], VK_PIPELINE_BIND_POINT_COMPUTE, cv->pipeline_layout, 0, 1, &cc->descriptor_sets[i], 0, NULL);
vkCmdDispatch(cc->command_buffers[i], 48, 1, 1); // const for 48
vkCmdDispatch(cc->command_buffers[i], n_wgs, 1, 1);
if (vkEndCommandBuffer(cc->command_buffers[i])) return -1;
}
......@@ -144,6 +180,10 @@ static int cdef_update_vk(struct VkCdefContext *const cc) { // unsupported yet
}
int bitfn(cdef_filter_sbrow_vk)(Dav1dFrameContext *const f, const int sby) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
uint64_t t0 = 1000000000ULL * ts.tv_sec + ts.tv_nsec;
struct VkContext *const cv = &f->vk;
const int bd_idx = bd_idx_from_max(f->bitdepth_max);
const int h_idx = f->seq_hdr->sb128;
......@@ -152,7 +192,7 @@ int bitfn(cdef_filter_sbrow_vk)(Dav1dFrameContext *const f, const int sby) {
if (cc->state == DAV1D_VK_STATE_DISABLED) return -1;
const int by_start = sby * f->sb_step - 2;
const int by_start = !sby ? 0 : sby * f->sb_step - 2;
const int n_blks = f->sb_step - 2 * (sby + 1 < f->sbh);
const int by_end = imin(sby * f->sb_step + n_blks, f->bh);
const int bitdepth_min_8 = BITDEPTH == 8 ? 0 : f->cur.p.bpc - 8;
......@@ -164,29 +204,40 @@ int bitfn(cdef_filter_sbrow_vk)(Dav1dFrameContext *const f, const int sby) {
const int has_chroma = layout != DAV1D_PIXEL_LAYOUT_I400;
const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
pixel *p[3] = {
f->lf.p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
f->lf.p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
f->lf.p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
};
pixel *p[3] = { f->lf.p[0], f->lf.p[1], f->lf.p[2] };
if (sby) {
p[0] -= 8 * PXSTRIDE(f->cur.stride[0]);
p[1] -= (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver);
p[2] -= (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver);
}
pixel *ptr = p[0];
const int b8w = f->bw >> 1;
const int b8h = 8 << f->seq_hdr->sb128;
const int b8w_wg = cv->max_cwgs / b8h;
const int w_y = b8w << 3;
const int h_y = b8w << 3;
const int h_y = b8h << 3;
const int w_uv = w_y >> ss_hor;
const int h_uv = w_y >> ss_ver;
const int h_uv = h_y >> ss_ver;
const int n_wgs = (b8w + b8w_wg - 1) / b8w_wg;
const int b8s = b8w_wg * n_wgs;
const int s_y = (b8s << 3) + 4;
const int s_uv = ((s_y - 4) >> ss_hor) + 4;
const int y_start_y = by_start << 2;
const int y_end_y = by_end << 2;
const int y_start_uv = y_start_y >> ss_ver;
const int y_end_uv = y_end_y >> ss_ver;
const int tf = f->lf.top_pre_cdef_toggle;
const int has_bottom = by_end != f->bh;
if (cc->state == DAV1D_VK_STATE_ENABLED && (uint32_t)w_y != cc->tmp_y_stride) {
if (cc->state == DAV1D_VK_STATE_ENABLED && (uint32_t)s_y != cc->tmp_y_stride) {
cc->state = DAV1D_VK_STATE_OUTDATED;
}
memset(cc->bps, 0, sizeof(struct VkCdefBlockParams) * b8h * b8w);
//memset(cc->bps, 0, sizeof(struct VkCdefBlockParams) * b8h * b8w);
int y_e = 0, uv_e = 0;
for (int by = by_start; by < by_end; by += 2) {
const int b8y = by >> 1;
const Av1Filter *const lflvl = by == by_start ? f->lf.prev_mask_ptr : f->lf.mask_ptr;
const int b8y = (by - by_start) >> 1;
const Av1Filter *const lflvl = sby && by == by_start ? f->lf.prev_mask_ptr : f->lf.mask_ptr;
for (int sbx = 0; sbx < sb64w; sbx++) {
const int sb128x = sbx >> 1;
const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
......@@ -195,7 +246,7 @@ int bitfn(cdef_filter_sbrow_vk)(Dav1dFrameContext *const f, const int sby) {
const int y_lvl = f->frame_hdr->cdef.y_strength[cdef_idx];
const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx] && has_chroma;
if (!y_lvl && !uv_lvl) goto next_sb;
if (cc->state == DAV1D_VK_STATE_UNKNOWN && !cdef_init_vk(cv, cc, uv_idx, has_chroma, b8w, b8h, b8w_wg, w_y, h_y, w_uv, h_uv)) {
if (cc->state == DAV1D_VK_STATE_UNKNOWN && cdef_init_vk(cv, cc, uv_idx, has_chroma, b8s, b8h, b8w_wg, n_wgs, s_y, h_y, s_uv, h_uv)) {
return -1;
} else if (cc->state == DAV1D_VK_STATE_OUTDATED && cdef_update_vk(cc)) {
return -1;
......@@ -217,16 +268,25 @@ int bitfn(cdef_filter_sbrow_vk)(Dav1dFrameContext *const f, const int sby) {
ptr += 8 * PXSTRIDE(f->cur.stride[0]);
}
// TODO restore backup of last line and store current one
if (has_bottom) {
backup2lines(f->lf.cdef_line_ptr[!tf], p, f->cur.stride, y_end_y - y_start_y, w_y, layout);
}
if (!y_e && !uv_e) return 0;
*cc->bw = b8w;
*cc->bs = b8s;
*cc->damping = damping;
if (y_e) {
ptr = p[0];
uint16_t *tmp = cc->tmp_y + 2;
for (int y = 0; y < h_y; y++) {
if (by_start) {
pixel_copy(tmp, f->lf.cdef_line_ptr[tf][0][0], w_y);
pixel_copy(tmp + cc->tmp_y_stride, f->lf.cdef_line_ptr[tf][0][1], w_y);
} else {
fill(tmp, cc->tmp_y_stride, PXSTRIDE(f->cur.stride[0]), 2);
}
tmp += 2 * cc->tmp_y_stride;
for (int y = y_start_y; y < y_end_y + 2 * has_bottom; y++) {
tmp[-2] = INT16_MAX;
tmp[-1] = INT16_MAX;
for (int x = 0; x < w_y; x++) {
......@@ -234,15 +294,25 @@ int bitfn(cdef_filter_sbrow_vk)(Dav1dFrameContext *const f, const int sby) {
}
tmp[w_y] = INT16_MAX;
tmp[w_y + 1] = INT16_MAX;
ptr += w_y;
ptr += PXSTRIDE(f->cur.stride[0]);
tmp += cc->tmp_y_stride;
}
if (!has_bottom) {
fill(tmp, cc->tmp_y_stride, PXSTRIDE(f->cur.stride[0]), 2);
}
}
if (uv_e) {
uint16_t *tmp = cc->tmp_uv + 2;
for (int pl = 1; pl <= 2; pl++) {
if (by_start) {
pixel_copy(tmp, f->lf.cdef_line_ptr[tf][pl][0], w_uv);
pixel_copy(tmp + cc->tmp_uv_stride, f->lf.cdef_line_ptr[tf][pl][1], w_uv);
} else {
fill(tmp, cc->tmp_uv_stride, PXSTRIDE(f->cur.stride[1]), 2);
}
tmp += 2 * cc->tmp_uv_stride;
ptr = p[pl];
for (int y = 0; y < h_uv; y++) {
for (int y = y_start_uv; y < y_end_uv + 2 * has_bottom; y++) {
tmp[-2] = INT16_MAX;
tmp[-1] = INT16_MAX;
for (int x = 0; x < w_uv; x++) {
......@@ -250,9 +320,13 @@ int bitfn(cdef_filter_sbrow_vk)(Dav1dFrameContext *const f, const int sby) {
}
tmp[w_uv] = INT16_MAX;
tmp[w_uv + 1] = INT16_MAX;
ptr += w_uv;
ptr += PXSTRIDE(f->cur.stride[1]);
tmp += cc->tmp_uv_stride;
}
if (!has_bottom) {
fill(tmp, cc->tmp_uv_stride, PXSTRIDE(f->cur.stride[1]), 2);
tmp += 2 * cc->tmp_uv_stride;
}
}
}
......@@ -267,27 +341,51 @@ int bitfn(cdef_filter_sbrow_vk)(Dav1dFrameContext *const f, const int sby) {
if (vkResetFences(cv->device, 1, &cv->fence)) return -1;
if (y_e) {
pixel *dst = cc->dst_y + 2;
pixel *dst = cc->dst_y + 2 * cc->tmp_y_stride + 2;
ptr = p[0];
for (int y = 0; y < h_y; y++) {
for (int y = y_start_y; y < y_end_y; y++) {
for (int x = 0; x < w_y; x++) {
/*if (x < 32) {
printf("%02x ", dst[x]);
if (!((x + 1) % 8)) printf(" ");
}*/
ptr[x] = dst[x];
}
/*puts("");
if (!((y + 1) % 8)) puts("");*/
dst += cc->tmp_y_stride;
ptr += w_y;
ptr += PXSTRIDE(f->cur.stride[0]);
}
}
if (uv_e) {
pixel *dst = cc->dst_uv + 2;
for (int pl = 1; pl <= 2; pl++) {
dst += 2 * cc->tmp_uv_stride;
ptr = p[pl];
for (int x = 0; x < w_uv; x++) {
ptr[x] = dst[x];
for (int y = y_start_uv; y < y_end_uv; y++) {
for (int x = 0; x < w_uv; x++) {
/*if (pl == 2 && x < 32) {
printf("%02x ", dst[x]);
if (!((x + 1) % 4)) printf(" ");
}*/
ptr[x] = dst[x];
}
/*if (pl == 2) {
puts("");
if (!((y + 1) % 4)) puts("");
}*/
dst += cc->tmp_uv_stride;
ptr += PXSTRIDE(f->cur.stride[1]);
}
dst += cc->tmp_uv_stride;
ptr += w_uv;
dst += 2 * cc->tmp_uv_stride;
}
}
f->lf.top_pre_cdef_toggle ^= 1;
clock_gettime(CLOCK_MONOTONIC, &ts);
uint64_t t1 = 1000000000ULL * ts.tv_sec + ts.tv_nsec;
printf("cdef %d %d %d %lu\n", h_idx, by_start, by_end, t1 - t0);
return 0;
}
......@@ -131,12 +131,12 @@ COLD int init_vk(Dav1dContext *const c) {
}
if (memory_type_index == memory_type_count) return -1; // TODO fallback
const size_t code_size[3][3] = {
const size_t code_size[3][3] = {
{ sizeof(cdef_8bpc_code), sizeof(cdef_10bpc_code), sizeof(cdef_12bpc_code) },
{ sizeof(wiener_8bpc_code), sizeof(wiener_10bpc_code), sizeof(wiener_12bpc_code) },
{ sizeof(sgr_8bpc_code), sizeof(sgr_10bpc_code), sizeof(sgr_12bpc_code) },
};
const uint32_t *code[3][3] = {
const uint32_t *code[3][3] = {
{ cdef_8bpc_code, cdef_10bpc_code, cdef_12bpc_code },
{ wiener_8bpc_code, wiener_10bpc_code, wiener_12bpc_code },
{ sgr_8bpc_code, sgr_10bpc_code, sgr_12bpc_code },
......@@ -178,7 +178,7 @@ COLD int init_vk(Dav1dContext *const c) {
VkPipelineLayout pipeline_layout;
if (vkCreatePipelineLayout(device, &pipeline_layout_create_info, NULL, &pipeline_layout)) return -1;
const size_t pool_size_count = c->n_fc * 2 * 3 * 2; // TODO rename
const size_t pool_size_count = c->n_fc * (3 * 2 * 3 * 2 + 2 * 3 * 2); // TODO rename
VkDescriptorPoolSize *descriptor_pool_sizes = malloc(sizeof(VkDescriptorPoolSize) * pool_size_count);
if (!descriptor_pool_sizes) return -1;
for (size_t i = 0; i < pool_size_count; i++) {
......@@ -237,11 +237,20 @@ COLD int init_vk(Dav1dContext *const c) {
.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
};
if (vkCreateFence(device, &fence_create_info, NULL, &cv->fence)) return -1;
for (int j = 0; j < 3; j++) {
for (int k = 0; k < 2; k++) {
for (int l = 0; l < 3; l++) {
cv->cdef[j][k][l].shader_module = shader_modules[0][j];
cv->cdef[j][k][l].descriptor_sets[0] = descriptor_sets[(3 * 2 * 3 * 2 + 2 * 3 * 2) * i + 2 * 3 * 2 * j + 3 * 2 * k + 2 * l];
cv->cdef[j][k][l].descriptor_sets[1] = descriptor_sets[(3 * 2 * 3 * 2 + 2 * 3 * 2) * i + 2 * 3 * 2 * j + 3 * 2 * k + 2 * l + 1];
}
}
}
for (int j = 0; j < 2; j++) {
for (int k = 0; k < 3; k++) {
for (int l = 0; l < 2; l++) {
cv->lr[j][k][l].shader_module = shader_modules[j + 1][k];
cv->lr[j][k][l].descriptor_set = descriptor_sets[3 * 2 * j + 2 * k + l];
cv->lr[j][k][l].descriptor_set = descriptor_sets[(3 * 2 * 3 * 2 + 2 * 3 * 2) * i + 3 * 2 * 3 * 2 + 3 * 2 * j + 2 * k + l];
}
}
}
......
<
......@@ -131,7 +131,7 @@ static int lr_common_vk(struct VkContext *const cv, struct VkLrContext *const lc
if (vkBeginCommandBuffer(lc->command_buffer, &command_buffer_begin_info)) return -1;
vkCmdBindPipeline(lc->command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, lc->pipeline);
vkCmdBindDescriptorSets(lc->command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, cv->pipeline_layout, 0, 1, &lc->descriptor_set, 0, NULL);