Commit 0bd57c6b authored by Kyle Siefring's avatar Kyle Siefring Committed by Jean-Baptiste Kempf

Rework the usage of noskip_mask

Remove half of the masks since they are only used for cdef on a 8x8
level of granularity.

Load the mask and combine the 16-bit sections into the 32-bit sections
outside of the inner cdef loop. This should save some registers.

Results in mild performance improvements.
parent 3ccfc25a
Pipeline #56587 passed with stages
in 4 minutes and 34 seconds
......@@ -117,7 +117,7 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) {
const int tf = f->lf.top_pre_cdef_toggle;
const int by_idx = by & 30;
const int by_idx = (by & 30) >> 1;
if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM;
if (edges & CDEF_HAVE_BOTTOM) // backup pre-filter data for next iteration
......@@ -140,6 +140,11 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
goto next_sb;
}
// Create a complete 32-bit mask for the sb row ahead of time.
const uint16_t (*noskip_row)[2] = &lflvl[sb128x].noskip_mask[by_idx];
const unsigned noskip_mask = (unsigned) noskip_row[0][1] << 16 |
noskip_row[0][0];
const int y_lvl = f->frame_hdr->cdef.y_strength[cdef_idx];
const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx];
const enum Backup2x8Flags flag = !!y_lvl + (!!uv_lvl << 1);
......@@ -162,11 +167,8 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
// check if this 8x8 block had any coded coefficients; if not,
// go to the next block
const unsigned bx_mask = 3U << (bx & 14);
const int bx_idx = (bx & 16) >> 4;
if (!((lflvl[sb128x].noskip_mask[by_idx + 0][bx_idx] |
lflvl[sb128x].noskip_mask[by_idx + 1][bx_idx]) & bx_mask))
{
const uint32_t bx_mask = 3U << (bx & 30);
if (!(noskip_mask & bx_mask)) {
last_skip = 1;
goto next_b;
}
......
......@@ -1984,10 +1984,10 @@ static int decode_b(Dav1dTileContext *const t,
#undef set_ctx
}
if (!b->skip) {
uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4];
uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4 >> 1];
const unsigned mask = (~0U >> (32 - bw4)) << (bx4 & 15);
const int bx_idx = (bx4 & 16) >> 4;
for (int y = 0; y < bh4; y++, noskip_mask++) {
for (int y = 0; y < bh4; y += 2, noskip_mask++) {
(*noskip_mask)[bx_idx] |= mask;
if (bw4 == 32) // this should be mask >> 16, but it's 0xffffffff anyway
(*noskip_mask)[1] |= mask;
......
......@@ -53,7 +53,7 @@ typedef struct Av1Filter {
uint16_t filter_y[2 /* 0=col, 1=row */][32][3][2];
uint16_t filter_uv[2 /* 0=col, 1=row */][32][2][2];
int8_t cdef_idx[4]; // -1 means "unset"
uint16_t noskip_mask[32][2];
uint16_t noskip_mask[16][2]; // for 8x8 blocks, but stored on a 4x8 basis
} Av1Filter;
// each struct describes one 128x128 area (1 or 4 SBs), post-superres-scaling
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment