Commit e3b5d4d0 authored by Ronald S. Bultje's avatar Ronald S. Bultje

Use grouped context setting

Decreases runtime of decoding first 1000 frames of Chimera (1080p, 8bit)
from 12.227 to 12.075s (average of 6 runs) after changing decode.c, and
further down to 12.027s (1.67%) with the changes to recon_tmpl.c included.
After the changes to lf_mask.c, it goes down to 11.842s.
parent 01386d4c
......@@ -32,6 +32,12 @@
#include <stddef.h>
#ifdef __GNUC__
#define ATTR_ALIAS __attribute__((may_alias))
#else
#define ATTR_ALIAS
#endif
#if ARCH_X86
#define ALIGN_32_VAL 32
#define ALIGN_16_VAL 16
......
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __DAV1D_SRC_CTX_H__
#define __DAV1D_SRC_CTX_H__
#include <stdint.h>
#include "common/attributes.h"
union alias64 { uint64_t u64; uint8_t u8[8]; } ATTR_ALIAS;
union alias32 { uint32_t u32; uint8_t u8[4]; } ATTR_ALIAS;
union alias16 { uint16_t u16; uint8_t u8[2]; } ATTR_ALIAS;
union alias8 { uint8_t u8; };
#define set_ctx_rep4(type, var, off, val) do { \
const uint64_t const_val = val; \
((union alias64 *) &var[off + 0])->u64 = const_val; \
((union alias64 *) &var[off + 8])->u64 = const_val; \
((union alias64 *) &var[off + 16])->u64 = const_val; \
((union alias64 *) &var[off + 24])->u64 = const_val; \
} while (0)
#define set_ctx_rep2(type, var, off, val) do { \
const uint64_t const_val = val; \
((union alias64 *) &var[off + 0])->u64 = const_val; \
((union alias64 *) &var[off + 8])->u64 = const_val; \
} while (0)
#define set_ctx_rep1(typesz, var, off, val) \
((union alias##typesz *) &var[off])->u##typesz = val
#define case_set(var, dir, diridx, off) \
switch (var) { \
case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
case 32: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
}
#define case_set_upto16(var, dir, diridx, off) \
switch (var) { \
case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
}
#define case_set_upto32_with_default(var, dir, diridx, off) \
switch (var) { \
case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
case 32: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
default: default_memset(dir, diridx, off, var); break; \
}
#define case_set_upto16_with_default(var, dir, diridx, off) \
switch (var) { \
case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
default: default_memset(dir, diridx, off, var); break; \
}
#endif /* __DAV1D_SRC_CTX_H__ */
......@@ -38,6 +38,7 @@
#include "common/intops.h"
#include "common/mem.h"
#include "src/ctx.h"
#include "src/decode.h"
#include "src/dequant_tables.h"
#include "src/env.h"
......@@ -171,8 +172,14 @@ static void read_tx_tree(Dav1dTileContext *const t,
}
t->by -= txsh;
} else {
memset(&t->a->tx[bx4], is_split ? TX_4X4 : txw, t_dim->w);
memset(&t->l.tx[by4], is_split ? TX_4X4 : txh, t_dim->h);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txh)
case_set_upto16(t_dim->h, l., 1, by4);
#undef set_ctx
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txw)
case_set_upto16(t_dim->w, a->, 0, bx4);
#undef set_ctx
}
}
......@@ -611,13 +618,19 @@ static void read_vartx_tree(Dav1dTileContext *const t,
{
b->max_ytx = b->uvtx = TX_4X4;
if (f->frame_hdr.txfm_mode == TX_SWITCHABLE) {
memset(&t->a->tx[bx4], TX_4X4, bw4);
memset(&t->l.tx[by4], TX_4X4, bh4);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir tx, off, TX_4X4)
case_set(bh4, l., 1, by4);
case_set(bw4, a->, 0, bx4);
#undef set_ctx
}
} else if (f->frame_hdr.txfm_mode != TX_SWITCHABLE || b->skip) {
if (f->frame_hdr.txfm_mode == TX_SWITCHABLE) {
memset(&t->a->tx[bx4], b_dim[2], bw4);
memset(&t->l.tx[by4], b_dim[3], bh4);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir tx, off, mul * b_dim[2 + diridx])
case_set(bh4, l., 1, by4);
case_set(bw4, a->, 0, bx4);
#undef set_ctx
} else {
assert(f->frame_hdr.txfm_mode == TX_LARGEST);
}
......@@ -694,14 +707,22 @@ static int decode_b(Dav1dTileContext *const t,
if (b->intra) {
f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
if (has_chroma) {
memset(&t->l.uvmode[cby4], b->uv_mode, cbh4);
memset(&t->a->uvmode[cbx4], b->uv_mode, cbw4);
}
const enum IntraPredMode y_mode_nofilt =
b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
memset(&t->l.mode[by4], y_mode_nofilt, bh4);
memset(&t->a->mode[bx4], y_mode_nofilt, bw4);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \
rep_macro(type, t->dir intra, off, mul)
case_set(bh4, l., 1, by4);
case_set(bw4, a->, 0, bx4);
#undef set_ctx
if (has_chroma) {
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
case_set(cbh4, l., 1, cby4);
case_set(cbw4, a->, 0, cbx4);
#undef set_ctx
}
} else {
if (b->comp_type == COMP_INTER_NONE && b->motion_mode == MM_WARP) {
uint64_t mask[2] = { 0, 0 };
......@@ -712,17 +733,22 @@ static int decode_b(Dav1dTileContext *const t,
f->bd_fn.recon_b_inter(t, bs, b);
const uint8_t *const filter = dav1d_filter_dir[b->filter2d];
memset(&t->l.filter[0][by4], filter[0], bh4);
memset(&t->a->filter[0][bx4], filter[0], bw4);
memset(&t->l.filter[1][by4], filter[1], bh4);
memset(&t->a->filter[1][bx4], filter[1], bw4);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir filter[0], off, mul * filter[0]); \
rep_macro(type, t->dir filter[1], off, mul * filter[1]); \
rep_macro(type, t->dir intra, off, 0)
case_set(bh4, l., 1, by4);
case_set(bw4, a->, 0, bx4);
#undef set_ctx
if (has_chroma) {
memset(&t->l.uvmode[cby4], DC_PRED, cbh4);
memset(&t->a->uvmode[cbx4], DC_PRED, cbw4);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
case_set(cbh4, l., 1, cby4);
case_set(cbw4, a->, 0, cbx4);
#undef set_ctx
}
}
memset(&t->l.intra[by4], b->intra, bh4);
memset(&t->a->intra[bx4], b->intra, bw4);
return 0;
}
......@@ -1106,14 +1132,29 @@ static int decode_b(Dav1dTileContext *const t,
has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
// update contexts
memset(&t->a->tx_intra[bx4], t_dim->lw, bw4);
memset(&t->l.tx_intra[by4], t_dim->lh, bh4);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir tx_intra, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \
rep_macro(type, t->dir tx, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \
rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \
rep_macro(type, t->dir pal_sz, off, mul * b->pal_sz[0]); \
rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
rep_macro(type, t->dir skip_mode, off, 0); \
rep_macro(type, t->dir intra, off, mul); \
rep_macro(type, t->dir skip, off, mul * b->skip); \
/* see aomedia bug 2183 for why we use luma coordinates here */ \
rep_macro(type, t->pal_sz_uv[diridx], off, mul * (has_chroma ? b->pal_sz[1] : 0)); \
if (f->frame_hdr.frame_type & 1) { \
rep_macro(type, t->dir comp_type, off, mul * b->skip); \
rep_macro(type, t->dir ref[0], off, mul * ((uint8_t) -1)); \
rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) -1)); \
rep_macro(type, t->dir filter[0], off, mul * N_SWITCHABLE_FILTERS); \
rep_macro(type, t->dir filter[1], off, mul * N_SWITCHABLE_FILTERS); \
}
const enum IntraPredMode y_mode_nofilt =
b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
memset(&t->l.mode[by4], y_mode_nofilt, bh4);
memset(&t->a->mode[bx4], y_mode_nofilt, bw4);
memset(&t->l.pal_sz[by4], b->pal_sz[0], bh4);
memset(&t->a->pal_sz[bx4], b->pal_sz[0], bw4);
case_set(bh4, l., 1, by4);
case_set(bw4, a->, 0, bx4);
#undef set_ctx
if (b->pal_sz[0]) {
uint16_t *const pal = f->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
......@@ -1124,11 +1165,11 @@ static int decode_b(Dav1dTileContext *const t,
memcpy(t->al_pal[1][by4 + y][0], pal, 16);
}
if (has_chroma) {
memset(&t->l.uvmode[cby4], b->uv_mode, cbh4);
memset(&t->a->uvmode[cbx4], b->uv_mode, cbw4);
// see aomedia bug 2183 for why we use luma coordinates here
memset(&t->pal_sz_uv[1][by4], b->pal_sz[1], bh4);
memset(&t->pal_sz_uv[0][bx4], b->pal_sz[1], bw4);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
case_set(cbh4, l., 1, cby4);
case_set(cbw4, a->, 0, cbx4);
#undef set_ctx
if (b->pal_sz[1]) for (int pl = 1; pl < 3; pl++) {
uint16_t *const pal = f->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
......@@ -1139,28 +1180,11 @@ static int decode_b(Dav1dTileContext *const t,
for (int y = 0; y < bh4; y++)
memcpy(t->al_pal[1][by4 + y][pl], pal, 16);
}
} else { // see aomedia bug 2183 for why we reset this
memset(&t->pal_sz_uv[1][by4], 0, bh4);
memset(&t->pal_sz_uv[0][bx4], 0, bw4);
}
if ((f->frame_hdr.frame_type & 1) || f->frame_hdr.allow_intrabc) {
memset(&t->a->tx[bx4], t_dim->lw, bw4);
memset(&t->l.tx[by4], t_dim->lh, bh4);
splat_intraref(f->mvs, f->b4_stride, t->by, t->bx, bs,
y_mode_nofilt);
}
if (f->frame_hdr.frame_type & 1) {
memset(&t->l.comp_type[by4], COMP_INTER_NONE, bh4);
memset(&t->a->comp_type[bx4], COMP_INTER_NONE, bw4);
memset(&t->l.ref[0][by4], -1, bh4);
memset(&t->a->ref[0][bx4], -1, bw4);
memset(&t->l.ref[1][by4], -1, bh4);
memset(&t->a->ref[1][bx4], -1, bw4);
memset(&t->l.filter[0][by4], N_SWITCHABLE_FILTERS, bh4);
memset(&t->a->filter[0][bx4], N_SWITCHABLE_FILTERS, bw4);
memset(&t->l.filter[1][by4], N_SWITCHABLE_FILTERS, bh4);
memset(&t->a->filter[1][bx4], N_SWITCHABLE_FILTERS, bw4);
}
} else if (!(f->frame_hdr.frame_type & 1)) {
// intra block copy
candidate_mv mvstack[8];
......@@ -1259,18 +1283,25 @@ static int decode_b(Dav1dTileContext *const t,
splat_intrabc_mv(f->mvs, f->b4_stride, t->by, t->bx, bs, b->mv[0]);
memset(&t->a->tx_intra[bx4], b_dim[2], bw4);
memset(&t->l.tx_intra[by4], b_dim[3], bh4);
memset(&t->l.mode[by4], DC_PRED, bh4);
memset(&t->a->mode[bx4], DC_PRED, bw4);
memset(&t->l.pal_sz[by4], 0, bh4);
memset(&t->a->pal_sz[bx4], 0, bw4);
// see aomedia bug 2183 for why this is outside if (has_chroma)
memset(&t->pal_sz_uv[1][by4], 0, bh4);
memset(&t->pal_sz_uv[0][bx4], 0, bw4);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \
rep_macro(type, t->dir mode, off, mul * DC_PRED); \
rep_macro(type, t->dir pal_sz, off, 0); \
/* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
rep_macro(type, t->pal_sz_uv[diridx], off, 0); \
rep_macro(type, t->dir seg_pred, off, seg_pred); \
rep_macro(type, t->dir skip_mode, off, 0); \
rep_macro(type, t->dir intra, off, 0); \
rep_macro(type, t->dir skip, off, b->skip)
case_set(bh4, l., 1, by4);
case_set(bw4, a->, 0, bx4);
#undef set_ctx
if (has_chroma) {
memset(&t->l.uvmode[cby4], DC_PRED, cbh4);
memset(&t->a->uvmode[cbx4], DC_PRED, cbw4);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
case_set(cbh4, l., 1, cby4);
case_set(cbw4, a->, 0, cbx4);
#undef set_ctx
}
} else {
// inter-specific mode/mv coding
......@@ -1764,29 +1795,33 @@ static int decode_b(Dav1dTileContext *const t,
b->inter_mode, b->ref[0], b->mv[0],
b->interintra_type);
}
memset(&t->l.pal_sz[by4], 0, bh4);
memset(&t->a->pal_sz[bx4], 0, bw4);
// see aomedia bug 2183 for why this is outside if (has_chroma)
memset(&t->pal_sz_uv[1][by4], 0, bh4);
memset(&t->pal_sz_uv[0][bx4], 0, bw4);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
rep_macro(type, t->dir skip_mode, off, mul * b->skip_mode); \
rep_macro(type, t->dir intra, off, 0); \
rep_macro(type, t->dir skip, off, mul * b->skip); \
rep_macro(type, t->dir pal_sz, off, 0); \
/* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
rep_macro(type, t->pal_sz_uv[diridx], off, 0); \
rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \
rep_macro(type, t->dir comp_type, off, mul * b->comp_type); \
rep_macro(type, t->dir filter[0], off, mul * filter[0]); \
rep_macro(type, t->dir filter[1], off, mul * filter[1]); \
rep_macro(type, t->dir mode, off, mul * b->inter_mode); \
rep_macro(type, t->dir ref[0], off, mul * b->ref[0]); \
rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) b->ref[1]))
case_set(bh4, l., 1, by4);
case_set(bw4, a->, 0, bx4);
#undef set_ctx
if (has_chroma) {
memset(&t->l.uvmode[cby4], DC_PRED, cbh4);
memset(&t->a->uvmode[cbx4], DC_PRED, cbw4);
}
memset(&t->a->tx_intra[bx4], b_dim[2], bw4);
memset(&t->l.tx_intra[by4], b_dim[3], bh4);
memset(&t->l.comp_type[by4], b->comp_type, bh4);
memset(&t->a->comp_type[bx4], b->comp_type, bw4);
memset(&t->l.filter[0][by4], filter[0], bh4);
memset(&t->a->filter[0][bx4], filter[0], bw4);
memset(&t->l.filter[1][by4], filter[1], bh4);
memset(&t->a->filter[1][bx4], filter[1], bw4);
memset(&t->l.mode[by4], b->inter_mode, bh4);
memset(&t->a->mode[bx4], b->inter_mode, bw4);
memset(&t->l.ref[0][by4], b->ref[0], bh4);
memset(&t->a->ref[0][bx4], b->ref[0], bw4);
memset(&t->l.ref[1][by4], b->ref[1], bh4);
memset(&t->a->ref[1][bx4], b->ref[1], bw4);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
case_set(cbh4, l., 1, cby4);
case_set(cbw4, a->, 0, cbx4);
#undef set_ctx
}
}
// update contexts
......@@ -1794,19 +1829,14 @@ static int decode_b(Dav1dTileContext *const t,
f->frame_hdr.segmentation.update_map)
{
uint8_t *seg_ptr = &f->cur_segmap[t->by * f->b4_stride + t->bx];
for (int y = 0; y < bh4; y++) {
memset(seg_ptr, b->seg_id, bw4);
seg_ptr += f->b4_stride;
}
}
memset(&t->l.seg_pred[by4], seg_pred, bh4);
memset(&t->a->seg_pred[bx4], seg_pred, bw4);
memset(&t->l.skip_mode[by4], b->skip_mode, bh4);
memset(&t->a->skip_mode[bx4], b->skip_mode, bw4);
memset(&t->l.intra[by4], b->intra, bh4);
memset(&t->a->intra[bx4], b->intra, bw4);
memset(&t->l.skip[by4], b->skip, bh4);
memset(&t->a->skip[bx4], b->skip, bw4);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
for (int y = 0; y < bh4; y++) { \
rep_macro(type, seg_ptr, 0, mul * b->seg_id); \
seg_ptr += f->b4_stride; \
}
case_set(bw4, NULL, 0, 0);
#undef set_ctx
}
if (!b->skip) {
uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4];
const unsigned mask = (~0U >> (32 - bw4)) << (bx4 & 15);
......@@ -2081,8 +2111,11 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
}
if (f->frame_thread.pass != 2 && (bp != PARTITION_SPLIT || bl == BL_8X8)) {
memset(&t->a->partition[bx8], dav1d_al_part_ctx[0][bl][bp], hsz);
memset(&t->l.partition[by8], dav1d_al_part_ctx[1][bl][bp], hsz);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->a->partition, bx8, mul * dav1d_al_part_ctx[0][bl][bp]); \
rep_macro(type, t->l.partition, by8, mul * dav1d_al_part_ctx[1][bl][bp])
case_set_upto16(hsz,,,);
#undef set_ctx
}
return 0;
......
......@@ -38,23 +38,23 @@
#include "src/tables.h"
typedef struct BlockContext {
uint8_t mode[32];
uint8_t lcoef[32];
uint8_t ccoef[2][32];
uint8_t seg_pred[32];
uint8_t skip[32];
uint8_t skip_mode[32];
uint8_t intra[32];
uint8_t comp_type[32];
int8_t ref[2][32]; // -1 means intra
uint8_t filter[2][32]; // 3 means unset
int8_t tx_intra[32];
int8_t tx[32];
uint8_t tx_lpf_y[32];
uint8_t tx_lpf_uv[32];
uint8_t partition[16];
uint8_t uvmode[32];
uint8_t pal_sz[32];
uint8_t ALIGN(mode[32], 8);
uint8_t ALIGN(lcoef[32], 8);
uint8_t ALIGN(ccoef[2][32], 8);
uint8_t ALIGN(seg_pred[32], 8);
uint8_t ALIGN(skip[32], 8);
uint8_t ALIGN(skip_mode[32], 8);
uint8_t ALIGN(intra[32], 8);
uint8_t ALIGN(comp_type[32], 8);
int8_t ALIGN(ref[2][32], 8); // -1 means intra
uint8_t ALIGN(filter[2][32], 8); // 3 means unset
int8_t ALIGN(tx_intra[32], 8);
int8_t ALIGN(tx[32], 8);
uint8_t ALIGN(tx_lpf_y[32], 8);
uint8_t ALIGN(tx_lpf_uv[32], 8);
uint8_t ALIGN(partition[16], 8);
uint8_t ALIGN(uvmode[32], 8);
uint8_t ALIGN(pal_sz[32], 8);
} BlockContext;
static inline int get_intra_ctx(const BlockContext *const a,
......
......@@ -32,6 +32,7 @@
#include "common/intops.h"
#include "src/ctx.h"
#include "src/levels.h"
#include "src/lf_mask.h"
#include "src/tables.h"
......@@ -64,12 +65,18 @@ static void decomp_tx(uint8_t (*const txa)[2 /* txsz, step */][32 /* y */][32 /*
} else {
const int lw = imin(2, t_dim->lw), lh = imin(2, t_dim->lh);
for (int y = 0; y < t_dim->h; y++) {
memset(txa[0][0][y], lw, t_dim->w);
memset(txa[1][0][y], lh, t_dim->w);
txa[0][1][y][0] = t_dim->w;
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
for (int y = 0; y < t_dim->h; y++) { \
rep_macro(type, txa[0][0][y], off, mul * lw); \
rep_macro(type, txa[1][0][y], off, mul * lh); \
txa[0][1][y][0] = t_dim->w; \
}
memset(txa[1][1][0], t_dim->h, t_dim->w);
case_set_upto16(t_dim->w,,, 0);
#undef set_ctx
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, txa[1][1][0], off, mul * t_dim->h)
case_set_upto16(t_dim->w,,, 0);
#undef set_ctx
}
}
......@@ -190,8 +197,20 @@ static inline void mask_edges_intra(uint16_t (*const masks)[32][3][2],
if (inner2) masks[1][by4 + y][thl4c][1] |= inner2;
}
memset(a, thl4c, w4);
memset(l, twl4c, h4);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, a, off, mul * thl4c)
#define default_memset(dir, diridx, off, var) \
memset(a, thl4c, var)
case_set_upto32_with_default(w4,,, 0);
#undef default_memset
#undef set_ctx
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, l, off, mul * twl4c)
#define default_memset(dir, diridx, off, var) \
memset(l, twl4c, var)
case_set_upto32_with_default(h4,,, 0);
#undef default_memset
#undef set_ctx
}
static inline void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
......@@ -249,8 +268,20 @@ static inline void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
}
}
memset(a, thl4c, cw4);
memset(l, twl4c, ch4);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, a, off, mul * thl4c)
#define default_memset(dir, diridx, off, var) \
memset(a, thl4c, var)
case_set_upto32_with_default(cw4,,, 0);
#undef default_memset
#undef set_ctx
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, l, off, mul * twl4c)
#define default_memset(dir, diridx, off, var) \
memset(l, twl4c, var)
case_set_upto32_with_default(ch4,,, 0);
#undef default_memset
#undef set_ctx
}
void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
......
......@@ -37,6 +37,7 @@
#include "common/mem.h"
#include "src/cdef_apply.h"
#include "src/ctx.h"
#include "src/ipred_prepare.h"
#include "src/lf_apply.h"
#include "src/lr_apply.h"
......@@ -315,10 +316,22 @@ static void read_coef_tree(Dav1dTileContext *const t,
if (DEBUG_BLOCK_INFO)
printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
ytx, txtp, eob, ts->msac.rng);
memset(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx));
memset(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by));
for (int y = 0; y < txh; y++)
memset(&t->txtp_map[(by4 + y) * 32 + bx4], txtp, txw);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
#define default_memset(dir, diridx, off, sz) \
memset(&t->dir lcoef[off], cf_ctx, sz)
case_set_upto16_with_default(imin(txh, f->bh - t->by), l., 1, by4);
case_set_upto16_with_default(imin(txw, f->bw - t->bx), a->, 0, bx4);
#undef default_memset
#undef set_ctx
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
for (int y = 0; y < txh; y++) { \
rep_macro(type, txtp_map, 0, mul * txtp); \
txtp_map += 32; \
}
uint8_t *txtp_map = &t->txtp_map[by4 * 32 + bx4];
case_set_upto16(txw,,,);
#undef set_ctx
if (f->frame_thread.pass == 1) {
cbi->eob[0] = eob;
cbi->txtp[0] = txtp;
......@@ -356,11 +369,18 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
(bh4 > ss_ver || t->by & 1);
if (b->skip) {
memset(&t->a->lcoef[bx4], 0x40, bw4);
memset(&t->l.lcoef[by4], 0x40, bh4);
if (has_chroma) for (int pl = 0; pl < 2; pl++) {
memset(&t->a->ccoef[pl][cbx4], 0x40, cbw4);
memset(&t->l.ccoef[pl][cby4], 0x40, cbh4);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir lcoef, off, mul * 0x40)
case_set(bh4, l., 1, by4);
case_set(bw4, a->, 0, bx4);
#undef set_ctx
if (has_chroma) {
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \
rep_macro(type, t->dir ccoef[1], off, mul * 0x40)
case_set(cbh4, l., 1, cby4);
case_set(cbw4, a->, 0, cbx4);
#undef set_ctx
}
return;
}
......@@ -402,10 +422,16 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
b->tx, txtp, eob, ts->msac.rng);
cbi[t->bx].txtp[0] = txtp;
ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
memset(&t->a->lcoef[bx4 + x], cf_ctx,
imin(t_dim->w, f->bw - t->bx));
memset(&t->l.lcoef[by4 + y], cf_ctx,
imin(t_dim->h, f->bh - t->by));
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
#define default_memset(dir, diridx, off, sz) \
memset(&t->dir lcoef[off], cf_ctx, sz)
case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by),
l., 1, by4 + y);
case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx),
a->, 0, bx4 + x);
#undef default_memset
#undef set_ctx
}
}
t->bx -= x;
......@@ -441,10 +467,18 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
pl, b->uvtx, txtp, eob, ts->msac.rng);
cbi[t->bx].txtp[1 + pl] = txtp;
ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));
memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
#define default_memset(dir, diridx, off, sz) \
memset(&t->dir ccoef[pl][off], cf_ctx, sz)
case_set_upto16_with_default( \
imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver),
l., 1, cby4 + y);
case_set_upto16_with_default( \
imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor),
a->, 0, cbx4 + x);
#undef default_memset
#undef set_ctx
}
t->bx -= x << ss_hor;
}
......@@ -763,10 +797,16 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
if (DEBUG_BLOCK_INFO)
printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
b->tx, txtp, eob, ts->msac.rng);
memset(&t->a->lcoef[bx4 + x], cf_ctx,
imin(t_dim->w, f->bw - t->bx));
memset(&t->l.lcoef[by4 + y], cf_ctx,
imin(t_dim->h, f->bh - t->by));
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
#define default_memset(dir, diridx, off, sz) \
memset(&t->dir lcoef[off], cf_ctx, sz)
case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by), \
l., 1, by4 + y);
case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx), \
a->, 0, bx4 + x);
#undef default_memset
#undef set_ctx
}
if (eob >= 0) {
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
......@@ -781,8 +821,11 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
t_dim->w * 4, t_dim->h * 4, "recon");
}
} else if (!f->frame_thread.pass) {
memset(&t->a->lcoef[bx4 + x], 0x40, t_dim->w);
memset(&t->l.lcoef[by4 + y], 0x40, t_dim->h);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir lcoef, off, mul * 0x40)
case_set_upto16(t_dim->h, l., 1, by4 + y);
case_set_upto16(t_dim->w, a->, 0, bx4 + x);
#undef set_ctx
}
dst += 4 * t_dim->w;
}
......@@ -970,10 +1013,18 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
printf("Post-uv-cf-blk[pl=%d,tx=%d,"
"txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));
memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
#define default_memset(dir, diridx, off, sz) \
memset(&t->dir ccoef[pl][off], cf_ctx, sz)
case_set_upto16_with_default( \
imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver),
l., 1, cby4 + y);
case_set_upto16_with_default( \
imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor),
a->, 0, cbx4 + x);
#undef default_memset
#undef set_ctx
}
if (eob >= 0) {
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
......@@ -987,8 +1038,11 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
uv_t_dim->h * 4, "recon");
}
} else if (!f->frame_thread.pass) {
memset(&t->a->ccoef[pl][cbx4 + x], 0x40, uv_t_dim->w);
memset(&t->l.ccoef[pl][cby4 + y], 0x40, uv_t_dim->h);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir ccoef[pl], off, mul * 0x40)
case_set_upto16(uv_t_dim->h, l., 1, cby4 + y);