Commit 989057fb authored by Henrik Gramner's avatar Henrik Gramner Committed by Henrik Gramner
Browse files

Optimize non-qmatrix coefficient decoding

Not having a quantizer matrix is the most common case, so it's
worth having a separate code path for it that eliminates some
calculations and table lookups.

Without a qm, not only can we skip calculating dq * qm, but only
Exp-Golomb-coded coefficients will have the potential to overflow,
so we can also skip clipping for the vast majority of coefficients.
parent a92e307f
......@@ -2966,15 +2966,13 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
// setup dequant tables
init_quant_tables(f->seq_hdr, f->frame_hdr, f->frame_hdr->quant.yac, f->dq);
if (f->frame_hdr->quant.qm)
for (int j = 0; j < N_RECT_TX_SIZES; j++) {
f->qm[0][j][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][j];
f->qm[0][j][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][j];
f->qm[0][j][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][j];
}
for (int i = f->frame_hdr->quant.qm; i < 2; i++)
for (int tx = 0; tx < N_RECT_TX_SIZES; tx++)
for (int pl = 0; pl < 3; pl++)
f->qm[i][tx][pl] = dav1d_qm_tbl[15][!!pl][tx];
for (int i = 0; i < N_RECT_TX_SIZES; i++) {
f->qm[i][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][i];
f->qm[i][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][i];
f->qm[i][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][i];
}
else
memset(f->qm, 0, sizeof(f->qm));
// setup jnt_comp weights
if (f->frame_hdr->switchable_comp_refs) {
......
......@@ -29,7 +29,7 @@
#include "src/dequant_tables.h"
const uint16_t dav1d_dq_tbl[][QINDEX_RANGE][2] = {
const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2] = {
{
{ 4, 4, }, { 8, 8, }, { 8, 9, }, { 9, 10, },
{ 10, 11, }, { 11, 12, }, { 12, 13, }, { 12, 14, },
......
......@@ -32,6 +32,6 @@
#include "src/levels.h"
extern const uint16_t dav1d_dq_tbl[][QINDEX_RANGE][2];
extern const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2];
#endif /* DAV1D_SRC_DEQUANT_TABLES_H */
......@@ -208,7 +208,7 @@ struct Dav1dFrameContext {
ptrdiff_t b4_stride;
int w4, h4, bw, bh, sb128w, sb128h, sbh, sb_shift, sb_step, sr_sb128w;
uint16_t dq[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */];
const uint8_t *qm[2 /* is_1d */][N_RECT_TX_SIZES][3 /* plane */];
const uint8_t *qm[N_RECT_TX_SIZES][3 /* plane */];
BlockContext *a;
int a_sz /* w*tile_rows */;
refmvs_frame rf;
......
......@@ -3066,7 +3066,6 @@ static const uint8_t qm_tbl_32x32_t[][2][528] = {
};
const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES];
static uint8_t pb_32x32[32 * 32];
static uint8_t qm_tbl_4x4[15][2][16];
static uint8_t qm_tbl_4x8[15][2][32];
static uint8_t qm_tbl_4x16[15][2][64];
......@@ -3145,8 +3144,5 @@ COLD void dav1d_init_qm_tables(void) {
dav1d_qm_tbl[i][j][RTX_16X64] = dav1d_qm_tbl[i][j][RTX_16X32];
}
memset(pb_32x32, 32, sizeof(pb_32x32));
for (int j = 0; j < 2; j++)
for (int k = 0; k < N_RECT_TX_SIZES; k++)
dav1d_qm_tbl[15][j][k] = pb_32x32;
// dav1d_qm_tbl[15][*][*] == NULL
}
......@@ -581,22 +581,30 @@ static int decode_coefs(Dav1dTileContext *const t,
// residual and sign
const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
const uint8_t *const qm_tbl = f->qm[lossless || is_1d || *txtp == IDTX][tx][plane];
const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL;
const int dq_shift = imax(0, t_dim->ctx - 2);
const unsigned cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
unsigned dc_sign = 1 << 6;
unsigned cul_level;
unsigned cul_level, dc_sign_level;
if (dc_tok) { // dc
const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
if (dbg)
printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
chroma, dc_sign_ctx, sign, ts->msac.rng);
if (!dc_tok) {
cul_level = 0;
dc_sign_level = 1 << 6;
if (qm_tbl) goto ac_qm;
goto ac_noqm;
}
const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
const int dc_sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
if (dbg)
printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
chroma, dc_sign_ctx, dc_sign, ts->msac.rng);
unsigned dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5;
dc_sign = (sign - 1) & (2 << 6);
unsigned dc_dq = dq_tbl[0];
dc_sign_level = (dc_sign - 1) & (2 << 6);
if (qm_tbl) {
dc_dq = (dc_dq * qm_tbl[0] + 16) >> 5;
if (dc_tok == 15) {
dc_tok = read_golomb(&ts->msac) + 15;
......@@ -605,46 +613,100 @@ static int decode_coefs(Dav1dTileContext *const t,
dc_tok - 15, dc_tok, ts->msac.rng);
dc_tok &= 0xfffff;
dc_dq = (dc_dq * dc_tok) & 0xffffff;
} else {
dc_dq *= dc_tok;
assert(dc_dq <= 0xffffff);
}
cul_level = dc_tok;
dc_dq >>= dq_shift;
cf[0] = (coef) (umin(dc_dq - dc_sign, cf_max) ^ -dc_sign);
if (rc) ac_qm: {
const unsigned ac_dq = dq_tbl[1];
do {
const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
if (dbg)
printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
const unsigned rc_tok = cf[rc];
unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5;
if (rc_tok >= (15 << 11)) {
tok = read_golomb(&ts->msac) + 15;
if (dbg)
printf("Post-residual[%d=%d->%d]: r=%d\n",
rc, tok - 15, tok, ts->msac.rng);
tok &= 0xfffff;
dq = (dq * tok) & 0xffffff;
} else {
tok = rc_tok >> 11;
dq *= tok;
assert(dq <= 0xffffff);
}
cul_level += tok;
dq >>= dq_shift;
cf[rc] = (coef) (umin(dq - sign, cf_max) ^ -sign);
dq = ((dq * dc_tok) & 0xffffff) >> dq_shift;
cf[0] = imin(dq - sign, cf_max) ^ -sign;
}
cul_level = dc_tok;
if (rc) { // ac
const unsigned ac_dq = dq_tbl[1];
do {
const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
rc = rc_tok & 0x3ff;
} while (rc);
}
} else {
// non-qmatrix is the common case and allows for additional optimizations
if (dc_tok == 15) {
dc_tok = read_golomb(&ts->msac) + 15;
if (dbg)
printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
printf("Post-dc_residual[%d->%d]: r=%d\n",
dc_tok - 15, dc_tok, ts->msac.rng);
const unsigned rc_tok = cf[rc];
unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5;
dc_tok &= 0xfffff;
dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift;
dc_dq = umin(dc_dq - dc_sign, cf_max);
} else {
dc_dq = ((dc_dq * dc_tok) >> dq_shift) - dc_sign;
assert(dc_dq <= cf_max);
}
cul_level = dc_tok;
cf[0] = (coef) (dc_dq ^ -dc_sign);
// residual
if (rc_tok >= (15 << 11)) {
tok = read_golomb(&ts->msac) + 15;
if (rc) ac_noqm: {
const unsigned ac_dq = dq_tbl[1];
do {
const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
if (dbg)
printf("Post-residual[%d=%d->%d]: r=%d\n",
rc, tok - 15, tok, ts->msac.rng);
// coefficient parsing, see 5.11.39
tok &= 0xfffff;
} else {
tok = rc_tok >> 11;
}
// dequant, see 7.12.3
cul_level += tok;
dq = ((dq * tok) & 0xffffff) >> dq_shift;
cf[rc] = imin(dq - sign, cf_max) ^ -sign;
printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
const unsigned rc_tok = cf[rc];
unsigned tok, dq;
// residual
if (rc_tok >= (15 << 11)) {
tok = read_golomb(&ts->msac) + 15;
if (dbg)
printf("Post-residual[%d=%d->%d]: r=%d\n",
rc, tok - 15, tok, ts->msac.rng);
// coefficient parsing, see 5.11.39
tok &= 0xfffff;
// dequant, see 7.12.3
dq = ((ac_dq * tok) & 0xffffff) >> dq_shift;
dq = umin(dq - sign, cf_max);
} else {
// cannot exceed cf_max, so we can avoid the clipping
tok = rc_tok >> 11;
dq = ((ac_dq * tok) >> dq_shift) - sign;
assert(dq <= cf_max);
}
cul_level += tok;
cf[rc] = (coef) (dq ^ -sign);
rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob
} while (rc);
rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob
} while (rc);
}
}
// context
*res_ctx = umin(cul_level, 63) | dc_sign;
*res_ctx = umin(cul_level, 63) | dc_sign_level;
return eob;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment