Commit 989057fb by Henrik Gramner Committed by Henrik Gramner

### Optimize non-qmatrix coefficient decoding

```Not having a quantizer matrix is the most common case, so it's
worth having a separate code path for it that eliminates some
calculations and table lookups.

Without a qm, not only can we skip calculating dq * qm, but only
Exp-Golomb-coded coefficients will have the potential to overflow,
so we can also skip clipping for the vast majority of coefficients.```
parent a92e307f
 ... ... @@ -2966,15 +2966,13 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { // setup dequant tables init_quant_tables(f->seq_hdr, f->frame_hdr, f->frame_hdr->quant.yac, f->dq); if (f->frame_hdr->quant.qm) for (int j = 0; j < N_RECT_TX_SIZES; j++) { f->qm[0][j][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][j]; f->qm[0][j][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][j]; f->qm[0][j][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][j]; } for (int i = f->frame_hdr->quant.qm; i < 2; i++) for (int tx = 0; tx < N_RECT_TX_SIZES; tx++) for (int pl = 0; pl < 3; pl++) f->qm[i][tx][pl] = dav1d_qm_tbl[15][!!pl][tx]; for (int i = 0; i < N_RECT_TX_SIZES; i++) { f->qm[i][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][i]; f->qm[i][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][i]; f->qm[i][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][i]; } else memset(f->qm, 0, sizeof(f->qm)); // setup jnt_comp weights if (f->frame_hdr->switchable_comp_refs) { ... ...
 ... ... @@ -29,7 +29,7 @@ #include "src/dequant_tables.h" const uint16_t dav1d_dq_tbl[][QINDEX_RANGE][2] = { const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2] = { { { 4, 4, }, { 8, 8, }, { 8, 9, }, { 9, 10, }, { 10, 11, }, { 11, 12, }, { 12, 13, }, { 12, 14, }, ... ...
 ... ... @@ -32,6 +32,6 @@ #include "src/levels.h" extern const uint16_t dav1d_dq_tbl[][QINDEX_RANGE][2]; extern const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2]; #endif /* DAV1D_SRC_DEQUANT_TABLES_H */
 ... ... @@ -208,7 +208,7 @@ struct Dav1dFrameContext { ptrdiff_t b4_stride; int w4, h4, bw, bh, sb128w, sb128h, sbh, sb_shift, sb_step, sr_sb128w; uint16_t dq[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */]; const uint8_t *qm[2 /* is_1d */][N_RECT_TX_SIZES][3 /* plane */]; const uint8_t *qm[N_RECT_TX_SIZES][3 /* plane */]; BlockContext *a; int a_sz /* w*tile_rows */; refmvs_frame rf; ... ...
 ... ... @@ -3066,7 +3066,6 @@ static const uint8_t qm_tbl_32x32_t[][2][528] = { }; const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES]; static uint8_t pb_32x32[32 * 32]; static uint8_t qm_tbl_4x4[15][2][16]; static uint8_t qm_tbl_4x8[15][2][32]; static uint8_t qm_tbl_4x16[15][2][64]; ... ... @@ -3145,8 +3144,5 @@ COLD void dav1d_init_qm_tables(void) { dav1d_qm_tbl[i][j][RTX_16X64] = dav1d_qm_tbl[i][j][RTX_16X32]; } memset(pb_32x32, 32, sizeof(pb_32x32)); for (int j = 0; j < 2; j++) for (int k = 0; k < N_RECT_TX_SIZES; k++) dav1d_qm_tbl[15][j][k] = pb_32x32; // dav1d_qm_tbl[15][*][*] == NULL }
 ... ... @@ -581,22 +581,30 @@ static int decode_coefs(Dav1dTileContext *const t, // residual and sign const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane]; const uint8_t *const qm_tbl = f->qm[lossless || is_1d || *txtp == IDTX][tx][plane]; const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL; const int dq_shift = imax(0, t_dim->ctx - 2); const unsigned cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc)); unsigned dc_sign = 1 << 6; unsigned cul_level; unsigned cul_level, dc_sign_level; if (dc_tok) { // dc const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l); uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx]; const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf); if (dbg) printf("Post-dc_sign[%d][%d][%d]: r=%d\n", chroma, dc_sign_ctx, sign, ts->msac.rng); if (!dc_tok) { cul_level = 0; dc_sign_level = 1 << 6; if (qm_tbl) goto ac_qm; goto ac_noqm; } const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l); uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx]; const int dc_sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf); if (dbg) printf("Post-dc_sign[%d][%d][%d]: r=%d\n", chroma, dc_sign_ctx, dc_sign, ts->msac.rng); unsigned dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5; dc_sign = (sign - 1) & (2 << 6); unsigned dc_dq = dq_tbl[0]; dc_sign_level = (dc_sign - 1) & (2 << 6); if (qm_tbl) { dc_dq = (dc_dq * qm_tbl[0] + 16) >> 5; if (dc_tok == 15) { dc_tok = read_golomb(&ts->msac) + 15; ... ... @@ -605,46 +613,100 @@ static int decode_coefs(Dav1dTileContext *const t, dc_tok - 15, dc_tok, ts->msac.rng); dc_tok &= 0xfffff; dc_dq = (dc_dq * dc_tok) & 0xffffff; } else { dc_dq *= dc_tok; assert(dc_dq <= 0xffffff); } cul_level = dc_tok; dc_dq >>= dq_shift; cf[0] = (coef) (umin(dc_dq - dc_sign, cf_max) ^ -dc_sign); if (rc) ac_qm: { const unsigned ac_dq = dq_tbl[1]; do { const int sign = dav1d_msac_decode_bool_equi(&ts->msac); if (dbg) printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng); const unsigned rc_tok = cf[rc]; unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5; if (rc_tok >= (15 << 11)) { tok = read_golomb(&ts->msac) + 15; if (dbg) printf("Post-residual[%d=%d->%d]: r=%d\n", rc, tok - 15, tok, ts->msac.rng); tok &= 0xfffff; dq = (dq * tok) & 0xffffff; } else { tok = rc_tok >> 11; dq *= tok; assert(dq <= 0xffffff); } cul_level += tok; dq >>= dq_shift; cf[rc] = (coef) (umin(dq - sign, cf_max) ^ -sign); dq = ((dq * dc_tok) & 0xffffff) >> dq_shift; cf[0] = imin(dq - sign, cf_max) ^ -sign; } cul_level = dc_tok; if (rc) { // ac const unsigned ac_dq = dq_tbl[1]; do { const int sign = dav1d_msac_decode_bool_equi(&ts->msac); rc = rc_tok & 0x3ff; } while (rc); } } else { // non-qmatrix is the common case and allows for additional optimizations if (dc_tok == 15) { dc_tok = read_golomb(&ts->msac) + 15; if (dbg) printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng); printf("Post-dc_residual[%d->%d]: r=%d\n", dc_tok - 15, dc_tok, ts->msac.rng); const unsigned rc_tok = cf[rc]; unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5; dc_tok &= 0xfffff; dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift; dc_dq = umin(dc_dq - dc_sign, cf_max); } else { dc_dq = ((dc_dq * dc_tok) >> dq_shift) - dc_sign; assert(dc_dq <= cf_max); } cul_level = dc_tok; cf[0] = (coef) (dc_dq ^ -dc_sign); // residual if (rc_tok >= (15 << 11)) { tok = read_golomb(&ts->msac) + 15; if (rc) ac_noqm: { const unsigned ac_dq = dq_tbl[1]; do { const int sign = dav1d_msac_decode_bool_equi(&ts->msac); if (dbg) printf("Post-residual[%d=%d->%d]: r=%d\n", rc, tok - 15, tok, ts->msac.rng); // coefficient parsing, see 5.11.39 tok &= 0xfffff; } else { tok = rc_tok >> 11; } // dequant, see 7.12.3 cul_level += tok; dq = ((dq * tok) & 0xffffff) >> dq_shift; cf[rc] = imin(dq - sign, cf_max) ^ -sign; printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng); const unsigned rc_tok = cf[rc]; unsigned tok, dq; // residual if (rc_tok >= (15 << 11)) { tok = read_golomb(&ts->msac) + 15; if (dbg) printf("Post-residual[%d=%d->%d]: r=%d\n", rc, tok - 15, tok, ts->msac.rng); // coefficient parsing, see 5.11.39 tok &= 0xfffff; // dequant, see 7.12.3 dq = ((ac_dq * tok) & 0xffffff) >> dq_shift; dq = umin(dq - sign, cf_max); } else { // cannot exceed cf_max, so we can avoid the clipping tok = rc_tok >> 11; dq = ((ac_dq * tok) >> dq_shift) - sign; assert(dq <= cf_max); } cul_level += tok; cf[rc] = (coef) (dq ^ -sign); rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob } while (rc); rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob } while (rc); } } // context *res_ctx = umin(cul_level, 63) | dc_sign; *res_ctx = umin(cul_level, 63) | dc_sign_level; return eob; } ... ...
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!