Commit e3b5d4d0 authored by Ronald S. Bultje's avatar Ronald S. Bultje

Use grouped context setting

Decreases runtime of decoding first 1000 frames of Chimera (1080p, 8bit)
from 12.227 to 12.075s (average of 6 runs) after changing decode.c, and
further down to 12.027s (1.67%) with the changes to recon_tmpl.c included.
After the changes to lf_mask.c, it goes down to 11.842s.
parent 01386d4c
......@@ -32,6 +32,12 @@
#include <stddef.h>
#ifdef __GNUC__
#define ATTR_ALIAS __attribute__((may_alias))
#else
#define ATTR_ALIAS
#endif
#if ARCH_X86
#define ALIGN_32_VAL 32
#define ALIGN_16_VAL 16
......
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __DAV1D_SRC_CTX_H__
#define __DAV1D_SRC_CTX_H__
#include <stdint.h>
#include "common/attributes.h"
union alias64 { uint64_t u64; uint8_t u8[8]; } ATTR_ALIAS;
union alias32 { uint32_t u32; uint8_t u8[4]; } ATTR_ALIAS;
union alias16 { uint16_t u16; uint8_t u8[2]; } ATTR_ALIAS;
union alias8 { uint8_t u8; };
#define set_ctx_rep4(type, var, off, val) do { \
const uint64_t const_val = val; \
((union alias64 *) &var[off + 0])->u64 = const_val; \
((union alias64 *) &var[off + 8])->u64 = const_val; \
((union alias64 *) &var[off + 16])->u64 = const_val; \
((union alias64 *) &var[off + 24])->u64 = const_val; \
} while (0)
#define set_ctx_rep2(type, var, off, val) do { \
const uint64_t const_val = val; \
((union alias64 *) &var[off + 0])->u64 = const_val; \
((union alias64 *) &var[off + 8])->u64 = const_val; \
} while (0)
#define set_ctx_rep1(typesz, var, off, val) \
((union alias##typesz *) &var[off])->u##typesz = val
#define case_set(var, dir, diridx, off) \
switch (var) { \
case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
case 32: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
}
#define case_set_upto16(var, dir, diridx, off) \
switch (var) { \
case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
}
#define case_set_upto32_with_default(var, dir, diridx, off) \
switch (var) { \
case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
case 32: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
default: default_memset(dir, diridx, off, var); break; \
}
#define case_set_upto16_with_default(var, dir, diridx, off) \
switch (var) { \
case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
default: default_memset(dir, diridx, off, var); break; \
}
#endif /* __DAV1D_SRC_CTX_H__ */
This diff is collapsed.
......@@ -38,23 +38,23 @@
#include "src/tables.h"
typedef struct BlockContext {
uint8_t mode[32];
uint8_t lcoef[32];
uint8_t ccoef[2][32];
uint8_t seg_pred[32];
uint8_t skip[32];
uint8_t skip_mode[32];
uint8_t intra[32];
uint8_t comp_type[32];
int8_t ref[2][32]; // -1 means intra
uint8_t filter[2][32]; // 3 means unset
int8_t tx_intra[32];
int8_t tx[32];
uint8_t tx_lpf_y[32];
uint8_t tx_lpf_uv[32];
uint8_t partition[16];
uint8_t uvmode[32];
uint8_t pal_sz[32];
uint8_t ALIGN(mode[32], 8);
uint8_t ALIGN(lcoef[32], 8);
uint8_t ALIGN(ccoef[2][32], 8);
uint8_t ALIGN(seg_pred[32], 8);
uint8_t ALIGN(skip[32], 8);
uint8_t ALIGN(skip_mode[32], 8);
uint8_t ALIGN(intra[32], 8);
uint8_t ALIGN(comp_type[32], 8);
int8_t ALIGN(ref[2][32], 8); // -1 means intra
uint8_t ALIGN(filter[2][32], 8); // 3 means unset
int8_t ALIGN(tx_intra[32], 8);
int8_t ALIGN(tx[32], 8);
uint8_t ALIGN(tx_lpf_y[32], 8);
uint8_t ALIGN(tx_lpf_uv[32], 8);
uint8_t ALIGN(partition[16], 8);
uint8_t ALIGN(uvmode[32], 8);
uint8_t ALIGN(pal_sz[32], 8);
} BlockContext;
static inline int get_intra_ctx(const BlockContext *const a,
......
......@@ -32,6 +32,7 @@
#include "common/intops.h"
#include "src/ctx.h"
#include "src/levels.h"
#include "src/lf_mask.h"
#include "src/tables.h"
......@@ -64,12 +65,18 @@ static void decomp_tx(uint8_t (*const txa)[2 /* txsz, step */][32 /* y */][32 /*
} else {
const int lw = imin(2, t_dim->lw), lh = imin(2, t_dim->lh);
for (int y = 0; y < t_dim->h; y++) {
memset(txa[0][0][y], lw, t_dim->w);
memset(txa[1][0][y], lh, t_dim->w);
txa[0][1][y][0] = t_dim->w;
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
for (int y = 0; y < t_dim->h; y++) { \
rep_macro(type, txa[0][0][y], off, mul * lw); \
rep_macro(type, txa[1][0][y], off, mul * lh); \
txa[0][1][y][0] = t_dim->w; \
}
memset(txa[1][1][0], t_dim->h, t_dim->w);
case_set_upto16(t_dim->w,,, 0);
#undef set_ctx
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, txa[1][1][0], off, mul * t_dim->h)
case_set_upto16(t_dim->w,,, 0);
#undef set_ctx
}
}
......@@ -190,8 +197,20 @@ static inline void mask_edges_intra(uint16_t (*const masks)[32][3][2],
if (inner2) masks[1][by4 + y][thl4c][1] |= inner2;
}
memset(a, thl4c, w4);
memset(l, twl4c, h4);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, a, off, mul * thl4c)
#define default_memset(dir, diridx, off, var) \
memset(a, thl4c, var)
case_set_upto32_with_default(w4,,, 0);
#undef default_memset
#undef set_ctx
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, l, off, mul * twl4c)
#define default_memset(dir, diridx, off, var) \
memset(l, twl4c, var)
case_set_upto32_with_default(h4,,, 0);
#undef default_memset
#undef set_ctx
}
static inline void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
......@@ -249,8 +268,20 @@ static inline void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
}
}
memset(a, thl4c, cw4);
memset(l, twl4c, ch4);
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, a, off, mul * thl4c)
#define default_memset(dir, diridx, off, var) \
memset(a, thl4c, var)
case_set_upto32_with_default(cw4,,, 0);
#undef default_memset
#undef set_ctx
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, l, off, mul * twl4c)
#define default_memset(dir, diridx, off, var) \
memset(l, twl4c, var)
case_set_upto32_with_default(ch4,,, 0);
#undef default_memset
#undef set_ctx
}
void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment