Commit bf7a4786 authored by Ronald S. Bultje's avatar Ronald S. Bultje

Rewrite horizontal loopfilter

Loop inside SIMD (instead of in the caller) so that we can handle
multiple 4px blocks per iteration, allowing for more efficient
SIMD. To make this easier, also transpose the masks for the hor
filter.
parent 04b70ea5
......@@ -34,44 +34,28 @@
#include "src/lf_apply.h"
static inline int maxifzero(const uint8_t (*const a)[4],
const uint8_t (*const b)[4], const int diridx)
{
const int a_val = (*a)[diridx];
if (a_val) return a_val;
return (*b)[diridx];
}
static inline void filter_plane_cols_y(const Dav1dFrameContext *const f,
const int have_left,
const uint8_t (*lvl)[4],
const ptrdiff_t b4_stride,
const uint32_t (*const mask)[3],
pixel *dst, const ptrdiff_t ls,
const int w,
const int starty4, const int endy4)
{
const Dav1dDSPContext *const dsp = f->dsp;
// filter edges between columns (e.g. block1 | block2)
for (int y = starty4; y < endy4;
y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)
{
pixel *ptr = dst;
const uint8_t (*l)[4] = lvl;
const uint32_t *const hmask = mask[y];
const unsigned hm = hmask[0] | hmask[1] | hmask[2];
for (unsigned x = 1; hm & ~(x - 1); l++, x <<= 1, ptr += 4) {
if ((have_left || x > 1) && (hm & x)) {
const int L = maxifzero(l, &l[-1], 0);
if (!L) continue;
const int H = L >> 4;
const int E = f->lf.lim_lut.e[L], I = f->lf.lim_lut.i[L];
const int idx = (hmask[2] & x) ? 2 : !!(hmask[1] & x);
dsp->lf.loop_filter[idx][0](ptr, ls, E, I, H);
}
}
for (int x = 0; x < w; x++) {
if (!have_left && !x) continue;
dsp->lf.loop_filter_sb[0][0](&dst[x * 4], ls,
starty4 ? (const uint32_t[3]) {
mask[x][0] >> starty4,
mask[x][1] >> starty4,
mask[x][2] >> starty4,
} : mask[x],
(const uint8_t(*)[4]) &lvl[x][0], b4_stride,
&f->lf.lim_lut, endy4 - starty4);
}
}
......@@ -93,9 +77,9 @@ static inline void filter_plane_rows_y(const Dav1dFrameContext *const f,
y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)
{
if (!have_top && !y) continue;
dsp->lf.loop_filter_sb128y(dst, ls, mask[y],
(const uint8_t(*)[4]) &lvl[0][1], b4_stride,
&f->lf.lim_lut, w);
dsp->lf.loop_filter_sb[0][1](dst, ls, mask[y],
(const uint8_t(*)[4]) &lvl[0][1], b4_stride,
&f->lf.lim_lut, w);
}
}
......@@ -105,45 +89,28 @@ static inline void filter_plane_cols_uv(const Dav1dFrameContext *const f,
const ptrdiff_t b4_stride,
const uint32_t (*const mask)[2],
pixel *const u, pixel *const v,
const ptrdiff_t ls,
const ptrdiff_t ls, const int w,
const int starty4, const int endy4)
{
const Dav1dDSPContext *const dsp = f->dsp;
int y;
ptrdiff_t off_l;
const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
// filter edges between columns (e.g. block1 | block2)
for (off_l = 0, y = starty4; y < endy4;
y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)
{
ptrdiff_t off = off_l;
const uint8_t (*l)[4] = lvl;
const uint32_t *const hmask = mask[y];
const unsigned hm = hmask[0] | hmask[1];
for (unsigned x = 1; hm & ~(x - 1); l++, x <<= 1, off += 4) {
if ((have_left || x > 1) && (hm & x)) {
const int idx = !!(hmask[1] & x);
const int Lu = maxifzero(l, &l[-1], 2);
if (Lu) {
const int H = Lu >> 4;
const int E = f->lf.lim_lut.e[Lu], I = f->lf.lim_lut.i[Lu];
dsp->lf.loop_filter_uv[idx][0](&u[off], ls, E, I, H);
}
const int Lv = maxifzero(l, &l[-1], 3);
if (Lv) {
const int H = Lv >> 4;
const int E = f->lf.lim_lut.e[Lv], I = f->lf.lim_lut.i[Lv];
dsp->lf.loop_filter_uv[idx][0](&v[off], ls, E, I, H);
}
}
}
for (int x = 0; x < w; x++) {
if (!have_left && !x) continue;
dsp->lf.loop_filter_sb[1][0](&u[x * 4], ls,
starty4 ? (const uint32_t[2]) {
mask[x][0] >> starty4,
mask[x][1] >> starty4,
} : mask[x],
(const uint8_t(*)[4]) &lvl[x][2], b4_stride,
&f->lf.lim_lut, endy4 - starty4);
dsp->lf.loop_filter_sb[1][0](&v[x * 4], ls,
starty4 ? (const uint32_t[2]) {
mask[x][0] >> starty4,
mask[x][1] >> starty4,
} : mask[x],
(const uint8_t(*)[4]) &lvl[x][3], b4_stride,
&f->lf.lim_lut, endy4 - starty4);
}
}
......@@ -167,12 +134,12 @@ static inline void filter_plane_rows_uv(const Dav1dFrameContext *const f,
y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)
{
if (!have_top && !y) continue;
dsp->lf.loop_filter_sb128uv(&u[off_l], ls, mask[y],
(const uint8_t(*)[4]) &lvl[0][2], b4_stride,
&f->lf.lim_lut, w);
dsp->lf.loop_filter_sb128uv(&v[off_l], ls, mask[y],
(const uint8_t(*)[4]) &lvl[0][3], b4_stride,
&f->lf.lim_lut, w);
dsp->lf.loop_filter_sb[1][1](&u[off_l], ls, mask[y],
(const uint8_t(*)[4]) &lvl[0][2], b4_stride,
&f->lf.lim_lut, w);
dsp->lf.loop_filter_sb[1][1](&v[off_l], ls, mask[y],
(const uint8_t(*)[4]) &lvl[0][3], b4_stride,
&f->lf.lim_lut, w);
}
}
......@@ -200,22 +167,23 @@ void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
for (int tile_col = 1;; tile_col++) {
x = f->frame_hdr.tiling.col_start_sb[tile_col];
if ((x << sbl2) >= f->bw) break;
const int mask = x & is_sb64 ? 1 << 16 : 1;
const int uv_mask = x & is_sb64 ? 1 << (16 >> ss_hor) : 1;
const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor;
x >>= is_sb64;
for (int y = starty4; y < endy4; y++) {
const int idx = 2 * !!(lflvl[x].filter_y[0][y][2] & mask) +
!!(lflvl[x].filter_y[0][y][1] & mask);
lflvl[x].filter_y[0][y][2] &= ~mask;
lflvl[x].filter_y[0][y][1] &= ~mask;
lflvl[x].filter_y[0][y][0] &= ~mask;
lflvl[x].filter_y[0][y][imin(idx, lpf_y[y - starty4])] |= mask;
for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) {
const int idx = 2 * !!(lflvl[x].filter_y[0][bx4][2] & mask) +
!!(lflvl[x].filter_y[0][bx4][1] & mask);
lflvl[x].filter_y[0][bx4][2] &= ~mask;
lflvl[x].filter_y[0][bx4][1] &= ~mask;
lflvl[x].filter_y[0][bx4][0] &= ~mask;
lflvl[x].filter_y[0][bx4][imin(idx, lpf_y[y - starty4])] |= mask;
}
for (int y = starty4 >> ss_ver; y < uv_endy4; y++) {
const int idx = !!(lflvl[x].filter_uv[0][y][1] & uv_mask);
lflvl[x].filter_uv[0][y][1] &= ~uv_mask;
lflvl[x].filter_uv[0][y][0] &= ~uv_mask;
lflvl[x].filter_uv[0][y][imin(idx, lpf_uv[y - (starty4 >> ss_ver)])] |= uv_mask;
for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4;
y++, uv_mask <<= 1)
{
const int idx = !!(lflvl[x].filter_uv[0][cbx4][1] & uv_mask);
lflvl[x].filter_uv[0][cbx4][1] &= ~uv_mask;
lflvl[x].filter_uv[0][cbx4][0] &= ~uv_mask;
lflvl[x].filter_uv[0][cbx4][imin(idx, lpf_uv[y - (starty4 >> ss_ver)])] |= uv_mask;
}
lpf_y += halign;
lpf_uv += halign >> ss_ver;
......@@ -257,8 +225,8 @@ void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
x++, have_left = 1, ptr += 128, level_ptr += 32)
{
filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride,
lflvl[x].filter_y[0],
ptr, f->cur.p.stride[0], starty4, endy4);
lflvl[x].filter_y[0], ptr, f->cur.p.stride[0],
imin(32, f->bw - x * 32), starty4, endy4);
}
level_ptr = f->lf.level + f->b4_stride * sby * sbsz;
......@@ -279,6 +247,7 @@ void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride,
lflvl[x].filter_uv[0],
&p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],
(imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,
starty4 >> ss_ver, uv_endy4);
}
......
......@@ -98,30 +98,24 @@ static inline void mask_edges_inter(uint32_t (*masks)[32][3],
max_tx, 0, y_off, x_off, tx_masks);
// left block edge
unsigned mask = 1U << bx4;
for (y = 0; y < h4; y++)
masks[0][by4 + y][imin(txa[0][0][y][0], l[y])] |= mask;
unsigned mask = 1U << by4;
for (y = 0; y < h4; y++, mask <<= 1)
masks[0][bx4][imin(txa[0][0][y][0], l[y])] |= mask;
// top block edge
for (x = 0; x < w4; x++, mask <<= 1)
for (mask = 1U << bx4, x = 0; x < w4; x++, mask <<= 1)
masks[1][by4][imin(txa[1][0][0][x], a[x])] |= mask;
if (!skip) {
// inner (tx) left|right edges
for (y = 0; y < h4; y++) {
for (y = 0, mask = 1U << by4; y < h4; y++, mask <<= 1) {
int ltx = txa[0][0][y][0];
int step = txa[0][1][y][0];
if (step < w4) {
x = step;
mask = 1U << (bx4 + step);
do {
const int rtx = txa[0][0][y][x];
masks[0][by4 + y][imin(rtx, ltx)] |= mask;
ltx = rtx;
step = txa[0][1][y][x];
x += step;
mask <<= step;
} while (x < w4);
for (x = step; x < w4; x += step) {
const int rtx = txa[0][0][y][x];
masks[0][bx4 + x][imin(rtx, ltx)] |= mask;
ltx = rtx;
step = txa[0][1][y][x];
}
}
......@@ -157,29 +151,27 @@ static inline void mask_edges_intra(uint32_t (*const masks)[32][3],
int y, x;
// left block edge
unsigned mask = 1U << bx4;
for (y = 0; y < h4; y++)
masks[0][by4 + y][imin(twl4c, l[y])] |= mask;
unsigned mask = 1U << by4;
for (y = 0; y < h4; y++, mask <<= 1)
masks[0][bx4][imin(twl4c, l[y])] |= mask;
// top block edge
for (mask = 1U << bx4, x = 0; x < w4; x++, mask <<= 1)
masks[1][by4][imin(thl4c, a[x])] |= mask;
static const uint32_t hstep[] = {
0xffffffff, 0x55555555, 0x11111111, 0x01010101, 0x00010001
};
// inner (tx) left|right edges
const unsigned t = 1U << bx4;
const unsigned inner = (((uint64_t) t) << w4) - t;
mask = (inner - t) & hstep[twl4];
for (y = 0; y < h4; y++)
masks[0][by4 + y][twl4c] |= mask;
const int hstep = t_dim->w;
unsigned t = 1U << by4;
unsigned inner = (((uint64_t) t) << h4) - t;
for (x = hstep; x < w4; x += hstep)
masks[0][bx4 + x][twl4c] |= inner;
// top
// inner (tx) --- edges
// bottom
const int vstep = t_dim->h;
t = 1U << bx4;
inner = (((uint64_t) t) << w4) - t;
for (y = vstep; y < h4; y += vstep)
masks[1][by4 + y][thl4c] |= inner;
......@@ -200,30 +192,28 @@ static inline void mask_edges_chroma(uint32_t (*const masks)[32][2],
int y, x;
// left block edge
unsigned mask = 1U << cbx4;
for (y = 0; y < ch4; y++)
masks[0][cby4 + y][imin(twl4c, l[y])] |= mask;
unsigned mask = 1U << cby4;
for (y = 0; y < ch4; y++, mask <<= 1)
masks[0][cbx4][imin(twl4c, l[y])] |= mask;
// top block edge
for (mask = 1U << cbx4, x = 0; x < cw4; x++, mask <<= 1)
masks[1][cby4][imin(thl4c, a[x])] |= mask;
if (!skip_inter) {
static const uint32_t hstep[] = {
0xffffffff, 0x55555555, 0x11111111, 0x01010101
};
// inner (tx) left|right edges
const int t = 1U << cbx4;
const unsigned inner = (((uint64_t) t) << cw4) - t;
mask = (inner - t) & hstep[twl4];
for (y = 0; y < ch4; y++)
masks[0][cby4 + y][twl4c] |= mask;
const int hstep = t_dim->w;
int t = 1U << cby4;
unsigned inner = (((uint64_t) t) << ch4) - t;
for (x = hstep; x < cw4; x += hstep)
masks[0][cbx4 + x][twl4c] |= inner;
// top
// inner (tx) --- edges
// bottom
const int vstep = t_dim->h;
t = 1U << cbx4;
inner = (((uint64_t) t) << cw4) - t;
for (y = vstep; y < ch4; y += vstep)
masks[1][cby4 + y][thl4c] |= inner;
}
......
......@@ -159,27 +159,26 @@ loop_filter(pixel *dst, int E, int I, int H,
}
}
#define lf_4_fn(dir, wd, stridea, strideb) \
static void loop_filter_##dir##_##wd##wd_4px_c(pixel *const dst, \
const ptrdiff_t stride, \
const int E, const int I, \
const int H) \
{ \
loop_filter(dst, E, I, H, stridea, strideb, wd); \
static void loop_filter_h_sb128y_c(pixel *dst, const ptrdiff_t stride,
const uint32_t *const vmask,
const uint8_t (*l)[4], ptrdiff_t b4_stride,
const Av1FilterLUT *lut, const int h)
{
const unsigned vm = (vmask[0] | vmask[1] | vmask[2]) & ((1ULL << h) - 1);
for (unsigned y = 1; vm & ~(y - 1);
y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
{
if (vm & y) {
const int L = l[0][0] ? l[0][0] : l[-1][0];
if (!L) continue;
const int H = L >> 4;
const int E = lut->e[L], I = lut->i[L];
const int idx = (vmask[2] & y) ? 2 : !!(vmask[1] & y);
loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 << idx);
}
}
}
#define lf_4_fns(wd) \
lf_4_fn(h, wd, PXSTRIDE(stride), 1) \
lf_4_fn(v, wd, 1, PXSTRIDE(stride))
lf_4_fns(4)
lf_4_fns(6)
lf_4_fns(8)
lf_4_fns(16)
#undef lf_4_fn
#undef lf_4_fns
static void loop_filter_v_sb128y_c(pixel *dst, const ptrdiff_t stride,
const uint32_t *const vmask,
const uint8_t (*l)[4], ptrdiff_t b4_stride,
......@@ -198,6 +197,26 @@ static void loop_filter_v_sb128y_c(pixel *dst, const ptrdiff_t stride,
}
}
static void loop_filter_h_sb128uv_c(pixel *dst, const ptrdiff_t stride,
const uint32_t *const vmask,
const uint8_t (*l)[4], ptrdiff_t b4_stride,
const Av1FilterLUT *lut, const int h)
{
const unsigned vm = (vmask[0] | vmask[1]) & ((1ULL << h) - 1);
for (unsigned y = 1; vm & ~(y - 1);
y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
{
if (vm & y) {
const int L = l[0][0] ? l[0][0] : l[-1][0];
if (!L) continue;
const int H = L >> 4;
const int E = lut->e[L], I = lut->i[L];
const int idx = !!(vmask[1] & y);
loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 + 2 * idx);
}
}
}
static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
const uint32_t *const vmask,
const uint8_t (*l)[4], ptrdiff_t b4_stride,
......@@ -217,20 +236,10 @@ static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
}
void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
c->loop_filter[0][0] = loop_filter_h_4wd_4px_c;
c->loop_filter[0][1] = loop_filter_v_4wd_4px_c;
c->loop_filter[1][0] = loop_filter_h_8wd_4px_c;
c->loop_filter[1][1] = loop_filter_v_8wd_4px_c;
c->loop_filter[2][0] = loop_filter_h_16wd_4px_c;
c->loop_filter[2][1] = loop_filter_v_16wd_4px_c;
c->loop_filter_uv[0][0] = loop_filter_h_4wd_4px_c;
c->loop_filter_uv[0][1] = loop_filter_v_4wd_4px_c;
c->loop_filter_uv[1][0] = loop_filter_h_6wd_4px_c;
c->loop_filter_uv[1][1] = loop_filter_v_6wd_4px_c;
c->loop_filter_sb128y = loop_filter_v_sb128y_c;
c->loop_filter_sb128uv = loop_filter_v_sb128uv_c;
c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c;
c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c;
c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;
c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;
#if HAVE_ASM && ARCH_X86
bitfn(dav1d_loop_filter_dsp_init_x86)(c);
......
......@@ -36,10 +36,6 @@
#include "src/levels.h"
#include "src/lf_mask.h"
#define decl_loopfilter_fn(name) \
void (name)(pixel *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)
typedef decl_loopfilter_fn(*loopfilter_fn);
#define decl_loopfilter_sb_fn(name) \
void (name)(pixel *dst, ptrdiff_t stride, const uint32_t *mask, \
const uint8_t (*lvl)[4], ptrdiff_t lvl_stride, \
......@@ -48,15 +44,12 @@ typedef decl_loopfilter_sb_fn(*loopfilter_sb_fn);
typedef struct Dav1dLoopFilterDSPContext {
/*
* dimension 1: filter taps (0=4, 1=8, 2=16 for luma; 0=4, 1=6 for chroma)
* dimension 1: plane (0=luma, 1=chroma)
* dimension 2: 0=col-edge filter (h), 1=row-edge filter (v)
*
* dst/stride are aligned by 4
* dst/stride are aligned by 32
*/
loopfilter_fn loop_filter[3][2];
loopfilter_fn loop_filter_uv[2][2];
loopfilter_sb_fn loop_filter_sb128y;
loopfilter_sb_fn loop_filter_sb128uv;
loopfilter_sb_fn loop_filter_sb[2][2];
} Dav1dLoopFilterDSPContext;
void dav1d_loop_filter_dsp_init_8bpc(Dav1dLoopFilterDSPContext *c);
......
......@@ -37,7 +37,7 @@ void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const c) {
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
#if BITDEPTH == 8 && ARCH_X86_64
c->loop_filter_sb128y = dav1d_lpf_v_sb128y_avx2;
c->loop_filter_sb128uv = dav1d_lpf_v_sb128uv_avx2;
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb128y_avx2;
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb128uv_avx2;
#endif
}
......@@ -161,6 +161,6 @@ void bitfn(checkasm_check_loopfilter)(void) {
bitfn(dav1d_loop_filter_dsp_init)(&c);
check_lpf_sb(c.loop_filter_sb128y, "lpf_v_sb128y", 3, 32, 1);
check_lpf_sb(c.loop_filter_sb128uv, "lpf_v_sb128uv", 2, 16, 2);
check_lpf_sb(c.loop_filter_sb[0][1], "lpf_v_sb128y", 3, 32, 1);
check_lpf_sb(c.loop_filter_sb[1][1], "lpf_v_sb128uv", 2, 16, 2);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment