Commit 58fc5165 authored by Henrik Gramner's avatar Henrik Gramner

Split MC blend

The mstride == 0, mstride == 1, and mstride == w cases are very different
from each other, and splitting them into separate functions makes it easier
top optimize them.

Also add some further optimizations to the AVX2 asm that became possible
after this change.
parent 8b8e9fe8
......@@ -81,11 +81,14 @@ void (name)(pixel *dst, ptrdiff_t dst_stride, \
typedef decl_w_mask_fn(*w_mask_fn);
#define decl_blend_fn(name) \
void (name)(pixel *dst, ptrdiff_t dst_stride, \
const pixel *tmp, int w, int h, \
const uint8_t *mask, ptrdiff_t mstride)
void (name)(pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, \
int w, int h, const uint8_t *mask)
typedef decl_blend_fn(*blend_fn);
#define decl_blend_dir_fn(name) \
void (name)(pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, int w, int h)
typedef decl_blend_dir_fn(*blend_dir_fn);
#define decl_emu_edge_fn(name) \
void (name)(intptr_t bw, intptr_t bh, intptr_t iw, intptr_t ih, intptr_t x, intptr_t y, \
pixel *dst, ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride)
......@@ -99,6 +102,8 @@ typedef struct Dav1dMCDSPContext {
mask_fn mask;
w_mask_fn w_mask[3 /* 444, 422, 420 */];
blend_fn blend;
blend_dir_fn blend_v;
blend_dir_fn blend_h;
warp8x8_fn warp8x8;
warp8x8t_fn warp8x8t;
emu_edge_fn emu_edge;
......
......@@ -373,19 +373,46 @@ static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
} while (--h);
}
static void blend_c(pixel *dst, const ptrdiff_t dst_stride,
const pixel *tmp, const int w, const int h,
const uint8_t *mask, const ptrdiff_t m_stride)
#define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
static NOINLINE void
blend_internal_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
const int w, int h, const uint8_t *mask,
const ptrdiff_t mask_stride)
{
for (int y = 0; y < h; y++) {
do {
for (int x = 0; x < w; x++) {
#define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
dst[x] = blend_px(dst[x], tmp[x], mask[m_stride == 1 ? 0 : x]);
dst[x] = blend_px(dst[x], tmp[x], mask[x]);
}
dst += PXSTRIDE(dst_stride);
tmp += w;
mask += m_stride;
}
mask += mask_stride;
} while (--h);
}
static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
const int w, const int h, const uint8_t *mask)
{
blend_internal_c(dst, dst_stride, tmp, w, h, mask, w);
}
static void blend_v_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
const int w, const int h)
{
blend_internal_c(dst, dst_stride, tmp, w, h, &dav1d_obmc_masks[w], 0);
}
static void blend_h_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
const int w, int h)
{
const uint8_t *mask = &dav1d_obmc_masks[h];
do {
const int m = *mask++;
for (int x = 0; x < w; x++) {
dst[x] = blend_px(dst[x], tmp[x], m);
}
dst += PXSTRIDE(dst_stride);
tmp += w;
} while (--h);
}
static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
......@@ -591,6 +618,8 @@ void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
c->w_avg = w_avg_c;
c->mask = mask_c;
c->blend = blend_c;
c->blend_v = blend_v_c;
c->blend_h = blend_h_c;
c->w_mask[0] = w_mask_444_c;
c->w_mask[1] = w_mask_422_c;
c->w_mask[2] = w_mask_420_c;
......
......@@ -579,9 +579,8 @@ static int obmc(Dav1dTileContext *const t,
&f->refp[a_r->ref[0] - 1],
dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
if (res) return res;
f->dsp->mc.blend(&dst[x * h_mul], dst_stride, lap,
h_mul * ow4, v_mul * oh4,
&dav1d_obmc_masks[v_mul * oh4], 1);
f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap,
h_mul * ow4, v_mul * oh4);
i++;
}
x += imax(a_b_dim[0], 2);
......@@ -603,9 +602,8 @@ static int obmc(Dav1dTileContext *const t,
&f->refp[l_r->ref[0] - 1],
dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
if (res) return res;
f->dsp->mc.blend(&dst[y * v_mul * PXSTRIDE(dst_stride)],
dst_stride, lap, h_mul * ow4, v_mul * oh4,
&dav1d_obmc_masks[h_mul * ow4], 0);
f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)],
dst_stride, lap, h_mul * ow4, v_mul * oh4);
i++;
}
y += imax(l_b_dim[1], 2);
......@@ -1144,7 +1142,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
dav1d_ii_masks[bs][0][b->interintra_mode] :
dav1d_wedge_masks[bs][0][0][b->wedge_idx];
dsp->mc.blend(dst, f->cur.p.stride[0], tmp,
bw4 * 4, bh4 * 4, ii_mask, bw4 * 4);
bw4 * 4, bh4 * 4, ii_mask);
}
if (!has_chroma) goto skip_inter_chroma_pred;
......@@ -1277,7 +1275,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
tl_edge, cbw4 * 4, cbh4 * 4, 0);
dsp->mc.blend(uvdst, f->cur.p.stride[1], tmp,
cbw4 * 4, cbh4 * 4, ii_mask, cbw4 * 4);
cbw4 * 4, cbh4 * 4, ii_mask);
}
}
}
......
This diff is collapsed.
......@@ -55,6 +55,8 @@ decl_w_avg_fn(dav1d_w_avg_avx2);
decl_mask_fn(dav1d_mask_avx2);
decl_w_mask_fn(dav1d_w_mask_420_avx2);
decl_blend_fn(dav1d_blend_avx2);
decl_blend_dir_fn(dav1d_blend_v_avx2);
decl_blend_dir_fn(dav1d_blend_h_avx2);
decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
......@@ -98,6 +100,8 @@ void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
c->mask = dav1d_mask_avx2;
c->w_mask[2] = dav1d_w_mask_420_avx2;
c->blend = dav1d_blend_avx2;
c->blend_v = dav1d_blend_v_avx2;
c->blend_h = dav1d_blend_h_avx2;
c->warp8x8 = dav1d_warp_affine_8x8_avx2;
c->warp8x8t = dav1d_warp_affine_8x8t_avx2;
......
......@@ -237,40 +237,95 @@ static void check_w_mask(Dav1dMCDSPContext *const c) {
}
static void check_blend(Dav1dMCDSPContext *const c) {
ALIGN_STK_32(pixel, tmp, 128 * 32,);
ALIGN_STK_32(pixel, c_dst, 128 * 32,);
ALIGN_STK_32(pixel, a_dst, 128 * 32,);
ALIGN_STK_32(uint8_t, mask, 128 * 32,);
ALIGN_STK_32(pixel, tmp, 32 * 32,);
ALIGN_STK_32(pixel, c_dst, 32 * 32,);
ALIGN_STK_32(pixel, a_dst, 32 * 32,);
ALIGN_STK_32(uint8_t, mask, 32 * 32,);
for (int i = 0; i < 128 * 32; i++) {
for (int i = 0; i < 32 * 32; i++) {
tmp[i] = rand() & ((1 << BITDEPTH) - 1);
mask[i] = rand() % 65;
}
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
int w, int h, const uint8_t *mask, ptrdiff_t mstride);
int w, int h, const uint8_t *mask);
for (int w = 2; w <= 128; w <<= 1) {
for (int w = 4; w <= 32; w <<= 1) {
const ptrdiff_t dst_stride = w * sizeof(pixel);
const int h_min = (w == 128) ? 4 : 2;
const int h_max = (w > 32) ? 32 : (w == 2) ? 64 : 128;
for (int ms = 0; ms <= w; ms += ms ? w - 1 : 1)
if (check_func(c->blend, "blend_w%d_ms%d_%dbpc", w, ms, BITDEPTH))
for (int h = h_min; h <= h_max; h <<= 1) {
for (int i = 0; i < w * h; i++)
c_dst[i] = a_dst[i] = rand() & ((1 << BITDEPTH) - 1);
call_ref(c_dst, dst_stride, tmp, w, h, mask, ms);
call_new(a_dst, dst_stride, tmp, w, h, mask, ms);
if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst)))
fail();
if (check_func(c->blend, "blend_w%d_%dbpc", w, BITDEPTH))
for (int h = imax(w / 2, 4); h <= imin(w * 2, 32); h <<= 1) {
for (int i = 0; i < w * h; i++)
c_dst[i] = a_dst[i] = rand() & ((1 << BITDEPTH) - 1);
bench_new(a_dst, dst_stride, tmp, w, h, mask, ms);
}
call_ref(c_dst, dst_stride, tmp, w, h, mask);
call_new(a_dst, dst_stride, tmp, w, h, mask);
if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst)))
fail();
bench_new(a_dst, dst_stride, tmp, w, h, mask);
}
}
report("blend");
}
static void check_blend_v(Dav1dMCDSPContext *const c) {
ALIGN_STK_32(pixel, tmp, 32 * 128,);
ALIGN_STK_32(pixel, c_dst, 32 * 128,);
ALIGN_STK_32(pixel, a_dst, 32 * 128,);
for (int i = 0; i < 32 * 128; i++)
tmp[i] = rand() & ((1 << BITDEPTH) - 1);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
int w, int h);
for (int w = 2; w <= 32; w <<= 1) {
const ptrdiff_t dst_stride = w * sizeof(pixel);
if (check_func(c->blend_v, "blend_v_w%d_%dbpc", w, BITDEPTH))
for (int h = 2; h <= (w == 2 ? 64 : 128); h <<= 1) {
for (int i = 0; i < w * h; i++)
c_dst[i] = a_dst[i] = rand() & ((1 << BITDEPTH) - 1);
call_ref(c_dst, dst_stride, tmp, w, h);
call_new(a_dst, dst_stride, tmp, w, h);
if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst)))
fail();
bench_new(a_dst, dst_stride, tmp, w, h);
}
}
report("blend_v");
}
static void check_blend_h(Dav1dMCDSPContext *const c) {
ALIGN_STK_32(pixel, tmp, 128 * 32,);
ALIGN_STK_32(pixel, c_dst, 128 * 32,);
ALIGN_STK_32(pixel, a_dst, 128 * 32,);
for (int i = 0; i < 128 * 32; i++)
tmp[i] = rand() & ((1 << BITDEPTH) - 1);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
int w, int h);
for (int w = 2; w <= 128; w <<= 1) {
const ptrdiff_t dst_stride = w * sizeof(pixel);
if (check_func(c->blend_h, "blend_h_w%d_%dbpc", w, BITDEPTH))
for (int h = (w == 128 ? 4 : 2); h <= 32; h <<= 1) {
for (int i = 0; i < w * h; i++)
c_dst[i] = a_dst[i] = rand() & ((1 << BITDEPTH) - 1);
call_ref(c_dst, dst_stride, tmp, w, h);
call_new(a_dst, dst_stride, tmp, w, h);
if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst)))
fail();
bench_new(a_dst, dst_stride, tmp, w, h);
}
}
report("blend_h");
}
static void check_warp8x8(Dav1dMCDSPContext *const c) {
ALIGN_STK_32(pixel, src_buf, 15 * 15,);
ALIGN_STK_32(pixel, c_dst, 8 * 8,);
......@@ -430,6 +485,8 @@ void bitfn(checkasm_check_mc)(void) {
check_mask(&c);
check_w_mask(&c);
check_blend(&c);
check_blend_v(&c);
check_blend_h(&c);
check_warp8x8(&c);
check_warp8x8t(&c);
check_emuedge(&c);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment