Commit 4a499fd5 authored by Ronald S. Bultje's avatar Ronald S. Bultje

Add AVX2 implementation for SGR looprestoration

Total decoding time for first 1000 frames of TwxVOYxoukU:
after: 0m3.761s
before: 0m6.868s

Cycle times:
selfguided_3x3_8bpc_c: 438865.8
selfguided_3x3_8bpc_avx2: 112522.6
selfguided_5x5_8bpc_c: 326938.3
selfguided_5x5_8bpc_avx2: 75850.1
selfguided_mix_8bpc_c: 755980.5
selfguided_mix_8bpc_avx2: 195930.3
parent bfdfd1aa
......@@ -502,7 +502,7 @@ const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1
{ 2, 0, 22, -1 },
};
const int16_t dav1d_sgr_x_by_xplus1[256] = {
const int dav1d_sgr_x_by_xplus1[256] = {
1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
......
......@@ -107,7 +107,7 @@ static const unsigned interintra_allowed_mask =
extern const WarpedMotionParams dav1d_default_wm_params;
extern const int16_t dav1d_sgr_params[16][4];
extern const int16_t dav1d_sgr_x_by_xplus1[256];
extern const int dav1d_sgr_x_by_xplus1[256];
extern const int8_t dav1d_mc_subpel_filters[5][15][8];
extern const int8_t dav1d_mc_warp_filter[][8];
......
This diff is collapsed.
......@@ -30,6 +30,7 @@
#include "common/attributes.h"
#include "common/intops.h"
#include "src/tables.h"
#if BITDEPTH == 8 && ARCH_X86_64
void dav1d_wiener_filter_h_avx2(int16_t *dst, const pixel (*left)[4],
......@@ -73,6 +74,128 @@ static void wiener_filter_avx2(pixel *const dst, const ptrdiff_t dst_stride,
dav1d_wiener_filter_v_avx2(dst, dst_stride, &mid[2*384], w, h, fv, edges);
}
void dav1d_sgr_box3_h_avx2(int32_t *sumsq, int16_t *sum,
const pixel (*left)[4],
const pixel *src, const ptrdiff_t stride,
const int w, const int h,
const enum LrEdgeFlags edges);
void dav1d_sgr_box3_v_avx2(int32_t *sumsq, int16_t *sum,
const int w, const int h,
const enum LrEdgeFlags edges);
void dav1d_sgr_calc_ab1_avx2(int32_t *a, int16_t *b,
const int w, const int h, const int strength);
void dav1d_sgr_finish_filter1_avx2(coef *tmp,
const pixel *src, const ptrdiff_t stride,
const int32_t *a, const int16_t *b,
const int w, const int h);
// filter with a 3x3 box (radius=1)
static void dav1d_sgr_filter1_avx2(coef *tmp,
const pixel *src, const ptrdiff_t stride,
const pixel (*left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h, const int strength,
const enum LrEdgeFlags edges)
{
ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,);
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
dav1d_sgr_box3_h_avx2(sumsq, sum, left, src, stride, w, h, edges);
if (edges & LR_HAVE_TOP)
dav1d_sgr_box3_h_avx2(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
NULL, lpf, lpf_stride, w, 2, edges);
if (edges & LR_HAVE_BOTTOM)
dav1d_sgr_box3_h_avx2(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
lpf_stride, w, 2, edges);
dav1d_sgr_box3_v_avx2(sumsq, sum, w, h, edges);
dav1d_sgr_calc_ab1_avx2(a, b, w, h, strength);
dav1d_sgr_finish_filter1_avx2(tmp, src, stride, a, b, w, h);
}
void dav1d_sgr_box5_h_avx2(int32_t *sumsq, int16_t *sum,
const pixel (*left)[4],
const pixel *src, const ptrdiff_t stride,
const int w, const int h,
const enum LrEdgeFlags edges);
void dav1d_sgr_box5_v_avx2(int32_t *sumsq, int16_t *sum,
const int w, const int h,
const enum LrEdgeFlags edges);
void dav1d_sgr_calc_ab2_avx2(int32_t *a, int16_t *b,
const int w, const int h, const int strength);
void dav1d_sgr_finish_filter2_avx2(coef *tmp,
const pixel *src, const ptrdiff_t stride,
const int32_t *a, const int16_t *b,
const int w, const int h);
// filter with a 5x5 box (radius=2)
static void dav1d_sgr_filter2_avx2(coef *tmp,
const pixel *src, const ptrdiff_t stride,
const pixel (*left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h, const int strength,
const enum LrEdgeFlags edges)
{
ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,);
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
dav1d_sgr_box5_h_avx2(sumsq, sum, left, src, stride, w, h, edges);
if (edges & LR_HAVE_TOP)
dav1d_sgr_box5_h_avx2(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
NULL, lpf, lpf_stride, w, 2, edges);
if (edges & LR_HAVE_BOTTOM)
dav1d_sgr_box5_h_avx2(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
lpf_stride, w, 2, edges);
dav1d_sgr_box5_v_avx2(sumsq, sum, w, h, edges);
dav1d_sgr_calc_ab2_avx2(a, b, w, h, strength);
dav1d_sgr_finish_filter2_avx2(tmp, src, stride, a, b, w, h);
}
void dav1d_sgr_weighted1_avx2(pixel *dst, const ptrdiff_t stride,
const coef *t1, const int w, const int h,
const int wt);
void dav1d_sgr_weighted2_avx2(pixel *dst, const ptrdiff_t stride,
const coef *t1, const coef *t2,
const int w, const int h,
const int16_t wt[2]);
static void sgr_filter_avx2(pixel *const dst, const ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h, const int sgr_idx,
const int16_t sgr_wt[7], const enum LrEdgeFlags edges)
{
if (!dav1d_sgr_params[sgr_idx][0]) {
ALIGN_STK_32(coef, tmp, 64 * 384,);
dav1d_sgr_filter1_avx2(tmp, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][3], edges);
dav1d_sgr_weighted1_avx2(dst, dst_stride, tmp, w, h, (1 << 7) - sgr_wt[1]);
} else if (!dav1d_sgr_params[sgr_idx][1]) {
ALIGN_STK_32(coef, tmp, 64 * 384,);
dav1d_sgr_filter2_avx2(tmp, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][2], edges);
dav1d_sgr_weighted1_avx2(dst, dst_stride, tmp, w, h, sgr_wt[0]);
} else {
ALIGN_STK_32(coef, tmp1, 64 * 384,);
ALIGN_STK_32(coef, tmp2, 64 * 384,);
dav1d_sgr_filter2_avx2(tmp1, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][2], edges);
dav1d_sgr_filter1_avx2(tmp2, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][3], edges);
const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] };
dav1d_sgr_weighted2_avx2(dst, dst_stride, tmp1, tmp2, w, h, wt);
}
}
#endif
void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) {
......@@ -82,5 +205,6 @@ void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *
#if BITDEPTH == 8 && ARCH_X86_64
c->wiener = wiener_filter_avx2;
c->selfguided = sgr_filter_avx2;
#endif
}
......@@ -73,7 +73,7 @@ int float_near_abs_eps_array_ulp(const float *a, const float *b, float eps,
static void *func_ref, *func_new;
#define BENCH_RUNS (1 << 16) /* Trade-off between accuracy and speed */
#define BENCH_RUNS (1 << 12) /* Trade-off between accuracy and speed */
/* Decide whether or not the specified function needs to be tested */
#define check_func(func, ...)\
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment