Commit c290c02e authored by Henrik Gramner's avatar Henrik Gramner
Browse files

Add minor SGR optimizations

Split the 5x5, 3x3, and mix cases into separate functions.

Shrink some tables.

Move some scalar calculations out of the DSP function.

Make Wiener and SGR share the same function prototype to
eliminate a branch in lr_stripe().
parent c36b191a
......@@ -27,21 +27,20 @@
#include "src/cpu.h"
#include "src/looprestoration.h"
#include "src/tables.h"
#if ARCH_AARCH64
void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t p_stride,
const pixel (*left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, int h,
const int16_t filter[2][8],
const LooprestorationParams *const params,
const enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX);
void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t p_stride,
const pixel (*left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, int h,
const int16_t filter[2][8],
const LooprestorationParams *const params,
const enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX);
#else
......@@ -81,9 +80,10 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h,
const int16_t filter[2][8],
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
const int16_t (*const filter)[8] = params->filter;
ALIGN_STK_16(int16_t, mid, 68 * 384,);
int mid_stride = (w + 7) & ~7;
......@@ -208,43 +208,50 @@ void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
const int w, const int h,
const int16_t wt[2] HIGHBD_DECL_SUFFIX);
static void sgr_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h, const int sgr_idx,
const int16_t sgr_wt[7], const enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX)
static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
if (!dav1d_sgr_params[sgr_idx][0]) {
ALIGN_STK_16(int16_t, tmp, 64 * 384,);
dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][3], edges
HIGHBD_TAIL_SUFFIX);
BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
tmp, w, h, (1 << 7) - sgr_wt[1]
HIGHBD_TAIL_SUFFIX);
} else if (!dav1d_sgr_params[sgr_idx][1]) {
ALIGN_STK_16(int16_t, tmp, 64 * 384,);
dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][2], edges
HIGHBD_TAIL_SUFFIX);
BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
tmp, w, h, sgr_wt[0]
HIGHBD_TAIL_SUFFIX);
} else {
ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][2], edges
HIGHBD_TAIL_SUFFIX);
dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][3], edges
HIGHBD_TAIL_SUFFIX);
const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] };
BF(dav1d_sgr_weighted2, neon)(dst, dst_stride, dst, dst_stride,
tmp1, tmp2, w, h, wt
HIGHBD_TAIL_SUFFIX);
}
ALIGN_STK_16(int16_t, tmp, 64 * 384,);
dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
}
static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(int16_t, tmp, 64 * 384,);
dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
}
static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,
w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 };
BF(dav1d_sgr_weighted2, neon)(dst, dst_stride, dst, dst_stride,
tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX);
}
COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
......@@ -258,6 +265,9 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPCont
#else
c->wiener[0] = c->wiener[1] = wiener_filter_neon;
#endif
if (bpc <= 10)
c->selfguided = sgr_filter_neon;
if (bpc <= 10) {
c->sgr[0] = sgr_filter_5x5_neon;
c->sgr[1] = sgr_filter_3x3_neon;
c->sgr[2] = sgr_filter_mix_neon;
}
}
......@@ -2485,15 +2485,12 @@ static void read_restoration_info(Dav1dTileContext *const t,
lr->filter_h[1], lr->filter_h[2], ts->msac.rng);
} else if (lr->type == DAV1D_RESTORATION_SGRPROJ) {
const unsigned idx = dav1d_msac_decode_bools(&ts->msac, 4);
const uint16_t *const sgr_params = dav1d_sgr_params[idx];
lr->sgr_idx = idx;
lr->sgr_weights[0] = dav1d_sgr_params[idx][0] ?
dav1d_msac_decode_subexp(&ts->msac,
ts->lr_ref[p]->sgr_weights[0] + 96, 128, 4) - 96 :
0;
lr->sgr_weights[1] = dav1d_sgr_params[idx][1] ?
dav1d_msac_decode_subexp(&ts->msac,
ts->lr_ref[p]->sgr_weights[1] + 32, 128, 4) - 32 :
95;
lr->sgr_weights[0] = sgr_params[0] ? dav1d_msac_decode_subexp(&ts->msac,
ts->lr_ref[p]->sgr_weights[0] + 96, 128, 4) - 96 : 0;
lr->sgr_weights[1] = sgr_params[1] ? dav1d_msac_decode_subexp(&ts->msac,
ts->lr_ref[p]->sgr_weights[1] + 32, 128, 4) - 32 : 95;
memcpy(lr->filter_v, ts->lr_ref[p]->filter_v, sizeof(lr->filter_v));
memcpy(lr->filter_h, ts->lr_ref[p]->filter_h, sizeof(lr->filter_h));
ts->lr_ref[p] = lr;
......
......@@ -40,11 +40,11 @@ typedef struct Av1FilterLUT {
} Av1FilterLUT;
typedef struct Av1RestorationUnit {
enum Dav1dRestorationType type;
uint8_t /* enum Dav1dRestorationType */ type;
int8_t filter_h[3];
int8_t filter_v[3];
uint8_t sgr_idx;
int16_t sgr_weights[2];
int8_t sgr_weights[2];
} Av1RestorationUnit;
// each struct describes one 128x128 area (1 or 4 SBs), pre-superres-scaling
......
......@@ -46,32 +46,32 @@ typedef const pixel (*const_left_pixel_row)[4];
typedef const void *const_left_pixel_row;
#endif
// Although the spec applies restoration filters over 4x4 blocks, the wiener
// filter can be applied to a bigger surface.
typedef union LooprestorationParams {
ALIGN(int16_t filter[2][8], 16);
struct {
uint32_t s0, s1;
int16_t w0, w1;
} sgr;
} LooprestorationParams;
// Although the spec applies restoration filters over 4x4 blocks,
// they can be applied to a bigger surface.
// * w is constrained by the restoration unit size (w <= 256)
// * h is constrained by the stripe height (h <= 64)
// The filter functions are allowed to do aligned writes past the right
// edge of the buffer, aligned up to the minimum loop restoration unit size
// (which is 32 pixels for subsampled chroma and 64 pixels for luma).
#define decl_wiener_filter_fn(name) \
#define decl_lr_filter_fn(name) \
void (name)(pixel *dst, ptrdiff_t dst_stride, \
const_left_pixel_row left, \
const pixel *lpf, ptrdiff_t lpf_stride, \
int w, int h, const int16_t filter[2][8], \
int w, int h, const LooprestorationParams *params, \
enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
typedef decl_wiener_filter_fn(*wienerfilter_fn);
#define decl_selfguided_filter_fn(name) \
void (name)(pixel *dst, ptrdiff_t dst_stride, \
const_left_pixel_row left, \
const pixel *lpf, ptrdiff_t lpf_stride, \
int w, int h, int sgr_idx, const int16_t sgr_w[2], \
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
typedef decl_selfguided_filter_fn(*selfguided_fn);
typedef decl_lr_filter_fn(*looprestorationfilter_fn);
typedef struct Dav1dLoopRestorationDSPContext {
wienerfilter_fn wiener[2]; /* 7-tap, 5-tap */
selfguided_fn selfguided;
looprestorationfilter_fn wiener[2]; /* 7-tap, 5-tap */
looprestorationfilter_fn sgr[3]; /* 5x5, 3x3, mix */
} Dav1dLoopRestorationDSPContext;
bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c, int bpc);
......
......@@ -39,10 +39,10 @@
// TODO Reuse p when no padding is needed (add and remove lpf pixels in p)
// TODO Chroma only requires 2 rows of padding.
static void padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride,
const pixel (*left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
int unit_w, const int stripe_h, const enum LrEdgeFlags edges)
static NOINLINE void
padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride,
const pixel (*left)[4], const pixel *lpf, const ptrdiff_t lpf_stride,
int unit_w, const int stripe_h, const enum LrEdgeFlags edges)
{
const int have_left = !!(edges & LR_HAVE_LEFT);
const int have_right = !!(edges & LR_HAVE_RIGHT);
......@@ -135,7 +135,7 @@ static void wiener_c(pixel *p, const ptrdiff_t p_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h,
const int16_t filter[2][8],
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
// Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
......@@ -150,6 +150,7 @@ static void wiener_c(pixel *p, const ptrdiff_t p_stride,
uint16_t hor[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
uint16_t *hor_ptr = hor;
const int16_t (*const filter)[8] = params->filter;
const int bitdepth = bitdepth_from_max(bitdepth_max);
const int round_bits_h = 3 + (bitdepth == 12) * 2;
const int rounding_off_h = 1 << (round_bits_h - 1);
......@@ -347,12 +348,12 @@ static void boxsum5(int32_t *sumsq, coef *sum, const pixel *const src,
}
}
static void selfguided_filter(coef *dst, const pixel *src,
const ptrdiff_t src_stride, const int w,
const int h, const int n, const int s
HIGHBD_DECL_SUFFIX)
static NOINLINE void
selfguided_filter(coef *dst, const pixel *src, const ptrdiff_t src_stride,
const int w, const int h, const int n, const unsigned s
HIGHBD_DECL_SUFFIX)
{
const int sgr_one_by_x = n == 25 ? 164 : 455;
const unsigned sgr_one_by_x = n == 25 ? 164 : 455;
// Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
// of padding above and below
......@@ -446,71 +447,93 @@ static void selfguided_filter(coef *dst, const pixel *src,
#undef EIGHT_NEIGHBORS
}
static void selfguided_c(pixel *p, const ptrdiff_t p_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h, const int sgr_idx,
const int16_t sgr_w[2], const enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX)
static void sgr_5x5_c(pixel *p, const ptrdiff_t p_stride,
const pixel (*const left)[4], const pixel *lpf,
const ptrdiff_t lpf_stride, const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
// Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
// of padding above and below
pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
// Selfguided filter outputs to a maximum stripe height of 64 and a
// maximum restoration width of 384 (256 * 1.5)
coef dst[64 * 384];
// both r1 and r0 can't be zero
if (!dav1d_sgr_params[sgr_idx][0]) {
const int s1 = dav1d_sgr_params[sgr_idx][3];
selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9, s1 HIGHBD_TAIL_SUFFIX);
const int w1 = (1 << 7) - sgr_w[1];
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
const int u = (p[i] << 4);
const int v = (u << 7) + w1 * (dst[j * 384 + i] - u);
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
}
p += PXSTRIDE(p_stride);
padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25,
params->sgr.s0 HIGHBD_TAIL_SUFFIX);
const int w0 = params->sgr.w0;
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
const int u = (p[i] << 4);
const int v = (u << 7) + w0 * (dst[j * 384 + i] - u);
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
}
} else if (!dav1d_sgr_params[sgr_idx][1]) {
const int s0 = dav1d_sgr_params[sgr_idx][2];
selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0 HIGHBD_TAIL_SUFFIX);
const int w0 = sgr_w[0];
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
const int u = (p[i] << 4);
const int v = (u << 7) + w0 * (dst[j * 384 + i] - u);
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
}
p += PXSTRIDE(p_stride);
p += PXSTRIDE(p_stride);
}
}
static void sgr_3x3_c(pixel *p, const ptrdiff_t p_stride,
const pixel (*const left)[4], const pixel *lpf,
const ptrdiff_t lpf_stride, const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
coef dst[64 * 384];
padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9,
params->sgr.s1 HIGHBD_TAIL_SUFFIX);
const int w1 = params->sgr.w1;
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
const int u = (p[i] << 4);
const int v = (u << 7) + w1 * (dst[j * 384 + i] - u);
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
}
} else {
coef dst1[64 * 384];
const int s0 = dav1d_sgr_params[sgr_idx][2];
const int s1 = dav1d_sgr_params[sgr_idx][3];
const int w0 = sgr_w[0];
const int w1 = (1 << 7) - w0 - sgr_w[1];
selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0 HIGHBD_TAIL_SUFFIX);
selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9, s1 HIGHBD_TAIL_SUFFIX);
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
const int u = (p[i] << 4);
const int v = (u << 7) + w0 * (dst[j * 384 + i] - u) +
w1 * (dst1[j * 384 + i] - u);
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
}
p += PXSTRIDE(p_stride);
p += PXSTRIDE(p_stride);
}
}
static void sgr_mix_c(pixel *p, const ptrdiff_t p_stride,
const pixel (*const left)[4], const pixel *lpf,
const ptrdiff_t lpf_stride, const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
coef dst0[64 * 384];
coef dst1[64 * 384];
padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
selfguided_filter(dst0, tmp, REST_UNIT_STRIDE, w, h, 25,
params->sgr.s0 HIGHBD_TAIL_SUFFIX);
selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9,
params->sgr.s1 HIGHBD_TAIL_SUFFIX);
const int w0 = params->sgr.w0;
const int w1 = params->sgr.w1;
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
const int u = (p[i] << 4);
const int v = (u << 7) + w0 * (dst0[j * 384 + i] - u) +
w1 * (dst1[j * 384 + i] - u);
p[i] = iclip_pixel((v + (1 << 10)) >> 11);
}
p += PXSTRIDE(p_stride);
}
}
COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
c->wiener[0] = c->wiener[1] = wiener_c;
c->selfguided = selfguided_c;
c->sgr[0] = sgr_5x5_c;
c->sgr[1] = sgr_3x3_c;
c->sgr[2] = sgr_mix_c;
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
......
......@@ -167,9 +167,10 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
// The first stripe of the frame is shorter by 8 luma pixel rows.
int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);
ALIGN_STK_16(int16_t, filter, 2, [8]);
wienerfilter_fn wiener_fn = NULL;
looprestorationfilter_fn lr_fn;
LooprestorationParams params;
if (lr->type == DAV1D_RESTORATION_WIENER) {
int16_t (*const filter)[8] = params.filter;
filter[0][0] = filter[0][6] = lr->filter_h[0];
filter[0][1] = filter[0][5] = lr->filter_h[1];
filter[0][2] = filter[0][4] = lr->filter_h[2];
......@@ -185,21 +186,23 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
filter[1][2] = filter[1][4] = lr->filter_v[2];
filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2;
wiener_fn = dsp->lr.wiener[!(filter[0][0] | filter[1][0])];
lr_fn = dsp->lr.wiener[!(filter[0][0] | filter[1][0])];
} else {
assert(lr->type == DAV1D_RESTORATION_SGRPROJ);
const uint16_t *const sgr_params = dav1d_sgr_params[lr->sgr_idx];
params.sgr.s0 = sgr_params[0];
params.sgr.s1 = sgr_params[1];
params.sgr.w0 = lr->sgr_weights[0];
params.sgr.w1 = 128 - (lr->sgr_weights[0] + lr->sgr_weights[1]);
lr_fn = dsp->lr.sgr[!!sgr_params[0] + !!sgr_params[1] * 2 - 1];
}
while (y + stripe_h <= row_h) {
// Change the HAVE_BOTTOM bit in edges to (sby + 1 != f->sbh || y + stripe_h != row_h)
edges ^= (-(sby + 1 != f->sbh || y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
if (wiener_fn) {
wiener_fn(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
filter, edges HIGHBD_CALL_SUFFIX);
} else {
dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
lr->sgr_idx, lr->sgr_weights, edges HIGHBD_CALL_SUFFIX);
}
lr_fn(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
&params, edges HIGHBD_CALL_SUFFIX);
left += stripe_h;
y += stripe_h;
......
......@@ -299,7 +299,6 @@ static inline void padding(uint8_t *dst, const uint8_t *p,
}
}
// FIXME Could split into luma and chroma specific functions,
// (since first and last tops are always 0 for chroma)
// FIXME Could implement a version that requires less temporary memory
......@@ -309,9 +308,11 @@ static void wiener_filter_vsx(uint8_t *p, const ptrdiff_t p_stride,
const uint8_t *lpf,
const ptrdiff_t lpf_stride,
const int w, const int h,
const int16_t filter[2][8],
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
const int16_t (*const filter)[8] = params->filter;
// Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
// of padding above and below
ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
......@@ -320,7 +321,6 @@ static void wiener_filter_vsx(uint8_t *p, const ptrdiff_t p_stride,
wiener_filter_h_vsx(hor, tmp, filter[0], w, h);
wiener_filter_v_vsx(p, p_stride, hor, filter[1], w, h);
}
#endif
......
......@@ -412,13 +412,11 @@ const int8_t dav1d_cdef_directions[2 + 8 + 2 /* dir */][2 /* pass */] = {
{ 0 * 12 + 1, -1 * 12 + 2 }, // 1
};
const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1
{ 2, 1, 140, 3236 }, { 2, 1, 112, 2158 }, { 2, 1, 93, 1618 },
{ 2, 1, 80, 1438 }, { 2, 1, 70, 1295 }, { 2, 1, 58, 1177 },
{ 2, 1, 47, 1079 }, { 2, 1, 37, 996 }, { 2, 1, 30, 925 },
{ 2, 1, 25, 863 }, { 0, 1, -1, 2589 }, { 0, 1, -1, 1618 },
{ 0, 1, -1, 1177 }, { 0, 1, -1, 925 }, { 2, 0, 56, -1 },
{ 2, 0, 22, -1 },
const uint16_t ALIGN(dav1d_sgr_params[16][2], 4) = {
{ 140, 3236 }, { 112, 2158 }, { 93, 1618 }, { 80, 1438 },
{ 70, 1295 }, { 58, 1177 }, { 47, 1079 }, { 37, 996 },
{ 30, 925 }, { 25, 863 }, { 0, 2589 }, { 0, 1618 },
{ 0, 1177 }, { 0, 925 }, { 56, 0 }, { 22, 0 },
};
const uint8_t ALIGN(dav1d_sgr_x_by_x[256], 16) = {
......
......@@ -107,7 +107,7 @@ extern const Dav1dWarpedMotionParams dav1d_default_wm_params;
extern const int8_t dav1d_cdef_directions[12][2];
extern const int16_t dav1d_sgr_params[16][4];
extern const uint16_t dav1d_sgr_params[16][2];
extern const uint8_t dav1d_sgr_x_by_x[256];
extern const int8_t dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8];
......
......@@ -29,18 +29,17 @@
#include "src/looprestoration.h"
#include "common/intops.h"
#include "src/tables.h"
#define WIENER_FILTER(ext) \
void dav1d_wiener_filter7_##ext(pixel *const dst, ptrdiff_t dst_stride, \
void dav1d_wiener_filter7_##ext(pixel *dst, ptrdiff_t dst_stride, \
const pixel (*left)[4], const pixel *lpf, \
ptrdiff_t lpf_stride, int w, int h, \
const int16_t filter[2][8], \
const LooprestorationParams *params, \
enum LrEdgeFlags edges); \
void dav1d_wiener_filter5_##ext(pixel *const dst, ptrdiff_t dst_stride, \
void dav1d_wiener_filter5_##ext(pixel *dst, ptrdiff_t dst_stride, \
const pixel (*left)[4], const pixel *lpf, \
ptrdiff_t lpf_stride, int w, int h, \
const int16_t filter[2][8], \
const LooprestorationParams *params, \
enum LrEdgeFlags edges);
#define SGR_FILTER(ext) \
......@@ -53,7 +52,7 @@ void dav1d_sgr_box3_v_##ext(int32_t *sumsq, int16_t *sum, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void dav1d_sgr_calc_ab1_##ext(int32_t *a, int16_t *b, \
const int w, const int h, const int strength); \
const int w, const int h, const unsigned s); \
void dav1d_sgr_finish_filter1_##ext(coef *tmp, \
const pixel *src, const ptrdiff_t stride, \
const int32_t *a, const int16_t *b, \
......@@ -138,32 +137,45 @@ void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \
const int w, const int h, \
const uint32_t wt); \
\
static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
const pixel (*const left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, const int sgr_idx, \
const int16_t sgr_wt[7], const enum LrEdgeFlags edges) \
static void sgr_filter_5x5_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
const pixel (*const left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, \
const LooprestorationParams *const params, \
const enum LrEdgeFlags edges) \
{ \
ALIGN_STK_32(coef, tmp, 64 * 384,); \
dav1d_sgr_filter2_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \
w, h, params->sgr.s0, edges); \
dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, params->sgr.w0); \
} \
static void sgr_filter_3x3_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
const pixel (*const left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, \
const LooprestorationParams *const params, \
const enum LrEdgeFlags edges) \
{ \