Commit 371de01c authored by Martin Storsjö's avatar Martin Storsjö
Browse files

WIP: arm64: Add SGR testing code

parent d77c91e0
Pipeline #5376 passed with stages
in 6 minutes and 55 seconds
......@@ -31,6 +31,8 @@
#include "common/attributes.h"
#include "src/tables.h"
//#define COMPARE
#if BITDEPTH == 8
// This calculates things slightly differently than the reference C version.
// This version calculates roughly this:
......@@ -94,6 +96,349 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
}
#if ARCH_AARCH64
#ifdef COMPARE
#include <stdio.h>
static
void dav1d_sgr_box3_h_test(int32_t *sumsq, int16_t *sum,
const pixel (*left)[4],
const pixel *src, const ptrdiff_t stride,
const int w, const int h,
const enum LrEdgeFlags edges) {
sumsq++;
sum++;
for (int y = 0; y < h; y++) {
int a = edges & LR_HAVE_LEFT ? (left ? left[y][2] : src[-2]) : src[0];
int b = edges & LR_HAVE_LEFT ? (left ? left[y][3] : src[-1]) : src[0];
for (int x = -1; x < w + 1; x++) {
int c = (x + 1 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 1] : src[w - 1];
sum[x] = a + b + c;
sumsq[x] = a * a + b * b + c * c;
a = b;
b = c;
}
src += PXSTRIDE(stride);
sum += 384 + 16;
sumsq += 384 + 16;
}
}
static
void dav1d_sgr_box5_h_test(int32_t *sumsq, int16_t *sum,
const pixel (*left)[4],
const pixel *src, const ptrdiff_t stride,
const int w, const int h,
const enum LrEdgeFlags edges) {
sumsq++;
sum++;
for (int y = 0; y < h; y++) {
int a = edges & LR_HAVE_LEFT ? (left ? left[y][1] : src[-3]) : src[0];
int b = edges & LR_HAVE_LEFT ? (left ? left[y][2] : src[-2]) : src[0];
int c = edges & LR_HAVE_LEFT ? (left ? left[y][3] : src[-1]) : src[0];
int d = src[0];
for (int x = -1; x < w + 1; x++) {
int e = (x + 2 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 2] : src[w - 1];
sum[x] = a + b + c + d + e;
sumsq[x] = a * a + b * b + c * c + d * d + e * e;
a = b;
b = c;
c = d;
d = e;
}
src += PXSTRIDE(stride);
sum += 384 + 16;
sumsq += 384 + 16;
}
}
static
void dav1d_sgr_box3_v_test(int32_t *sumsq, int16_t *sum,
const int w, const int h,
const enum LrEdgeFlags edges) {
for (int x = -1; x < w + 1; x++) {
int sq_a = edges & LR_HAVE_TOP ? sumsq[-2*(384 + 16)] : sumsq[0];
int sq_b = edges & LR_HAVE_TOP ? sumsq[-1*(384 + 16)] : sumsq[0];
int s_a = edges & LR_HAVE_TOP ? sum[-2*(384 + 16)] : sum[0];
int s_b = edges & LR_HAVE_TOP ? sum[-1*(384 + 16)] : sum[0];
for (int y = -1; y < h + 1; y++) {
int sq_c = (y + 1 < h || (edges & LR_HAVE_BOTTOM)) ? sumsq[(y+1)*(384 + 16)] : sq_b;
int s_c = (y + 1 < h || (edges & LR_HAVE_BOTTOM)) ? sum[(y+1)*(384 + 16)] : s_b;
sumsq[y*(384 + 16)] = sq_a + sq_b + sq_c;
sum[y*(384 + 16)] = s_a + s_b + s_c;
sq_a = sq_b;
sq_b = sq_c;
s_a = s_b;
s_b = s_c;
}
sumsq++;
sum++;
}
}
static
void dav1d_sgr_box5_v_test(int32_t *sumsq, int16_t *sum,
const int w, const int h,
const enum LrEdgeFlags edges) {
for (int x = -1; x < w + 1; x++) {
int sq_a = edges & LR_HAVE_TOP ? sumsq[-2*(384 + 16)] : sumsq[0];
int sq_b = edges & LR_HAVE_TOP ? sumsq[-2*(384 + 16)] : sumsq[0];
int sq_c = edges & LR_HAVE_TOP ? sumsq[-1*(384 + 16)] : sumsq[0];
int sq_d = edges & LR_HAVE_TOP ? sumsq[0*(384 + 16)] : sumsq[0];
int s_a = edges & LR_HAVE_TOP ? sum[-2*(384 + 16)] : sum[0];
int s_b = edges & LR_HAVE_TOP ? sum[-2*(384 + 16)] : sum[0];
int s_c = edges & LR_HAVE_TOP ? sum[-1*(384 + 16)] : sum[0];
int s_d = edges & LR_HAVE_TOP ? sum[0*(384 + 16)] : sum[0];
for (int y = -1; y < h + 1; y++) {
int sq_e, s_e;
if (edges & LR_HAVE_BOTTOM) {
sq_e = (y + 2 < h + 2) ? sumsq[(y+2)*(384 + 16)] : sq_d;
s_e = (y + 2 < h + 2) ? sum[(y+2)*(384 + 16)] : s_d;
} else {
sq_e = (y + 2 < h) ? sumsq[(y+2)*(384 + 16)] : sq_d;
s_e = (y + 2 < h) ? sum[(y+2)*(384 + 16)] : s_d;
}
if (y & 1) {
sumsq[y*(384 + 16)] = sq_a + sq_b + sq_c + sq_d + sq_e;
sum[y*(384 + 16)] = s_a + s_b + s_c + s_d + s_e;
}
sq_a = sq_b;
sq_b = sq_c;
sq_c = sq_d;
sq_d = sq_e;
s_a = s_b;
s_b = s_c;
s_c = s_d;
s_d = s_e;
}
sumsq++;
sum++;
}
}
static
void dav1d_sgr_calc_ab1_test(int32_t *AA, int16_t *BB,
const int w, const int h, const int s) {
AA++;
BB++;
AA -= 384+16;
BB -= 384+16;
int step = 1;
int n = 9;
const int sgr_one_by_x = 455;
for (int j = -1; j < h + 1; j+= step) {
for (int i = -1; i < w + 1; i++) {
const int a = AA[i];
const int b = BB[i];
const unsigned p = imax(a * n - b * b, 0);
const unsigned z = (p * s + (1 << 19)) >> 20;
const unsigned x = dav1d_sgr_x_by_x[imin(z, 255)];
// This is where we invert A and B, so that B is of size coef.
AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
BB[i] = 256 - x;
}
AA += step * (384 + 16);
BB += step * (384 + 16);
}
}
static
void dav1d_sgr_calc_ab2_test(int32_t *AA, int16_t *BB,
const int w, const int h, const int s) {
AA++;
BB++;
AA -= 384+16;
BB -= 384+16;
int step = 2;
int n = 25;
const int sgr_one_by_x = 164;
for (int j = -1; j < h + 1; j+= step) {
for (int i = -1; i < w + 1; i++) {
const int a = AA[i];
const int b = BB[i];
const unsigned p = imax(a * n - b * b, 0);
const unsigned z = (p * s + (1 << 19)) >> 20;
const unsigned x = dav1d_sgr_x_by_x[imin(z, 255)];
// This is where we invert A and B, so that B is of size coef.
AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
BB[i] = 256 - x;
}
AA += step * (384 + 16);
BB += step * (384 + 16);
}
}
static
void dav1d_sgr_finish_filter1_test(coef *tmp,
const pixel *src, const ptrdiff_t stride,
const int32_t *A, const int16_t *B,
const int w, const int h) {
A++;
B++;
#define REST_UNIT_STRIDE (384+16)
#define EIGHT_NEIGHBORS(P, i)\
((P[i] + P[i - 1] + P[i + 1] + P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 4 + \
(P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] + \
P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 3)
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
const int a = EIGHT_NEIGHBORS(B, i);
const int b = EIGHT_NEIGHBORS(A, i);
tmp[i] = (a * src[i] + b + (1 << 8)) >> 9;
}
tmp += 384;
src += stride;
B += REST_UNIT_STRIDE;
A += REST_UNIT_STRIDE;
}
}
static
void dav1d_sgr_finish_filter2_test(coef *tmp,
const pixel *src, const ptrdiff_t stride,
const int32_t *A, const int16_t *B,
const int w, const int h) {
A++;
B++;
int j = 0;
#define SIX_NEIGHBORS(P, i)\
((P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 6 + \
(P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] + \
P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 5)
for (; j < h - 1; j+=2) {
for (int i = 0; i < w; i++) {
const int a = SIX_NEIGHBORS(B, i);
const int b = SIX_NEIGHBORS(A, i);
tmp[i] = (a * src[i] + b + (1 << 8)) >> 9;
}
tmp += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
src += stride;
B += REST_UNIT_STRIDE;
A += REST_UNIT_STRIDE;
for (int i = 0; i < w; i++) {
const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
tmp[i] = (a * src[i] + b + (1 << 7)) >> 8;
}
tmp += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
src += stride;
B += REST_UNIT_STRIDE;
A += REST_UNIT_STRIDE;
}
if (j + 1 == h) { // Last row, when number of rows is odd
for (int i = 0; i < w; i++) {
const int a = SIX_NEIGHBORS(B, i);
const int b = SIX_NEIGHBORS(A, i);
tmp[i] = (a * src[i] + b + (1 << 8)) >> 9;
}
}
}
#if 1
static
void dav1d_sgr_weighted1_test(pixel *dst, const ptrdiff_t dst_stride,
const pixel *src, const ptrdiff_t src_stride,
const coef *t1, const int w, const int h,
const int w1) {
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
const int u = (src[i] << 4);
const int v = (u << 7) + w1 * (t1[j * 384 + i] - u);
dst[i] = iclip_pixel((v + (1 << 10)) >> 11);
}
src += PXSTRIDE(src_stride);
dst += PXSTRIDE(dst_stride);
}
}
static
void dav1d_sgr_weighted2_test(pixel *dst, const ptrdiff_t dst_stride,
const pixel *src, const ptrdiff_t src_stride,
const coef *t1, const coef *t2,
const int w, const int h,
const int16_t wt[2]) {
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
const int u = (src[i] << 4);
const int v = (u << 7) + wt[0] * (t1[j * 384 + i] - u) +
wt[1] * (t2[j * 384 + i] - u);
dst[i] = iclip_pixel((v + (1 << 10)) >> 11);
}
src += PXSTRIDE(src_stride);
dst += PXSTRIDE(dst_stride);
}
}
#endif
static void compare_sums(const int32_t *sumsq, const int16_t *sum, const int32_t *sumsq_test, const int16_t *sum_test, int w, int full_h, int offset, int h, enum LrEdgeFlags edges) {
int stride = 384+16;
int diff_s = 0, diff_sq = 0;
for (int y = offset; y < offset + h; y++) {
for (int x = -1; x < w + 1; x++) {
if (sumsq[y*stride + x] != sumsq_test[y*stride + x])
diff_sq = 1;
if (sum[y*stride + x] != sum_test[y*stride + x])
diff_s = 1;
}
}
if (diff_s) {
printf("diff_s edges %d %d size %d %d\n", edges & LR_HAVE_LEFT, edges & LR_HAVE_RIGHT, w, full_h);
for (int y = offset; y < offset + h && 1; y++) {
for (int x = -1; x < w + 1; x++)
printf("%03d ", sum[y*stride +x]);
printf(" ");
for (int x = -1; x < w + 1; x++)
printf("%03d ", sum_test[y*stride +x]);
printf(" ");
for (int x = -1; x < w + 1; x++)
printf("%c", sum[y*stride +x] != sum_test[y*stride +x] ? 'x' : '.');
printf("\n");
}
}
if (diff_sq) {
printf("diff_sq edges %d %d size %d %d\n", edges & LR_HAVE_LEFT, edges & LR_HAVE_RIGHT, w, full_h);
for (int y = offset; y < offset + h && 1; y++) {
for (int x = -1; x < w + 1; x++)
printf("%03d ", sumsq[y*stride +x]);
printf(" ");
for (int x = -1; x < w + 1; x++)
printf("%03d ", sumsq_test[y*stride +x]);
printf(" ");
for (int x = -1; x < w + 1; x++)
printf("%c", sumsq[y*stride +x] != sumsq_test[y*stride +x] ? 'x' : '.');
printf("\n");
}
}
}
static void compare_tmp(const coef *tmp, const coef *tmp_test, const int32_t *a, const int16_t *b, const pixel *src, ptrdiff_t stride, int w, int h) {
int diff = 0;
for (int y = 0; y < h; y++)
for (int x = 0; x < w; x++)
if (tmp[y*384+x] != tmp_test[y*384+x])
diff = 1;
if (diff) {
printf("diff tmp size %d %d\n", w, h);
for (int y = -1; y < h + 1; y++) {
for (int x = -1; x < w + 1; x++)
printf("%5d ", a[y*(384+16)+x]);
printf(" ");
for (int x = -1; x < w + 1; x++)
printf("%5d ", b[y*(384+16)+x]);
printf("\n");
}
for (int y = 0; y < h && 1; y++) {
for (int x = 0; x < w; x++)
printf("%6d ", tmp[y*384+x]);
printf(" ");
for (int x = 0; x < w; x++)
printf("%6d ", tmp_test[y*384+x]);
printf(" ");
for (int x = 0; x < w; x++)
printf("%3d ", src[y*PXSTRIDE(stride)+x]);
printf(" ");
for (int x = 0; x < w; x++)
printf("%c", tmp[y*384+x] != tmp_test[y*384+x] ? 'x' : '.');
printf("\n");
}
}
}
#endif
void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
const pixel (*left)[4],
const pixel *src, const ptrdiff_t stride,
......@@ -121,20 +466,71 @@ static void dav1d_sgr_filter1_neon(coef *tmp,
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
#ifdef COMPARE
ALIGN_STK_16(int32_t, sumsq_mem_test, (384 + 16) * 68 + 8,);
int32_t *const sumsq_test = &sumsq_mem_test[(384 + 16) * 2 + 8], *const a_test = sumsq_test;
ALIGN_STK_16(int16_t, sum_mem_test, (384 + 16) * 68 + 16,);
int16_t *const sum_test = &sum_mem_test[(384 + 16) * 2 + 16], *const b_test = sum_test;
memset(sum_mem, 128, 2*400*68);
memset(sum_mem_test, 128, 2*400*68);
memset(sumsq_mem, 128, 4*400*68);
memset(sumsq_mem_test, 128, 4*400*68);
#endif
dav1d_sgr_box3_h_neon(sumsq, sum, left, src, stride, w, h, edges);
if (edges & LR_HAVE_TOP)
#ifdef COMPARE
dav1d_sgr_box3_h_test(sumsq_test, sum_test, left, src, stride, w, h, edges);
compare_sums(sumsq, sum, sumsq_test, sum_test, w, h, 0, h, edges);
#endif
if (edges & LR_HAVE_TOP) {
#ifdef COMPARE
if (0) {
for (int y = 0; y < 2 && 1; y++) {
for (int x = 0; x < w; x++)
printf("%03d ", lpf[y*lpf_stride +x]);
printf("\n");
}
}
#endif
dav1d_sgr_box3_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
NULL, lpf, lpf_stride, w, 1, edges);
#ifdef COMPARE
dav1d_sgr_box3_h_test(&sumsq_test[-2 * (384 + 16)], &sum_test[-2 * (384 + 16)],
NULL, lpf, lpf_stride, w, 2, edges);
compare_sums(sumsq, sum, sumsq_test, sum_test, w, h, -2, 2, edges);
#endif
}
if (edges & LR_HAVE_BOTTOM)
if (edges & LR_HAVE_BOTTOM) {
dav1d_sgr_box3_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
lpf_stride, w, 2, edges);
#ifdef COMPARE
dav1d_sgr_box3_h_test(&sumsq_test[h * (384 + 16)], &sum_test[h * (384 + 16)],
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
lpf_stride, w, 2, edges);
compare_sums(sumsq, sum, sumsq_test, sum_test, w, h, h, 2, edges);
#endif
}
dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
#ifdef COMPARE
dav1d_sgr_box3_v_test(sumsq_test, sum_test, w, h, edges);
compare_sums(sumsq, sum, sumsq_test, sum_test, w, h, -2, h + 4, edges);
#endif
dav1d_sgr_calc_ab1_neon(a, b, w, h, strength);
#ifdef COMPARE
dav1d_sgr_calc_ab1_test(a_test, b_test, w, h, strength);
compare_sums(sumsq, sum, sumsq_test, sum_test, w, h, -1, h + 1, edges);
#endif
dav1d_sgr_finish_filter1_neon(tmp, src, stride, a, b, w, h);
#ifdef COMPARE
ALIGN_STK_16(coef, tmp_test, 64 * 384,);
dav1d_sgr_finish_filter1_test(tmp_test, src, stride, a, b, w, h);
compare_tmp(tmp, tmp_test, a, b, src, stride, w, h);
#endif
}
void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,
......@@ -164,20 +560,61 @@ static void dav1d_sgr_filter2_neon(coef *tmp,
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
#ifdef COMPARE
ALIGN_STK_16(int32_t, sumsq_mem_test, (384 + 16) * 68 + 8,);
int32_t *const sumsq_test = &sumsq_mem_test[(384 + 16) * 2 + 8], *const a_test = sumsq_test;
ALIGN_STK_16(int16_t, sum_mem_test, (384 + 16) * 68 + 16,);
int16_t *const sum_test = &sum_mem_test[(384 + 16) * 2 + 16], *const b_test = sum_test;
memset(sum_mem, 128, 2*400*68);
memset(sum_mem_test, 128, 2*400*68);
memset(sumsq_mem, 128, 4*400*68);
memset(sumsq_mem_test, 128, 4*400*68);
#endif
dav1d_sgr_box5_h_neon(sumsq, sum, left, src, stride, w, h, edges);
if (edges & LR_HAVE_TOP)
#ifdef COMPARE
dav1d_sgr_box5_h_test(sumsq_test, sum_test, left, src, stride, w, h, edges);
compare_sums(sumsq, sum, sumsq_test, sum_test, w, h, 0, h, edges);
#endif
if (edges & LR_HAVE_TOP) {
dav1d_sgr_box5_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
NULL, lpf, lpf_stride, w, 2, edges);
#ifdef COMPARE
dav1d_sgr_box5_h_test(&sumsq_test[-2 * (384 + 16)], &sum_test[-2 * (384 + 16)],
NULL, lpf, lpf_stride, w, 2, edges);
compare_sums(sumsq, sum, sumsq_test, sum_test, w, h, -2, 2, edges);
#endif
}
if (edges & LR_HAVE_BOTTOM)
if (edges & LR_HAVE_BOTTOM) {
dav1d_sgr_box5_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
lpf_stride, w, 2, edges);
#ifdef COMPARE
dav1d_sgr_box5_h_test(&sumsq_test[h * (384 + 16)], &sum_test[h * (384 + 16)],
NULL, lpf + 6 * PXSTRIDE(lpf_stride),
lpf_stride, w, 2, edges);
compare_sums(sumsq, sum, sumsq_test, sum_test, w, h, h, 2, edges);
#endif
}
dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
#ifdef COMPARE
dav1d_sgr_box5_v_test(sumsq_test, sum_test, w, h, edges);
compare_sums(sumsq, sum, sumsq_test, sum_test, w, h, -2, h + 4, edges);
#endif
dav1d_sgr_calc_ab2_neon(a, b, w, h, strength);
#ifdef COMPARE
dav1d_sgr_calc_ab2_test(a_test, b_test, w, h, strength);
compare_sums(sumsq, sum, sumsq_test, sum_test, w, h, -1, h + 1, edges);
#endif
dav1d_sgr_finish_filter2_neon(tmp, src, stride, a, b, w, h);
#ifdef COMPARE
ALIGN_STK_16(coef, tmp_test, 64 * 384,);
dav1d_sgr_finish_filter2_test(tmp_test, src, stride, a, b, w, h);
compare_tmp(tmp, tmp_test, a, b, src, stride, w, h);
#endif
}
void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
......
......@@ -28,6 +28,7 @@
#include "tests/checkasm/checkasm.h"
#include <string.h>
#include <stdio.h>
#include "src/levels.h"
#include "src/looprestoration.h"
......@@ -174,6 +175,29 @@ static void check_sgr(Dav1dLoopRestorationDSPContext *const c) {
call_new(a_dst + 32, 448 * sizeof(pixel), left,
h_edge + 32, 448 * sizeof(pixel),
w, h, sgr_idx, sgr_wt, edges HIGHBD_TAIL_SUFFIX);
#if 1
int diff = 0;
for (int y = 0; y < h; y++)
for (int x = 0; x < w; x++)
if (c_dst[32+y*448+x] != a_dst[32+y*448+x])
diff = 1;
if (diff) {
printf("selfguided_%s_%dbpc %dx%d edges %x\n", sgr_idx == 6 ? "mix" : sgr_idx == 10 ? "3x3" : "5x5", BITDEPTH, w, h, edges);
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++)
printf("%3d ", c_dst[32+y*448+x]);
printf(" ");
for (int x = 0; x < w; x++)
printf("%3d ", a_dst[32+y*448+x]);
printf(" ");
for (int x = 0; x < w; x++)
printf("%c", c_dst[32+y*448+x] != a_dst[32+y*448+x] ? 'x' : '.');
printf("\n");
}
fflush(stdout);
}
#endif
const int res = cmp2d(c_dst + 32, a_dst + 32, 448 * sizeof(pixel), w, h);
if (res != -1) fail();
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment