Commit 1f32abd2 authored by Ronald S. Bultje's avatar Ronald S. Bultje

Add infrastructure for LR SIMD and unit tests.

wiener_luma_8bpc_c: 326272.1
wiener_luma_8bpc_avx2: 19841.5

Decoding time of first 1000 frames of Chimera-8bit-1920x1080.ivf goes
from 27.471 to 23.558 seconds.
parent 1d775483
......@@ -65,6 +65,11 @@ static void padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride,
pixel_copy(dst_l, p, unit_w);
pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
if (have_left) {
pixel_copy(dst_l, &left[0][1], 3);
pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
}
}
pixel *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
......@@ -81,6 +86,11 @@ static void padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride,
pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
if (have_left) {
pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
}
}
// Inner UNIT_WxSTRIPE_H
......@@ -560,4 +570,8 @@ static void selfguided_c(pixel *p, const ptrdiff_t p_stride,
void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c) {
c->wiener = wiener_c;
c->selfguided = selfguided_c;
#if ARCH_X86 && BITDEPTH == 8
bitfn(dav1d_loop_restoration_dsp_init_x86)(c);
#endif
}
......@@ -40,21 +40,31 @@ enum LrEdgeFlags {
LR_HAVE_BOTTOM = 1 << 3,
};
#ifdef BITDEPTH
typedef const pixel (*const_left_pixel_row)[4];
#else
typedef const void *const_left_pixel_row;
#endif
// Although the spec applies restoration filters over 4x4 blocks, the wiener
// filter can be applied to a bigger surface.
// * w is constrained by the restoration unit size (w <= 256)
// * h is constrained by the stripe height (h <= 64)
typedef void (*wienerfilter_fn)(pixel *dst, ptrdiff_t dst_stride,
const void *left /*const pixel (*left)[4]*/,
const pixel *lpf, ptrdiff_t lpf_stride,
int w, int h, const int16_t filterh[7],
const int16_t filterv[7], enum LrEdgeFlags edges);
#define decl_wiener_filter_fn(name) \
void (name)(pixel *dst, ptrdiff_t dst_stride, \
const_left_pixel_row left, \
const pixel *lpf, ptrdiff_t lpf_stride, \
int w, int h, const int16_t filterh[7], \
const int16_t filterv[7], enum LrEdgeFlags edges)
typedef decl_wiener_filter_fn(*wienerfilter_fn);
typedef void (*selfguided_fn)(pixel *dst, ptrdiff_t dst_stride,
const void *left /*const pixel (*left)[4]*/,
const pixel *lpf, ptrdiff_t lpf_stride,
int w, int h, int sgr_idx, const int16_t sgr_w[2],
const enum LrEdgeFlags edges);
#define decl_selfguided_filter_fn(name) \
void (name)(pixel *dst, ptrdiff_t dst_stride, \
const_left_pixel_row left, \
const pixel *lpf, ptrdiff_t lpf_stride, \
int w, int h, int sgr_idx, const int16_t sgr_w[2], \
const enum LrEdgeFlags edges)
typedef decl_selfguided_filter_fn(*selfguided_fn);
typedef struct Dav1dLoopRestorationDSPContext {
wienerfilter_fn wiener;
......@@ -64,4 +74,7 @@ typedef struct Dav1dLoopRestorationDSPContext {
void dav1d_loop_restoration_dsp_init_8bpc(Dav1dLoopRestorationDSPContext *c);
void dav1d_loop_restoration_dsp_init_10bpc(Dav1dLoopRestorationDSPContext *c);
void dav1d_loop_restoration_dsp_init_x86_8bpc(Dav1dLoopRestorationDSPContext *c);
void dav1d_loop_restoration_dsp_init_x86_10bpc(Dav1dLoopRestorationDSPContext *c);
#endif /* __DAV1D_SRC_LOOPRESTORATION_H__ */
......@@ -101,6 +101,7 @@ if is_asm_enabled
'x86/ipred_init.c',
'x86/itx_init.c',
'x86/loopfilter_init.c',
'x86/looprestoration_init.c',
'x86/mc_init.c',
)
......@@ -110,6 +111,7 @@ if is_asm_enabled
'x86/ipred.asm',
'x86/itx.asm',
'x86/loopfilter.asm',
'x86/looprestoration.asm',
'x86/mc.asm',
)
......
; Copyright © 2018, VideoLAN and dav1d authors
; Copyright © 2018, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 32
pb_right_ext_mask: times 32 db 0xff
times 32 db 0
pb_14x0_1_2: times 14 db 0
db 1, 2
pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
pb_15: times 16 db 15
pw_128: times 2 dw 128
pw_2048: times 2 dw 2048
pw_16380: times 2 dw 16380
pw_0_128: dw 0, 128
pd_1024: dd 1024
SECTION .text
INIT_YMM avx2
cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge
vpbroadcastb m15, [fhq+0]
vpbroadcastb m14, [fhq+2]
vpbroadcastb m13, [fhq+4]
vpbroadcastw m12, [fhq+6]
vpbroadcastd m9, [pw_128]
paddw m12, m9
vpbroadcastd m11, [pw_2048]
vpbroadcastd m10, [pw_16380]
lea r11, [pb_right_ext_mask]
DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
; if (edge & has_right) align_w_to_32
; else w -= 32, and use that as limit in x loop
test edged, 2 ; has_right
jnz .align
mov xlimq, -3
jmp .loop
.align:
add wd, 31
and wd, ~31
xor xlimd, xlimd
; main y loop for vertical filter
.loop:
mov srcptrq, srcq
mov dstptrq, dstq
lea xq, [wq+xlimq]
; load left edge pixels
test edged, 1 ; have_left
jz .emu_left
test leftq, leftq ; left == NULL for the edge-extended bottom/top
jz .load_left_combined
movd xm0, [leftq]
pinsrd xm0, [srcq], 1
pslldq xm0, 9
jmp .left_load_done
.load_left_combined:
movq xm0, [srcq-5]
jmp .left_load_done
.emu_left:
movd xm0, [srcq]
pshufb xm0, [pb_14x0_1_2]
; load right edge pixels
.left_load_done:
cmp xd, 32
jg .main_load
test xd, xd
jg .load_and_splat
je .splat_right
; for very small images (w=[1-2]), edge-extend the original cache,
; ugly, but only runs in very odd cases
add wd, wd
pshufb xm0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
shr wd, 1
; main x loop, mostly this starts in .main_load
.splat_right:
; no need to load new pixels, just extend them from the (possibly previously
; extended) previous load into m0
pshufb xm1, xm0, [pb_15]
jmp .main_loop
.load_and_splat:
; load new pixels and extend edge for right-most
movu m1, [srcptrq+3]
sub r11, xq
movu m2, [r11-pb_right_ext_mask+pb_right_ext_mask+32]
add r11, xq
vpbroadcastb m3, [srcptrq+2+xq]
pand m1, m2
pandn m3, m2, m3
por m1, m3
jmp .main_loop
.main_load:
; load subsequent line
movu m1, [srcptrq+3]
.main_loop:
vinserti128 m0, xm1, 1
palignr m2, m1, m0, 10
palignr m3, m1, m0, 11
palignr m4, m1, m0, 12
palignr m5, m1, m0, 13
palignr m6, m1, m0, 14
palignr m7, m1, m0, 15
punpcklbw m0, m2, m1
punpckhbw m2, m1
punpcklbw m8, m3, m7
punpckhbw m3, m7
punpcklbw m7, m4, m6
punpckhbw m4, m6
pxor m9, m9
punpcklbw m6, m5, m9
punpckhbw m5, m9
pmaddubsw m0, m15
pmaddubsw m2, m15
pmaddubsw m8, m14
pmaddubsw m3, m14
pmaddubsw m7, m13
pmaddubsw m4, m13
pmullw m6, m12
pmullw m5, m12
; note that m6/5 are unsigned here, whereas the others are signed
psubw m0, m10
psubw m2, m10
paddw m0, m8
paddw m2, m3
paddw m0, m7
paddw m2, m4
paddw m0, m6
paddw m2, m5
psraw m0, 3
psraw m2, 3
paddw m0, m11
paddw m2, m11
mova [dstptrq], xm0
mova [dstptrq+16], xm2
vextracti128 [dstptrq+32], m0, 1
vextracti128 [dstptrq+48], m2, 1
vextracti128 xm0, m1, 1
add srcptrq, 32
add dstptrq, 64
sub xq, 32
cmp xd, 32
jg .main_load
test xd, xd
jg .load_and_splat
cmp xd, xlimd
jg .splat_right
add srcq, strideq
add dstq, 384*2
dec hd
jg .loop
RET
cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge
vpbroadcastd m14, [fvq+4]
vpbroadcastd m15, [fvq]
vpbroadcastd m13, [pw_0_128]
paddw m14, m13
vpbroadcastd m12, [pd_1024]
DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr
and ylimd, 8 ; have_bottom
shr ylimd, 2
sub ylimd, 3
; main x loop for vertical filter, does one column of 16 pixels
.loop_x:
mova m3, [midq] ; middle line
; load top pixels
test edged, 4 ; have_top
jz .emu_top
mova m0, [midq-384*4]
mova m2, [midq-384*2]
mova m1, m0
jmp .load_bottom_pixels
.emu_top:
mova m0, m3
mova m1, m3
mova m2, m3
; load bottom pixels
.load_bottom_pixels:
mov yd, hd
mov mptrq, midq
mov dstptrq, dstq
add yd, ylimd
jg .load_threelines
; the remainder here is somewhat messy but only runs in very weird
; circumstances at the bottom of the image in very small blocks (h=[1-3]),
; so performance is not terribly important here...
je .load_twolines
cmp yd, -1
je .load_oneline
; h == 1 case
mova m5, m3
mova m4, m3
mova m6, m3
jmp .loop
.load_oneline:
; h == 2 case
mova m4, [midq+384*2]
mova m5, m4
mova m6, m4
jmp .loop
.load_twolines:
; h == 3 case
mova m4, [midq+384*2]
mova m5, [midq+384*4]
mova m6, m5
jmp .loop
.load_threelines:
; h > 3 case
mova m4, [midq+384*2]
mova m5, [midq+384*4]
; third line loaded in main loop below
; main y loop for vertical filter
.loop_load:
; load one line into m6. if that pixel is no longer available, do
; nothing, since m6 still has the data from the previous line in it. We
; try to structure the loop so that the common case is evaluated fastest
mova m6, [mptrq+384*6]
.loop:
paddw m7, m0, m6
paddw m8, m1, m5
paddw m9, m2, m4
punpcklwd m10, m7, m8
punpckhwd m7, m8
punpcklwd m11, m9, m3
punpckhwd m9, m3
pmaddwd m10, m15
pmaddwd m7, m15
pmaddwd m11, m14
pmaddwd m9, m14
paddd m10, m11
paddd m7, m9
paddd m10, m12
paddd m7, m12
psrad m10, 11
psrad m7, 11
packssdw m10, m7
packuswb m10, m10
vpermq m10, m10, q3120
mova [dstptrq], xm10
; shift pixels one position
mova m0, m1
mova m1, m2
mova m2, m3
mova m3, m4
mova m4, m5
mova m5, m6
add dstptrq, strideq
add mptrq, 384*2
dec yd
jg .loop_load
; for the bottom pixels, continue using m6 (as extended edge)
cmp yd, ylimd
jg .loop
add dstq, 16
add midq, 32
sub wd, 16
jg .loop_x
RET
%endif ; ARCH_X86_64
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <assert.h>
#include "common/attributes.h"
#include "common/intops.h"
#include "src/cpu.h"
#include "src/looprestoration.h"
#if BITDEPTH == 8 && ARCH_X86_64
void dav1d_wiener_filter_h_avx2(int16_t *dst, const pixel (*left)[4],
const pixel *src, ptrdiff_t stride,
const int16_t fh[7], const intptr_t w,
int h, enum LrEdgeFlags edges);
void dav1d_wiener_filter_v_avx2(pixel *dst, ptrdiff_t stride,
const int16_t *mid, int w, int h,
const int16_t fv[7], enum LrEdgeFlags edges);
// Future potential optimizations:
// - special chroma versions which don't filter [0]/[6];
// - running filter_h_avx2 transposed (one col of 32 pixels per iteration, top
// to bottom) instead of scanline-ordered should be faster since then the
// if (have_left) and similar conditions run only once instead of per line;
// - filter_v_avx2 currently runs 16 pixels per iteration, it should be possible
// to run 32 (like filter_h_avx2), and then all vpermqs can go;
// - maybe split out the top/bottom filter_h_avx2 from the main body filter_h_avx2,
// since then the have_left condition can be inlined;
// - consider having the wrapper (wiener_filter_avx2) also in hand-written
// assembly, so the setup overhead is minimized.
static void wiener_filter_avx2(pixel *const dst, const ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
const int w, const int h, const int16_t fh[7],
const int16_t fv[7], const enum LrEdgeFlags edges)
{
ALIGN_STK_32(int16_t, mid, 68 * 384,);
// horizontal filter
dav1d_wiener_filter_h_avx2(&mid[2 * 384], left, dst, dst_stride,
fh, w, h, edges);
if (edges & LR_HAVE_TOP)
dav1d_wiener_filter_h_avx2(mid, NULL, lpf, lpf_stride,
fh, w, 2, edges);
if (edges & LR_HAVE_BOTTOM)
dav1d_wiener_filter_h_avx2(&mid[(2 + h) * 384], NULL,
lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride,
fh, w, 2, edges);
dav1d_wiener_filter_v_avx2(dst, dst_stride, &mid[2*384], w, h, fv, edges);
}
#endif
void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
#if BITDEPTH == 8 && ARCH_X86_64
c->wiener = wiener_filter_avx2;
#endif
}
......@@ -67,6 +67,8 @@ static const struct {
{ "itx_10bpc", checkasm_check_itx_10bpc },
{ "loopfilter_8bpc", checkasm_check_loopfilter_8bpc },
{ "loopfilter_10bpc", checkasm_check_loopfilter_10bpc },
{ "looprestoration_8bpc", checkasm_check_looprestoration_8bpc },
{ "looprestoration_10bpc", checkasm_check_looprestoration_10bpc },
{ "mc_8bpc", checkasm_check_mc_8bpc },
{ "mc_10bpc", checkasm_check_mc_10bpc },
{ 0 }
......
......@@ -45,6 +45,9 @@ void checkasm_check_itx_10bpc(void);
void checkasm_check_loopfilter_8bpc(void);
void checkasm_check_loopfilter_10bpc(void);
void checkasm_check_looprestoration_8bpc(void);
void checkasm_check_looprestoration_10bpc(void);
void checkasm_check_mc_8bpc(void);
void checkasm_check_mc_10bpc(void);
......
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "tests/checkasm/checkasm.h"
#include <string.h>
#include "src/levels.h"
#include "src/looprestoration.h"
static void init_tmp(pixel *buf, const ptrdiff_t stride,
const int w, const int h)
{
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++)
buf[x] = rand() & ((1 << BITDEPTH) - 1);
buf += PXSTRIDE(stride);
}
}
static int cmp2d(const pixel *a, const pixel *b, const ptrdiff_t stride,
const int w, const int h)
{
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++)
if (a[x] != b[x]) return (y << 16) | x;
a += PXSTRIDE(stride);
b += PXSTRIDE(stride);
}
return -1;
}
static void check_wiener(Dav1dLoopRestorationDSPContext *const c) {
ALIGN_STK_32(pixel, c_dst, 448 * 64,);
ALIGN_STK_32(pixel, a_dst, 448 * 64,);
ALIGN_STK_32(pixel, h_edge, 448 * 8,);
pixel left[64][4];
declare_func(void, pixel *dst, ptrdiff_t dst_stride,
const pixel (*const left)[4],
const pixel *lpf, ptrdiff_t lpf_stride,
int w, int h, const int16_t filterh[7],
const int16_t filterv[7], enum LrEdgeFlags edges);
init_tmp(c_dst, 448 * sizeof(pixel), 448, 64);
init_tmp(h_edge, 448 * sizeof(pixel), 448, 8);
init_tmp(left, 4 * sizeof(pixel), 4, 64);
for (int pl = 0; pl < 2; pl++) {
if (check_func(c->wiener, "wiener_%s_%dbpc",
pl ? "chroma" : "luma", BITDEPTH))
{
int16_t filter[2][3], filter_v[7], filter_h[7];
filter[0][0] = pl ? 0 : (rand() & 15) - 5;
filter[0][1] = (rand() & 31) - 23;
filter[0][2] = (rand() & 63) - 17;
filter[1][0] = pl ? 0 : (rand() & 15) - 5;
filter[1][1] = (rand() & 31) - 23;
filter[1][2] = (rand() & 63) - 17;
filter_h[0] = filter_h[6] = filter[0][0];
filter_h[1] = filter_h[5] = filter[0][1];
filter_h[2] = filter_h[4] = filter[0][2];
filter_h[3] = -((filter_h[0] + filter_h[1] + filter_h[2]) * 2);
filter_v[0] = filter_v[6] = filter[1][0];
filter_v[1] = filter_v[5] = filter[1][1];
filter_v[2] = filter_v[4] = filter[1][2];
filter_v[3] = -((filter_v[0] + filter_v[1] + filter_v[2]) * 2);
const int base_w = 1 + (rand() % 384);
const int base_h = 1 + (rand() & 63);
for (enum LrEdgeFlags edges = 0; edges <= 0; edges++) {
const int w = edges & LR_HAVE_RIGHT ? 256 : base_w;
const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h;
memcpy(a_dst, c_dst, sizeof(c_dst));
call_ref(c_dst + 32, 448 * sizeof(pixel), left,
h_edge + 32, 448 * sizeof(pixel),
w, h, filter_h, filter_v, edges);
call_new(a_dst + 32, 448 * sizeof(pixel), left,
h_edge + 32, 448 * sizeof(pixel),
w, h, filter_h, filter_v, edges);
const int res = cmp2d(c_dst + 32, a_dst + 32, 448 * sizeof(pixel), w, h);
if (res != -1) fail();
}
bench_new(a_dst + 32, 448 * sizeof(pixel), left,
h_edge + 32, 448 * sizeof(pixel),
256, 64, filter_h, filter_v, 0xf);
}
}
report("wiener");
}
void bitfn(checkasm_check_looprestoration)(void) {
Dav1dLoopRestorationDSPContext c;
bitfn(dav1d_loop_restoration_dsp_init)(&c);
check_wiener(&c);
}
......@@ -38,6 +38,7 @@ if is_asm_enabled
'checkasm/ipred.c',
'checkasm/itx.c',
'checkasm/loopfilter.c',
'checkasm/looprestoration.c',
'checkasm/mc.c',
)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment