Commit a3c1c676 authored by Nathan Egge's avatar Nathan Egge
Browse files

Add bpc suffix to lr functions

parent baa92371
......@@ -88,8 +88,8 @@ SECTION .text
DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; ring buffer pointers
INIT_YMM avx2
cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, flt, h
cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, flt, h
mov fltq, fltmp
mov edged, r8m
mov wd, wm
......@@ -436,8 +436,8 @@ ALIGN function_align
add dstq, dst_strideq
ret
cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, flt, h
cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, flt, h
mov fltq, fltmp
mov edged, r8m
mov wd, wm
......@@ -554,7 +554,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
jnz .h_have_right
cmp r10d, -33
jl .h_have_right
call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right
call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
.h_have_right:
pshufb m0, m4, m6
pmaddubsw m0, m12
......@@ -613,7 +613,7 @@ ALIGN function_align
jnz .hv_have_right
cmp r10d, -33
jl .hv_have_right
call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right
call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
.hv_have_right:
pshufb m0, m4, m6
pmaddubsw m0, m12
......@@ -727,8 +727,8 @@ ALIGN function_align
jl .v_loop
ret
cglobal sgr_filter_5x5, 5, 13, 16, 400*24+16, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, params, h
cglobal sgr_filter_5x5_8bpc, 5, 13, 16, 400*24+16, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, params, h
%define base r12-sgr_x_by_x-256*4
lea r12, [sgr_x_by_x+256*4]
mov paramsq, paramsmp
......@@ -1187,8 +1187,8 @@ ALIGN function_align
add dstq, dst_strideq
ret
cglobal sgr_filter_3x3, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, params, h
cglobal sgr_filter_3x3_8bpc, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, params, h
%define base r14-sgr_x_by_x-256*4
mov paramsq, paramsmp
mov edged, r8m
......@@ -1298,7 +1298,7 @@ cglobal sgr_filter_3x3, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \
jnz .h_have_right
cmp r10d, -17
jl .h_have_right
call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right
call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
.h_have_right:
pshufb m0, m5, m8
pmullw m2, m0, m0
......@@ -1346,7 +1346,7 @@ ALIGN function_align
jnz .hv_have_right
cmp r10d, -17
jl .hv_have_right
call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right
call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
.hv_have_right:
pshufb m0, m5, m8
pmullw m3, m0, m0
......@@ -1546,8 +1546,8 @@ ALIGN function_align
add dstq, dst_strideq
ret
cglobal sgr_filter_mix, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, params, h
cglobal sgr_filter_mix_8bpc, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, params, h
%define base r12-sgr_x_by_x-256*4
lea r12, [sgr_x_by_x+256*4]
mov paramsq, paramsmp
......@@ -1573,7 +1573,7 @@ cglobal sgr_filter_mix, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \
call .h_top
add lpfq, lpf_strideq
mov t2, t1
call mangle(private_prefix %+ _sgr_filter_5x5_avx2).top_fixup
call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup
add t1, 400*12
call .h_top
lea r10, [lpfq+lpf_strideq*4]
......@@ -1681,7 +1681,7 @@ cglobal sgr_filter_mix, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \
jnz .h_have_right
cmp r10d, -18
jl .h_have_right
call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right
call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
.h_have_right:
pshufb m6, m5, m9
pshufb m4, m5, m10
......@@ -1742,7 +1742,7 @@ ALIGN function_align
jnz .hv0_have_right
cmp r10d, -18
jl .hv0_have_right
call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right
call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
.hv0_have_right:
pshufb m6, m5, m9
pshufb m4, m5, m10
......@@ -1853,7 +1853,7 @@ ALIGN function_align
jnz .hv1_have_right
cmp r10d, -18
jl .hv1_have_right
call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right
call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
.hv1_have_right:
pshufb m6, m5, m9
pshufb m3, m5, m10
......
......@@ -30,179 +30,171 @@
#include "common/intops.h"
#define WIENER_FILTER(ext) \
void dav1d_wiener_filter7_##ext(pixel *dst, ptrdiff_t dst_stride, \
const pixel (*left)[4], const pixel *lpf, \
ptrdiff_t lpf_stride, int w, int h, \
const LooprestorationParams *params, \
enum LrEdgeFlags edges); \
void dav1d_wiener_filter5_##ext(pixel *dst, ptrdiff_t dst_stride, \
const pixel (*left)[4], const pixel *lpf, \
ptrdiff_t lpf_stride, int w, int h, \
const LooprestorationParams *params, \
enum LrEdgeFlags edges);
#define decl_wiener_filter_fns(ext) \
decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \
decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext))
#define SGR_FILTER(ext) \
void dav1d_sgr_filter_5x5_##ext(pixel *dst, ptrdiff_t dst_stride, \
const pixel (*left)[4], const pixel *lpf, \
ptrdiff_t lpf_stride, int w, int h, \
const LooprestorationParams *params, \
enum LrEdgeFlags edges); \
void dav1d_sgr_filter_3x3_##ext(pixel *dst, ptrdiff_t dst_stride, \
const pixel (*left)[4], const pixel *lpf, \
ptrdiff_t lpf_stride, int w, int h, \
const LooprestorationParams *params, \
enum LrEdgeFlags edges); \
void dav1d_sgr_filter_mix_##ext(pixel *dst, ptrdiff_t dst_stride, \
const pixel (*left)[4], const pixel *lpf, \
ptrdiff_t lpf_stride, int w, int h, \
const LooprestorationParams *params, \
enum LrEdgeFlags edges);
#define decl_sgr_filter_fns(ext) \
void BF(dav1d_sgr_filter_5x5, ext)(pixel *dst, ptrdiff_t dst_stride, \
const pixel (*left)[4], const pixel *lpf, \
ptrdiff_t lpf_stride, int w, int h, \
const LooprestorationParams *params, \
enum LrEdgeFlags edges); \
void BF(dav1d_sgr_filter_3x3, ext)(pixel *dst, ptrdiff_t dst_stride, \
const pixel (*left)[4], const pixel *lpf, \
ptrdiff_t lpf_stride, int w, int h, \
const LooprestorationParams *params, \
enum LrEdgeFlags edges); \
void BF(dav1d_sgr_filter_mix, ext)(pixel *dst, ptrdiff_t dst_stride, \
const pixel (*left)[4], const pixel *lpf, \
ptrdiff_t lpf_stride, int w, int h, \
const LooprestorationParams *params, \
enum LrEdgeFlags edges);
/* FIXME: Replace with a port of the AVX2 code */
#define SGR_FILTER_OLD(ext) \
void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \
const pixel (*left)[4], \
const pixel *src, const ptrdiff_t stride, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void dav1d_sgr_box3_v_##ext(int32_t *sumsq, int16_t *sum, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void dav1d_sgr_calc_ab1_##ext(int32_t *a, int16_t *b, \
const int w, const int h, const unsigned s); \
void dav1d_sgr_finish_filter1_##ext(coef *tmp, \
const pixel *src, const ptrdiff_t stride, \
const int32_t *a, const int16_t *b, \
const int w, const int h); \
void BF(dav1d_sgr_box3_h, ext)(int32_t *sumsq, int16_t *sum, \
const pixel (*left)[4], \
const pixel *src, const ptrdiff_t stride, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void BF(dav1d_sgr_box3_v, ext)(int32_t *sumsq, int16_t *sum, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void BF(dav1d_sgr_calc_ab1, ext)(int32_t *a, int16_t *b, \
const int w, const int h, const unsigned s); \
void BF(dav1d_sgr_finish_filter1, ext)(coef *tmp, \
const pixel *src, const ptrdiff_t stride, \
const int32_t *a, const int16_t *b, \
const int w, const int h); \
\
/* filter with a 3x3 box (radius=1) */ \
static void dav1d_sgr_filter1_##ext(coef *tmp, \
const pixel *src, const ptrdiff_t stride, \
const pixel (*left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, const int strength, \
const enum LrEdgeFlags edges) \
static void BF(dav1d_sgr_filter1, ext)(coef *tmp, \
const pixel *src, const ptrdiff_t stride, \
const pixel (*left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, const int strength, \
const enum LrEdgeFlags edges) \
{ \
ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
\
dav1d_sgr_box3_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \
BF(dav1d_sgr_box3_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
if (edges & LR_HAVE_TOP) \
dav1d_sgr_box3_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
NULL, lpf, lpf_stride, w, 2, edges); \
BF(dav1d_sgr_box3_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
NULL, lpf, lpf_stride, w, 2, edges); \
\
if (edges & LR_HAVE_BOTTOM) \
dav1d_sgr_box3_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
lpf_stride, w, 2, edges); \
BF(dav1d_sgr_box3_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
lpf_stride, w, 2, edges); \
\
dav1d_sgr_box3_v_##ext(sumsq, sum, w, h, edges); \
dav1d_sgr_calc_ab1_##ext(a, b, w, h, strength); \
dav1d_sgr_finish_filter1_##ext(tmp, src, stride, a, b, w, h); \
BF(dav1d_sgr_box3_v, ext)(sumsq, sum, w, h, edges); \
BF(dav1d_sgr_calc_ab1, ext)(a, b, w, h, strength); \
BF(dav1d_sgr_finish_filter1, ext)(tmp, src, stride, a, b, w, h); \
} \
\
void dav1d_sgr_box5_h_##ext(int32_t *sumsq, int16_t *sum, \
const pixel (*left)[4], \
const pixel *src, const ptrdiff_t stride, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void dav1d_sgr_box5_v_##ext(int32_t *sumsq, int16_t *sum, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void dav1d_sgr_calc_ab2_##ext(int32_t *a, int16_t *b, \
const int w, const int h, const int strength); \
void dav1d_sgr_finish_filter2_##ext(coef *tmp, \
const pixel *src, const ptrdiff_t stride, \
const int32_t *a, const int16_t *b, \
const int w, const int h); \
void BF(dav1d_sgr_box5_h, ext)(int32_t *sumsq, int16_t *sum, \
const pixel (*left)[4], \
const pixel *src, const ptrdiff_t stride, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void BF(dav1d_sgr_box5_v, ext)(int32_t *sumsq, int16_t *sum, \
const int w, const int h, \
const enum LrEdgeFlags edges); \
void BF(dav1d_sgr_calc_ab2, ext)(int32_t *a, int16_t *b, \
const int w, const int h, const int strength); \
void BF(dav1d_sgr_finish_filter2, ext)(coef *tmp, \
const pixel *src, const ptrdiff_t stride, \
const int32_t *a, const int16_t *b, \
const int w, const int h); \
\
/* filter with a 5x5 box (radius=2) */ \
static void dav1d_sgr_filter2_##ext(coef *tmp, \
const pixel *src, const ptrdiff_t stride, \
const pixel (*left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, const int strength, \
const enum LrEdgeFlags edges) \
static void BF(dav1d_sgr_filter2, ext)(coef *tmp, \
const pixel *src, const ptrdiff_t stride, \
const pixel (*left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, const int strength, \
const enum LrEdgeFlags edges) \
{ \
ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \
int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \
ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \
int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \
\
dav1d_sgr_box5_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \
BF(dav1d_sgr_box5_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \
if (edges & LR_HAVE_TOP) \
dav1d_sgr_box5_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
NULL, lpf, lpf_stride, w, 2, edges); \
BF(dav1d_sgr_box5_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \
NULL, lpf, lpf_stride, w, 2, edges); \
\
if (edges & LR_HAVE_BOTTOM) \
dav1d_sgr_box5_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
lpf_stride, w, 2, edges); \
BF(dav1d_sgr_box5_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \
NULL, lpf + 6 * PXSTRIDE(lpf_stride), \
lpf_stride, w, 2, edges); \
\
dav1d_sgr_box5_v_##ext(sumsq, sum, w, h, edges); \
dav1d_sgr_calc_ab2_##ext(a, b, w, h, strength); \
dav1d_sgr_finish_filter2_##ext(tmp, src, stride, a, b, w, h); \
BF(dav1d_sgr_box5_v, ext)(sumsq, sum, w, h, edges); \
BF(dav1d_sgr_calc_ab2, ext)(a, b, w, h, strength); \
BF(dav1d_sgr_finish_filter2, ext)(tmp, src, stride, a, b, w, h); \
} \
\
void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \
const coef *t1, const int w, const int h, \
const int wt); \
void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \
const coef *t1, const coef *t2, \
const int w, const int h, \
const uint32_t wt); \
void BF(dav1d_sgr_weighted1, ext)(pixel *dst, const ptrdiff_t stride, \
const coef *t1, const int w, const int h, \
const int wt); \
void BF(dav1d_sgr_weighted2, ext)(pixel *dst, const ptrdiff_t stride, \
const coef *t1, const coef *t2, \
const int w, const int h, \
const uint32_t wt); \
\
static void sgr_filter_5x5_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
const pixel (*const left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, \
const LooprestorationParams *const params, \
const enum LrEdgeFlags edges) \
static void BF(sgr_filter_5x5, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
const pixel (*const left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, \
const LooprestorationParams *const params, \
const enum LrEdgeFlags edges) \
{ \
ALIGN_STK_32(coef, tmp, 64 * 384,); \
dav1d_sgr_filter2_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \
w, h, params->sgr.s0, edges); \
dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, params->sgr.w0); \
BF(dav1d_sgr_filter2, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
w, h, params->sgr.s0, edges); \
BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w0); \
} \
static void sgr_filter_3x3_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
const pixel (*const left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, \
const LooprestorationParams *const params, \
const enum LrEdgeFlags edges) \
static void BF(sgr_filter_3x3, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
const pixel (*const left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, \
const LooprestorationParams *const params, \
const enum LrEdgeFlags edges) \
{ \
ALIGN_STK_32(coef, tmp, 64 * 384,); \
dav1d_sgr_filter1_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \
w, h, params->sgr.s1, edges); \
dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, params->sgr.w1); \
BF(dav1d_sgr_filter1, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \
w, h, params->sgr.s1, edges); \
BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w1); \
} \
static void sgr_filter_mix_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
const pixel (*const left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, \
const LooprestorationParams *const params, \
const enum LrEdgeFlags edges) \
static void BF(sgr_filter_mix, ext)(pixel *const dst, const ptrdiff_t dst_stride, \
const pixel (*const left)[4], \
const pixel *lpf, const ptrdiff_t lpf_stride, \
const int w, const int h, \
const LooprestorationParams *const params, \
const enum LrEdgeFlags edges) \
{ \
ALIGN_STK_32(coef, tmp1, 64 * 384,); \
ALIGN_STK_32(coef, tmp2, 64 * 384,); \
dav1d_sgr_filter2_##ext(tmp1, dst, dst_stride, left, lpf, lpf_stride, \
w, h, params->sgr.s0, edges); \
dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
w, h, params->sgr.s1, edges); \
BF(dav1d_sgr_filter2, ext)(tmp1, dst, dst_stride, left, lpf, lpf_stride, \
w, h, params->sgr.s0, edges); \
BF(dav1d_sgr_filter1, ext)(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
w, h, params->sgr.s1, edges); \
const uint32_t wt = (params->sgr.w1 << 16) | (uint16_t) params->sgr.w0; \
dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \
BF(dav1d_sgr_weighted2, ext)(dst, dst_stride, tmp1, tmp2, w, h, wt); \
}
#if BITDEPTH == 8
WIENER_FILTER(sse2)
WIENER_FILTER(ssse3)
decl_wiener_filter_fns(sse2);
decl_wiener_filter_fns(ssse3);
SGR_FILTER_OLD(ssse3)
# if ARCH_X86_64
WIENER_FILTER(avx2)
SGR_FILTER(avx2)
decl_wiener_filter_fns(avx2);
decl_sgr_filter_fns(avx2)
# endif
#endif
......@@ -211,25 +203,25 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
#if BITDEPTH == 8
c->wiener[0] = dav1d_wiener_filter7_sse2;
c->wiener[1] = dav1d_wiener_filter5_sse2;
c->wiener[0] = BF(dav1d_wiener_filter7, sse2);
c->wiener[1] = BF(dav1d_wiener_filter5, sse2);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
c->wiener[0] = dav1d_wiener_filter7_ssse3;
c->wiener[1] = dav1d_wiener_filter5_ssse3;
c->sgr[0] = sgr_filter_5x5_ssse3;
c->sgr[1] = sgr_filter_3x3_ssse3;
c->sgr[2] = sgr_filter_mix_ssse3;
c->wiener[0] = BF(dav1d_wiener_filter7, ssse3);
c->wiener[1] = BF(dav1d_wiener_filter5, ssse3);
c->sgr[0] = BF(sgr_filter_5x5, ssse3);
c->sgr[1] = BF(sgr_filter_3x3, ssse3);
c->sgr[2] = BF(sgr_filter_mix, ssse3);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
#if BITDEPTH == 8 && ARCH_X86_64
c->wiener[0] = dav1d_wiener_filter7_avx2;
c->wiener[1] = dav1d_wiener_filter5_avx2;
c->sgr[0] = dav1d_sgr_filter_5x5_avx2;
c->sgr[1] = dav1d_sgr_filter_3x3_avx2;
c->sgr[2] = dav1d_sgr_filter_mix_avx2;
c->wiener[0] = BF(dav1d_wiener_filter7, avx2);
c->wiener[1] = BF(dav1d_wiener_filter5, avx2);
c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2);
c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2);
c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2);
#endif
}
......@@ -97,8 +97,8 @@ SECTION .text
%macro WIENER 0
%if ARCH_X86_64
DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers
cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, flt, h, x
cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, flt, h, x
%define base 0
mov fltq, fltmp
mov edged, r8m
......@@ -139,7 +139,7 @@ DECLARE_REG_TMP 4, 0, _, 5
%define m11 [stk+96]
%define stk_off 112
%endif
cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
%define base r6-pb_right_ext_mask-21
%define stk esp
%define dstq leftq
......@@ -245,7 +245,7 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
add lpfq, [rsp+gprsize*1]
call .hv_bottom
.v1:
call mangle(private_prefix %+ _wiener_filter7_ssse3).v
call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
RET
.no_top:
lea t3, [lpfq+lpf_strideq*4]
......@@ -281,9 +281,9 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
dec hd
jnz .main
.v3:
call mangle(private_prefix %+ _wiener_filter7_ssse3).v
call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
.v2:
call mangle(private_prefix %+ _wiener_filter7_ssse3).v
call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
jmp .v1
.extend_right:
movd m2, [lpfq-4]
......@@ -685,8 +685,8 @@ ALIGN function_align
%endif
%if ARCH_X86_64
cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, flt, h, x
cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
lpf_stride, w, edge, flt, h, x
mov fltq, fltmp
mov edged, r8m
mov wd, wm
......@@ -720,7 +720,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
%define m11 [stk+80]
%define stk_off 96
%endif
cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
%define stk esp
%define leftmp [stk+28]
%define m8 [base+pw_m16380]
......@@ -827,14 +827,14 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
dec hd
jnz .main
.v2:
call mangle(private_prefix %+ _wiener_filter5_ssse3).v
call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
add dstq, dst_strideq
mov t4, t3
mov t3, t2
mov t2, t1
movifnidn dstmp, dstq
.v1:
call mangle(private_prefix %+ _wiener_filter5_ssse3).v
call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
jmp .end
.h:
%define stk esp+4
......@@ -873,7 +873,7 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
jnz .h_have_right
cmp xd, -17
jl .h_have_right
call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
.h_have_right:
%macro %%h5 0
%if cpuflag(ssse3)
......@@ -991,7 +991,7 @@ ALIGN function_align
jnz .hv_have_right
cmp xd, -17
jl .hv_have_right
call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
.hv_have_right:
%%h5
mova m2, [t3+xq*2]
......@@ -1161,7 +1161,7 @@ WIENER
%endmacro
%if ARCH_X86_64
cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
cglobal sgr_box3_h_8bpc, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
mov xlimd, edgem
movifnidn xd, xm
mov hd, hm
......@@ -1170,7 +1170,7 @@ cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
add xd, xlimd
xor xlimd, 2 ; 2*!have_right
%else
cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
cglobal sgr_box3_h_8bpc, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
%define wq r0m
%define xlimd r1m
%define hd hmp
......@@ -1287,10 +1287,10 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
RET
%if ARCH_X86_64
cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
movifnidn edged, edgem
%else
cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
cglobal sgr_box3_v_8bpc, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
%define sumsq_baseq dword [esp+0]
%define sum_baseq dword [esp+4]
%define ylimd dword [esp+8]
......@@ -1383,7 +1383,7 @@ cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
jl .loop_x
RET
cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
cglobal sgr_calc_ab1_8bpc, 4, 7, 12, a, b, w, h, s
movifnidn sd, sm
sub aq, (384+16-1)*4
sub bq, (384+16-1)*2
......@@ -1463,8 +1463,8 @@ cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
RET
%if ARCH_X86_64
cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
tmp_base, src_base, a_base, b_base, x, y
cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \
tmp_base, src_base, a_base, b_base, x, y
movifnidn wd, wm
mov hd, hm
mova m15, [pw_16]
......@@ -1474,7 +1474,7 @@ cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
mov b_baseq, bq
xor xd, xd
%else
cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y