Commit ec95ea52 authored by Nathan Egge's avatar Nathan Egge Committed by Nathan Egge
Browse files

Add bpc suffix to cdef functions

parent 1d6aae47
......@@ -39,7 +39,7 @@
%endmacro
%macro CDEF_FILTER_JMP_TABLE 1
JMP_TABLE cdef_filter_%1, \
JMP_TABLE cdef_filter_%1_8bpc, \
d6k0, d6k1, d7k0, d7k1, \
d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \
d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \
......@@ -94,7 +94,7 @@ SECTION .text
%macro PREP_REGS 2 ; w, h
; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
mov dird, r6m
lea tableq, [cdef_filter_%1x%2_jmptable]
lea tableq, [cdef_filter_%1x%2_8bpc_jmptable]
lea dirq, [tableq+dirq*2*4]
%if %1 == 4
%if %2 == 4
......@@ -397,7 +397,7 @@ SECTION .text
%macro CDEF_FILTER 2 ; w, h
INIT_YMM avx2
cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \
cglobal cdef_filter_%1x%2_8bpc, 4, 9, 0, dst, stride, left, top, \
pri, sec, dir, damping, edge
%assign stack_offset_entry stack_offset
mov edged, edgem
......@@ -1592,7 +1592,7 @@ CDEF_FILTER 4, 8
CDEF_FILTER 4, 4
INIT_YMM avx2
cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
cglobal cdef_dir_8bpc, 3, 4, 15, src, stride, var, stride3
lea stride3q, [strideq*3]
movq xm0, [srcq+strideq*0]
movq xm1, [srcq+strideq*1]
......
......@@ -109,7 +109,8 @@ DECLARE_REG_TMP 8, 5
; 5e 5f 50 51 52 53 54 55
INIT_ZMM avx512icl
cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge
cglobal cdef_filter_4x4_8bpc, 4, 8, 13, dst, stride, left, top, \
pri, sec, dir, damping, edge
%define base r7-edge_mask
movq xmm0, [dstq+strideq*0]
movhps xmm0, [dstq+strideq*1]
......@@ -269,8 +270,8 @@ DECLARE_REG_TMP 2, 7
; L8 L9 40 41 42 43 44 45 8e 8f 80 81 82 83 84 85
; La Lb 50 51 52 53 54 55 9e 9f 90 91 92 93 94 95
cglobal cdef_filter_4x8, 4, 9, 22, dst, stride, left, top, \
pri, sec, dir, damping, edge
cglobal cdef_filter_4x8_8bpc, 4, 9, 22, dst, stride, left, top, \
pri, sec, dir, damping, edge
%define base r8-edge_mask
vpbroadcastd ym21, strided
mov r6d, edgem
......@@ -504,8 +505,8 @@ ALIGN function_align
; 8e 8f 80 81 82 83 84 85 84 85 86 87 88 89 8a 8b
; 9e 9f 90 91 92 93 94 95 94 95 96 97 98 99 9a 9b
cglobal cdef_filter_8x8, 4, 11, 32, 4*64, dst, stride, left, top, \
pri, sec, dir, damping, edge
cglobal cdef_filter_8x8_8bpc, 4, 11, 32, 4*64, dst, stride, left, top, \
pri, sec, dir, damping, edge
%define base r8-edge_mask
mov r6d, edgem
lea r10, [dstq+strideq*4-2]
......
......@@ -28,20 +28,22 @@
#include "src/cpu.h"
#include "src/cdef.h"
#define decl_cdef_size_fn(sz) \
decl_cdef_fn(dav1d_cdef_filter_##sz##_avx512icl); \
decl_cdef_fn(dav1d_cdef_filter_##sz##_avx2); \
decl_cdef_fn(dav1d_cdef_filter_##sz##_sse4); \
decl_cdef_fn(dav1d_cdef_filter_##sz##_ssse3); \
decl_cdef_fn(dav1d_cdef_filter_##sz##_sse2)
#define decl_cdef_fns(ext) \
decl_cdef_fn(BF(dav1d_cdef_filter_4x4, ext)); \
decl_cdef_fn(BF(dav1d_cdef_filter_4x8, ext)); \
decl_cdef_fn(BF(dav1d_cdef_filter_8x8, ext))
decl_cdef_size_fn(4x4);
decl_cdef_size_fn(4x8);
decl_cdef_size_fn(8x8);
decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
decl_cdef_dir_fn(dav1d_cdef_dir_sse4);
decl_cdef_dir_fn(dav1d_cdef_dir_ssse3);
#if BITDEPTH == 8
decl_cdef_fns(avx512icl);
decl_cdef_fns(avx2);
decl_cdef_fns(sse4);
decl_cdef_fns(ssse3);
decl_cdef_fns(sse2);
decl_cdef_dir_fn(BF(dav1d_cdef_dir, avx2));
decl_cdef_dir_fn(BF(dav1d_cdef_dir, sse4));
decl_cdef_dir_fn(BF(dav1d_cdef_dir, ssse3));
#endif
COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
......@@ -49,45 +51,45 @@ COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
#if BITDEPTH == 8
c->fb[0] = dav1d_cdef_filter_8x8_sse2;
c->fb[1] = dav1d_cdef_filter_4x8_sse2;
c->fb[2] = dav1d_cdef_filter_4x4_sse2;
c->fb[0] = BF(dav1d_cdef_filter_8x8, sse2);
c->fb[1] = BF(dav1d_cdef_filter_4x8, sse2);
c->fb[2] = BF(dav1d_cdef_filter_4x4, sse2);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
c->dir = dav1d_cdef_dir_ssse3;
c->fb[0] = dav1d_cdef_filter_8x8_ssse3;
c->fb[1] = dav1d_cdef_filter_4x8_ssse3;
c->fb[2] = dav1d_cdef_filter_4x4_ssse3;
c->dir = BF(dav1d_cdef_dir, ssse3);
c->fb[0] = BF(dav1d_cdef_filter_8x8, ssse3);
c->fb[1] = BF(dav1d_cdef_filter_4x8, ssse3);
c->fb[2] = BF(dav1d_cdef_filter_4x4, ssse3);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
#if BITDEPTH == 8
c->dir = dav1d_cdef_dir_sse4;
c->fb[0] = dav1d_cdef_filter_8x8_sse4;
c->fb[1] = dav1d_cdef_filter_4x8_sse4;
c->fb[2] = dav1d_cdef_filter_4x4_sse4;
c->dir = BF(dav1d_cdef_dir, sse4);
c->fb[0] = BF(dav1d_cdef_filter_8x8, sse4);
c->fb[1] = BF(dav1d_cdef_filter_4x8, sse4);
c->fb[2] = BF(dav1d_cdef_filter_4x4, sse4);
#endif
#if ARCH_X86_64
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
#if BITDEPTH == 8
c->dir = dav1d_cdef_dir_avx2;
c->fb[0] = dav1d_cdef_filter_8x8_avx2;
c->fb[1] = dav1d_cdef_filter_4x8_avx2;
c->fb[2] = dav1d_cdef_filter_4x4_avx2;
c->dir = BF(dav1d_cdef_dir, avx2);
c->fb[0] = BF(dav1d_cdef_filter_8x8, avx2);
c->fb[1] = BF(dav1d_cdef_filter_4x8, avx2);
c->fb[2] = BF(dav1d_cdef_filter_4x4, avx2);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
#if HAVE_AVX512ICL && BITDEPTH == 8
c->fb[0] = dav1d_cdef_filter_8x8_avx512icl;
c->fb[1] = dav1d_cdef_filter_4x8_avx512icl;
c->fb[2] = dav1d_cdef_filter_4x4_avx512icl;
c->fb[0] = BF(dav1d_cdef_filter_8x8, avx512icl);
c->fb[1] = BF(dav1d_cdef_filter_4x8, avx512icl);
c->fb[2] = BF(dav1d_cdef_filter_4x4, avx512icl);
#endif
#endif
......
......@@ -249,13 +249,13 @@ SECTION .text
%macro CDEF_FILTER 2 ; w, h
%if ARCH_X86_64
cglobal cdef_filter_%1x%2, 4, 9, 16, 3 * 16 + (%2+4)*32, \
dst, stride, left, top, pri, sec, edge, stride3, dst4
cglobal cdef_filter_%1x%2_8bpc, 4, 9, 16, 3 * 16 + (%2+4)*32, \
dst, stride, left, top, pri, sec, edge, stride3, dst4
%define px rsp+3*16+2*32
%define base 0
%else
cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
dst, stride, left, edge, stride3
cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
dst, stride, left, edge, stride3
%define topq r2
%define dst4q r2
LEA r5, tap_table
......@@ -758,7 +758,7 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
%macro CDEF_DIR 0
%if ARCH_X86_64
cglobal cdef_dir, 3, 5, 16, 32, src, stride, var, stride3
cglobal cdef_dir_8bpc, 3, 5, 16, 32, src, stride, var, stride3
lea stride3q, [strideq*3]
movq m1, [srcq+strideq*0]
movhps m1, [srcq+strideq*1]
......@@ -1030,7 +1030,7 @@ cglobal cdef_dir, 3, 5, 16, 32, src, stride, var, stride3
shr r1d, 10
mov [varq], r1d
%else
cglobal cdef_dir, 2, 4, 8, 96, src, stride, var, stride3
cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3
%define base r2-shufw_6543210x
LEA r2, shufw_6543210x
pxor m0, m0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment