x86: cdef_dir: optimize best cost finding for SSE

Port of 65ee1233 for AVX-2
from Kyle Siefring to SSE41, and optimize SSSE3.

---------------------
x86_64:
------------------------------------------
before: cdef_dir_8bpc_ssse3: 110.3
 after: cdef_dir_8bpc_ssse3: 105.9
   new: cdef_dir_8bpc_sse4:   96.4
------------------------------------------

---------------------
x86_32:
------------------------------------------
before: cdef_dir_8bpc_ssse3: 120.6
 after: cdef_dir_8bpc_ssse3: 110.7
   new: cdef_dir_8bpc_sse4:  106.5
------------------------------------------
parent 75e88fab
......@@ -41,6 +41,7 @@ decl_cdef_fn(dav1d_cdef_filter_4x4_sse4);
decl_cdef_fn(dav1d_cdef_filter_4x4_ssse3);
decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
decl_cdef_dir_fn(dav1d_cdef_dir_sse4);
decl_cdef_dir_fn(dav1d_cdef_dir_ssse3);
void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
......@@ -58,6 +59,7 @@ void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
#if BITDEPTH == 8
c->dir = dav1d_cdef_dir_sse4;
c->fb[0] = dav1d_cdef_filter_8x8_sse4;
c->fb[1] = dav1d_cdef_filter_4x8_sse4;
c->fb[2] = dav1d_cdef_filter_4x4_sse4;
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment