Commit b77ea4db authored by Fiona Glaser's avatar Fiona Glaser
Browse files

Slightly faster 8x16 SAD on Penryn Core 2

Same as MMX 8x16 cacheline SAD, but calls SSE2 8x16 SAD in non-cacheline case.
Only Nehalem benefits from sizes smaller than 8x16, and Nehalem doesn't use cacheline functions, so no smaller versions are included.
parent dfe8f732
......@@ -679,14 +679,21 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT_ADS( _sse2 );
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
#ifdef ARCH_X86
if( cpu&X264_CPU_CACHELINE_64 )
if( cpu&X264_CPU_CACHELINE_64 )
{
#ifdef ARCH_X86
INIT2( sad, _cache64_sse2 );
INIT2( sad_x3, _cache64_sse2 );
INIT2( sad_x4, _cache64_sse2 );
}
#endif
if( cpu&X264_CPU_SSE2_IS_FAST )
{
pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_sse2;
pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_sse2;
}
}
if( cpu&X264_CPU_SSE_MISALIGN )
{
INIT2( sad_x3, _sse2_misalign );
......
......@@ -1142,8 +1142,8 @@ cglobal x264_pixel_sad_8x%1_cache%2_mmxext
jg .split
%endmacro
%macro SADX3_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5
%macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
cglobal x264_pixel_sad_x3_%1x%2_cache%3_%6
CHECK_SPLIT r1m, %1, %3
CHECK_SPLIT r2m, %1, %3
CHECK_SPLIT r3m, %1, %3
......@@ -1207,8 +1207,8 @@ cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5
%endif
%endmacro
%macro SADX4_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5
%macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
cglobal x264_pixel_sad_x4_%1x%2_cache%3_%6
CHECK_SPLIT r1m, %1, %3
CHECK_SPLIT r2m, %1, %3
CHECK_SPLIT r3m, %1, %3
......@@ -1285,9 +1285,9 @@ cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5
%endif
%endmacro
%macro SADX34_CACHELINE_FUNC 5
SADX3_CACHELINE_FUNC %1, %2, %3, %4, %5
SADX4_CACHELINE_FUNC %1, %2, %3, %4, %5
%macro SADX34_CACHELINE_FUNC 1+
SADX3_CACHELINE_FUNC %1
SADX4_CACHELINE_FUNC %1
%endmacro
......@@ -1307,15 +1307,15 @@ SAD8_CACHELINE_FUNC_MMX2 8, 64
SAD8_CACHELINE_FUNC_MMX2 16, 64
%ifndef ARCH_X86_64
SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext
SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext
SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext
SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext
SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext
SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext
SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext, mmxext
SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext, mmxext
SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext, mmxext
SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext, mmxext
SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext, mmxext
SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext, mmxext
%endif ; !ARCH_X86_64
SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext
SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext
SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext, mmxext
SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext, mmxext
%ifndef ARCH_X86_64
SAD16_CACHELINE_FUNC sse2, 8
......@@ -1325,9 +1325,10 @@ SAD16_CACHELINE_FUNC sse2, 16
SAD16_CACHELINE_LOOP_SSE2 i
%assign i i+1
%endrep
SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2
SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2
SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2
SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2
%endif ; !ARCH_X86_64
SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmxext, sse2
SAD16_CACHELINE_FUNC ssse3, 8
SAD16_CACHELINE_FUNC ssse3, 16
......@@ -1336,5 +1337,6 @@ SAD16_CACHELINE_FUNC ssse3, 16
SAD16_CACHELINE_LOOP_SSSE3 i
%assign i i+1
%endrep
SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3
SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3
SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3
SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3, ssse3
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment