Commit 1bf7228f authored by Fiona Glaser's avatar Fiona Glaser

Initial Nehalem CPU optimizations

movaps/movups are no longer equivalent to their integer equivalents on the Nehalem, so that substitution is removed.
Nehalem has a much lower cacheline split penalty than previous Intel CPUs, so cacheline workarounds are no longer necessary.
Thanks to Intel for providing Avail Media with the pre-release Nehalem CPU needed to prepare these (and other not-yet-committed) optimizations.
Overall speed improvement with Nehalem vs Penryn at the same clock speed is around 40%.
parent fc321fd6
......@@ -48,7 +48,8 @@ const x264_cpu_name_t x264_cpu_names[] = {
{"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
{"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
{"PHADD", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_PHADD_IS_FAST},
{"SSE4", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4.1", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4.2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"Slow_mod4_stack", X264_CPU_STACK_MOD4},
......@@ -91,6 +92,8 @@ uint32_t x264_cpu_detect( void )
cpu |= X264_CPU_SSSE3;
if( ecx&0x00080000 )
cpu |= X264_CPU_SSE4;
if( ecx&0x00100000 )
cpu |= X264_CPU_SSE42;
if( cpu & X264_CPU_SSSE3 )
cpu |= X264_CPU_SSE2_IS_FAST;
......@@ -131,7 +134,7 @@ uint32_t x264_cpu_detect( void )
}
}
if( !strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead") )
if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
{
/* cacheline size is specified in 3 places, any of which may be missing */
x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
......
......@@ -474,7 +474,3 @@ INIT_MMX
%endif
%endmacro
; substitutions which are functionally identical but reduce code size
%define movdqa movaps
%define movdqu movups
......@@ -744,6 +744,9 @@ x264_t *x264_encoder_open ( x264_param_t *param )
if( !strcmp(x264_cpu_names[i].name, "SSE3")
&& (param->cpu & X264_CPU_SSSE3 || !(param->cpu & X264_CPU_CACHELINE_64)) )
continue;
if( !strcmp(x264_cpu_names[i].name, "SSE4.1")
&& (param->cpu & X264_CPU_SSE42) )
continue;
if( (param->cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
&& (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
p += sprintf( p, " %s", x264_cpu_names[i].name );
......
......@@ -58,8 +58,9 @@ typedef struct x264_t x264_t;
#define X264_CPU_SSE3 0x000200
#define X264_CPU_SSSE3 0x000400
#define X264_CPU_PHADD_IS_FAST 0x000800 /* pre-Penryn Core2 have a uselessly slow PHADD instruction */
#define X264_CPU_SSE4 0x001000 /* SSE4.1 */
#define X264_CPU_STACK_MOD4 0x002000 /* if stack is only mod4 and not mod16 */
#define X264_CPU_STACK_MOD4 0x001000 /* if stack is only mod4 and not mod16 */
#define X264_CPU_SSE4 0x002000 /* SSE4.1 */
#define X264_CPU_SSE42 0x004000 /* SSE4.2 */
/* Analyse flags
*/
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment