Commit ae289e6f authored by Fiona Glaser's avatar Fiona Glaser

TBM, AVX2, FMA3, BMI1, and BMI2 CPU detection support

TBM and BMI1 are supported by Trinity/Piledriver.
The others (and BMI1) will probably appear in Intel's upcoming Haswell.
Also update x86inc with AVX2 stuff.
parent e0581e08
......@@ -62,14 +62,21 @@ const x264_cpu_name_t x264_cpu_names[] =
{"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4.2", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
{"AVX", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX},
{"XOP", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX|X264_CPU_XOP},
{"FMA4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX|X264_CPU_FMA4},
#define AVX SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX
{"AVX", AVX},
{"XOP", AVX|X264_CPU_XOP},
{"FMA4", AVX|X264_CPU_FMA4},
{"AVX2", AVX|X264_CPU_AVX2},
{"FMA3", AVX|X264_CPU_FMA3},
#undef AVX
#undef SSE2
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"SSEMisalign", X264_CPU_SSE_MISALIGN},
{"LZCNT", X264_CPU_LZCNT},
{"BMI1", X264_CPU_BMI1},
{"BMI2", X264_CPU_BMI1|X264_CPU_BMI2},
{"TBM", X264_CPU_TBM},
{"Slow_mod4_stack", X264_CPU_STACK_MOD4},
{"ARMv6", X264_CPU_ARMV6},
{"NEON", X264_CPU_NEON},
......@@ -143,7 +150,22 @@ uint32_t x264_cpu_detect( void )
/* Check for OS support */
x264_cpu_xgetbv( 0, &eax, &edx );
if( (eax&0x6) == 0x6 )
{
cpu |= X264_CPU_AVX;
if( ecx&0x00001000 )
cpu |= X264_CPU_FMA3;
}
}
x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx );
/* AVX2 requires OS support, but BMI1/2 don't. */
if( (cpu&X264_CPU_AVX) && (ebx&0x00000020) )
cpu |= X264_CPU_AVX2;
if( ebx&0x00000008 )
{
cpu |= X264_CPU_BMI1;
if( ebx&0x00000100 )
cpu |= X264_CPU_BMI2;
}
if( cpu & X264_CPU_SSSE3 )
......@@ -185,6 +207,9 @@ uint32_t x264_cpu_detect( void )
if( ecx&0x00010000 ) /* FMA4 */
cpu |= X264_CPU_FMA4;
}
if( ecx&0x00200000 )
cpu |= X264_CPU_TBM;
}
}
......
......@@ -39,6 +39,7 @@ cglobal cpu_cpuid, 5,7
push r2
push r1
mov eax, r0d
xor ecx, ecx
cpuid
pop rsi
mov [rsi], eax
......
......@@ -554,6 +554,8 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%assign cpuflags_avx (1<<9) | cpuflags_sse42
%assign cpuflags_xop (1<<10)| cpuflags_avx
%assign cpuflags_fma4 (1<<11)| cpuflags_avx
%assign cpuflags_avx2 (1<<12)| cpuflags_avx
%assign cpuflags_fma3 (1<<13)| cpuflags_avx
%assign cpuflags_cache32 (1<<16)
%assign cpuflags_cache64 (1<<17)
......@@ -561,6 +563,9 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%assign cpuflags_lzcnt (1<<19)
%assign cpuflags_misalign (1<<20)
%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant
%assign cpuflags_bmi1 (1<<22)
%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1
%assign cpuflags_tbm (1<<24)|cpuflags_bmi1
%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
......@@ -822,10 +827,10 @@ INIT_XMM
;%4 == number of operands given
;%5+: operands
%macro RUN_AVX_INSTR 6-7+
%ifid %5
%define %%sizeofreg sizeof%5
%elifid %6
%ifid %6
%define %%sizeofreg sizeof%6
%elifid %5
%define %%sizeofreg sizeof%5
%else
%define %%sizeofreg mmsize
%endif
......@@ -948,6 +953,9 @@ AVX_INSTR mulsd, 1, 0, 1
AVX_INSTR mulss, 1, 0, 1
AVX_INSTR orpd, 1, 0, 1
AVX_INSTR orps, 1, 0, 1
AVX_INSTR pabsb, 0, 0, 0
AVX_INSTR pabsw, 0, 0, 0
AVX_INSTR pabsd, 0, 0, 0
AVX_INSTR packsswb, 0, 0, 0
AVX_INSTR packssdw, 0, 0, 0
AVX_INSTR packuswb, 0, 0, 0
......@@ -999,6 +1007,7 @@ AVX_INSTR pminsd, 0, 0, 1
AVX_INSTR pminub, 0, 0, 1
AVX_INSTR pminuw, 0, 0, 1
AVX_INSTR pminud, 0, 0, 1
AVX_INSTR pmovmskb, 0, 0, 0
AVX_INSTR pmulhuw, 0, 0, 1
AVX_INSTR pmulhrsw, 0, 0, 1
AVX_INSTR pmulhw, 0, 0, 1
......@@ -1009,6 +1018,9 @@ AVX_INSTR pmuldq, 0, 0, 1
AVX_INSTR por, 0, 0, 1
AVX_INSTR psadbw, 0, 0, 1
AVX_INSTR pshufb, 0, 0, 0
AVX_INSTR pshufd, 0, 1, 0
AVX_INSTR pshufhw, 0, 1, 0
AVX_INSTR pshuflw, 0, 1, 0
AVX_INSTR psignb, 0, 0, 0
AVX_INSTR psignw, 0, 0, 0
AVX_INSTR psignd, 0, 0, 0
......@@ -1030,6 +1042,7 @@ AVX_INSTR psubsb, 0, 0, 0
AVX_INSTR psubsw, 0, 0, 0
AVX_INSTR psubusb, 0, 0, 0
AVX_INSTR psubusw, 0, 0, 0
AVX_INSTR ptest, 0, 0, 0
AVX_INSTR punpckhbw, 0, 0, 0
AVX_INSTR punpckhwd, 0, 0, 0
AVX_INSTR punpckhdq, 0, 0, 0
......
......@@ -164,6 +164,8 @@ static void print_bench(void)
if( k < j )
continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
b->cpu&X264_CPU_AVX2 ? "avx2" :
b->cpu&X264_CPU_FMA3 ? "fma3" :
b->cpu&X264_CPU_FMA4 ? "fma4" :
b->cpu&X264_CPU_XOP ? "xop" :
b->cpu&X264_CPU_AVX ? "avx" :
......@@ -182,6 +184,9 @@ static void print_bench(void)
b->cpu&X264_CPU_SHUFFLE_IS_FAST && !(b->cpu&X264_CPU_SSE4) ? "_fastshuffle" :
b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
b->cpu&X264_CPU_BMI2 ? "_bmi2" :
b->cpu&X264_CPU_TBM ? "_tbm" :
b->cpu&X264_CPU_BMI1 ? "_bmi1" :
b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" :
b->cpu&X264_CPU_SLOW_ATOM ? "_slow_atom" : "",
......@@ -2405,7 +2410,32 @@ static int check_all_flags( void )
if( x264_cpu_detect() & X264_CPU_XOP )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_XOP, "XOP" );
if( x264_cpu_detect() & X264_CPU_FMA4 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
cpu1 &= ~X264_CPU_FMA4;
}
if( x264_cpu_detect() & X264_CPU_FMA3 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
cpu1 &= ~X264_CPU_FMA3;
}
if( x264_cpu_detect() & X264_CPU_BMI1 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
if( x264_cpu_detect() & X264_CPU_TBM )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_TBM, "TBM" );
cpu1 &= ~X264_CPU_TBM;
}
if( x264_cpu_detect() & X264_CPU_BMI2 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" );
cpu1 &= ~X264_CPU_BMI2;
}
cpu1 &= ~X264_CPU_BMI1;
}
if( x264_cpu_detect() & X264_CPU_AVX2 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
#elif ARCH_PPC
if( x264_cpu_detect() & X264_CPU_ALTIVEC )
{
......
......@@ -127,6 +127,11 @@ typedef struct
* aren't used. */
#define X264_CPU_XOP 0x0800000 /* AMD XOP */
#define X264_CPU_FMA4 0x1000000 /* AMD FMA4 */
#define X264_CPU_AVX2 0x2000000 /* AVX2 */
#define X264_CPU_FMA3 0x4000000 /* Intel FMA3 */
#define X264_CPU_BMI1 0x8000000 /* BMI1 */
#define X264_CPU_BMI2 0x10000000 /* BMI2 */
#define X264_CPU_TBM 0x20000000 /* AMD TBM */
/* Analyse flags
*/
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment