Commit 8ae4e1cf authored by Henrik Gramner's avatar Henrik Gramner
Browse files

x86: Make AVX2 also imply FMA3

All CPUs with AVX2 supports FMA3 (but not the other way around).
parent 06882793
......@@ -67,8 +67,8 @@ const x264_cpu_name_t x264_cpu_names[] =
{"AVX", AVX},
{"XOP", AVX|X264_CPU_XOP},
{"FMA4", AVX|X264_CPU_FMA4},
{"AVX2", AVX|X264_CPU_AVX2},
{"FMA3", AVX|X264_CPU_FMA3},
{"AVX2", AVX|X264_CPU_FMA3|X264_CPU_AVX2},
#undef AVX
#undef SSE2
#undef MMX2
......
......@@ -2136,7 +2136,7 @@ cglobal mbtree_propagate_cost, 6,6,%1
INIT_YMM avx
MBTREE_AVX 8
INIT_YMM avx2,fma3
INIT_YMM avx2
MBTREE_AVX 7
%macro MBTREE_PROPAGATE_LIST 0
......
......@@ -167,8 +167,8 @@ void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx2_fma3( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
#define MC_CHROMA(cpu)\
void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\
......@@ -938,7 +938,5 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( !(cpu&X264_CPU_AVX2) )
return;
pf->get_ref = get_ref_avx2;
if( cpu&X264_CPU_FMA3 )
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2_fma3;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
}
......@@ -738,8 +738,8 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%assign cpuflags_avx (1<<11)| cpuflags_sse42
%assign cpuflags_xop (1<<12)| cpuflags_avx
%assign cpuflags_fma4 (1<<13)| cpuflags_avx
%assign cpuflags_avx2 (1<<14)| cpuflags_avx
%assign cpuflags_fma3 (1<<15)| cpuflags_avx
%assign cpuflags_fma3 (1<<14)| cpuflags_avx
%assign cpuflags_avx2 (1<<15)| cpuflags_fma3
%assign cpuflags_cache32 (1<<16)
%assign cpuflags_cache64 (1<<17)
......
......@@ -167,12 +167,12 @@ static void print_bench(void)
continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
#if HAVE_MMX
b->cpu&X264_CPU_AVX2 && b->cpu&X264_CPU_FMA3 ? "avx2_fma3" :
b->cpu&X264_CPU_AVX2 ? "avx2" :
b->cpu&X264_CPU_FMA3 ? "fma3" :
b->cpu&X264_CPU_FMA4 ? "fma4" :
b->cpu&X264_CPU_XOP ? "xop" :
b->cpu&X264_CPU_AVX ? "avx" :
b->cpu&X264_CPU_SSE42 ? "sse42" :
b->cpu&X264_CPU_SSE4 ? "sse4" :
b->cpu&X264_CPU_SSSE3 ? "ssse3" :
b->cpu&X264_CPU_SSE3 ? "sse3" :
......@@ -2651,7 +2651,7 @@ static int check_all_flags( void )
#endif
if( cpu_detect & X264_CPU_LZCNT )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX LZCNT" );
cpu1 &= ~X264_CPU_LZCNT;
}
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
......@@ -2669,11 +2669,11 @@ static int check_all_flags( void )
cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
cpu1 &= ~X264_CPU_SLOW_CTZ;
}
if( cpu_detect & X264_CPU_LZCNT )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" );
cpu1 &= ~X264_CPU_LZCNT;
if( cpu_detect & X264_CPU_LZCNT )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE2 LZCNT" );
cpu1 &= ~X264_CPU_LZCNT;
}
}
if( cpu_detect & X264_CPU_SSE3 )
{
......@@ -2693,9 +2693,16 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" );
cpu1 &= ~X264_CPU_CACHELINE_64;
cpu1 &= ~X264_CPU_SLOW_ATOM;
if( cpu_detect & X264_CPU_LZCNT )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSSE3 LZCNT" );
cpu1 &= ~X264_CPU_LZCNT;
}
}
if( cpu_detect & X264_CPU_SSE4 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
if( cpu_detect & X264_CPU_SSE42 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE42, "SSE4.2" );
if( cpu_detect & X264_CPU_AVX )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
if( cpu_detect & X264_CPU_XOP )
......@@ -2705,30 +2712,30 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
cpu1 &= ~X264_CPU_FMA4;
}
if( cpu_detect & X264_CPU_BMI1 )
if( cpu_detect & X264_CPU_FMA3 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
cpu1 &= ~X264_CPU_BMI1;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
cpu1 &= ~X264_CPU_FMA3;
}
if( cpu_detect & X264_CPU_AVX2 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3 | X264_CPU_AVX2, "AVX2" );
if( cpu_detect & X264_CPU_LZCNT )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2_LZCNT" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2 LZCNT" );
cpu1 &= ~X264_CPU_LZCNT;
}
}
if( cpu_detect & X264_CPU_BMI1 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
cpu1 &= ~X264_CPU_BMI1;
}
if( cpu_detect & X264_CPU_BMI2 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" );
cpu1 &= ~(X264_CPU_BMI1|X264_CPU_BMI2);
}
if( cpu_detect & X264_CPU_FMA3 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
cpu1 &= ~X264_CPU_FMA3;
}
#elif ARCH_PPC
if( cpu_detect & X264_CPU_ALTIVEC )
{
......
......@@ -41,7 +41,7 @@
#include "x264_config.h"
#define X264_BUILD 142
#define X264_BUILD 143
/* Application developers planning to link against a shared library version of
* libx264 from a Microsoft Visual Studio or similar development environment
......@@ -129,8 +129,8 @@ typedef struct
#define X264_CPU_AVX 0x0000400 /* AVX support: requires OS support even if YMM registers aren't used. */
#define X264_CPU_XOP 0x0000800 /* AMD XOP */
#define X264_CPU_FMA4 0x0001000 /* AMD FMA4 */
#define X264_CPU_AVX2 0x0002000 /* AVX2 */
#define X264_CPU_FMA3 0x0004000 /* Intel FMA3 */
#define X264_CPU_FMA3 0x0002000 /* FMA3 */
#define X264_CPU_AVX2 0x0004000 /* AVX2 */
#define X264_CPU_BMI1 0x0008000 /* BMI1 */
#define X264_CPU_BMI2 0x0010000 /* BMI2 */
/* x86 modifiers */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment