Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
VideoLAN
x264
Commits
8ae4e1cf
Commit
8ae4e1cf
authored
Aug 02, 2014
by
Henrik Gramner
Browse files
x86: Make AVX2 also imply FMA3
All CPUs with AVX2 supports FMA3 (but not the other way around).
parent
06882793
Changes
6
Hide whitespace changes
Inline
Side-by-side
common/cpu.c
View file @
8ae4e1cf
...
...
@@ -67,8 +67,8 @@ const x264_cpu_name_t x264_cpu_names[] =
{
"AVX"
,
AVX
},
{
"XOP"
,
AVX
|
X264_CPU_XOP
},
{
"FMA4"
,
AVX
|
X264_CPU_FMA4
},
{
"AVX2"
,
AVX
|
X264_CPU_AVX2
},
{
"FMA3"
,
AVX
|
X264_CPU_FMA3
},
{
"AVX2"
,
AVX
|
X264_CPU_FMA3
|
X264_CPU_AVX2
},
#undef AVX
#undef SSE2
#undef MMX2
...
...
common/x86/mc-a2.asm
View file @
8ae4e1cf
...
...
@@ -2136,7 +2136,7 @@ cglobal mbtree_propagate_cost, 6,6,%1
INIT_YMM
avx
MBTREE_AVX
8
INIT_YMM
avx2
,
fma3
INIT_YMM
avx2
MBTREE_AVX
7
%macro MBTREE_PROPAGATE_LIST 0
...
...
common/x86/mc-c.c
View file @
8ae4e1cf
...
...
@@ -167,8 +167,8 @@ void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint
uint16_t
*
inter_costs
,
uint16_t
*
inv_qscales
,
float
*
fps_factor
,
int
len
);
void
x264_mbtree_propagate_cost_fma4
(
int16_t
*
dst
,
uint16_t
*
propagate_in
,
uint16_t
*
intra_costs
,
uint16_t
*
inter_costs
,
uint16_t
*
inv_qscales
,
float
*
fps_factor
,
int
len
);
void
x264_mbtree_propagate_cost_avx2
_fma3
(
int16_t
*
dst
,
uint16_t
*
propagate_in
,
uint16_t
*
intra_costs
,
uint16_t
*
inter_costs
,
uint16_t
*
inv_qscales
,
float
*
fps_factor
,
int
len
);
void
x264_mbtree_propagate_cost_avx2
(
int16_t
*
dst
,
uint16_t
*
propagate_in
,
uint16_t
*
intra_costs
,
uint16_t
*
inter_costs
,
uint16_t
*
inv_qscales
,
float
*
fps_factor
,
int
len
);
#define MC_CHROMA(cpu)\
void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\
...
...
@@ -938,7 +938,5 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if
(
!
(
cpu
&
X264_CPU_AVX2
)
)
return
;
pf
->
get_ref
=
get_ref_avx2
;
if
(
cpu
&
X264_CPU_FMA3
)
pf
->
mbtree_propagate_cost
=
x264_mbtree_propagate_cost_avx2_fma3
;
pf
->
mbtree_propagate_cost
=
x264_mbtree_propagate_cost_avx2
;
}
common/x86/x86inc.asm
View file @
8ae4e1cf
...
...
@@ -738,8 +738,8 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%assign cpuflags_avx (1<<11)| cpuflags_sse42
%assign cpuflags_xop (1<<12)| cpuflags_avx
%assign cpuflags_fma4 (1<<13)| cpuflags_avx
%assign cpuflags_
avx2
(1<<14)| cpuflags_avx
%assign cpuflags_
fma3
(1<<15)| cpuflags_
avx
%assign cpuflags_
fma3
(1<<14)| cpuflags_avx
%assign cpuflags_
avx2
(1<<15)| cpuflags_
fma3
%assign cpuflags_cache32 (1<<16)
%assign cpuflags_cache64 (1<<17)
...
...
tools/checkasm.c
View file @
8ae4e1cf
...
...
@@ -167,12 +167,12 @@ static void print_bench(void)
continue
;
printf
(
"%s_%s%s: %"
PRId64
"
\n
"
,
benchs
[
i
].
name
,
#if HAVE_MMX
b
->
cpu
&
X264_CPU_AVX2
&&
b
->
cpu
&
X264_CPU_FMA3
?
"avx2_fma3"
:
b
->
cpu
&
X264_CPU_AVX2
?
"avx2"
:
b
->
cpu
&
X264_CPU_FMA3
?
"fma3"
:
b
->
cpu
&
X264_CPU_FMA4
?
"fma4"
:
b
->
cpu
&
X264_CPU_XOP
?
"xop"
:
b
->
cpu
&
X264_CPU_AVX
?
"avx"
:
b
->
cpu
&
X264_CPU_SSE42
?
"sse42"
:
b
->
cpu
&
X264_CPU_SSE4
?
"sse4"
:
b
->
cpu
&
X264_CPU_SSSE3
?
"ssse3"
:
b
->
cpu
&
X264_CPU_SSE3
?
"sse3"
:
...
...
@@ -2651,7 +2651,7 @@ static int check_all_flags( void )
#endif
if
(
cpu_detect
&
X264_CPU_LZCNT
)
{
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_LZCNT
,
"MMX
_
LZCNT"
);
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_LZCNT
,
"MMX
LZCNT"
);
cpu1
&=
~
X264_CPU_LZCNT
;
}
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_SLOW_CTZ
,
"MMX SlowCTZ"
);
...
...
@@ -2669,11 +2669,11 @@ static int check_all_flags( void )
cpu1
&=
~
X264_CPU_SLOW_SHUFFLE
;
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_SLOW_CTZ
,
"SSE2 SlowCTZ"
);
cpu1
&=
~
X264_CPU_SLOW_CTZ
;
}
if
(
cpu_detect
&
X264_CPU_LZCNT
)
{
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_LZCNT
,
"SSE_LZCNT"
)
;
cpu1
&=
~
X264_CPU_LZCNT
;
if
(
cpu_detect
&
X264_CPU_LZCNT
)
{
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_LZCNT
,
"SSE2 LZCNT"
);
cpu1
&=
~
X264_CPU_LZCNT
;
}
}
if
(
cpu_detect
&
X264_CPU_SSE3
)
{
...
...
@@ -2693,9 +2693,16 @@ static int check_all_flags( void )
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_CACHELINE_64
,
"SSSE3 Cache64 SlowAtom"
);
cpu1
&=
~
X264_CPU_CACHELINE_64
;
cpu1
&=
~
X264_CPU_SLOW_ATOM
;
if
(
cpu_detect
&
X264_CPU_LZCNT
)
{
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_LZCNT
,
"SSSE3 LZCNT"
);
cpu1
&=
~
X264_CPU_LZCNT
;
}
}
if
(
cpu_detect
&
X264_CPU_SSE4
)
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_SSE4
,
"SSE4"
);
if
(
cpu_detect
&
X264_CPU_SSE42
)
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_SSE42
,
"SSE4.2"
);
if
(
cpu_detect
&
X264_CPU_AVX
)
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_AVX
,
"AVX"
);
if
(
cpu_detect
&
X264_CPU_XOP
)
...
...
@@ -2705,30 +2712,30 @@ static int check_all_flags( void )
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_FMA4
,
"FMA4"
);
cpu1
&=
~
X264_CPU_FMA4
;
}
if
(
cpu_detect
&
X264_CPU_
BMI1
)
if
(
cpu_detect
&
X264_CPU_
FMA3
)
{
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_
BMI1
,
"BMI1
"
);
cpu1
&=
~
X264_CPU_
BMI1
;
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_
FMA3
,
"FMA3
"
);
cpu1
&=
~
X264_CPU_
FMA3
;
}
if
(
cpu_detect
&
X264_CPU_AVX2
)
{
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_AVX2
,
"AVX2"
);
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_FMA3
|
X264_CPU_AVX2
,
"AVX2"
);
if
(
cpu_detect
&
X264_CPU_LZCNT
)
{
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_LZCNT
,
"AVX2
_
LZCNT"
);
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_LZCNT
,
"AVX2
LZCNT"
);
cpu1
&=
~
X264_CPU_LZCNT
;
}
}
if
(
cpu_detect
&
X264_CPU_BMI1
)
{
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_BMI1
,
"BMI1"
);
cpu1
&=
~
X264_CPU_BMI1
;
}
if
(
cpu_detect
&
X264_CPU_BMI2
)
{
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_BMI1
|
X264_CPU_BMI2
,
"BMI2"
);
cpu1
&=
~
(
X264_CPU_BMI1
|
X264_CPU_BMI2
);
}
if
(
cpu_detect
&
X264_CPU_FMA3
)
{
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_FMA3
,
"FMA3"
);
cpu1
&=
~
X264_CPU_FMA3
;
}
#elif ARCH_PPC
if
(
cpu_detect
&
X264_CPU_ALTIVEC
)
{
...
...
x264.h
View file @
8ae4e1cf
...
...
@@ -41,7 +41,7 @@
#include
"x264_config.h"
#define X264_BUILD 14
2
#define X264_BUILD 14
3
/* Application developers planning to link against a shared library version of
* libx264 from a Microsoft Visual Studio or similar development environment
...
...
@@ -129,8 +129,8 @@ typedef struct
#define X264_CPU_AVX 0x0000400
/* AVX support: requires OS support even if YMM registers aren't used. */
#define X264_CPU_XOP 0x0000800
/* AMD XOP */
#define X264_CPU_FMA4 0x0001000
/* AMD FMA4 */
#define X264_CPU_
AVX2
0x0002000
/*
AVX2
*/
#define X264_CPU_
FMA3
0x0004000
/*
Intel FMA3
*/
#define X264_CPU_
FMA3
0x0002000
/*
FMA3
*/
#define X264_CPU_
AVX2
0x0004000
/*
AVX2
*/
#define X264_CPU_BMI1 0x0008000
/* BMI1 */
#define X264_CPU_BMI2 0x0010000
/* BMI2 */
/* x86 modifiers */
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment