Commit 472ce364 authored by Henrik Gramner's avatar Henrik Gramner

x86: AVX-512 support

AVX-512 consists of a plethora of different extensions, but in order to keep
things a bit more manageable we group together the following extensions
under a single baseline cpu flag which should cover SKL-X and future CPUs:
 * AVX-512 Foundation (F)
 * AVX-512 Conflict Detection Instructions (CD)
 * AVX-512 Byte and Word Instructions (BW)
 * AVX-512 Doubleword and Quadword Instructions (DQ)
 * AVX-512 Vector Length Extensions (VL)

On x86-64 AVX-512 provides 16 additional vector registers, prefer using
those over existing ones since it allows us to avoid using `vzeroupper`
unless more than 16 vector registers are required. They also happen to
be volatile on Windows which means that we don't need to save and restore
existing xmm register contents unless more than 22 vector registers are
required.

Also take the opportunity to drop X264_CPU_CMOV and X264_CPU_SLOW_CTZ while
we're breaking API by messing with the cpu flags since they weren't really
used for anything.

Big thanks to Intel for their support.
parent d2b5f487
......@@ -47,8 +47,7 @@ const x264_cpu_name_t x264_cpu_names[] =
{
#if HAVE_MMX
// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
// {"CMOV", X264_CPU_CMOV}, // we require this unconditionally, so don't print it
#define MMX2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_CMOV
#define MMX2 X264_CPU_MMX|X264_CPU_MMX2
{"MMX2", MMX2},
{"MMXEXT", MMX2},
{"SSE", MMX2|X264_CPU_SSE},
......@@ -71,13 +70,13 @@ const x264_cpu_name_t x264_cpu_names[] =
{"BMI2", AVX|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2},
#define AVX2 AVX|X264_CPU_FMA3|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2|X264_CPU_AVX2
{"AVX2", AVX2},
{"AVX512", AVX2|X264_CPU_AVX512},
#undef AVX2
#undef AVX
#undef SSE2
#undef MMX2
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"SlowCTZ", X264_CPU_SLOW_CTZ},
{"SlowAtom", X264_CPU_SLOW_ATOM},
{"SlowPshufb", X264_CPU_SLOW_PSHUFB},
{"SlowPalignr", X264_CPU_SLOW_PALIGNR},
......@@ -120,7 +119,7 @@ static void sigill_handler( int sig )
#if HAVE_MMX
int x264_cpu_cpuid_test( void );
void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
void x264_cpu_xgetbv( uint32_t op, uint32_t *eax, uint32_t *edx );
uint64_t x264_cpu_xgetbv( int xcr );
uint32_t x264_cpu_detect( void )
{
......@@ -128,15 +127,14 @@ uint32_t x264_cpu_detect( void )
uint32_t eax, ebx, ecx, edx;
uint32_t vendor[4] = {0};
uint32_t max_extended_cap, max_basic_cap;
int cache;
uint64_t xcr0 = 0;
#if !ARCH_X86_64
if( !x264_cpu_cpuid_test() )
return 0;
#endif
x264_cpu_cpuid( 0, &eax, vendor+0, vendor+2, vendor+1 );
max_basic_cap = eax;
x264_cpu_cpuid( 0, &max_basic_cap, vendor+0, vendor+2, vendor+1 );
if( max_basic_cap == 0 )
return 0;
......@@ -147,28 +145,24 @@ uint32_t x264_cpu_detect( void )
return cpu;
if( edx&0x02000000 )
cpu |= X264_CPU_MMX2|X264_CPU_SSE;
if( edx&0x00008000 )
cpu |= X264_CPU_CMOV;
else
return cpu;
if( edx&0x04000000 )
cpu |= X264_CPU_SSE2;
if( ecx&0x00000001 )
cpu |= X264_CPU_SSE3;
if( ecx&0x00000200 )
cpu |= X264_CPU_SSSE3;
cpu |= X264_CPU_SSSE3|X264_CPU_SSE2_IS_FAST;
if( ecx&0x00080000 )
cpu |= X264_CPU_SSE4;
if( ecx&0x00100000 )
cpu |= X264_CPU_SSE42;
/* Check OXSAVE and AVX bits */
if( (ecx&0x18000000) == 0x18000000 )
if( ecx&0x08000000 ) /* XGETBV supported and XSAVE enabled by OS */
{
/* Check for OS support */
x264_cpu_xgetbv( 0, &eax, &edx );
if( (eax&0x6) == 0x6 )
xcr0 = x264_cpu_xgetbv( 0 );
if( (xcr0&0x6) == 0x6 ) /* XMM/YMM state */
{
cpu |= X264_CPU_AVX;
if( ecx&0x10000000 )
cpu |= X264_CPU_AVX;
if( ecx&0x00001000 )
cpu |= X264_CPU_FMA3;
}
......@@ -177,20 +171,25 @@ uint32_t x264_cpu_detect( void )
if( max_basic_cap >= 7 )
{
x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx );
/* AVX2 requires OS support, but BMI1/2 don't. */
if( (cpu&X264_CPU_AVX) && (ebx&0x00000020) )
cpu |= X264_CPU_AVX2;
if( ebx&0x00000008 )
{
cpu |= X264_CPU_BMI1;
if( ebx&0x00000100 )
cpu |= X264_CPU_BMI2;
if( ebx&0x00000100 )
cpu |= X264_CPU_BMI2;
if( (xcr0&0x6) == 0x6 ) /* XMM/YMM state */
{
if( ebx&0x00000020 )
cpu |= X264_CPU_AVX2;
if( (xcr0&0xE0) == 0xE0 ) /* OPMASK/ZMM state */
{
if( (ebx&0xD0030000) == 0xD0030000 )
cpu |= X264_CPU_AVX512;
}
}
}
if( cpu & X264_CPU_SSSE3 )
cpu |= X264_CPU_SSE2_IS_FAST;
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
max_extended_cap = eax;
......@@ -230,8 +229,6 @@ uint32_t x264_cpu_detect( void )
{
if( edx&0x00400000 )
cpu |= X264_CPU_MMX2;
if( !(cpu&X264_CPU_LZCNT) )
cpu |= X264_CPU_SLOW_CTZ;
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) )
cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
}
......@@ -256,7 +253,6 @@ uint32_t x264_cpu_detect( void )
else if( model == 28 )
{
cpu |= X264_CPU_SLOW_ATOM;
cpu |= X264_CPU_SLOW_CTZ;
cpu |= X264_CPU_SLOW_PSHUFB;
}
/* Conroe has a slow shuffle unit. Check the model number to make sure not
......@@ -270,7 +266,7 @@ uint32_t x264_cpu_detect( void )
{
/* cacheline size is specified in 3 places, any of which may be missing */
x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
cache = (ebx&0xff00)>>5; // cflush size
int cache = (ebx&0xff00)>>5; // cflush size
if( !cache && max_extended_cap >= 0x80000006 )
{
x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
......
......@@ -56,7 +56,7 @@ void x264_cpu_sfence( void );
* alignment between functions (osdep.h handles manual alignment of arrays
* if it doesn't).
*/
#if (ARCH_X86 || STACK_ALIGNMENT > 16) && HAVE_MMX
#if HAVE_MMX && (STACK_ALIGNMENT > 16 || (ARCH_X86 && STACK_ALIGNMENT > 4))
intptr_t x264_stack_align( void (*func)(), ... );
#define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__)
#else
......@@ -65,7 +65,7 @@ intptr_t x264_stack_align( void (*func)(), ... );
typedef struct
{
const char name[16];
const char *name;
uint32_t flags;
} x264_cpu_name_t;
extern const x264_cpu_name_t x264_cpu_names[];
......
......@@ -139,17 +139,23 @@ int x264_is_pipe( const char *path );
#define EXPAND(x) x
#if ARCH_X86 || ARCH_X86_64
#define NATIVE_ALIGN 32
#define NATIVE_ALIGN 64
#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
#define ALIGNED_64( var ) DECLARE_ALIGNED( var, 64 )
#if STACK_ALIGNMENT >= 32
#define ALIGNED_ARRAY_32( type, name, sub1, ... ) ALIGNED_32( type name sub1 __VA_ARGS__ )
#else
#define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) )
#endif
#if STACK_ALIGNMENT >= 64
#define ALIGNED_ARRAY_64( type, name, sub1, ... ) ALIGNED_64( type name sub1 __VA_ARGS__ )
#else
#define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) )
#endif
#else
#define NATIVE_ALIGN 16
#define ALIGNED_32 ALIGNED_16
#define ALIGNED_64 ALIGNED_16
#define ALIGNED_ARRAY_32 ALIGNED_ARRAY_16
#define ALIGNED_ARRAY_64 ALIGNED_ARRAY_16
#endif
......
......@@ -53,18 +53,16 @@ cglobal cpu_cpuid, 5,7
RET
;-----------------------------------------------------------------------------
; void cpu_xgetbv( int op, int *eax, int *edx )
; uint64_t cpu_xgetbv( int xcr )
;-----------------------------------------------------------------------------
cglobal cpu_xgetbv, 3,7
push r2
push r1
mov ecx, r0d
cglobal cpu_xgetbv
movifnidn ecx, r0m
xgetbv
pop r4
mov [r4], eax
pop r4
mov [r4], edx
RET
%if ARCH_X86_64
shl rdx, 32
or rax, rdx
%endif
ret
%if ARCH_X86_64
......@@ -77,7 +75,7 @@ cglobal stack_align
%if WIN64
sub rsp, 32 ; shadow space
%endif
and rsp, ~31
and rsp, ~(STACK_ALIGNMENT-1)
mov rax, r0
mov r0, r1
mov r1, r2
......@@ -118,7 +116,7 @@ cglobal stack_align
push ebp
mov ebp, esp
sub esp, 12
and esp, ~31
and esp, ~(STACK_ALIGNMENT-1)
mov ecx, [ebp+8]
mov edx, [ebp+12]
mov [esp], edx
......
......@@ -323,6 +323,8 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%endmacro
%define required_stack_alignment ((mmsize + 15) & ~15)
%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
%define high_mm_regs (16*cpuflag(avx512))
%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
%ifnum %1
......@@ -436,15 +438,16 @@ DECLARE_REG 14, R13, 120
%macro WIN64_PUSH_XMM 0
; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
%if xmm_regs_used > 6
%if xmm_regs_used > 6 + high_mm_regs
movaps [rstk + stack_offset + 8], xmm6
%endif
%if xmm_regs_used > 7
%if xmm_regs_used > 7 + high_mm_regs
movaps [rstk + stack_offset + 24], xmm7
%endif
%if xmm_regs_used > 8
%assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
%if %%xmm_regs_on_stack > 0
%assign %%i 8
%rep xmm_regs_used-8
%rep %%xmm_regs_on_stack
movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
%assign %%i %%i+1
%endrep
......@@ -453,10 +456,11 @@ DECLARE_REG 14, R13, 120
%macro WIN64_SPILL_XMM 1
%assign xmm_regs_used %1
ASSERT xmm_regs_used <= 16
%if xmm_regs_used > 8
ASSERT xmm_regs_used <= 16 + high_mm_regs
%assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
%if %%xmm_regs_on_stack > 0
; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
%assign %%pad (xmm_regs_used-8)*16 + 32
%assign %%pad %%xmm_regs_on_stack*16 + 32
%assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
SUB rsp, stack_size_padded
%endif
......@@ -465,9 +469,10 @@ DECLARE_REG 14, R13, 120
%macro WIN64_RESTORE_XMM_INTERNAL 0
%assign %%pad_size 0
%if xmm_regs_used > 8
%assign %%i xmm_regs_used
%rep xmm_regs_used-8
%assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
%if %%xmm_regs_on_stack > 0
%assign %%i xmm_regs_used - high_mm_regs
%rep %%xmm_regs_on_stack
%assign %%i %%i-1
movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
%endrep
......@@ -480,10 +485,10 @@ DECLARE_REG 14, R13, 120
%assign %%pad_size stack_size_padded
%endif
%endif
%if xmm_regs_used > 7
%if xmm_regs_used > 7 + high_mm_regs
movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
%endif
%if xmm_regs_used > 6
%if xmm_regs_used > 6 + high_mm_regs
movaps xmm6, [rsp + stack_offset - %%pad_size + 8]
%endif
%endmacro
......@@ -495,12 +500,12 @@ DECLARE_REG 14, R13, 120
%assign xmm_regs_used 0
%endmacro
%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
%macro RET 0
WIN64_RESTORE_XMM_INTERNAL
POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
%if mmsize == 32
%if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
......@@ -524,9 +529,10 @@ DECLARE_REG 12, R15, 56
DECLARE_REG 13, R12, 64
DECLARE_REG 14, R13, 72
%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
%assign regs_used %2
%assign xmm_regs_used %3
ASSERT regs_used >= num_args
SETUP_STACK_POINTER %4
ASSERT regs_used <= 15
......@@ -536,7 +542,7 @@ DECLARE_REG 14, R13, 72
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
%macro RET 0
%if stack_size_padded > 0
......@@ -547,7 +553,7 @@ DECLARE_REG 14, R13, 72
%endif
%endif
POP_IF_USED 14, 13, 12, 11, 10, 9
%if mmsize == 32
%if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
......@@ -592,7 +598,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
%macro RET 0
%if stack_size_padded > 0
......@@ -603,7 +609,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%endif
%endif
POP_IF_USED 6, 5, 4, 3
%if mmsize == 32
%if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
......@@ -713,7 +719,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign stack_offset 0 ; stack pointer offset relative to the return address
%assign stack_size 0 ; amount of stack space that can be freely used inside a function
%assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
%assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
%assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
%ifnidn %3, ""
PROLOGUE %3
%endif
......@@ -788,10 +794,10 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign cpuflags_bmi1 (1<<16)| cpuflags_avx|cpuflags_lzcnt
%assign cpuflags_bmi2 (1<<17)| cpuflags_bmi1
%assign cpuflags_avx2 (1<<18)| cpuflags_fma3|cpuflags_bmi2
%assign cpuflags_avx512 (1<<19)| cpuflags_avx2 ; F, CD, BW, DQ, VL
%assign cpuflags_cache32 (1<<19)
%assign cpuflags_cache64 (1<<20)
%assign cpuflags_slowctz (1<<21)
%assign cpuflags_cache32 (1<<20)
%assign cpuflags_cache64 (1<<21)
%assign cpuflags_aligned (1<<22) ; not a cpu feature, but a function variant
%assign cpuflags_atom (1<<23)
......@@ -849,11 +855,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%endif
%endmacro
; Merge mmx and sse*
; Merge mmx, sse*, and avx*
; m# is a simd register of the currently selected size
; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
; (All 3 remain in sync through SWAP.)
; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
; (All 4 remain in sync through SWAP.)
%macro CAT_XDEFINE 3
%xdefine %1%2 %3
......@@ -863,6 +870,18 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%undef %1%2
%endmacro
; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
%if ARCH_X86_64 && cpuflag(avx512)
%assign %%i %1
%rep 16-%1
%assign %%i_high %%i+16
SWAP %%i, %%i_high
%assign %%i %%i+1
%endrep
%endif
%endmacro
%macro INIT_MMX 0-1+
%assign avx_enabled 0
%define RESET_MM_PERMUTATION INIT_MMX %1
......@@ -878,7 +897,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
CAT_XDEFINE nnmm, %%i, %%i
%assign %%i %%i+1
%endrep
%rep 8
%rep 24
CAT_UNDEF m, %%i
CAT_UNDEF nnmm, %%i
%assign %%i %%i+1
......@@ -892,7 +911,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%define mmsize 16
%define num_mmregs 8
%if ARCH_X86_64
%define num_mmregs 16
%define num_mmregs 32
%endif
%define mova movdqa
%define movu movdqu
......@@ -905,6 +924,10 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign %%i %%i+1
%endrep
INIT_CPUFLAGS %1
%if WIN64
; Swap callee-saved registers with volatile registers
AVX512_MM_PERMUTATION 6
%endif
%endmacro
%macro INIT_YMM 0-1+
......@@ -913,7 +936,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%define mmsize 32
%define num_mmregs 8
%if ARCH_X86_64
%define num_mmregs 16
%define num_mmregs 32
%endif
%define mova movdqa
%define movu movdqu
......@@ -926,6 +949,29 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign %%i %%i+1
%endrep
INIT_CPUFLAGS %1
AVX512_MM_PERMUTATION
%endmacro
%macro INIT_ZMM 0-1+
%assign avx_enabled 1
%define RESET_MM_PERMUTATION INIT_ZMM %1
%define mmsize 64
%define num_mmregs 8
%if ARCH_X86_64
%define num_mmregs 32
%endif
%define mova movdqa
%define movu movdqu
%undef movh
%define movnta movntdq
%assign %%i 0
%rep num_mmregs
CAT_XDEFINE m, %%i, zmm %+ %%i
CAT_XDEFINE nnzmm, %%i, %%i
%assign %%i %%i+1
%endrep
INIT_CPUFLAGS %1
AVX512_MM_PERMUTATION
%endmacro
INIT_XMM
......@@ -934,18 +980,26 @@ INIT_XMM
%define mmmm%1 mm%1
%define mmxmm%1 mm%1
%define mmymm%1 mm%1
%define mmzmm%1 mm%1
%define xmmmm%1 mm%1
%define xmmxmm%1 xmm%1
%define xmmymm%1 xmm%1
%define xmmzmm%1 xmm%1
%define ymmmm%1 mm%1
%define ymmxmm%1 xmm%1
%define ymmymm%1 ymm%1
%define ymmzmm%1 ymm%1
%define zmmmm%1 mm%1
%define zmmxmm%1 xmm%1
%define zmmymm%1 ymm%1
%define zmmzmm%1 zmm%1
%define xm%1 xmm %+ m%1
%define ym%1 ymm %+ m%1
%define zm%1 zmm %+ m%1
%endmacro
%assign i 0
%rep 16
%rep 32
DECLARE_MMCAST i
%assign i i+1
%endrep
......@@ -1080,12 +1134,17 @@ INIT_XMM
;=============================================================================
%assign i 0
%rep 16
%rep 32
%if i < 8
CAT_XDEFINE sizeofmm, i, 8
CAT_XDEFINE regnumofmm, i, i
%endif
CAT_XDEFINE sizeofxmm, i, 16
CAT_XDEFINE sizeofymm, i, 32
CAT_XDEFINE sizeofzmm, i, 64
CAT_XDEFINE regnumofxmm, i, i
CAT_XDEFINE regnumofymm, i, i
CAT_XDEFINE regnumofzmm, i, i
%assign i i+1
%endrep
%undef i
......@@ -1202,7 +1261,7 @@ INIT_XMM
%endmacro
%endmacro
; Instructions with both VEX and non-VEX encodings
; Instructions with both VEX/EVEX and legacy encodings
; Non-destructive instructions are written without parameters
AVX_INSTR addpd, sse2, 1, 0, 1
AVX_INSTR addps, sse, 1, 0, 1
......@@ -1533,3 +1592,49 @@ FMA4_INSTR fmsub, pd, ps, sd, ss
FMA4_INSTR fmsubadd, pd, ps
FMA4_INSTR fnmadd, pd, ps, sd, ss
FMA4_INSTR fnmsub, pd, ps, sd, ss
; Macros for converting VEX instructions to equivalent EVEX ones.
%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
%macro %1 2-7 fnord, fnord, %1, %2, %3
%ifidn %3, fnord
%define %%args %1, %2
%elifidn %4, fnord
%define %%args %1, %2, %3
%else
%define %%args %1, %2, %3, %4
%endif
%assign %%evex_required cpuflag(avx512) & %7
%ifnum regnumof%1
%if regnumof%1 >= 16 || sizeof%1 > 32
%assign %%evex_required 1
%endif
%endif
%ifnum regnumof%2
%if regnumof%2 >= 16 || sizeof%2 > 32
%assign %%evex_required 1
%endif
%endif
%if %%evex_required
%6 %%args
%else
%5 %%args ; Prefer VEX over EVEX due to shorter instruction length
%endif
%endmacro
%endmacro
EVEX_INSTR vbroadcastf128, vbroadcastf32x4
EVEX_INSTR vbroadcasti128, vbroadcasti32x4
EVEX_INSTR vextractf128, vextractf32x4
EVEX_INSTR vextracti128, vextracti32x4
EVEX_INSTR vinsertf128, vinsertf32x4
EVEX_INSTR vinserti128, vinserti32x4
EVEX_INSTR vmovdqa, vmovdqa32
EVEX_INSTR vmovdqu, vmovdqu32
EVEX_INSTR vpand, vpandd
EVEX_INSTR vpandn, vpandnd
EVEX_INSTR vpor, vpord
EVEX_INSTR vpxor, vpxord
EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision
EVEX_INSTR vrcpss, vrcp14ss, 1
EVEX_INSTR vrsqrtps, vrsqrt14ps, 1
EVEX_INSTR vrsqrtss, vrsqrt14ss, 1
......@@ -863,7 +863,10 @@ if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o
fi
if [ $compiler = GNU -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
if cc_check '' -mpreferred-stack-boundary=5 ; then
if cc_check '' -mpreferred-stack-boundary=6 ; then
CFLAGS="$CFLAGS -mpreferred-stack-boundary=6"
stack_alignment=64
elif cc_check '' -mpreferred-stack-boundary=5 ; then
CFLAGS="$CFLAGS -mpreferred-stack-boundary=5"
stack_alignment=32
elif [ $stack_alignment -lt 16 ] && cc_check '' -mpreferred-stack-boundary=4 ; then
......
......@@ -444,11 +444,6 @@ static int x264_validate_parameters( x264_t *h, int b_open )
fail = 1;
}
#endif
if( !fail && !(cpuflags & X264_CPU_CMOV) )
{
x264_log( h, X264_LOG_ERROR, "your cpu does not support CMOV, but x264 was compiled with asm\n");
fail = 1;
}
if( fail )
{
x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm (configure --disable-asm)\n");
......
......@@ -57,8 +57,7 @@ int quiet = 0;
if( !ok ) ret = -1; \
}
#define BENCH_RUNS 100 // tradeoff between accuracy and speed
#define BENCH_ALIGNS 16 // number of stack+heap data alignments (another accuracy vs speed tradeoff)
#define BENCH_RUNS 2000 // tradeoff between accuracy and speed
#define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions
#define MAX_CPUS 30 // number of different combinations of cpu flags
......@@ -178,6 +177,7 @@ static void print_bench(void)
continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
#if HAVE_MMX
b->cpu&X264_CPU_AVX512 ? "avx512" :
b->cpu&X264_CPU_AVX2 ? "avx2" :
b->cpu&X264_CPU_BMI2 ? "bmi2" :
b->cpu&X264_CPU_BMI1 ? "bmi1" :
......@@ -2602,6 +2602,11 @@ static int check_cabac( int cpu_ref, int cpu_new )
x264_quant_init( &h, cpu_new, &h.quantf );
h.quantf.coeff_last[DCT_CHROMA_DC] = h.quantf.coeff_last4;
/* Reset cabac state to avoid buffer overruns in do_bench() with large BENCH_RUNS values. */
#define GET_CB( i ) (\
x264_cabac_encode_init( &cb[i], bitstream[i], bitstream[i]+0xfff0 ),\
cb[i].f8_bits_encoded = 0, &cb[i] )
#define CABAC_RESIDUAL(name, start, end, rd)\
{\
if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || (cpu_new&X264_CPU_SSE2_IS_SLOW)) )\
......@@ -2637,13 +2642,9 @@ static int check_cabac( int cpu_ref, int cpu_new )
x264_cabac_t cb[2];\
x264_cabac_context_init( &h, &cb[0], SLICE_TYPE_P, 26, 0 );\
x264_cabac_context_init( &h, &cb[1], SLICE_TYPE_P, 26, 0 );\
x264_cabac_encode_init( &cb[0], bitstream[0], bitstream[0]+0xfff0 );\
x264_cabac_encode_init( &cb[1], bitstream[1], bitstream[1]+0xfff0 );\
cb[0].f8_bits_encoded = 0;\
cb[1].f8_bits_encoded = 0;\
if( !rd ) memcpy( bitstream[1], bitstream[0], 0x400 );\
call_c1( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
call_c1( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\
call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\
ok = cb[0].f8_bits_encoded == cb[1].f8_bits_encoded && !memcmp(cb[0].state, cb[1].state, 1024);\
if( !rd ) ok |= !memcmp( bitstream[1], bitstream[0], 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\
if( !ok )\
......@@ -2656,8 +2657,8 @@ static int check_cabac( int cpu_ref, int cpu_new )
}\
if( (j&15) == 0 )\
{\
call_c2( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
call_c2( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\
call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\
}\
}\
}\
......@@ -2794,8 +2795,6 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" );
cpu1 &= ~X264_CPU_CACHELINE_32;
#endif
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
cpu1 &= ~X264_CPU_SLOW_CTZ;
}
if( cpu_detect & X264_CPU_SSE )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE, "SSE" );
......@@ -2807,8 +2806,6 @@ static int check_all_flags( void )
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSE2 SlowShuffle" );
cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
cpu1 &= ~X264_CPU_SLOW_CTZ;
}
if( cpu_detect & X264_CPU_LZCNT )
{
......@@ -2827,8 +2824,6 @@ static int check_all_flags( void )
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSSE3 SlowShuffle" );
cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
cpu1 &= ~X264_CPU_SLOW_CTZ;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" );
cpu1 &= ~X264_CPU_CACHELINE_64;
......@@ -2860,6 +2855,8 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" );
if( cpu_detect & X264_CPU_AVX2 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
if( cpu_detect & X264_CPU_AVX512 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX512, "AVX512" );
#elif ARCH_PPC
if( cpu_detect & X264_CPU_ALTIVEC )
{
......@@ -2889,8 +2886,6 @@ static int check_all_flags( void )
int main(int argc, char *argv[])
{
int ret = 0;
#ifdef _WIN32
/* Disable the Windows Error Reporting dialog */
SetErrorMode( SEM_NOGPFAULTERRORBOX );
......@@ -2916,8 +2911,8 @@ int main(int argc, char *argv[])
fprintf( stderr, "x264: using random seed %u\n", seed );
srand( seed );
buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) + 32*BENCH_ALIGNS );
pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) + 32*BENCH_ALIGNS );
buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) );
pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) );
if( !buf1 || !pbuf1 )
{
fprintf( stderr, "malloc failed, unable to initiate tests!\n" );
......@@ -2938,21 +2933,7 @@ int main(int argc, char *argv[])
}
memset( buf1+0x1e00, 0, 0x2000*sizeof(pixel) );
/* 32-byte alignment is guaranteed whenever it's useful, but some functions also vary in speed depending on %64 */
if( do_bench )
for( int i = 0; i < BENCH_ALIGNS && !ret; i++ )
{
INIT_POINTER_OFFSETS;
ret |= x264_stack_pagealign( check_all_flags, i*32 );
buf1 += 32;
pbuf1 += 32;
quiet = 1;
fprintf( stderr, "%d/%d\r", i+1, BENCH_ALIGNS );
}
else
ret = x264_stack_pagealign( check_all_flags, 0 );
if( ret )
if( x264_stack_pagealign( check_all_flags, 0 ) )
{
fprintf( stderr, "x264: at least one test has failed. Go and fix that Right Now!\n" );
return -1;
......