Commit 68cda11b authored by Fiona Glaser's avatar Fiona Glaser

Initial AVX support

Automatically handle 3-operand instructions and abstraction between SSE and AVX.
Implement one function with this (denoise_dct) as an initial test.
x264 can't make much use of the 256-bit support of AVX (as it's float-only), but 3-operand could give some small benefits.
parent 8fb87147
......@@ -59,6 +59,7 @@ const x264_cpu_name_t x264_cpu_names[] = {
{"FastShuffle", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SHUFFLE_IS_FAST},
{"SSE4.1", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4.2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
{"AVX", X264_CPU_AVX},
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"SSEMisalign", X264_CPU_SSE_MISALIGN},
......@@ -129,6 +130,8 @@ uint32_t x264_cpu_detect( void )
cpu |= X264_CPU_SSE4;
if( ecx&0x00100000 )
cpu |= X264_CPU_SSE42;
if( ecx&0x10000000 )
cpu |= X264_CPU_AVX;
if( cpu & X264_CPU_SSSE3 )
cpu |= X264_CPU_SSE2_IS_FAST;
......
......@@ -474,6 +474,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4 = x264_quant_4x4_sse4;
pf->quant_8x8 = x264_quant_8x8_sse4;
}
if( cpu&X264_CPU_AVX )
{
pf->denoise_dct = x264_denoise_dct_avx;
}
#endif // HAVE_MMX
#if HAVE_ALTIVEC
......
......@@ -804,28 +804,24 @@ cglobal denoise_dct_%1, 4,5,%2
mova m3, [r0+r3*2+1*mmsize]
PABSW m0, m2
PABSW m1, m3
mova m4, m0
mova m5, m1
psubusw m0, [r2+r3*2+0*mmsize]
psubusw m1, [r2+r3*2+1*mmsize]
PSIGNW m0, m2
PSIGNW m1, m3
mova [r0+r3*2+0*mmsize], m0
mova [r0+r3*2+1*mmsize], m1
mova m2, m4
mova m3, m5
punpcklwd m4, m6
punpckhwd m2, m6
punpcklwd m5, m6
punpckhwd m3, m6
paddd m4, [r1+r3*4+0*mmsize]
paddd m2, [r1+r3*4+1*mmsize]
paddd m5, [r1+r3*4+2*mmsize]
paddd m3, [r1+r3*4+3*mmsize]
mova [r1+r3*4+0*mmsize], m4
mova [r1+r3*4+1*mmsize], m2
mova [r1+r3*4+2*mmsize], m5
mova [r1+r3*4+3*mmsize], m3
psubusw m4, m0, [r2+r3*2+0*mmsize]
psubusw m5, m1, [r2+r3*2+1*mmsize]
PSIGNW m4, m2
PSIGNW m5, m3
mova [r0+r3*2+0*mmsize], m4
mova [r0+r3*2+1*mmsize], m5
punpcklwd m2, m0, m6
punpcklwd m3, m1, m6
punpckhwd m0, m6
punpckhwd m1, m6
paddd m2, [r1+r3*4+0*mmsize]
paddd m0, [r1+r3*4+1*mmsize]
paddd m3, [r1+r3*4+2*mmsize]
paddd m1, [r1+r3*4+3*mmsize]
mova [r1+r3*4+0*mmsize], m2
mova [r1+r3*4+1*mmsize], m0
mova [r1+r3*4+2*mmsize], m3
mova [r1+r3*4+3*mmsize], m1
jg .loop
mov [r0], r4w
RET
......@@ -842,6 +838,8 @@ DENOISE_DCT sse2, 7
%define PABSW PABSW_SSSE3
%define PSIGNW PSIGNW_SSSE3
DENOISE_DCT ssse3, 7
INIT_AVX
DENOISE_DCT avx, 7
%endif ; !HIGH_BIT_DEPTH
......@@ -970,12 +968,14 @@ cglobal decimate_score%1_%2, 1,3
%endmacro
%ifndef ARCH_X86_64
INIT_MMX
%define DECIMATE_MASK DECIMATE_MASK_MMX
DECIMATE4x4 15, mmxext, 0, 0
DECIMATE4x4 16, mmxext, 0, 0
DECIMATE4x4 15, mmxext_slowctz, 1, 0
DECIMATE4x4 16, mmxext_slowctz, 1, 0
%endif
INIT_XMM
%define DECIMATE_MASK DECIMATE_MASK_SSE2
DECIMATE4x4 15, sse2, 0, 0
DECIMATE4x4 16, sse2, 0, 0
......
......@@ -54,9 +54,10 @@ void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_
void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_denoise_dct_mmx( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_sse2( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_mmx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_avx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
int x264_decimate_score15_mmxext( dctcoef *dct );
int x264_decimate_score15_sse2 ( dctcoef *dct );
int x264_decimate_score15_ssse3 ( dctcoef *dct );
......
......@@ -500,6 +500,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%endmacro
%macro INIT_MMX 0
%assign avx_enabled 0
%define RESET_MM_PERMUTATION INIT_MMX
%define mmsize 8
%define num_mmregs 8
......@@ -521,6 +522,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%endmacro
%macro INIT_XMM 0
%assign avx_enabled 0
%define RESET_MM_PERMUTATION INIT_XMM
%define mmsize 16
%define num_mmregs 8
......@@ -539,6 +541,12 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%endrep
%endmacro
%macro INIT_AVX 0
INIT_XMM
%assign avx_enabled 1
%define RESET_MM_PERMUTATION INIT_AVX
%endmacro
INIT_MMX
; I often want to use macros that permute their arguments. e.g. there's no
......@@ -646,3 +654,226 @@ INIT_MMX
sub %1, %2
%endif
%endmacro
;=============================================================================
; AVX abstraction layer
;=============================================================================
%define sizeofmm0 8
%define sizeofmm1 8
%define sizeofmm2 8
%define sizeofmm3 8
%define sizeofmm4 8
%define sizeofmm5 8
%define sizeofmm6 8
%define sizeofmm7 8
%define sizeofxmm0 16
%define sizeofxmm1 16
%define sizeofxmm2 16
%define sizeofxmm3 16
%define sizeofxmm4 16
%define sizeofxmm5 16
%define sizeofxmm6 16
%define sizeofxmm7 16
%define sizeofxmm8 16
%define sizeofxmm9 16
%define sizeofxmm10 16
%define sizeofxmm11 16
%define sizeofxmm12 16
%define sizeofxmm13 16
%define sizeofxmm14 16
%define sizeofxmm15 16
;%1 == instruction
;%2 == 1 if float, 0 if int
;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
;%4 == number of operands given
;%5+: operands
%macro RUN_AVX_INSTR 6-7+
%if sizeof%5==8
%define %%regmov movq
%elif %2
%define %%regmov movaps
%else
%define %%regmov movdqa
%endif
%if %4>=3+%3
%ifnidn %5, %6
%if avx_enabled && sizeof%5==16
v%1 %5, %6, %7
%else
%%regmov %5, %6
%1 %5, %7
%endif
%else
%1 %5, %7
%endif
%elif %3
%1 %5, %6, %7
%else
%1 %5, %6
%endif
%endmacro
;%1 == instruction
;%2 == 1 if float, 0 if int
;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
%macro AVX_INSTR 3
%macro %1 2-8 fnord, fnord, fnord, %1, %2, %3
%ifidn %3, fnord
RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
%elifidn %4, fnord
RUN_AVX_INSTR %6, %7, %8, 3, %1, %2, %3
%elifidn %5, fnord
RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
%else
RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
%endif
%endmacro
%endmacro
AVX_INSTR addpd, 1, 0
AVX_INSTR addps, 1, 0
AVX_INSTR addsd, 1, 0
AVX_INSTR addss, 1, 0
AVX_INSTR addsubpd, 1, 0
AVX_INSTR addsubps, 1, 0
AVX_INSTR andpd, 1, 0
AVX_INSTR andps, 1, 0
AVX_INSTR andnpd, 1, 0
AVX_INSTR andnps, 1, 0
AVX_INSTR blendpd, 1, 0
AVX_INSTR blendps, 1, 0
AVX_INSTR blendvpd, 1, 0
AVX_INSTR blendvps, 1, 0
AVX_INSTR cmppd, 1, 0
AVX_INSTR cmpps, 1, 0
AVX_INSTR cmpsd, 1, 0
AVX_INSTR cmpss, 1, 0
AVX_INSTR divpd, 1, 0
AVX_INSTR divps, 1, 0
AVX_INSTR divsd, 1, 0
AVX_INSTR divss, 1, 0
AVX_INSTR dppd, 1, 0
AVX_INSTR dpps, 1, 0
AVX_INSTR haddpd, 1, 0
AVX_INSTR haddps, 1, 0
AVX_INSTR hsubpd, 1, 0
AVX_INSTR hsubps, 1, 0
AVX_INSTR maxpd, 1, 0
AVX_INSTR maxps, 1, 0
AVX_INSTR maxsd, 1, 0
AVX_INSTR maxss, 1, 0
AVX_INSTR minpd, 1, 0
AVX_INSTR minps, 1, 0
AVX_INSTR minsd, 1, 0
AVX_INSTR minss, 1, 0
AVX_INSTR mpsadbw, 0, 1
AVX_INSTR mulpd, 1, 0
AVX_INSTR mulps, 1, 0
AVX_INSTR mulsd, 1, 0
AVX_INSTR mulss, 1, 0
AVX_INSTR orpd, 1, 0
AVX_INSTR orps, 1, 0
AVX_INSTR packsswb, 0, 0
AVX_INSTR packssdw, 0, 0
AVX_INSTR packuswb, 0, 0
AVX_INSTR packusdw, 0, 0
AVX_INSTR paddb, 0, 0
AVX_INSTR paddw, 0, 0
AVX_INSTR paddd, 0, 0
AVX_INSTR paddq, 0, 0
AVX_INSTR paddsb, 0, 0
AVX_INSTR paddsw, 0, 0
AVX_INSTR paddusb, 0, 0
AVX_INSTR paddusw, 0, 0
AVX_INSTR palignr, 0, 1
AVX_INSTR pand, 0, 0
AVX_INSTR pandn, 0, 0
AVX_INSTR pavgb, 0, 0
AVX_INSTR pavgw, 0, 0
AVX_INSTR pblendvb, 0, 0
AVX_INSTR pblendw, 0, 1
AVX_INSTR pcmpestri, 0, 0
AVX_INSTR pcmpestrm, 0, 0
AVX_INSTR pcmpistri, 0, 0
AVX_INSTR pcmpistrm, 0, 0
AVX_INSTR pcmpeqb, 0, 0
AVX_INSTR pcmpeqw, 0, 0
AVX_INSTR pcmpeqd, 0, 0
AVX_INSTR pcmpeqq, 0, 0
AVX_INSTR pcmpgtb, 0, 0
AVX_INSTR pcmpgtw, 0, 0
AVX_INSTR pcmpgtd, 0, 0
AVX_INSTR pcmpgtq, 0, 0
AVX_INSTR phaddw, 0, 0
AVX_INSTR phaddd, 0, 0
AVX_INSTR phaddsw, 0, 0
AVX_INSTR phsubw, 0, 0
AVX_INSTR phsubd, 0, 0
AVX_INSTR phsubsw, 0, 0
AVX_INSTR pmaddwd, 0, 0
AVX_INSTR pmaddubsw, 0, 0
AVX_INSTR pmaxsb, 0, 0
AVX_INSTR pmaxsw, 0, 0
AVX_INSTR pmaxsd, 0, 0
AVX_INSTR pmaxub, 0, 0
AVX_INSTR pmaxuw, 0, 0
AVX_INSTR pmaxud, 0, 0
AVX_INSTR pminsb, 0, 0
AVX_INSTR pminsw, 0, 0
AVX_INSTR pminsd, 0, 0
AVX_INSTR pminub, 0, 0
AVX_INSTR pminuw, 0, 0
AVX_INSTR pminud, 0, 0
AVX_INSTR pmulhuw, 0, 0
AVX_INSTR pmulhrsw, 0, 0
AVX_INSTR pmulhw, 0, 0
AVX_INSTR pmullw, 0, 0
AVX_INSTR pmulld, 0, 0
AVX_INSTR pmuludq, 0, 0
AVX_INSTR pmuldq, 0, 0
AVX_INSTR por, 0, 0
AVX_INSTR psadbw, 0, 0
AVX_INSTR pshufb, 0, 0
AVX_INSTR psignb, 0, 0
AVX_INSTR psignw, 0, 0
AVX_INSTR psignd, 0, 0
AVX_INSTR psllw, 0, 0
AVX_INSTR pslld, 0, 0
AVX_INSTR psllq, 0, 0
AVX_INSTR pslldq, 0, 0
AVX_INSTR psraw, 0, 0
AVX_INSTR psrad, 0, 0
AVX_INSTR psrlw, 0, 0
AVX_INSTR psrld, 0, 0
AVX_INSTR psrlq, 0, 0
AVX_INSTR psrldq, 0, 0
AVX_INSTR psubb, 0, 0
AVX_INSTR psubw, 0, 0
AVX_INSTR psubd, 0, 0
AVX_INSTR psubq, 0, 0
AVX_INSTR psubsb, 0, 0
AVX_INSTR psubsw, 0, 0
AVX_INSTR psubusb, 0, 0
AVX_INSTR psubusw, 0, 0
AVX_INSTR punpckhbw, 0, 0
AVX_INSTR punpckhwd, 0, 0
AVX_INSTR punpckhdq, 0, 0
AVX_INSTR punpckhqdq, 0, 0
AVX_INSTR punpcklbw, 0, 0
AVX_INSTR punpcklwd, 0, 0
AVX_INSTR punpckldq, 0, 0
AVX_INSTR punpcklqdq, 0, 0
AVX_INSTR pxor, 0, 0
AVX_INSTR subpd, 1, 0
AVX_INSTR subps, 1, 0
AVX_INSTR subsd, 1, 0
AVX_INSTR subss, 1, 0
AVX_INSTR unpckhpd, 1, 0
AVX_INSTR unpckhps, 1, 0
AVX_INSTR unpcklpd, 1, 0
AVX_INSTR unpcklps, 1, 0
AVX_INSTR xorpd, 1, 0
AVX_INSTR xorps, 1, 0
......@@ -165,6 +165,7 @@ static void print_bench(void)
if( k < j )
continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
b->cpu&X264_CPU_AVX ? "avx" :
b->cpu&X264_CPU_SSE4 ? "sse4" :
b->cpu&X264_CPU_SHUFFLE_IS_FAST ? "fastshuffle" :
b->cpu&X264_CPU_SSSE3 ? "ssse3" :
......@@ -2020,6 +2021,8 @@ static int check_all_flags( void )
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
}
if( x264_cpu_detect() & X264_CPU_AVX )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
#elif ARCH_PPC
if( x264_cpu_detect() & X264_CPU_ALTIVEC )
{
......
......@@ -41,7 +41,7 @@
#include "x264_config.h"
#define X264_BUILD 112
#define X264_BUILD 113
/* x264_t:
* opaque handler for encoder */
......@@ -122,6 +122,9 @@ typedef struct
#define X264_CPU_FAST_NEON_MRC 0x080000 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
#define X264_CPU_SLOW_CTZ 0x100000 /* BSR/BSF x86 instructions are really slow on some CPUs */
#define X264_CPU_SLOW_ATOM 0x200000 /* The Atom just sucks */
#define X264_CPU_AVX 0x400000 /* AVX support -- we don't currently use YMM registers, just
* the 3-operand capability, so we don't require OS support
* for AVX. */
/* Analyse flags
*/
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment