Commit ccda1ba4 authored by Fiona Glaser's avatar Fiona Glaser

AVX2/FMA3 version of mbtree_propagate

First AVX2 function for testing.
Bump yasm version to 1.2.0 for AVX2 support.
parent 8a9608bb
......@@ -1702,7 +1702,7 @@ cglobal mbtree_propagate_cost, 7,7,7
%if cpuflag(fma4)
cvtdq2ps xmm0, xmm0
cvtdq2ps xmm1, xmm1
vfmaddps xmm0, xmm0, xmm6, xmm1
fmaddps xmm0, xmm0, xmm6, xmm1
cvtdq2ps xmm1, xmm2
psubd xmm2, xmm3
cvtdq2ps xmm2, xmm2
......@@ -1710,7 +1710,7 @@ cglobal mbtree_propagate_cost, 7,7,7
mulps xmm1, xmm3
mulps xmm0, xmm2
addps xmm2, xmm3, xmm3
vfnmaddps xmm3, xmm1, xmm3, xmm2
fnmaddps xmm3, xmm1, xmm3, xmm2
mulps xmm0, xmm3
%else
cvtdq2ps xmm0, xmm0
......@@ -1742,14 +1742,18 @@ INIT_XMM fma4
MBTREE
%macro INT16_TO_FLOAT 1
%if cpuflag(avx2)
vpmovzxwd ymm%1, xmm%1
%else
vpunpckhwd xmm4, xmm%1, xmm7
vpunpcklwd xmm%1, xmm7
vinsertf128 ymm%1, ymm%1, xmm4, 1
%endif
vcvtdq2ps ymm%1, ymm%1
%endmacro
; FIXME: align loads/stores to 16 bytes
INIT_YMM avx
%macro MBTREE_AVX 0
cglobal mbtree_propagate_cost, 7,7,8
add r6d, r6d
lea r0, [r0+r6*2]
......@@ -1761,7 +1765,9 @@ cglobal mbtree_propagate_cost, 7,7,8
vmovdqa xmm5, [pw_3fff]
vbroadcastss ymm6, [r5]
vmulps ymm6, ymm6, [pf_inv256]
%if notcpuflag(avx2)
vpxor xmm7, xmm7
%endif
.loop:
vmovdqu xmm0, [r2+r6] ; intra
vmovdqu xmm1, [r4+r6] ; invq
......@@ -1771,6 +1777,17 @@ cglobal mbtree_propagate_cost, 7,7,8
INT16_TO_FLOAT 1
INT16_TO_FLOAT 2
INT16_TO_FLOAT 3
%if cpuflag(fma3)
vmulps ymm1, ymm1, ymm0
vsubps ymm4, ymm0, ymm3
fmaddps ymm1, ymm1, ymm6, ymm2
vrcpps ymm3, ymm0
vmulps ymm2, ymm0, ymm3
vmulps ymm1, ymm1, ymm4
vaddps ymm4, ymm3, ymm3
fnmaddps ymm4, ymm2, ymm3, ymm4
vmulps ymm1, ymm1, ymm4
%else
vmulps ymm1, ymm1, ymm0
vsubps ymm4, ymm0, ymm3
vmulps ymm1, ymm1, ymm6 ; intra*invq*fps_factor>>8
......@@ -1782,8 +1799,15 @@ cglobal mbtree_propagate_cost, 7,7,8
vaddps ymm3, ymm3, ymm3 ; 2 * (1/intra 1st approx)
vsubps ymm3, ymm3, ymm2 ; 2nd approximation for 1/intra
vmulps ymm1, ymm1, ymm3 ; / intra
%endif
vcvtps2dq ymm1, ymm1
vmovdqu [r0+r6*2], ymm1
add r6, 16
jl .loop
RET
%endmacro
INIT_YMM avx
MBTREE_AVX
INIT_YMM avx2,fma3
MBTREE_AVX
......@@ -139,6 +139,8 @@ void x264_mbtree_propagate_cost_avx ( int *dst, uint16_t *propagate_in, uint16_t
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_fma4( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx2_fma3( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
#define MC_CHROMA(cpu)\
void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\
......@@ -754,7 +756,12 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
return;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
if( !(cpu&X264_CPU_FMA4) )
if( cpu&X264_CPU_FMA4 )
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
if( !(cpu&X264_CPU_AVX2) )
return;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
if( cpu&X264_CPU_FMA3 )
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2_fma3;
}
......@@ -1361,3 +1361,45 @@ FMA_INSTR pmadcswd, pmaddwd, paddd
; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
; This lets us use tzcnt without bumping the yasm version requirement yet.
%define tzcnt rep bsf
; convert FMA4 to FMA3 if possible
%macro FMA4_INSTR 4
%macro %1 4-8 %1, %2, %3, %4
%if cpuflag(fma4)
v%5 %1, %2, %3, %4
%elifidn %1, %2
v%6 %1, %4, %3 ; %1 = %1 * %3 + %4
%elifidn %1, %3
v%7 %1, %2, %4 ; %1 = %2 * %1 + %4
%elifidn %1, %4
v%8 %1, %2, %3 ; %1 = %2 * %3 + %1
%else
%error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported
%endif
%endmacro
%endmacro
FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd
FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps
FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd
FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss
FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd
FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps
FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd
FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps
FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd
FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps
FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd
FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss
FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd
FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps
FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd
FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss
FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd
FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
......@@ -687,10 +687,10 @@ if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o
fi
if [ $asm = auto -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
if ! as_check "vpperm xmm0, xmm0, xmm0, xmm0" ; then
if ! as_check "vpmovzxwd ymm0, xmm0" ; then
VER=`($AS --version || echo no assembler) 2>/dev/null | head -n 1`
echo "Found $VER"
echo "Minimum version is yasm-1.0.0"
echo "Minimum version is yasm-1.2.0"
echo "If you really want to compile without asm, configure with --disable-asm."
exit 1
fi
......
......@@ -164,6 +164,7 @@ static void print_bench(void)
if( k < j )
continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
b->cpu&X264_CPU_AVX2 && b->cpu&X264_CPU_FMA3 ? "avx2_fma3" :
b->cpu&X264_CPU_AVX2 ? "avx2" :
b->cpu&X264_CPU_FMA3 ? "fma3" :
b->cpu&X264_CPU_FMA4 ? "fma4" :
......@@ -2444,11 +2445,6 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
cpu1 &= ~X264_CPU_FMA4;
}
if( x264_cpu_detect() & X264_CPU_FMA3 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
cpu1 &= ~X264_CPU_FMA3;
}
if( x264_cpu_detect() & X264_CPU_BMI1 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
......@@ -2466,6 +2462,11 @@ static int check_all_flags( void )
}
if( x264_cpu_detect() & X264_CPU_AVX2 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
if( x264_cpu_detect() & X264_CPU_FMA3 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
cpu1 &= ~X264_CPU_FMA3;
}
#elif ARCH_PPC
if( x264_cpu_detect() & X264_CPU_ALTIVEC )
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment