Commit 75f6f9b2 authored by Henrik Gramner's avatar Henrik Gramner
Browse files

x86: AVX-512 coeff_last

parent c3a1d1d8
......@@ -558,6 +558,14 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->denoise_dct = x264_denoise_dct_avx2;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2;
}
if( cpu&X264_CPU_AVX512 )
{
pf->coeff_last4 = x264_coeff_last4_avx512;
pf->coeff_last8 = x264_coeff_last8_avx512;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512;
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
......@@ -717,6 +725,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2;
}
if( cpu&X264_CPU_AVX512 )
{
pf->coeff_last8 = x264_coeff_last8_avx512;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512;
}
#endif // HAVE_MMX
#if HAVE_ALTIVEC
......
......@@ -1756,6 +1756,70 @@ cglobal coeff_last64, 1,3
RET
%endif
%macro COEFF_LAST_AVX512 2 ; num, w/d
cglobal coeff_last%1, 1,2
mova m0, [r0-(%1&1)*SIZEOF_DCTCOEF]
vptestm%2 k0, m0, m0
%if %1 == 15
mov eax, 30
kmovw r1d, k0
lzcnt r1d, r1d
sub eax, r1d
%else
kmovw eax, k0
lzcnt eax, eax
xor eax, 31
%endif
RET
%endmacro
%macro COEFF_LAST64_AVX512 1 ; w/d
cglobal coeff_last64, 1,2
pxor xm0, xm0
vpcmp%1 k0, m0, [r0+0*64], 4
vpcmp%1 k1, m0, [r0+1*64], 4
%if HIGH_BIT_DEPTH
vpcmp%1 k2, m0, [r0+2*64], 4
vpcmp%1 k3, m0, [r0+3*64], 4
kunpckwd k0, k1, k0
kunpckwd k1, k3, k2
%endif
%if ARCH_X86_64
kunpckdq k0, k1, k0
kmovq rax, k0
lzcnt rax, rax
xor eax, 63
%else
kmovd r1d, k1
kmovd eax, k0
lzcnt r1d, r1d
lzcnt eax, eax
xor r1d, 32
cmovnz eax, r1d
xor eax, 31
%endif
RET
%endmacro
%if HIGH_BIT_DEPTH
INIT_XMM avx512
COEFF_LAST_AVX512 4, d
INIT_YMM avx512
COEFF_LAST_AVX512 8, d
INIT_ZMM avx512
COEFF_LAST_AVX512 15, d
COEFF_LAST_AVX512 16, d
COEFF_LAST64_AVX512 d
%else ; !HIGH_BIT_DEPTH
INIT_XMM avx512
COEFF_LAST_AVX512 8, w
INIT_YMM avx512
COEFF_LAST_AVX512 15, w
COEFF_LAST_AVX512 16, w
INIT_ZMM avx512
COEFF_LAST64_AVX512 w
%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
;-----------------------------------------------------------------------------
......
......@@ -110,6 +110,11 @@ int x264_coeff_last15_lzcnt( dctcoef *dct );
int x264_coeff_last16_lzcnt( dctcoef *dct );
int x264_coeff_last64_lzcnt( dctcoef *dct );
int x264_coeff_last64_avx2 ( dctcoef *dct );
int x264_coeff_last4_avx512( int32_t *dct );
int x264_coeff_last8_avx512( dctcoef *dct );
int x264_coeff_last15_avx512( dctcoef *dct );
int x264_coeff_last16_avx512( dctcoef *dct );
int x264_coeff_last64_avx512( dctcoef *dct );
int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
......
......@@ -2008,7 +2008,7 @@ static int check_quant( int cpu_ref, int cpu_new )
x264_quant_function_t qf_c;
x264_quant_function_t qf_ref;
x264_quant_function_t qf_a;
ALIGNED_ARRAY_32( dctcoef, dct1,[64] );
ALIGNED_ARRAY_64( dctcoef, dct1,[64] );
ALIGNED_ARRAY_32( dctcoef, dct2,[64] );
ALIGNED_ARRAY_32( dctcoef, dct3,[8],[16] );
ALIGNED_ARRAY_32( dctcoef, dct4,[8],[16] );
......@@ -2631,7 +2631,7 @@ static int check_cabac( int cpu_ref, int cpu_new )
{\
for( int j = 0; j < 256; j++ )\
{\
ALIGNED_ARRAY_32( dctcoef, dct, [2],[64] );\
ALIGNED_ARRAY_64( dctcoef, dct, [2],[64] );\
uint8_t bitstream[2][1<<16];\
static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\
int ac = ctx_ac[ctx_block_cat];\
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment