Commit 74f7802b authored by Henrik Gramner's avatar Henrik Gramner
Browse files

x86: AVX-512 dequant_4x4

parent 3451ba3a
......@@ -560,6 +560,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
}
if( cpu&X264_CPU_AVX512 )
{
pf->dequant_4x4 = x264_dequant_4x4_avx512;
pf->coeff_last4 = x264_coeff_last4_avx512;
pf->coeff_last8 = x264_coeff_last8_avx512;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
......@@ -727,6 +728,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
}
if( cpu&X264_CPU_AVX512 )
{
if( h->param.i_cqm_preset != X264_CQM_FLAT )
pf->dequant_4x4 = x264_dequant_4x4_avx512;
pf->coeff_last8 = x264_coeff_last8_avx512;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512;
......
......@@ -743,6 +743,65 @@ DEQUANT 4, 4, 4
DEQUANT 8, 6, 4
%endif
%macro DEQUANT_START_AVX512 1
movifnidn t2d, r2m
imul t0d, t2d, 0x2b
shr t0d, 8 ; i_qbits = i_qp / 6
lea t1d, [t0*5]
sub t2d, t0d
sub t2d, t1d ; i_mf = i_qp % 6
shl t2d, %1
%if ARCH_X86_64
%define dmf r1+t2
%else
%define dmf r1
add r1, r1mp ; dequant_mf[i_mf]
mov r0, r0mp ; dct
%endif
%endmacro
INIT_ZMM avx512
cglobal dequant_4x4, 0,3
DEQUANT_START_AVX512 6
mova m0, [dmf]
%if HIGH_BIT_DEPTH
pmaddwd m0, [r0]
%endif
sub t0d, 4
jl .rshift
%if HIGH_BIT_DEPTH
vpbroadcastd m1, t0d
vpsllvd m0, m1
mova [r0], m0
%else
vpbroadcastw ym1, t0d
vpmovsdw ym0, m0
pmullw ym0, [r0]
vpsllvw ym0, ym1
mova [r0], ym0
%endif
RET
.rshift:
%if HIGH_BIT_DEPTH == 0
pmovzxwd m1, [r0]
pmaddwd m0, m1
%endif
mov r1d, 1<<31
shrx r1d, r1d, t0d ; 1 << (-i_qbits-1)
neg t0d
vpbroadcastd m1, r1d
vpbroadcastd m2, t0d
paddd m0, m1
vpsravd m0, m2
%if HIGH_BIT_DEPTH
mova [r0], m0
%else
vpmovsdw [r0], m0
%endif
RET
%undef dmf
%macro DEQUANT_DC 2
cglobal dequant_4x4dc, 0,3,6
DEQUANT_START 6, 6
......
......@@ -66,6 +66,7 @@ void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_avx512( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
......
......@@ -2009,7 +2009,7 @@ static int check_quant( int cpu_ref, int cpu_new )
x264_quant_function_t qf_ref;
x264_quant_function_t qf_a;
ALIGNED_ARRAY_64( dctcoef, dct1,[64] );
ALIGNED_ARRAY_32( dctcoef, dct2,[64] );
ALIGNED_ARRAY_64( dctcoef, dct2,[64] );
ALIGNED_ARRAY_32( dctcoef, dct3,[8],[16] );
ALIGNED_ARRAY_32( dctcoef, dct4,[8],[16] );
ALIGNED_ARRAY_32( uint8_t, cqm_buf,[64] );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment