Commit 23d1d8e8 authored by Henrik Gramner's avatar Henrik Gramner
Browse files

x86: SSE2/AVX idct_dequant_2x4_(dc|dconly)

Only used in 4:2:2. Both 8-bit and high bit-depth implemented.

Approximate performance improvement compared to C on Ivy Bridge:

                         x86-32  x86-64
idct_dequant_2x4_dc      2.1x    1.7x
idct_dequant_2x4_dconly  2.7x    2.0x

Helps more on 32-bit due to the C versions being register starved.
parent dbbf1dd2
......@@ -486,6 +486,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_4x4 = x264_dequant_4x4_sse2;
pf->dequant_8x8 = x264_dequant_8x8_sse2;
pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_sse2;
pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_sse2;
pf->denoise_dct = x264_denoise_dct_sse2;
pf->decimate_score15 = x264_decimate_score15_sse2;
pf->decimate_score16 = x264_decimate_score16_sse2;
......@@ -532,6 +534,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
}
if( cpu&X264_CPU_AVX )
{
pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_avx;
pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_avx;
pf->denoise_dct = x264_denoise_dct_avx;
}
if( cpu&X264_CPU_XOP )
......@@ -618,6 +622,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
}
pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_sse2;
pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_sse2;
pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse2;
pf->denoise_dct = x264_denoise_dct_sse2;
pf->decimate_score15 = x264_decimate_score15_sse2;
......@@ -680,6 +686,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_4x4 = x264_dequant_4x4_avx;
pf->dequant_8x8 = x264_dequant_8x8_avx;
}
pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_avx;
pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_avx;
pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_avx;
pf->denoise_dct = x264_denoise_dct_avx;
}
......
......@@ -829,6 +829,150 @@ INIT_YMM avx2
DEQUANT_DC w, pmullw
%endif
%macro PEXTRW 4
%if cpuflag(sse4)
pextrw %1, %2, %3
%else
; pextrw with a memory destination requires SSE4.1, go through a GPR as a fallback
%if %3
pextrw %4d, %2, %3
%else
movd %4d, %2
%endif
mov %1, %4w
%endif
%endmacro
;-----------------------------------------------------------------------------
; void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp )
; void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp )
;-----------------------------------------------------------------------------
%macro DEQUANT_2x4_DC 1
%ifidn %1, dconly
DECLARE_REG_TMP 6,3,2
%define %%args dct, dmf, qp
%else
DECLARE_REG_TMP 6,4,3
%define %%args dct, dct4x4, dmf, qp
%endif
%if ARCH_X86_64 == 0
DECLARE_REG_TMP 2,0,1
%endif
cglobal idct_dequant_2x4_%1, 0,3,5, %%args
movifnidn t2d, qpm
imul t0d, t2d, 0x2b
shr t0d, 8 ; qp / 6
lea t1d, [t0*5]
sub t2d, t0d
sub t2d, t1d ; qp % 6
shl t2d, 6 ; 16 * sizeof(int)
%if ARCH_X86_64
imul t2d, [dmfq+t2], -0xffff ; (-dmf) << 16 | dmf
%else
mov dctq, dctmp
add t2, dmfmp
imul t2d, [t2], -0xffff
%endif
%if HIGH_BIT_DEPTH
mova m0, [dctq]
mova m1, [dctq+16]
SUMSUB_BA d, 1, 0, 2 ; 16-bit intermediate precision is enough for the first two sumsub steps,
packssdw m1, m0 ; and by packing to words we can use pmaddwd instead of pmulld later.
%else
movq m0, [dctq]
movq m1, [dctq+8]
SUMSUB_BA w, 1, 0, 2
punpcklqdq m1, m0 ; a0 a1 a2 a3 a4 a5 a6 a7
%endif
pshufd m0, m1, q2301 ; a2 a3 a0 a1 a6 a7 a4 a5
movd m3, t2d
pshuflw m3, m3, q1000 ; + + + -
SUMSUB_BA w, 0, 1, 2
punpcklqdq m3, m3 ; + + + - + + + -
pshufd m1, m1, q0022
sub t0d, 6
jl .rshift
movd m2, t0d
psllw m3, m2
pmaddwd m0, m3
pmaddwd m1, m3
jmp .end
.rshift:
neg t0d
movd m2, t0d
pcmpeqd m4, m4
pmaddwd m0, m3
pmaddwd m1, m3
pslld m4, m2
psrad m4, 1
psubd m0, m4 ; + 1 << (qp/6-1)
psubd m1, m4
psrad m0, m2
psrad m1, m2
.end:
%ifidn %1, dconly
%if HIGH_BIT_DEPTH
mova [dctq], m0
mova [dctq+16], m1
%else
packssdw m0, m1
mova [dctq], m0
%endif
%else
movifnidn dct4x4q, dct4x4mp
%if HIGH_BIT_DEPTH
movd [dct4x4q+0*64], m0
%if cpuflag(sse4)
pextrd [dct4x4q+1*64], m0, 1
add dct4x4q, 4*64
pextrd [dct4x4q-2*64], m0, 2
pextrd [dct4x4q-1*64], m0, 3
movd [dct4x4q+0*64], m1
pextrd [dct4x4q+1*64], m1, 1
pextrd [dct4x4q+2*64], m1, 2
pextrd [dct4x4q+3*64], m1, 3
%else
MOVHL m2, m0
psrlq m0, 32
movd [dct4x4q+1*64], m0
add dct4x4q, 4*64
movd [dct4x4q-2*64], m2
psrlq m2, 32
movd [dct4x4q-1*64], m2
movd [dct4x4q+0*64], m1
MOVHL m2, m1
psrlq m1, 32
movd [dct4x4q+1*64], m1
movd [dct4x4q+2*64], m2
psrlq m2, 32
movd [dct4x4q+3*64], m2
%endif
%else
PEXTRW [dct4x4q+0*32], m0, 0, eax
PEXTRW [dct4x4q+1*32], m0, 2, eax
PEXTRW [dct4x4q+2*32], m0, 4, eax
PEXTRW [dct4x4q+3*32], m0, 6, eax
add dct4x4q, 4*32
PEXTRW [dct4x4q+0*32], m1, 0, eax
PEXTRW [dct4x4q+1*32], m1, 2, eax
PEXTRW [dct4x4q+2*32], m1, 4, eax
PEXTRW [dct4x4q+3*32], m1, 6, eax
%endif
%endif
RET
%endmacro
; sse4 reduces code size compared to sse2 but isn't any faster, so just go with sse2+avx
INIT_XMM sse2
DEQUANT_2x4_DC dc
DEQUANT_2x4_DC dconly
INIT_XMM avx
DEQUANT_2x4_DC dc
DEQUANT_2x4_DC dconly
; t4 is eax for return value.
%if ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX
......
......@@ -72,6 +72,10 @@ void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i
void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_avx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_avx2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_idct_dequant_2x4_dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
void x264_idct_dequant_2x4_dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
void x264_idct_dequant_2x4_dconly_sse2( dctcoef dct[8], int dequant_mf[6][16], int i_qp );
void x264_idct_dequant_2x4_dconly_avx ( dctcoef dct[8], int dequant_mf[6][16], int i_qp );
int x264_optimize_chroma_2x2_dc_sse2( dctcoef dct[4], int dequant_mf );
int x264_optimize_chroma_2x2_dc_ssse3( dctcoef dct[4], int dequant_mf );
int x264_optimize_chroma_2x2_dc_sse4( dctcoef dct[4], int dequant_mf );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment