Commit eeb9b66d authored by Henrik Gramner's avatar Henrik Gramner
Browse files

x86: dct2x4dc asm

Only used in 4:2:2. MMX2 version implemented for 8-bit, SSE2 and AVX
versions implemented for high bit-depth.

2.5x faster on 32-bit and 1.6x faster on 64-bit compared to C on Ivy Bridge.
parent 23d1d8e8
......@@ -576,6 +576,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add4x4_idct = x264_add4x4_idct_sse2;
dctf->dct4x4dc = x264_dct4x4dc_sse2;
dctf->idct4x4dc = x264_idct4x4dc_sse2;
dctf->dct2x4dc = x264_dct2x4dc_sse2;
dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
dctf->add8x8_idct = x264_add8x8_idct_sse2;
......@@ -597,6 +598,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add4x4_idct = x264_add4x4_idct_avx;
dctf->dct4x4dc = x264_dct4x4dc_avx;
dctf->idct4x4dc = x264_idct4x4dc_avx;
dctf->dct2x4dc = x264_dct2x4dc_avx;
dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
dctf->add8x8_idct = x264_add8x8_idct_avx;
......@@ -633,6 +635,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
if( cpu&X264_CPU_MMX2 )
{
dctf->dct4x4dc = x264_dct4x4dc_mmx2;
dctf->dct2x4dc = x264_dct2x4dc_mmx2;
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx2;
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2;
}
......
......@@ -209,6 +209,78 @@ cglobal idct4x4dc, 1,1
RET
%endif ; HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
;-----------------------------------------------------------------------------
%if WIN64
DECLARE_REG_TMP 6 ; Avoid some REX prefixes to reduce code size
%else
DECLARE_REG_TMP 2
%endif
%macro INSERT_COEFF 3 ; dst, src, imm
%if %3
%if HIGH_BIT_DEPTH
%if cpuflag(sse4)
pinsrd %1, %2, %3
%elif %3 == 2
movd m2, %2
%elif %3 == 1
punpckldq %1, %2
%else
punpckldq m2, %2
punpcklqdq %1, m2
%endif
%else
%if %3 == 2
punpckldq %1, %2
%else
pinsrw %1, %2, %3
%endif
%endif
%else
movd %1, %2
%endif
%if HIGH_BIT_DEPTH
mov %2, t0d
%else
mov %2, t0w
%endif
%endmacro
%macro DCT2x4DC 2
cglobal dct2x4dc, 2,3
xor t0d, t0d
INSERT_COEFF m0, [r1+0*16*SIZEOF_DCTCOEF], 0
INSERT_COEFF m0, [r1+1*16*SIZEOF_DCTCOEF], 2
add r1, 4*16*SIZEOF_DCTCOEF
INSERT_COEFF m0, [r1-2*16*SIZEOF_DCTCOEF], 1
INSERT_COEFF m0, [r1-1*16*SIZEOF_DCTCOEF], 3
INSERT_COEFF m1, [r1+0*16*SIZEOF_DCTCOEF], 0
INSERT_COEFF m1, [r1+1*16*SIZEOF_DCTCOEF], 2
INSERT_COEFF m1, [r1+2*16*SIZEOF_DCTCOEF], 1
INSERT_COEFF m1, [r1+3*16*SIZEOF_DCTCOEF], 3
SUMSUB_BA %1, 1, 0, 2
SBUTTERFLY %2, 1, 0, 2
SUMSUB_BA %1, 0, 1, 2
SBUTTERFLY %2, 0, 1, 2
SUMSUB_BA %1, 1, 0, 2
pshuf%1 m0, m0, q1032
mova [r0], m1
mova [r0+mmsize], m0
RET
%endmacro
%if HIGH_BIT_DEPTH
INIT_XMM sse2
DCT2x4DC d, dq
INIT_XMM avx
DCT2x4DC d, dq
%else
INIT_MMX mmx2
DCT2x4DC w, wd
%endif
%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
......
......@@ -77,6 +77,10 @@ void x264_idct4x4dc_mmx ( int16_t d[16] );
void x264_idct4x4dc_sse2 ( int32_t d[16] );
void x264_idct4x4dc_avx ( int32_t d[16] );
void x264_dct2x4dc_mmx2( dctcoef dct[8], dctcoef dct4x4[8][16] );
void x264_dct2x4dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16] );
void x264_dct2x4dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16] );
void x264_sub8x8_dct8_mmx ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_mmx ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_sse2 ( dctcoef dct [64], pixel *pix1, pixel *pix2 );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment