Commit e9fbd8db authored by Holger Lubitz's avatar Holger Lubitz Committed by Fiona Glaser

SSE4 version of 4x4 idct

27->24 clocks on Nehalem.
This is really just an excuse to use "movsd" in a real function.
Add some comments to subsum-related macros in x86util.
parent 7639d496
......@@ -482,6 +482,10 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
}
if( cpu&X264_CPU_SSE4 )
dctf->add4x4_idct = x264_add4x4_idct_sse4;
#endif //HAVE_MMX
#ifdef ARCH_PPC
......
......@@ -27,6 +27,8 @@
%include "x86util.asm"
SECTION_RODATA
pw_32_0: times 4 dw 32
times 4 dw 0
pw_32: times 8 dw 32
pw_8000: times 8 dw 0x8000
hsub_mul: times 8 db 1, -1
......@@ -148,6 +150,59 @@ cglobal x264_add4x4_idct_mmx, 2,2
STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
RET
INIT_XMM
cglobal x264_add4x4_idct_sse4, 2,2,6
mova m0, [r1+0x00] ; row1/row0
mova m2, [r1+0x10] ; row3/row2
mova m1, m0 ; row1/row0
psraw m0, 1 ; row1>>1/...
mova m3, m2 ; row3/row2
psraw m2, 1 ; row3>>1/...
movsd m0, m1 ; row1>>1/row0
movsd m2, m3 ; row3>>1/row2
psubw m0, m3 ; row1>>1-row3/row0-2
paddw m2, m1 ; row3>>1+row1/row0+2
SBUTTERFLY2 wd, 0, 2, 1
SUMSUB_BA m2, m0, m1
pshuflw m1, m2, 10110001b
pshufhw m2, m2, 10110001b
punpckldq m1, m0
punpckhdq m2, m0
SWAP 0, 1
mova m1, [pw_32_0 GLOBAL]
paddw m1, m0 ; row1/row0 corrected
psraw m0, 1 ; row1>>1/...
mova m3, m2 ; row3/row2
psraw m2, 1 ; row3>>1/...
movsd m0, m1 ; row1>>1/row0
movsd m2, m3 ; row3>>1/row2
psubw m0, m3 ; row1>>1-row3/row0-2
paddw m2, m1 ; row3>>1+row1/row0+2
SBUTTERFLY2 qdq, 0, 2, 1
SUMSUB_BA m2, m0, m1
movd m4, [r0+FDEC_STRIDE*0]
movd m1, [r0+FDEC_STRIDE*1]
movd m3, [r0+FDEC_STRIDE*2]
movd m5, [r0+FDEC_STRIDE*3]
punpckldq m1, m4 ; row0/row1
pxor m4, m4
punpckldq m3, m5 ; row3/row2
punpcklbw m1, m4
psraw m2, 6
punpcklbw m3, m4
psraw m0, 6
paddsw m2, m1
paddsw m0, m3
packuswb m0, m2 ; row0/row1/row3/row2
pextrd [r0+FDEC_STRIDE*0], m0, 3
pextrd [r0+FDEC_STRIDE*1], m0, 2
movd [r0+FDEC_STRIDE*2], m0
pextrd [r0+FDEC_STRIDE*3], m0, 1
RET
INIT_MMX
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
......
......@@ -36,6 +36,7 @@ void x264_sub8x8_dct_dc_mmxext( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2
void x264_sub8x8_dct_dc_sse2 ( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 );
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4] );
void x264_add4x4_idct_sse4 ( uint8_t *p_dst, int16_t dct[ 4][4] );
void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[2][2] );
void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][4][4] );
......
......@@ -28,6 +28,13 @@
SWAP %3, %4
%endmacro
%macro SBUTTERFLY2 4
mova m%4, m%2
punpckh%1 m%2, m%3
punpckl%1 m%4, m%3
SWAP %2, %4, %3
%endmacro
%macro TRANSPOSE4x4W 5
SBUTTERFLY wd, %1, %2, %5
SBUTTERFLY wd, %3, %4, %5
......@@ -386,10 +393,10 @@
%macro SUMSUBD2_AB 4
mova %4, %1
mova %3, %2
psraw %2, 1
psraw %1, 1
paddw %2, %4
psubw %1, %3
psraw %2, 1 ; %2: %2>>1
psraw %1, 1 ; %1: %1>>1
paddw %2, %4 ; %2: %2>>1+%1
psubw %1, %3 ; %1: %1>>1-%2
%endmacro
%macro DCT4_1D 5
......@@ -410,14 +417,24 @@
%macro IDCT4_1D 5-6
%ifnum %5
SUMSUBD2_AB m%2, m%4, m%6, m%5
; %2: %2>>1-%4 %4: %2+%4>>1
SUMSUB_BA m%3, m%1, m%6
; %3: %1+%3 %1: %1-%3
SUMSUB_BADC m%4, m%3, m%2, m%1, m%6
; %4: %1+%3 + (%2+%4>>1)
; %3: %1+%3 - (%2+%4>>1)
; %2: %1-%3 + (%2>>1-%4)
; %1: %1-%3 - (%2>>1-%4)
%else
SUMSUBD2_AB m%2, m%4, [%5], [%5+16]
SUMSUB_BA m%3, m%1
SUMSUB_BADC m%4, m%3, m%2, m%1
%endif
SWAP %1, %4, %3
; %1: %1+%3 + (%2+%4>>1) row0
; %2: %1-%3 + (%2>>1-%4) row1
; %3: %1-%3 - (%2>>1-%4) row2
; %4: %1+%3 - (%2+%4>>1) row3
%endmacro
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment