Commit 539103a5 authored by Loren Merritt's avatar Loren Merritt

dct4 sse2

parent 9168abfa
......@@ -420,6 +420,13 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
}
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_3DNOW) )
{
dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
dctf->add8x8_idct = x264_add8x8_idct_sse2;
dctf->add16x16_idct = x264_add16x16_idct_sse2;
}
#endif //HAVE_MMX
#ifdef ARCH_PPC
......
......@@ -32,9 +32,9 @@ pb_zigzag4: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
SECTION .text
%macro LOAD_DIFF_4P 5
movd %1, %4
movh %1, %4
punpcklbw %1, %3
movd %2, %5
movh %2, %5
punpcklbw %2, %3
psubw %1, %2
%endmacro
......@@ -55,7 +55,7 @@ SECTION .text
%endmacro
%macro SUMSUB2_AB 3
movq %3, %1
mova %3, %1
paddw %1, %1
paddw %1, %2
psubw %3, %2
......@@ -63,8 +63,8 @@ SECTION .text
%endmacro
%macro SUMSUBD2_AB 4
movq %4, %1
movq %3, %2
mova %4, %1
mova %3, %2
psraw %2, 1
psraw %4, 1
paddw %1, %2
......@@ -86,14 +86,22 @@ SECTION .text
SWAP %2, %3
%endmacro
%macro STORE_DIFF_4P 5
paddw %1, %3
%macro TRANSPOSE2x4x4W 5
SBUTTERFLY wd, %1, %2, %5
SBUTTERFLY wd, %3, %4, %5
SBUTTERFLY dq, %1, %3, %5
SBUTTERFLY dq, %2, %4, %5
SBUTTERFLY qdq, %1, %2, %5
SBUTTERFLY qdq, %3, %4, %5
%endmacro
%macro STORE_DIFF_4P 4
psraw %1, 6
movd %2, %5
punpcklbw %2, %4
movh %2, %4
punpcklbw %2, %3
paddsw %1, %2
packuswb %1, %1
movd %5, %1
movh %4, %1
%endmacro
%macro HADAMARD4_1D 4
......@@ -164,17 +172,20 @@ cglobal x264_idct4x4dc_mmx, 1,1
;-----------------------------------------------------------------------------
cglobal x264_sub4x4_dct_mmx, 3,3
.skip_prologue:
%macro SUB_DCT4 1
LOAD_DIFF_4P m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF_4P m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF_4P m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
LOAD_DIFF_4P m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
DCT4_1D 0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4
TRANSPOSE%1 0,1,2,3,4
DCT4_1D 0,1,2,3,4
movq [r0+ 0], m0
movq [r0+ 8], m1
movq [r0+16], m2
movq [r0+24], m3
%endmacro
SUB_DCT4 4x4W
RET
;-----------------------------------------------------------------------------
......@@ -186,18 +197,52 @@ cglobal x264_add4x4_idct_mmx, 2,2,1
movq m1, [r1+ 8]
movq m2, [r1+16]
movq m3, [r1+24]
%macro ADD_IDCT4 1
IDCT4_1D 0,1,2,3,4,5
TRANSPOSE4x4W 0,1,2,3,4
TRANSPOSE%1 0,1,2,3,4
paddw m0, [pw_32 GLOBAL]
IDCT4_1D 0,1,2,3,4,5
pxor m7, m7
movq m6, [pw_32 GLOBAL]
STORE_DIFF_4P m0, m4, m6, m7, [r0+0*FDEC_STRIDE]
STORE_DIFF_4P m1, m4, m6, m7, [r0+1*FDEC_STRIDE]
STORE_DIFF_4P m2, m4, m6, m7, [r0+2*FDEC_STRIDE]
STORE_DIFF_4P m3, m4, m6, m7, [r0+3*FDEC_STRIDE]
STORE_DIFF_4P m0, m4, m7, [r0+0*FDEC_STRIDE]
STORE_DIFF_4P m1, m4, m7, [r0+1*FDEC_STRIDE]
STORE_DIFF_4P m2, m4, m7, [r0+2*FDEC_STRIDE]
STORE_DIFF_4P m3, m4, m7, [r0+3*FDEC_STRIDE]
%endmacro
ADD_IDCT4 4x4W
RET
INIT_XMM
cglobal x264_sub8x8_dct_sse2, 3,3
.skip_prologue:
call .8x4
add r0, 64
add r1, 4*FENC_STRIDE
add r2, 4*FDEC_STRIDE
.8x4:
SUB_DCT4 2x4x4W
movhps [r0+32], m0
movhps [r0+40], m1
movhps [r0+48], m2
movhps [r0+56], m3
ret
cglobal x264_add8x8_idct_sse2, 2,2,1
.skip_prologue:
call .8x4
add r1, 64
add r0, 4*FDEC_STRIDE
.8x4:
movq m0, [r1+ 0]
movq m1, [r1+ 8]
movq m2, [r1+16]
movq m3, [r1+24]
movhps m0, [r1+32]
movhps m1, [r1+40]
movhps m2, [r1+48]
movhps m3, [r1+56]
ADD_IDCT4 2x4x4W
ret
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
......@@ -207,16 +252,16 @@ cglobal %1, 3,3
.skip_prologue:
call %2
add r0, %3
add r1, %4-%5*FENC_STRIDE
add r2, %4-%5*FDEC_STRIDE
add r1, %4-%5-%6*FENC_STRIDE
add r2, %4-%5-%6*FDEC_STRIDE
call %2
add r0, %3
add r1, %4*FENC_STRIDE-%6
add r2, %4*FDEC_STRIDE-%6
add r1, (%4-%6)*FENC_STRIDE-%5-%4
add r2, (%4-%6)*FDEC_STRIDE-%5-%4
call %2
add r0, %3
add r1, %4-%5*FENC_STRIDE
add r2, %4-%5*FDEC_STRIDE
add r1, %4-%5-%6*FENC_STRIDE
add r2, %4-%5-%6*FDEC_STRIDE
jmp %2
%endmacro
......@@ -227,36 +272,39 @@ cglobal %1, 3,3
cglobal %1, 2,2,1
.skip_prologue:
call %2
add r0, %4-%5*FDEC_STRIDE
add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3
call %2
add r0, %4*FDEC_STRIDE-%6
add r0, (%4-%6)*FDEC_STRIDE-%5-%4
add r1, %3
call %2
add r0, %4-%5*FDEC_STRIDE
add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3
jmp %2
%endmacro
SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 4
ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 4
SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 0
ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 0
SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 8, 4, 4
ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 8, 4, 4
SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 4, 4, 12
ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 4, 4, 12
SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2 %+ .skip_prologue, 64, 8, 0, 4
ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4
%ifndef ARCH_X86_64
cextern x264_sub8x8_dct8_mmx.skip_prologue
cextern x264_add8x8_idct8_mmx.skip_prologue
SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx %+ .skip_prologue, 128, 8, 0, 8
ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 128, 8, 0, 8
SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx %+ .skip_prologue, 128, 8, 0, 0
ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 128, 8, 0, 0
%define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue
%define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue
%endif
cextern x264_sub8x8_dct8_sse2
cextern x264_add8x8_idct8_sse2
SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 8
ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 8
SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 0
ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0
......
......@@ -26,23 +26,25 @@
void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_mmx( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_sse2( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_sse2( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] );
void x264_add8x8_idct_mmx( uint8_t *p_dst, int16_t dct[4][4][4] );
void x264_add16x16_idct_mmx( uint8_t *p_dst, int16_t dct[16][4][4] );
void x264_add8x8_idct_sse2( uint8_t *p_dst, int16_t dct[4][4][4] );
void x264_add16x16_idct_sse2( uint8_t *p_dst, int16_t dct[16][4][4] );
void x264_dct4x4dc_mmx( int16_t d[4][4] );
void x264_idct4x4dc_mmx( int16_t d[4][4] );
void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_mmx( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] );
void x264_add16x16_idct8_mmx( uint8_t *dst, int16_t dct[4][8][8] );
void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_sse2( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] );
void x264_add16x16_idct8_mmx( uint8_t *dst, int16_t dct[4][8][8] );
void x264_add8x8_idct8_sse2( uint8_t *dst, int16_t dct[8][8] );
void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][8][8] );
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment