Commit 79389771 authored by Loren Merritt's avatar Loren Merritt

keep transposed dct coefs. ~1% overall speedup.



git-svn-id: svn://svn.videolan.org/x264/trunk@463 df754926-b1dd-0310-bc7b-ec298dee348c
parent ce9b3336
......@@ -177,21 +177,19 @@ x264_dct4x4dc_mmxext:
MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
MMX_TRANSPOSE mm0, mm2, mm3, mm4, mm1 ; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3
movq mm6, [pw_1 GLOBAL]
paddw mm0, mm6
paddw mm4, mm6
paddw mm2, mm6
psraw mm0, 1
movq [parm1q+ 0],mm0
psraw mm4, 1
movq [parm1q+ 8],mm4
paddw mm1, mm6
psraw mm2, 1
movq [parm1q+ 8],mm2
paddw mm3, mm6
psraw mm1, 1
movq [parm1q+16],mm1
paddw mm4, mm6
psraw mm3, 1
movq [parm1q+24],mm3
movq [parm1q+16],mm3
psraw mm4, 1
movq [parm1q+24],mm4
ret
cglobal x264_idct4x4dc_mmxext
......@@ -214,12 +212,10 @@ x264_idct4x4dc_mmxext:
MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
MMX_TRANSPOSE mm0, mm2, mm3, mm4, mm1 ; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3
movq [parm1q+ 0], mm0
movq [parm1q+ 8], mm4
movq [parm1q+16], mm1
movq [parm1q+24], mm3
movq [parm1q+ 8], mm2
movq [parm1q+16], mm3
movq [parm1q+24], mm4
ret
cglobal x264_sub4x4_dct_mmxext
......@@ -267,13 +263,10 @@ x264_sub4x4_dct_mmxext:
MMX_SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12
MMX_SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12
; transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3
MMX_TRANSPOSE mm1, mm2, mm3, mm0, mm4
movq [r10+ 0], mm1 ; dct
movq [r10+ 8], mm0
movq [r10+16], mm4
movq [r10+24], mm3
movq [r10+ 8], mm2
movq [r10+16], mm3
movq [r10+24], mm0
pop rbx
ret
......@@ -288,17 +281,14 @@ ALIGN 16
x264_add4x4_idct_mmxext:
; Load dct coeffs
movq mm0, [parm3q+ 0] ; dct
movq mm4, [parm3q+ 8]
movq mm3, [parm3q+16]
movq mm1, [parm3q+24]
movq mm1, [parm3q+ 8]
movq mm2, [parm3q+16]
movq mm3, [parm3q+24]
mov rax, parm1q ; p_dst
movsxd rcx, parm2d ; i_dst
lea rdx, [rcx+rcx*2]
; out:mm0, mm1, mm2, mm3
MMX_TRANSPOSE mm0, mm4, mm3, mm1, mm2
MMX_SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02
MMX_SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
......@@ -408,19 +398,18 @@ x264_sub8x8_dct8_sse2:
MMX_LOAD_DIFF_8P xmm6, xmm8, xmm9, [rsi+r9 ], [rcx+r10]
MMX_LOAD_DIFF_8P xmm7, xmm8, xmm9, [rsi+rdx*4], [rcx+r8*4]
SSE2_TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
DCT8_1D xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1, xmm6, xmm9
SSE2_TRANSPOSE8x8 xmm4, xmm5, xmm7, xmm2, xmm8, xmm3, xmm1, xmm6, xmm0
DCT8_1D xmm4, xmm3, xmm6, xmm2, xmm0, xmm8, xmm7, xmm5, xmm1, xmm9
DCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9
SSE2_TRANSPOSE8x8 xmm5, xmm1, xmm2, xmm6, xmm4, xmm3, xmm7, xmm8, xmm0
DCT8_1D xmm5, xmm3, xmm8, xmm6, xmm0, xmm4, xmm2, xmm1, xmm7, xmm9
movdqa [rdi+0x00], xmm8
movdqa [rdi+0x00], xmm4
movdqa [rdi+0x10], xmm3
movdqa [rdi+0x20], xmm6
movdqa [rdi+0x30], xmm7
movdqa [rdi+0x20], xmm8
movdqa [rdi+0x30], xmm2
movdqa [rdi+0x40], xmm0
movdqa [rdi+0x50], xmm2
movdqa [rdi+0x60], xmm5
movdqa [rdi+0x70], xmm1
movdqa [rdi+0x50], xmm6
movdqa [rdi+0x60], xmm1
movdqa [rdi+0x70], xmm7
ret
......@@ -494,22 +483,21 @@ x264_add8x8_idct8_sse2:
movdqa xmm6, [rdx+0x60]
movdqa xmm7, [rdx+0x70]
SSE2_TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
IDCT8_1D xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1, xmm9, xmm6
SSE2_TRANSPOSE8x8 xmm9, xmm5, xmm1, xmm3, xmm8, xmm0, xmm7, xmm2, xmm4
IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm9, xmm8
SSE2_TRANSPOSE8x8 xmm9, xmm1, xmm7, xmm3, xmm4, xmm0, xmm2, xmm6, xmm5
paddw xmm9, [pw_32 GLOBAL] ; rounding for the >>6 at the end
IDCT8_1D xmm9, xmm0, xmm2, xmm3, xmm4, xmm8, xmm1, xmm5, xmm6, xmm7
IDCT8_1D xmm9, xmm0, xmm6, xmm3, xmm5, xmm4, xmm7, xmm1, xmm8, xmm2
MMX_ZERO xmm15
MMX_STORE_DIFF_8P xmm6, xmm14, xmm15, [rdi]
MMX_STORE_DIFF_8P xmm8, xmm14, xmm15, [rdi]
MMX_STORE_DIFF_8P xmm0, xmm14, xmm15, [rdi+rsi]
MMX_STORE_DIFF_8P xmm5, xmm14, xmm15, [rdi+rsi*2]
MMX_STORE_DIFF_8P xmm1, xmm14, xmm15, [rdi+rsi*2]
lea rax, [rsi+rsi*2]
add rdi, rax
MMX_STORE_DIFF_8P xmm3, xmm14, xmm15, [rdi]
MMX_STORE_DIFF_8P xmm4, xmm14, xmm15, [rdi+rsi]
MMX_STORE_DIFF_8P xmm5, xmm14, xmm15, [rdi+rsi]
MMX_STORE_DIFF_8P xmm9, xmm14, xmm15, [rdi+rsi*2]
MMX_STORE_DIFF_8P xmm2, xmm14, xmm15, [rdi+rax]
MMX_STORE_DIFF_8P xmm1, xmm14, xmm15, [rdi+rsi*4]
MMX_STORE_DIFF_8P xmm6, xmm14, xmm15, [rdi+rax]
MMX_STORE_DIFF_8P xmm7, xmm14, xmm15, [rdi+rsi*4]
ret
......@@ -52,8 +52,8 @@ static void dct2x2dc( int16_t d[2][2] )
tmp[1][1] = d[1][0] - d[1][1];
d[0][0] = tmp[0][0] + tmp[0][1];
d[0][1] = tmp[1][0] + tmp[1][1];
d[1][0] = tmp[0][0] - tmp[0][1];
d[1][0] = tmp[1][0] + tmp[1][1];
d[0][1] = tmp[0][0] - tmp[0][1];
d[1][1] = tmp[1][0] - tmp[1][1];
}
......@@ -84,10 +84,10 @@ static void dct4x4dc( int16_t d[4][4] )
s23 = tmp[i][2] + tmp[i][3];
d23 = tmp[i][2] - tmp[i][3];
d[0][i] = ( s01 + s23 + 1 ) >> 1;
d[1][i] = ( s01 - s23 + 1 ) >> 1;
d[2][i] = ( d01 - d23 + 1 ) >> 1;
d[3][i] = ( d01 + d23 + 1 ) >> 1;
d[i][0] = ( s01 + s23 + 1 ) >> 1;
d[i][1] = ( s01 - s23 + 1 ) >> 1;
d[i][2] = ( d01 - d23 + 1 ) >> 1;
d[i][3] = ( d01 + d23 + 1 ) >> 1;
}
}
......@@ -100,10 +100,10 @@ static void idct4x4dc( int16_t d[4][4] )
for( i = 0; i < 4; i++ )
{
s01 = d[0][i] + d[1][i];
d01 = d[0][i] - d[1][i];
s23 = d[2][i] + d[3][i];
d23 = d[2][i] - d[3][i];
s01 = d[i][0] + d[i][1];
d01 = d[i][0] - d[i][1];
s23 = d[i][2] + d[i][3];
d23 = d[i][2] - d[i][3];
tmp[0][i] = s01 + s23;
tmp[1][i] = s01 - s23;
......@@ -168,10 +168,10 @@ static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *p
const int d03 = tmp[i][0] - tmp[i][3];
const int d12 = tmp[i][1] - tmp[i][2];
dct[0][i] = s03 + s12;
dct[1][i] = 2*d03 + d12;
dct[2][i] = s03 - s12;
dct[3][i] = d03 - 2*d12;
dct[i][0] = s03 + s12;
dct[i][1] = 2*d03 + d12;
dct[i][2] = s03 - s12;
dct[i][3] = d03 - 2*d12;
}
}
......@@ -201,10 +201,10 @@ static void add4x4_idct( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
for( i = 0; i < 4; i++ )
{
const int s02 = dct[i][0] + dct[i][2];
const int d02 = dct[i][0] - dct[i][2];
const int s13 = dct[i][1] + (dct[i][3]>>1);
const int d13 = (dct[i][1]>>1) - dct[i][3];
const int s02 = dct[0][i] + dct[2][i];
const int d02 = dct[0][i] - dct[2][i];
const int s13 = dct[1][i] + (dct[3][i]>>1);
const int d13 = (dct[1][i]>>1) - dct[3][i];
tmp[i][0] = s02 + s13;
tmp[i][1] = d02 + d13;
......@@ -217,7 +217,7 @@ static void add4x4_idct( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
const int s02 = tmp[0][i] + tmp[2][i];
const int d02 = tmp[0][i] - tmp[2][i];
const int s13 = tmp[1][i] + (tmp[3][i]>>1);
const int d13 = (tmp[1][i]>>1) - tmp[3][i];
const int d13 = (tmp[1][i]>>1) - tmp[3][i];
d[0][i] = ( s02 + s13 + 32 ) >> 6;
d[1][i] = ( d02 + d13 + 32 ) >> 6;
......@@ -273,31 +273,36 @@ static void add16x16_idct( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] )
const int a5 = d07 - d34 - (d25 + (d25>>1));\
const int a6 = d07 + d34 - (d16 + (d16>>1));\
const int a7 = d16 - d25 + (d34 + (d34>>1));\
SRC(0) = a0 + a1 ;\
SRC(1) = a4 + (a7>>2);\
SRC(2) = a2 + (a3>>1);\
SRC(3) = a5 + (a6>>2);\
SRC(4) = a0 - a1 ;\
SRC(5) = a6 - (a5>>2);\
SRC(6) = (a2>>1) - a3 ;\
SRC(7) = (a4>>2) - a7 ;\
DST(0) = a0 + a1 ;\
DST(1) = a4 + (a7>>2);\
DST(2) = a2 + (a3>>1);\
DST(3) = a5 + (a6>>2);\
DST(4) = a0 - a1 ;\
DST(5) = a6 - (a5>>2);\
DST(6) = (a2>>1) - a3 ;\
DST(7) = (a4>>2) - a7 ;\
}
static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
{
int i;
int16_t tmp[8][8];
pixel_sub_wxh( (int16_t*)dct, 8, pix1, i_pix1, pix2, i_pix2 );
pixel_sub_wxh( (int16_t*)tmp, 8, pix1, i_pix1, pix2, i_pix2 );
#define SRC(x) dct[x][i]
#define SRC(x) tmp[x][i]
#define DST(x) tmp[x][i]
for( i = 0; i < 8; i++ )
DCT8_1D
#undef SRC
#undef DST
#define SRC(x) dct[i][x]
#define SRC(x) tmp[i][x]
#define DST(x) dct[x][i]
for( i = 0; i < 8; i++ )
DCT8_1D
#undef SRC
#undef DST
}
static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
......@@ -341,14 +346,14 @@ static void add8x8_idct8( uint8_t *dst, int i_dst, int16_t dct[8][8] )
dct[0][0] += 32; // rounding for the >>6 at the end
#define SRC(x) dct[i][x]
#define DST(x,rhs) dct[i][x] = (rhs)
#define SRC(x) dct[x][i]
#define DST(x,rhs) dct[x][i] = (rhs)
for( i = 0; i < 8; i++ )
IDCT8_1D
#undef SRC
#undef DST
#define SRC(x) dct[x][i]
#define SRC(x) dct[i][x]
#define DST(x,rhs) dst[i + x*i_dst] = clip_uint8( dst[i + x*i_dst] + ((rhs) >> 6) );
for( i = 0; i < 8; i++ )
IDCT8_1D
......@@ -404,16 +409,19 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->dct4x4dc = x264_dct4x4dc_mmxext;
dctf->idct4x4dc = x264_idct4x4dc_mmxext;
}
#ifndef ARCH_X86_64
dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmxext;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmxext;
if( cpu&X264_CPU_MMX )
{
dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
dctf->add8x8_idct8 = x264_add8x8_idct8_mmxext;
dctf->add16x16_idct8= x264_add16x16_idct8_mmxext;
#endif
dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
}
#endif
#endif
#if defined(HAVE_SSE2) && defined(ARCH_X86_64)
if( cpu&X264_CPU_SSE2 )
......@@ -425,7 +433,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
}
#endif
/* FIXME altivec dct is not transposed yet
#ifdef ARCH_PPC
if( cpu&X264_CPU_ALTIVEC )
{
......@@ -434,5 +442,6 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
}
#endif
*/
}
......@@ -167,21 +167,19 @@ x264_dct4x4dc_mmxext:
MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
MMX_TRANSPOSE mm0, mm2, mm3, mm4, mm1 ; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3
movq mm6, [x264_mmx_1 GOT_ebx]
paddw mm0, mm6
paddw mm4, mm6
paddw mm2, mm6
psraw mm0, 1
movq [eax+ 0], mm0
psraw mm4, 1
movq [eax+ 8], mm4
paddw mm1, mm6
psraw mm2, 1
movq [eax+ 8], mm2
paddw mm3, mm6
psraw mm1, 1
movq [eax+16], mm1
paddw mm4, mm6
psraw mm3, 1
movq [eax+24], mm3
movq [eax+16], mm3
psraw mm4, 1
movq [eax+24], mm4
picpop ebx
ret
......@@ -206,12 +204,10 @@ x264_idct4x4dc_mmxext:
MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
MMX_TRANSPOSE mm0, mm2, mm3, mm4, mm1 ; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3
movq [eax+ 0], mm0
movq [eax+ 8], mm4
movq [eax+16], mm1
movq [eax+24], mm3
movq [eax+ 8], mm2
movq [eax+16], mm3
movq [eax+24], mm4
ret
cglobal x264_sub4x4_dct_mmxext
......@@ -250,14 +246,11 @@ x264_sub4x4_dct_mmxext:
MMX_SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12
MMX_SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12
; transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3
MMX_TRANSPOSE mm1, mm2, mm3, mm0, mm4
mov eax, [esp+ 8] ; dct
movq [eax+ 0], mm1
movq [eax+ 8], mm0
movq [eax+16], mm4
movq [eax+24], mm3
movq [eax+ 8], mm2
movq [eax+16], mm3
movq [eax+24], mm0
pop ebx
ret
......@@ -272,9 +265,9 @@ x264_add4x4_idct_mmxext:
; Load dct coeffs
mov eax, [esp+12] ; dct
movq mm0, [eax+ 0]
movq mm4, [eax+ 8]
movq mm3, [eax+16]
movq mm1, [eax+24]
movq mm1, [eax+ 8]
movq mm2, [eax+16]
movq mm3, [eax+24]
mov eax, [esp+ 4] ; p_dst
mov ecx, [esp+ 8] ; i_dst
......@@ -283,9 +276,6 @@ x264_add4x4_idct_mmxext:
picpush ebx
picgetgot ebx
; out:mm0, mm1, mm2, mm3
MMX_TRANSPOSE mm0, mm4, mm3, mm1, mm2
MMX_SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02
MMX_SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
......@@ -338,24 +328,11 @@ x264_add4x4_idct_mmxext:
MMX_SUMSUB_BA %1, %2
%endmacro
%macro MMX_STORE_DIFF_8P 6
movq %1, %3
movq %2, %1
punpcklbw %1, %6
punpckhbw %2, %6
paddw %1, %4
paddw %2, %5
packuswb %1, %2
movq %3, %1
%endmacro
cglobal x264_pixel_sub_8x8_mmx
cglobal x264_xdct8_mmxext
cglobal x264_pixel_add_8x8_mmx
cglobal x264_transpose_8x8_mmx
cglobal x264_ydct8_mmx
cglobal x264_xidct8_mmxext
cglobal x264_yidct8_mmx
cglobal x264_pixel_add_8x8_mmx
ALIGN 16
;-----------------------------------------------------------------------------
......@@ -387,78 +364,6 @@ x264_pixel_sub_8x8_mmx:
pop ebx
ret
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_xdct8_mmxext( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
x264_xdct8_mmxext:
mov eax, [esp+04] ; dest
picpush ebx
picgetgot ebx
movq mm5, [x264_mmx_PPNN GOT_ebx]
movq mm6, [x264_mmx_PNNP GOT_ebx]
movq mm4, [x264_mmx_PPPN GOT_ebx]
movq mm7, [x264_mmx_PPNP GOT_ebx]
;-------------------------------------------------------------------------
; horizontal dct ( compute 1 row at a time -> 8 loops )
;-------------------------------------------------------------------------
%assign disp 0
%rep 8
movq mm0, [eax+disp]
movq mm1, [eax+disp+8]
pshufw mm2, mm1, 00011011b
movq mm1, mm0
paddw mm0, mm2 ; (low)s07/s16/d25/s34(high)
psubw mm1, mm2 ; (low)d07/d16/d25/d34(high)
pshufw mm2, mm0, 00011011b ; (low)s34/s25/s16/s07(high)
pmullw mm0, mm5 ; (low)s07/s16/-s25/-s34(high)
paddw mm0, mm2 ; (low)a0/a1/a3/a2(high)
movq mm3, mm1
psraw mm1, 1 ; (low)d07/d16/d25/d34(high) (x>>1)
pshufw mm2, mm3, 10110001b ; (low)d16/d07/d34/d25(high)
paddw mm1, mm3 ; (low)d07/d16/d25/d34(high) (x+(x>>1))
pshufw mm3, mm2, 00011011b ; (low)d25/d34/d07/d16(high)
pmullw mm2, mm5 ; (low)d16/d07/-d34/-d25(high)
pmullw mm1, mm6 ; (low)d07/-d16/-d25/d34(high) (x+(x>>1))
paddw mm3, mm2
paddw mm1, mm3 ; (low)a4/a6/a5/a7(high)
pshufw mm2, mm0, 11001001b ; (low)a1/a3/a0/a2(high)
pshufw mm0, mm0, 10011100b ; (low)a0/a2/a1/a3(high)
pmullw mm2, [x264_mmx_2121 GOT_ebx]
pmullw mm0, mm5 ; (low)a0/a2/-a1/-a3(high)
psraw mm2, 1 ; (low)a1/a3>>1/a0/a2>>1(high)
paddw mm0, mm2 ; (low)dst0/dst2/dst4/dst6(high)
pshufw mm1, mm1, 00100111b ; (low)a7/a6/a5/a4(high)
pshufw mm2, mm1, 00011011b ; (low)a4/a5/a6/a7(high)
psraw mm1, 2 ; (low)a7>>2/a6>>2/a5>>2/a4>>2(high)
pmullw mm2, mm4 ; (low)a4/a5/a6/-a7(high)
pmullw mm1, mm7 ; (low)a7>>2/a6>>2/-a5>>2/a4>>2(high)
paddw mm1, mm2 ; (low)dst1/dst3/dst5/dst7(high)
movq mm2, mm0
punpcklwd mm0, mm1 ; (low)dst0/dst1/dst2/dst3(high)
punpckhwd mm2, mm1 ; (low)dst4/dst5/dst6/dst7(high)
movq [eax+disp], mm0
movq [eax+disp+8], mm2
%assign disp disp+16
%endrep
picpop ebx
ret
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_ydct8_mmx( int16_t dest[8][8] );
......@@ -544,73 +449,6 @@ x264_ydct8_mmx:
ret
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_xidct8_mmxext( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
x264_xidct8_mmxext:
mov eax, [esp+04] ; dest
picpush ebx
picgetgot ebx
movq mm4, [x264_mmx_PPNN GOT_ebx]
movq mm5, [x264_mmx_PNPN GOT_ebx]
movq mm6, [x264_mmx_PPNP GOT_ebx]
movq mm7, [x264_mmx_PPPN GOT_ebx]
;-------------------------------------------------------------------------
; horizontal idct ( compute 1 row at a time -> 8 loops )
;-------------------------------------------------------------------------
%assign disp 0
%rep 8
pshufw mm0, [eax+disp], 11011000b ; (low)d0,d2,d1,d3(high)
pshufw mm2, [eax+disp+8], 11011000b ; (low)d4,d6,d5,d7(high)
movq mm1, mm0
punpcklwd mm0, mm2 ; (low)d0,d4,d2,d6(high)
punpckhwd mm1, mm2 ; (low)d1,d5,d3,d7(high)
pshufw mm2, mm0, 10110001b ; (low)d4,d0,d6,d2(high)
pmullw mm0, [x264_mmx_p2n2p1p1 GOT_ebx]; (low)2*d0,-2*d4,d2,d6(high)
pmullw mm2, mm6 ; (low)d4,d0,-d6,d2(high)
psraw mm0, 1 ; (low)d0,-d4,d2>>1,d6>>1(high)
paddw mm0, mm2 ; (low)e0,e2,e4,e6(high)
movq mm3, mm1 ; (low)d1,d5,d3,d7(high)
psraw mm1, 1 ; (low)d1>>1,d5>>1,d3>>1,d7>>1(high)
pshufw mm2, mm3, 10110001b ; (low)d5,d1,d7,d3(high)
paddw mm1, mm3 ; (low)d1+(d1>>1),d5+(d5>>1),d3+(d3>>1),d7+(d7>>1)(high)
pshufw mm3, mm2, 00011011b ; (low)d3,d7,d1,d5(high)
pmullw mm1, mm4 ; (low)d1+(d1>>1),d5+(d5>>1),-d3-(d3>>1),-d7-(d7>>1)(high)
pmullw mm2, mm5 ; (low)d5,-d1,d7,-d3(high)
paddw mm1, mm3
paddw mm1, mm2 ; (low)e7,e5,e3,e1(high)
pshufw mm2, mm0, 00011011b ; (low)e6,e4,e2,e0(high)
pmullw mm0, mm4 ; (low)e0,e2,-e4,-e6(high)
pshufw mm3, mm1, 00011011b ; (low)e1,e3,e5,e7(high)
psraw mm1, 2 ; (low)e7>>2,e5>>2,e3>>2,e1>>2(high)
pmullw mm3, mm6 ; (low)e1,e3,-e5,e7(high)
pmullw mm1, mm7 ; (low)e7>>2,e5>>2,e3>>2,-e1>>2(high)
paddw mm0, mm2 ; (low)f0,f2,f4,f6(high)
paddw mm1, mm3 ; (low)f1,f3,f5,f7(high)
pshufw mm3, mm0, 00011011b ; (low)f6,f4,f2,f0(high)
pshufw mm2, mm1, 00011011b ; (low)f7,f5,f3,f1(high)
psubw mm3, mm1
paddw mm0, mm2
movq [eax+disp], mm0
movq [eax+disp+8], mm3
%assign disp disp+16
%endrep
picpop ebx
ret
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_yidct8_mmx( int16_t dest[8][8] );
......@@ -691,15 +529,6 @@ x264_yidct8_mmx:
MMX_SUMSUB_BA mm3, mm2 ; mm3 = g2, mm2 = g5
MMX_SUMSUB_BA mm1, mm0 ; mm1 = g3, mm0 = g4
psraw mm7, 6
psraw mm6, 6
psraw mm5, 6
psraw mm4, 6
psraw mm3, 6
psraw mm2, 6
psraw mm1, 6
psraw mm0, 6
movq [eax+disp+0*16], mm7
movq [eax+disp+1*16], mm5
movq [eax+disp+2*16], mm3
......@@ -716,7 +545,7 @@ x264_yidct8_mmx:
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_pixel_add_8x8_mmx( unit8_t *dst, int i_dst, int16_t src[8][8] );
; void __cdecl x264_pixel_add_8x8_mmx( uint8_t *dst, int i_dst, int16_t src[8][8] );
;-----------------------------------------------------------------------------
x264_pixel_add_8x8_mmx:
mov eax, [esp+04] ; dst
......@@ -727,9 +556,69 @@ x264_pixel_add_8x8_mmx:
%assign disp 0
%rep 8