Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
VideoLAN
x264
Commits
79389771
Commit
79389771
authored
Mar 09, 2006
by
Loren Merritt
Browse files
keep transposed dct coefs. ~1% overall speedup.
git-svn-id:
svn://svn.videolan.org/x264/trunk@463
df754926-b1dd-0310-bc7b-ec298dee348c
parent
ce9b3336
Changes
8
Hide whitespace changes
Inline
Side-by-side
common/amd64/dct-a.asm
View file @
79389771
...
...
@@ -177,21 +177,19 @@ x264_dct4x4dc_mmxext:
MMX_SUMSUB_BADC
mm2
,
mm3
,
mm0
,
mm4
; mm2=s01 mm3=d01 mm0=s23 mm4=d23
MMX_SUMSUB_BADC
mm0
,
mm2
,
mm4
,
mm3
; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
MMX_TRANSPOSE
mm0
,
mm2
,
mm3
,
mm4
,
mm1
; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3
movq
mm6
,
[
pw_1
GLOBAL
]
paddw
mm0
,
mm6
paddw
mm
4
,
mm6
paddw
mm
2
,
mm6
psraw
mm0
,
1
movq
[
parm1q
+
0
],
mm0
psraw
mm4
,
1
movq
[
parm1q
+
8
],
mm4
paddw
mm1
,
mm6
psraw
mm2
,
1
movq
[
parm1q
+
8
],
mm2
paddw
mm3
,
mm6
psraw
mm1
,
1
movq
[
parm1q
+
16
],
mm1
paddw
mm4
,
mm6
psraw
mm3
,
1
movq
[
parm1q
+
24
],
mm3
movq
[
parm1q
+
16
],
mm3
psraw
mm4
,
1
movq
[
parm1q
+
24
],
mm4
ret
cglobal
x264_idct4x4dc_mmxext
...
...
@@ -214,12 +212,10 @@ x264_idct4x4dc_mmxext:
MMX_SUMSUB_BADC
mm2
,
mm3
,
mm0
,
mm4
; mm2=s01 mm3=d01 mm0=s23 mm4=d23
MMX_SUMSUB_BADC
mm0
,
mm2
,
mm4
,
mm3
; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
MMX_TRANSPOSE
mm0
,
mm2
,
mm3
,
mm4
,
mm1
; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3
movq
[
parm1q
+
0
],
mm0
movq
[
parm1q
+
8
],
mm
4
movq
[
parm1q
+
16
],
mm
1
movq
[
parm1q
+
24
],
mm
3
movq
[
parm1q
+
8
],
mm
2
movq
[
parm1q
+
16
],
mm
3
movq
[
parm1q
+
24
],
mm
4
ret
cglobal
x264_sub4x4_dct_mmxext
...
...
@@ -267,13 +263,10 @@ x264_sub4x4_dct_mmxext:
MMX_SUMSUB_BA
mm1
,
mm3
; mm1=s03+s12 mm3=s03-s12
MMX_SUMSUB2_AB
mm2
,
mm4
,
mm0
; mm2=2.d03+d12 mm0=d03-2.d12
; transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3
MMX_TRANSPOSE
mm1
,
mm2
,
mm3
,
mm0
,
mm4
movq
[
r10
+
0
],
mm1
; dct
movq
[
r10
+
8
],
mm
0
movq
[
r10
+
16
],
mm
4
movq
[
r10
+
24
],
mm
3
movq
[
r10
+
8
],
mm
2
movq
[
r10
+
16
],
mm
3
movq
[
r10
+
24
],
mm
0
pop
rbx
ret
...
...
@@ -288,17 +281,14 @@ ALIGN 16
x264_add4x4_idct_mmxext:
; Load dct coeffs
movq
mm0
,
[
parm3q
+
0
]
; dct
movq
mm
4
,
[
parm3q
+
8
]
movq
mm
3
,
[
parm3q
+
16
]
movq
mm
1
,
[
parm3q
+
24
]
movq
mm
1
,
[
parm3q
+
8
]
movq
mm
2
,
[
parm3q
+
16
]
movq
mm
3
,
[
parm3q
+
24
]
mov
rax
,
parm1q
; p_dst
movsxd
rcx
,
parm2d
; i_dst
lea
rdx
,
[
rcx
+
rcx
*
2
]
; out:mm0, mm1, mm2, mm3
MMX_TRANSPOSE
mm0
,
mm4
,
mm3
,
mm1
,
mm2
MMX_SUMSUB_BA
mm2
,
mm0
; mm2=s02 mm0=d02
MMX_SUMSUBD2_AB
mm1
,
mm3
,
mm5
,
mm4
; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
...
...
@@ -408,19 +398,18 @@ x264_sub8x8_dct8_sse2:
MMX_LOAD_DIFF_8P
xmm6
,
xmm8
,
xmm9
,
[
rsi
+
r9
],
[
rcx
+
r10
]
MMX_LOAD_DIFF_8P
xmm7
,
xmm8
,
xmm9
,
[
rsi
+
rdx
*
4
],
[
rcx
+
r8
*
4
]
SSE2_TRANSPOSE8x8
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
,
xmm8
DCT8_1D
xmm0
,
xmm5
,
xmm7
,
xmm3
,
xmm8
,
xmm4
,
xmm2
,
xmm1
,
xmm6
,
xmm9
SSE2_TRANSPOSE8x8
xmm4
,
xmm5
,
xmm7
,
xmm2
,
xmm8
,
xmm3
,
xmm1
,
xmm6
,
xmm0
DCT8_1D
xmm4
,
xmm3
,
xmm6
,
xmm2
,
xmm0
,
xmm8
,
xmm7
,
xmm5
,
xmm1
,
xmm9
DCT8_1D
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
,
xmm8
,
xmm9
SSE2_TRANSPOSE8x8
xmm5
,
xmm1
,
xmm2
,
xmm6
,
xmm4
,
xmm3
,
xmm7
,
xmm8
,
xmm0
DCT8_1D
xmm5
,
xmm3
,
xmm8
,
xmm6
,
xmm0
,
xmm4
,
xmm2
,
xmm1
,
xmm7
,
xmm9
movdqa
[
rdi
+
0x00
],
xmm
8
movdqa
[
rdi
+
0x00
],
xmm
4
movdqa
[
rdi
+
0x10
],
xmm3
movdqa
[
rdi
+
0x20
],
xmm
6
movdqa
[
rdi
+
0x30
],
xmm
7
movdqa
[
rdi
+
0x20
],
xmm
8
movdqa
[
rdi
+
0x30
],
xmm
2
movdqa
[
rdi
+
0x40
],
xmm0
movdqa
[
rdi
+
0x50
],
xmm
2
movdqa
[
rdi
+
0x60
],
xmm
5
movdqa
[
rdi
+
0x70
],
xmm
1
movdqa
[
rdi
+
0x50
],
xmm
6
movdqa
[
rdi
+
0x60
],
xmm
1
movdqa
[
rdi
+
0x70
],
xmm
7
ret
...
...
@@ -494,22 +483,21 @@ x264_add8x8_idct8_sse2:
movdqa
xmm6
,
[
rdx
+
0x60
]
movdqa
xmm7
,
[
rdx
+
0x70
]
SSE2_TRANSPOSE8x8
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
,
xmm8
IDCT8_1D
xmm0
,
xmm5
,
xmm7
,
xmm3
,
xmm8
,
xmm4
,
xmm2
,
xmm1
,
xmm9
,
xmm6
SSE2_TRANSPOSE8x8
xmm9
,
xmm5
,
xmm1
,
xmm3
,
xmm8
,
xmm0
,
xmm7
,
xmm2
,
xmm4
IDCT8_1D
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
,
xmm9
,
xmm8
SSE2_TRANSPOSE8x8
xmm9
,
xmm1
,
xmm7
,
xmm3
,
xmm4
,
xmm0
,
xmm2
,
xmm6
,
xmm5
paddw
xmm9
,
[
pw_32
GLOBAL
]
; rounding for the >>6 at the end
IDCT8_1D
xmm9
,
xmm0
,
xmm
2
,
xmm3
,
xmm
4
,
xmm
8
,
xmm
1
,
xmm
5
,
xmm
6
,
xmm
7
IDCT8_1D
xmm9
,
xmm0
,
xmm
6
,
xmm3
,
xmm
5
,
xmm
4
,
xmm
7
,
xmm
1
,
xmm
8
,
xmm
2
MMX_ZERO
xmm15
MMX_STORE_DIFF_8P
xmm
6
,
xmm14
,
xmm15
,
[
rdi
]
MMX_STORE_DIFF_8P
xmm
8
,
xmm14
,
xmm15
,
[
rdi
]
MMX_STORE_DIFF_8P
xmm0
,
xmm14
,
xmm15
,
[
rdi
+
rsi
]
MMX_STORE_DIFF_8P
xmm
5
,
xmm14
,
xmm15
,
[
rdi
+
rsi
*
2
]
MMX_STORE_DIFF_8P
xmm
1
,
xmm14
,
xmm15
,
[
rdi
+
rsi
*
2
]
lea
rax
,
[
rsi
+
rsi
*
2
]
add
rdi
,
rax
MMX_STORE_DIFF_8P
xmm3
,
xmm14
,
xmm15
,
[
rdi
]
MMX_STORE_DIFF_8P
xmm
4
,
xmm14
,
xmm15
,
[
rdi
+
rsi
]
MMX_STORE_DIFF_8P
xmm
5
,
xmm14
,
xmm15
,
[
rdi
+
rsi
]
MMX_STORE_DIFF_8P
xmm9
,
xmm14
,
xmm15
,
[
rdi
+
rsi
*
2
]
MMX_STORE_DIFF_8P
xmm
2
,
xmm14
,
xmm15
,
[
rdi
+
rax
]
MMX_STORE_DIFF_8P
xmm
1
,
xmm14
,
xmm15
,
[
rdi
+
rsi
*
4
]
MMX_STORE_DIFF_8P
xmm
6
,
xmm14
,
xmm15
,
[
rdi
+
rax
]
MMX_STORE_DIFF_8P
xmm
7
,
xmm14
,
xmm15
,
[
rdi
+
rsi
*
4
]
ret
common/dct.c
View file @
79389771
...
...
@@ -52,8 +52,8 @@ static void dct2x2dc( int16_t d[2][2] )
tmp
[
1
][
1
]
=
d
[
1
][
0
]
-
d
[
1
][
1
];
d
[
0
][
0
]
=
tmp
[
0
][
0
]
+
tmp
[
0
][
1
];
d
[
0
][
1
]
=
tmp
[
1
][
0
]
+
tmp
[
1
][
1
];
d
[
1
][
0
]
=
tmp
[
0
][
0
]
-
tmp
[
0
][
1
];
d
[
1
][
0
]
=
tmp
[
1
][
0
]
+
tmp
[
1
][
1
];
d
[
0
][
1
]
=
tmp
[
0
][
0
]
-
tmp
[
0
][
1
];
d
[
1
][
1
]
=
tmp
[
1
][
0
]
-
tmp
[
1
][
1
];
}
...
...
@@ -84,10 +84,10 @@ static void dct4x4dc( int16_t d[4][4] )
s23
=
tmp
[
i
][
2
]
+
tmp
[
i
][
3
];
d23
=
tmp
[
i
][
2
]
-
tmp
[
i
][
3
];
d
[
0
][
i
]
=
(
s01
+
s23
+
1
)
>>
1
;
d
[
1
][
i
]
=
(
s01
-
s23
+
1
)
>>
1
;
d
[
2
][
i
]
=
(
d01
-
d23
+
1
)
>>
1
;
d
[
3
][
i
]
=
(
d01
+
d23
+
1
)
>>
1
;
d
[
i
][
0
]
=
(
s01
+
s23
+
1
)
>>
1
;
d
[
i
][
1
]
=
(
s01
-
s23
+
1
)
>>
1
;
d
[
i
][
2
]
=
(
d01
-
d23
+
1
)
>>
1
;
d
[
i
][
3
]
=
(
d01
+
d23
+
1
)
>>
1
;
}
}
...
...
@@ -100,10 +100,10 @@ static void idct4x4dc( int16_t d[4][4] )
for
(
i
=
0
;
i
<
4
;
i
++
)
{
s01
=
d
[
0
][
i
]
+
d
[
1
][
i
];
d01
=
d
[
0
][
i
]
-
d
[
1
][
i
];
s23
=
d
[
2
][
i
]
+
d
[
3
][
i
];
d23
=
d
[
2
][
i
]
-
d
[
3
][
i
];
s01
=
d
[
i
][
0
]
+
d
[
i
][
1
];
d01
=
d
[
i
][
0
]
-
d
[
i
][
1
];
s23
=
d
[
i
][
2
]
+
d
[
i
][
3
];
d23
=
d
[
i
][
2
]
-
d
[
i
][
3
];
tmp
[
0
][
i
]
=
s01
+
s23
;
tmp
[
1
][
i
]
=
s01
-
s23
;
...
...
@@ -168,10 +168,10 @@ static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *p
const
int
d03
=
tmp
[
i
][
0
]
-
tmp
[
i
][
3
];
const
int
d12
=
tmp
[
i
][
1
]
-
tmp
[
i
][
2
];
dct
[
0
][
i
]
=
s03
+
s12
;
dct
[
1
][
i
]
=
2
*
d03
+
d12
;
dct
[
2
][
i
]
=
s03
-
s12
;
dct
[
3
][
i
]
=
d03
-
2
*
d12
;
dct
[
i
][
0
]
=
s03
+
s12
;
dct
[
i
][
1
]
=
2
*
d03
+
d12
;
dct
[
i
][
2
]
=
s03
-
s12
;
dct
[
i
][
3
]
=
d03
-
2
*
d12
;
}
}
...
...
@@ -201,10 +201,10 @@ static void add4x4_idct( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
for
(
i
=
0
;
i
<
4
;
i
++
)
{
const
int
s02
=
dct
[
i
][
0
]
+
dct
[
i
][
2
];
const
int
d02
=
dct
[
i
][
0
]
-
dct
[
i
][
2
];
const
int
s13
=
dct
[
i
][
1
]
+
(
dct
[
i
][
3
]
>>
1
);
const
int
d13
=
(
dct
[
i
][
1
]
>>
1
)
-
dct
[
i
][
3
];
const
int
s02
=
dct
[
0
][
i
]
+
dct
[
2
][
i
];
const
int
d02
=
dct
[
0
][
i
]
-
dct
[
2
][
i
];
const
int
s13
=
dct
[
1
][
i
]
+
(
dct
[
3
][
i
]
>>
1
);
const
int
d13
=
(
dct
[
1
][
i
]
>>
1
)
-
dct
[
3
][
i
];
tmp
[
i
][
0
]
=
s02
+
s13
;
tmp
[
i
][
1
]
=
d02
+
d13
;
...
...
@@ -217,7 +217,7 @@ static void add4x4_idct( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
const
int
s02
=
tmp
[
0
][
i
]
+
tmp
[
2
][
i
];
const
int
d02
=
tmp
[
0
][
i
]
-
tmp
[
2
][
i
];
const
int
s13
=
tmp
[
1
][
i
]
+
(
tmp
[
3
][
i
]
>>
1
);
const
int
d13
=
(
tmp
[
1
][
i
]
>>
1
)
-
tmp
[
3
][
i
];
const
int
d13
=
(
tmp
[
1
][
i
]
>>
1
)
-
tmp
[
3
][
i
];
d
[
0
][
i
]
=
(
s02
+
s13
+
32
)
>>
6
;
d
[
1
][
i
]
=
(
d02
+
d13
+
32
)
>>
6
;
...
...
@@ -273,31 +273,36 @@ static void add16x16_idct( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] )
const int a5 = d07 - d34 - (d25 + (d25>>1));\
const int a6 = d07 + d34 - (d16 + (d16>>1));\
const int a7 = d16 - d25 + (d34 + (d34>>1));\
SRC
(0) = a0 + a1 ;\
SRC
(1) = a4 + (a7>>2);\
SRC
(2) = a2 + (a3>>1);\
SRC
(3) = a5 + (a6>>2);\
SRC
(4) = a0 - a1 ;\
SRC
(5) = a6 - (a5>>2);\
SRC
(6) = (a2>>1) - a3 ;\
SRC
(7) = (a4>>2) - a7 ;\
DST
(0) = a0 + a1 ;\
DST
(1) = a4 + (a7>>2);\
DST
(2) = a2 + (a3>>1);\
DST
(3) = a5 + (a6>>2);\
DST
(4) = a0 - a1 ;\
DST
(5) = a6 - (a5>>2);\
DST
(6) = (a2>>1) - a3 ;\
DST
(7) = (a4>>2) - a7 ;\
}
static
void
sub8x8_dct8
(
int16_t
dct
[
8
][
8
],
uint8_t
*
pix1
,
int
i_pix1
,
uint8_t
*
pix2
,
int
i_pix2
)
{
int
i
;
int16_t
tmp
[
8
][
8
];
pixel_sub_wxh
(
(
int16_t
*
)
dct
,
8
,
pix1
,
i_pix1
,
pix2
,
i_pix2
);
pixel_sub_wxh
(
(
int16_t
*
)
tmp
,
8
,
pix1
,
i_pix1
,
pix2
,
i_pix2
);
#define SRC(x) dct[x][i]
#define SRC(x) tmp[x][i]
#define DST(x) tmp[x][i]
for
(
i
=
0
;
i
<
8
;
i
++
)
DCT8_1D
#undef SRC
#undef DST
#define SRC(x) dct[i][x]
#define SRC(x) tmp[i][x]
#define DST(x) dct[x][i]
for
(
i
=
0
;
i
<
8
;
i
++
)
DCT8_1D
#undef SRC
#undef DST
}
static
void
sub16x16_dct8
(
int16_t
dct
[
4
][
8
][
8
],
uint8_t
*
pix1
,
int
i_pix1
,
uint8_t
*
pix2
,
int
i_pix2
)
...
...
@@ -341,14 +346,14 @@ static void add8x8_idct8( uint8_t *dst, int i_dst, int16_t dct[8][8] )
dct
[
0
][
0
]
+=
32
;
// rounding for the >>6 at the end
#define SRC(x) dct[
i
][
x
]
#define DST(x,rhs) dct[
i
][
x
] = (rhs)
#define SRC(x) dct[
x
][
i
]
#define DST(x,rhs) dct[
x
][
i
] = (rhs)
for
(
i
=
0
;
i
<
8
;
i
++
)
IDCT8_1D
#undef SRC
#undef DST
#define SRC(x) dct[
x
][
i
]
#define SRC(x) dct[
i
][
x
]
#define DST(x,rhs) dst[i + x*i_dst] = clip_uint8( dst[i + x*i_dst] + ((rhs) >> 6) );
for
(
i
=
0
;
i
<
8
;
i
++
)
IDCT8_1D
...
...
@@ -404,16 +409,19 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf
->
dct4x4dc
=
x264_dct4x4dc_mmxext
;
dctf
->
idct4x4dc
=
x264_idct4x4dc_mmxext
;
}
#ifndef ARCH_X86_64
dctf
->
sub8x8_dct8
=
x264_sub8x8_dct8_mmxext
;
dctf
->
sub16x16_dct8
=
x264_sub16x16_dct8_mmxext
;
if
(
cpu
&
X264_CPU_MMX
)
{
dctf
->
sub8x8_dct8
=
x264_sub8x8_dct8_mmx
;
dctf
->
sub16x16_dct8
=
x264_sub16x16_dct8_mmx
;
dctf
->
add8x8_idct8
=
x264_add8x8_idct8_mmxext
;
dctf
->
add16x16_idct8
=
x264_add16x16_idct8_mmxext
;
#endif
dctf
->
add8x8_idct8
=
x264_add8x8_idct8_mmx
;
dctf
->
add16x16_idct8
=
x264_add16x16_idct8_mmx
;
}
#endif
#endif
#if defined(HAVE_SSE2) && defined(ARCH_X86_64)
if
(
cpu
&
X264_CPU_SSE2
)
...
...
@@ -425,7 +433,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf
->
add16x16_idct8
=
x264_add16x16_idct8_sse2
;
}
#endif
/* FIXME altivec dct is not transposed yet
#ifdef ARCH_PPC
if( cpu&X264_CPU_ALTIVEC )
{
...
...
@@ -434,5 +442,6 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
}
#endif
*/
}
common/i386/dct-a.asm
View file @
79389771
...
...
@@ -167,21 +167,19 @@ x264_dct4x4dc_mmxext:
MMX_SUMSUB_BADC
mm2
,
mm3
,
mm0
,
mm4
; mm2=s01 mm3=d01 mm0=s23 mm4=d23
MMX_SUMSUB_BADC
mm0
,
mm2
,
mm4
,
mm3
; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
MMX_TRANSPOSE
mm0
,
mm2
,
mm3
,
mm4
,
mm1
; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3
movq
mm6
,
[
x264_mmx_1
GOT_ebx
]
paddw
mm0
,
mm6
paddw
mm
4
,
mm6
paddw
mm
2
,
mm6
psraw
mm0
,
1
movq
[
eax
+
0
],
mm0
psraw
mm4
,
1
movq
[
eax
+
8
],
mm4
paddw
mm1
,
mm6
psraw
mm2
,
1
movq
[
eax
+
8
],
mm2
paddw
mm3
,
mm6
psraw
mm1
,
1
movq
[
eax
+
16
],
mm1
paddw
mm4
,
mm6
psraw
mm3
,
1
movq
[
eax
+
24
],
mm3
movq
[
eax
+
16
],
mm3
psraw
mm4
,
1
movq
[
eax
+
24
],
mm4
picpop
ebx
ret
...
...
@@ -206,12 +204,10 @@ x264_idct4x4dc_mmxext:
MMX_SUMSUB_BADC
mm2
,
mm3
,
mm0
,
mm4
; mm2=s01 mm3=d01 mm0=s23 mm4=d23
MMX_SUMSUB_BADC
mm0
,
mm2
,
mm4
,
mm3
; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
MMX_TRANSPOSE
mm0
,
mm2
,
mm3
,
mm4
,
mm1
; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3
movq
[
eax
+
0
],
mm0
movq
[
eax
+
8
],
mm
4
movq
[
eax
+
16
],
mm
1
movq
[
eax
+
24
],
mm
3
movq
[
eax
+
8
],
mm
2
movq
[
eax
+
16
],
mm
3
movq
[
eax
+
24
],
mm
4
ret
cglobal
x264_sub4x4_dct_mmxext
...
...
@@ -250,14 +246,11 @@ x264_sub4x4_dct_mmxext:
MMX_SUMSUB_BA
mm1
,
mm3
; mm1=s03+s12 mm3=s03-s12
MMX_SUMSUB2_AB
mm2
,
mm4
,
mm0
; mm2=2.d03+d12 mm0=d03-2.d12
; transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3
MMX_TRANSPOSE
mm1
,
mm2
,
mm3
,
mm0
,
mm4
mov
eax
,
[
esp
+
8
]
; dct
movq
[
eax
+
0
],
mm1
movq
[
eax
+
8
],
mm
0
movq
[
eax
+
16
],
mm
4
movq
[
eax
+
24
],
mm
3
movq
[
eax
+
8
],
mm
2
movq
[
eax
+
16
],
mm
3
movq
[
eax
+
24
],
mm
0
pop
ebx
ret
...
...
@@ -272,9 +265,9 @@ x264_add4x4_idct_mmxext:
; Load dct coeffs
mov
eax
,
[
esp
+
12
]
; dct
movq
mm0
,
[
eax
+
0
]
movq
mm
4
,
[
eax
+
8
]
movq
mm
3
,
[
eax
+
16
]
movq
mm
1
,
[
eax
+
24
]
movq
mm
1
,
[
eax
+
8
]
movq
mm
2
,
[
eax
+
16
]
movq
mm
3
,
[
eax
+
24
]
mov
eax
,
[
esp
+
4
]
; p_dst
mov
ecx
,
[
esp
+
8
]
; i_dst
...
...
@@ -283,9 +276,6 @@ x264_add4x4_idct_mmxext:
picpush
ebx
picgetgot
ebx
; out:mm0, mm1, mm2, mm3
MMX_TRANSPOSE
mm0
,
mm4
,
mm3
,
mm1
,
mm2
MMX_SUMSUB_BA
mm2
,
mm0
; mm2=s02 mm0=d02
MMX_SUMSUBD2_AB
mm1
,
mm3
,
mm5
,
mm4
; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
...
...
@@ -338,24 +328,11 @@ x264_add4x4_idct_mmxext:
MMX_SUMSUB_BA
%
1
,
%
2
%endmacro
%macro MMX_STORE_DIFF_8P 6
movq
%
1
,
%
3
movq
%
2
,
%
1
punpcklbw
%
1
,
%
6
punpckhbw
%
2
,
%
6
paddw
%
1
,
%
4
paddw
%
2
,
%
5
packuswb
%
1
,
%
2
movq
%
3
,
%
1
%endmacro
cglobal
x264_pixel_sub_8x8_mmx
cglobal
x264_xdct8_mmxext
cglobal
x264_pixel_add_8x8_mmx
cglobal
x264_transpose_8x8_mmx
cglobal
x264_ydct8_mmx
cglobal
x264_xidct8_mmxext
cglobal
x264_yidct8_mmx
cglobal
x264_pixel_add_8x8_mmx
ALIGN
16
;-----------------------------------------------------------------------------
...
...
@@ -387,78 +364,6 @@ x264_pixel_sub_8x8_mmx:
pop
ebx
ret
ALIGN
16
;-----------------------------------------------------------------------------
; void __cdecl x264_xdct8_mmxext( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
x264_xdct8_mmxext:
mov
eax
,
[
esp
+
04
]
; dest
picpush
ebx
picgetgot
ebx
movq
mm5
,
[
x264_mmx_PPNN
GOT_ebx
]
movq
mm6
,
[
x264_mmx_PNNP
GOT_ebx
]
movq
mm4
,
[
x264_mmx_PPPN
GOT_ebx
]
movq
mm7
,
[
x264_mmx_PPNP
GOT_ebx
]
;-------------------------------------------------------------------------
; horizontal dct ( compute 1 row at a time -> 8 loops )
;-------------------------------------------------------------------------
%assign disp 0
%rep 8
movq
mm0
,
[
eax
+
disp
]
movq
mm1
,
[
eax
+
disp
+
8
]
pshufw
mm2
,
mm1
,
00011011b
movq
mm1
,
mm0
paddw
mm0
,
mm2
; (low)s07/s16/d25/s34(high)
psubw
mm1
,
mm2
; (low)d07/d16/d25/d34(high)
pshufw
mm2
,
mm0
,
00011011b
; (low)s34/s25/s16/s07(high)
pmullw
mm0
,
mm5
; (low)s07/s16/-s25/-s34(high)
paddw
mm0
,
mm2
; (low)a0/a1/a3/a2(high)
movq
mm3
,
mm1
psraw
mm1
,
1
; (low)d07/d16/d25/d34(high) (x>>1)
pshufw
mm2
,
mm3
,
10110001b
; (low)d16/d07/d34/d25(high)
paddw
mm1
,
mm3
; (low)d07/d16/d25/d34(high) (x+(x>>1))
pshufw
mm3
,
mm2
,
00011011b
; (low)d25/d34/d07/d16(high)
pmullw
mm2
,
mm5
; (low)d16/d07/-d34/-d25(high)
pmullw
mm1
,
mm6
; (low)d07/-d16/-d25/d34(high) (x+(x>>1))
paddw
mm3
,
mm2
paddw
mm1
,
mm3
; (low)a4/a6/a5/a7(high)
pshufw
mm2
,
mm0
,
11001001b
; (low)a1/a3/a0/a2(high)
pshufw
mm0
,
mm0
,
10011100b
; (low)a0/a2/a1/a3(high)
pmullw
mm2
,
[
x264_mmx_2121
GOT_ebx
]
pmullw
mm0
,
mm5
; (low)a0/a2/-a1/-a3(high)
psraw
mm2
,
1
; (low)a1/a3>>1/a0/a2>>1(high)
paddw
mm0
,
mm2
; (low)dst0/dst2/dst4/dst6(high)
pshufw
mm1
,
mm1
,
00100111b
; (low)a7/a6/a5/a4(high)
pshufw
mm2
,
mm1
,
00011011b
; (low)a4/a5/a6/a7(high)
psraw
mm1
,
2
; (low)a7>>2/a6>>2/a5>>2/a4>>2(high)
pmullw
mm2
,
mm4
; (low)a4/a5/a6/-a7(high)
pmullw
mm1
,
mm7
; (low)a7>>2/a6>>2/-a5>>2/a4>>2(high)
paddw
mm1
,
mm2
; (low)dst1/dst3/dst5/dst7(high)
movq
mm2
,
mm0
punpcklwd
mm0
,
mm1
; (low)dst0/dst1/dst2/dst3(high)
punpckhwd
mm2
,
mm1
; (low)dst4/dst5/dst6/dst7(high)
movq
[
eax
+
disp
],
mm0
movq
[
eax
+
disp
+
8
],
mm2
%assign disp disp+16
%endrep
picpop
ebx
ret
ALIGN
16
;-----------------------------------------------------------------------------
; void __cdecl x264_ydct8_mmx( int16_t dest[8][8] );
...
...
@@ -544,73 +449,6 @@ x264_ydct8_mmx:
ret
ALIGN
16
;-----------------------------------------------------------------------------
; void __cdecl x264_xidct8_mmxext( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
x264_xidct8_mmxext:
mov
eax
,
[
esp
+
04
]
; dest
picpush
ebx
picgetgot
ebx
movq
mm4
,
[
x264_mmx_PPNN
GOT_ebx
]
movq
mm5
,
[
x264_mmx_PNPN
GOT_ebx
]
movq
mm6
,
[
x264_mmx_PPNP
GOT_ebx
]
movq
mm7
,
[
x264_mmx_PPPN
GOT_ebx
]
;-------------------------------------------------------------------------
; horizontal idct ( compute 1 row at a time -> 8 loops )
;-------------------------------------------------------------------------
%assign disp 0
%rep 8
pshufw
mm0
,
[
eax
+
disp
],
11011000b
; (low)d0,d2,d1,d3(high)
pshufw
mm2
,
[
eax
+
disp
+
8
],
11011000b
; (low)d4,d6,d5,d7(high)
movq
mm1
,
mm0
punpcklwd
mm0
,
mm2
; (low)d0,d4,d2,d6(high)
punpckhwd
mm1
,
mm2
; (low)d1,d5,d3,d7(high)
pshufw
mm2
,
mm0
,
10110001b
; (low)d4,d0,d6,d2(high)
pmullw
mm0
,
[
x264_mmx_p2n2p1p1
GOT_ebx
]
; (low)2*d0,-2*d4,d2,d6(high)
pmullw
mm2
,
mm6
; (low)d4,d0,-d6,d2(high)
psraw
mm0
,
1
; (low)d0,-d4,d2>>1,d6>>1(high)
paddw
mm0
,
mm2
; (low)e0,e2,e4,e6(high)
movq
mm3
,
mm1
; (low)d1,d5,d3,d7(high)
psraw
mm1
,
1
; (low)d1>>1,d5>>1,d3>>1,d7>>1(high)
pshufw
mm2
,
mm3
,
10110001b
; (low)d5,d1,d7,d3(high)
paddw
mm1
,
mm3
; (low)d1+(d1>>1),d5+(d5>>1),d3+(d3>>1),d7+(d7>>1)(high)
pshufw
mm3
,
mm2
,
00011011b
; (low)d3,d7,d1,d5(high)
pmullw
mm1
,
mm4
; (low)d1+(d1>>1),d5+(d5>>1),-d3-(d3>>1),-d7-(d7>>1)(high)
pmullw
mm2
,
mm5
; (low)d5,-d1,d7,-d3(high)
paddw
mm1
,
mm3
paddw
mm1
,
mm2
; (low)e7,e5,e3,e1(high)
pshufw
mm2
,
mm0
,
00011011b
; (low)e6,e4,e2,e0(high)
pmullw
mm0
,
mm4
; (low)e0,e2,-e4,-e6(high)
pshufw
mm3
,
mm1
,
00011011b
; (low)e1,e3,e5,e7(high)
psraw
mm1
,
2
; (low)e7>>2,e5>>2,e3>>2,e1>>2(high)
pmullw
mm3
,
mm6
; (low)e1,e3,-e5,e7(high)
pmullw
mm1
,
mm7
; (low)e7>>2,e5>>2,e3>>2,-e1>>2(high)
paddw
mm0
,
mm2
; (low)f0,f2,f4,f6(high)
paddw
mm1
,
mm3
; (low)f1,f3,f5,f7(high)
pshufw
mm3
,
mm0
,
00011011b
; (low)f6,f4,f2,f0(high)
pshufw
mm2
,
mm1
,
00011011b
; (low)f7,f5,f3,f1(high)
psubw
mm3
,
mm1
paddw
mm0
,
mm2
movq
[
eax
+
disp
],
mm0
movq
[
eax
+
disp
+
8
],
mm3
%assign disp disp+16
%endrep
picpop
ebx
ret
ALIGN
16
;-----------------------------------------------------------------------------
; void __cdecl x264_yidct8_mmx( int16_t dest[8][8] );
...
...
@@ -691,15 +529,6 @@ x264_yidct8_mmx:
MMX_SUMSUB_BA
mm3
,
mm2
; mm3 = g2, mm2 = g5
MMX_SUMSUB_BA
mm1
,
mm0
; mm1 = g3, mm0 = g4
psraw
mm7
,
6
psraw
mm6
,
6
psraw
mm5
,
6
psraw
mm4
,
6
psraw
mm3
,
6
psraw
mm2
,
6
psraw
mm1
,
6
psraw
mm0
,
6
movq
[
eax
+
disp
+
0
*
16
],
mm7
movq
[
eax
+
disp
+
1
*
16
],
mm5
movq
[
eax
+
disp
+
2
*
16
],
mm3
...
...
@@ -716,7 +545,7 @@ x264_yidct8_mmx:
ALIGN
16
;-----------------------------------------------------------------------------
; void __cdecl x264_pixel_add_8x8_mmx( u
n
it8_t *dst, int i_dst, int16_t src[8][8] );
; void __cdecl x264_pixel_add_8x8_mmx( ui
n
t8_t *dst, int i_dst, int16_t src[8][8] );
;-----------------------------------------------------------------------------
x264_pixel_add_8x8_mmx:
mov
eax
,
[
esp
+
04
]
; dst
...
...
@@ -727,9 +556,69 @@ x264_pixel_add_8x8_mmx: