Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Open sidebar
Steve Lhomme
VLC
Commits
bad0a366
Commit
bad0a366
authored
Jul 26, 2001
by
Renaud Dartus
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
* Format asm functions for gcc
-> fixed the segfaults with imdct_sse -> sound is hugly with imdct_sse in debug mode
parent
332c81bb
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
177 additions
and
225 deletions
+177
-225
doc/vlc.1
doc/vlc.1
+2
-2
plugins/imdct/ac3_imdct_3dn.c
plugins/imdct/ac3_imdct_3dn.c
+79
-108
plugins/imdct/ac3_imdct_sse.c
plugins/imdct/ac3_imdct_sse.c
+73
-86
plugins/imdct/ac3_srfft_sse.c
plugins/imdct/ac3_srfft_sse.c
+23
-29
No files found.
doc/vlc.1
View file @
bad0a366
...
...
@@ -53,10 +53,10 @@ Choose stereo or mono audio output.
Activate hardware AC3 pass-through mode.
.TP
.B \-\-downmix <module>
Specify a module for AC3 downmix: "downmix", "
downmixsse",
for instance.
Specify a module for AC3 downmix: "downmix", "
sse" or "3dn"
for instance.
.TP
.B \-\-imdct <module>
Specify a module for AC3 IMDCT: "imdct",
"imdctsse",
for instance.
Specify a module for AC3 IMDCT: "imdct",
sse" or "3dn"
for instance.
.TP
.B \-\-novideo
Disable video output.
...
...
plugins/imdct/ac3_imdct_3dn.c
View file @
bad0a366
...
...
@@ -2,7 +2,7 @@
* ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_3dn.c,v 1.
5
2001/07/
08 23:15:11
reno Exp $
* $Id: ac3_imdct_3dn.c,v 1.
6
2001/07/
26 20:00:33
reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
*
...
...
@@ -90,23 +90,11 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
"addl $-4, %%esp
\n
"
/* local variable, loop counter */
"pushl %%eax
\n
"
"pushl %%ebx
\n
"
"pushl %%ecx
\n
"
"pushl %%edx
\n
"
"pushl %%edi
\n
"
"pushl %%esi
\n
"
"movl $128, %%ebx
\n
"
/* loop counter */
"movl 8(%%ebp), %%eax
\n
"
/* pmt */
"movl 12(%%ebp), %%ebx
\n
"
/* buf */
"movl 16(%%ebp), %%ecx
\n
"
/* data */
"movl 20(%%ebp), %%edx
\n
"
/* xcos_sin_sse */
"movl $128, -4(%%ebp)
\n
"
".align 16
\n
"
".loop:
\n
"
"movl (%%eax), %%esi
\n
"
...
...
@@ -126,24 +114,19 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
"pfmul %%mm4, %%mm0
\n
"
/* 255-2j * -s_j | 255-2j * c_j */
"pfmul %%mm1, %%mm2
\n
"
/* 2j * -c_j | 2j * -s_j */
"addl $8, %%e
bx
\n
"
"addl $8, %%e
di
\n
"
"pfadd %%mm2, %%mm0
\n
"
/* 2j * -c_j + 255-2j * -s_j | 2j * -s_j + 255-2j * c_j */
"movq %%mm0, -8(%%e
bx
)
\n
"
"decl
-4(
%%eb
p)
\n
"
"movq %%mm0, -8(%%e
di
)
\n
"
"decl %%eb
x
\n
"
"jnz .loop
\n
"
"popl %%esi
\n
"
"popl %%edi
\n
"
"popl %%edx
\n
"
"popl %%ecx
\n
"
"popl %%ebx
\n
"
"popl %%eax
\n
"
"addl $4, %%esp
\n
"
"popl %%ebp
\n
"
"femms
\n
"
::
);
:
"=D"
(
buf
)
:
"a"
(
pmt
),
"c"
(
data
),
"d"
(
xcos_sin_sse
),
"D"
(
buf
));
}
static
void
imdct512_post_ifft_twiddle_3dn
(
complex_t
*
buf
,
float
*
xcos_sin_sse
)
...
...
@@ -205,25 +188,21 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
"pushl %%eax
\n
"
"pushl %%ebx
\n
"
"pushl %%ecx
\n
"
"pushl %%edx
\n
"
"pushl %%esi
\n
"
"pushl %%edi
\n
"
"pushl %%esi
\n
"
"pushl %%ebp
\n
"
"movl 20(%%ebp), %%ebx
\n
"
/* delay */
"movl 16(%%ebp), %%edx
\n
"
/* window */
"movl 8(%%ebp), %%eax
\n
"
/* buf */
"movl $32, %%ecx
\n
"
/* loop count */
"leal 516(%%eax), %%esi
\n
"
/* buf[64].im */
"leal 504(%%eax), %%edi
\n
"
/* buf[63].re */
"movl 12(%%ebp), %%eax
\n
"
/* data */
"movl %%esi, %%ebp
\n
"
/* buf */
"movl $32, %%ebx
\n
"
/* loop count */
"leal 516(%%ebp), %%esi
\n
"
/* buf[64].im */
"leal 504(%%ebp), %%edi
\n
"
/* buf[63].re */
".align 16
\n
"
".first_128_samples:
\n
"
"movd (%%esi), %%mm0
\n
"
/* im0 */
...
...
@@ -241,8 +220,8 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"movq (%%edx), %%mm0
\n
"
/* w1 | w0 */
"movq 8(%%edx), %%mm1
\n
"
/* w3 | w2 */
"movq (%%e
b
x), %%mm2
\n
"
/* d1 | d0 */
"movq 8(%%e
b
x), %%mm3
\n
"
/* d3 | d2 */
"movq (%%e
c
x), %%mm2
\n
"
/* d1 | d0 */
"movq 8(%%e
c
x), %%mm3
\n
"
/* d3 | d2 */
"pfmul %%mm4, %%mm0
\n
"
/* w1*re0 | -w0*im0 */
"pfmul %%mm5, %%mm1
\n
"
/* w3*re1 | -w2*im1 */
...
...
@@ -253,16 +232,16 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"addl $16, %%edx
\n
"
"movq %%mm0, (%%eax)
\n
"
"movq %%mm1, 8(%%eax)
\n
"
"addl $16, %%e
b
x
\n
"
"addl $16, %%e
c
x
\n
"
"addl $16, %%esi
\n
"
"addl $16, %%eax
\n
"
"addl $-16, %%edi
\n
"
"decl %%e
c
x
\n
"
"decl %%e
b
x
\n
"
"jnz .first_128_samples
\n
"
"movl
8(
%%ebp
)
, %%esi
\n
"
/* buf[0].re */
"
leal 1020(%%esi)
, %%e
di
\n
"
/* buf[127].im
*/
"
movl $32
, %%e
cx
\n
"
/* loop count
*/
"movl %%ebp, %%esi
\n
"
/* buf[0].re */
"
movl $32
, %%e
bx
\n
"
/* loop count
*/
"
leal 1020(%%ebp)
, %%e
di
\n
"
/* buf[127].im
*/
".align 16
\n
"
".second_128_samples:
\n
"
...
...
@@ -270,7 +249,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"movd 8(%%esi), %%mm2
\n
"
/* re1 */
"movd (%%edi), %%mm1
\n
"
/* buf[127-i].im */
"movd -8(%%edi), %%mm3
\n
"
/* im1 */
"pxor %%mm4, %%mm4
\n
"
"pxor %%mm5, %%mm5
\n
"
"pfsub %%mm0, %%mm4
\n
"
/* -re0 */
...
...
@@ -281,8 +260,8 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"movq (%%edx), %%mm0
\n
"
/* w1 | w0 */
"movq 8(%%edx), %%mm1
\n
"
/* w3 | w2 */
"movq (%%e
b
x), %%mm2
\n
"
/* d1 | d0 */
"movq 8(%%e
b
x), %%mm3
\n
"
/* d3 | d2 */
"movq (%%e
c
x), %%mm2
\n
"
/* d1 | d0 */
"movq 8(%%e
c
x), %%mm3
\n
"
/* d3 | d2 */
"addl $16, %%esi
\n
"
...
...
@@ -299,15 +278,14 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"addl $16, %%edx
\n
"
"addl $16, %%eax
\n
"
"addl $16, %%e
b
x
\n
"
"decl %%e
c
x
\n
"
"addl $16, %%e
c
x
\n
"
"decl %%e
b
x
\n
"
"jnz .second_128_samples
\n
"
"movl 8(%%ebp), %%eax
\n
"
"leal 512(%%eax), %%esi
\n
"
/* buf[64].re */
"leal 508(%%eax), %%edi
\n
"
/* buf[63].im */
"movl $32, %%ecx
\n
"
/* loop count */
"movl 20(%%ebp), %%eax
\n
"
/* delay */
"leal 512(%%ebp), %%esi
\n
"
/* buf[64].re */
"leal 508(%%ebp), %%edi
\n
"
/* buf[63].im */
"movl $32, %%ebx
\n
"
/* loop count */
"addl $-1024, %%ecx
\n
"
/* delay */
".align 16
\n
"
".first_128_delay:
\n
"
...
...
@@ -333,19 +311,17 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"pfmul %%mm4, %%mm0
\n
"
/* w1*im0 | -w0*re0 */
"pfmul %%mm5, %%mm1
\n
"
/* w3*im1 | -w2*re1 */
"movq %%mm0, (%%eax)
\n
"
"movq %%mm1, 8(%%eax)
\n
"
"movq %%mm0, (%%ecx)
\n
"
"movq %%mm1, 8(%%ecx)
\n
"
"addl $16, %%esi
\n
"
"addl $-16, %%edi
\n
"
"addl $16, %%e
a
x
\n
"
"decl %%e
c
x
\n
"
"addl $16, %%e
c
x
\n
"
"decl %%e
b
x
\n
"
"jnz .first_128_delay
\n
"
"movl 8(%%ebp), %%ebx
\n
"
"leal 4(%%ebx), %%esi
\n
"
/* buf[0].im */
"leal 1016(%%ebx), %%edi
\n
"
/* buf[127].re */
"movl $32, %%ecx
\n
"
/* loop count */
"leal 4(%%ebp), %%esi
\n
"
/* buf[0].im */
"leal 1016(%%ebp), %%edi
\n
"
/* buf[127].re */
"movl $32, %%ebx
\n
"
/* loop count */
".align 16
\n
"
".second_128_delay:
\n
"
...
...
@@ -372,48 +348,44 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"pfmul %%mm2, %%mm3
\n
"
/* -w3*re1 | w2*im1 */
"movq %%mm1, (%%e
a
x)
\n
"
"movq %%mm3, 8(%%e
a
x)
\n
"
"movq %%mm1, (%%e
c
x)
\n
"
"movq %%mm3, 8(%%e
c
x)
\n
"
"addl $16, %%esi
\n
"
"addl $-16, %%edi
\n
"
"addl $16, %%e
a
x
\n
"
"decl %%e
c
x
\n
"
"addl $16, %%e
c
x
\n
"
"decl %%e
b
x
\n
"
"jnz .second_128_delay
\n
"
"popl %%e
di
\n
"
"popl %%e
bp
\n
"
"popl %%esi
\n
"
"popl %%edi
\n
"
"popl %%edx
\n
"
"popl %%ecx
\n
"
"popl %%ebx
\n
"
"popl %%eax
\n
"
"leave
\n
"
"femms
\n
"
::
);
:
"=S"
(
buf
),
"=a"
(
data_ptr
),
"=c"
(
delay_prt
),
"=d"
(
window_prt
)
:
"S"
(
buf
),
"a"
(
data_ptr
),
"c"
(
delay_prt
),
"d"
(
window_prt
));
}
static
void
imdct512_window_delay_nol_3dn
(
complex_t
*
buf
,
float
*
data_ptr
,
float
*
window_prt
,
float
*
delay_prt
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
"pushl %%eax
\n
"
"pushl %%ebx
\n
"
"pushl %%ecx
\n
"
"pushl %%edx
\n
"
"pushl %%esi
\n
"
"pushl %%edi
\n
"
"pushl %%esi
\n
"
"pushl %%ebp
\n
"
"movl 20(%%ebp), %%ebx
\n
"
/* delay */
"movl 16(%%ebp), %%edx
\n
"
/* window */
"movl 8(%%ebp), %%eax
\n
"
/* buf */
"movl $32, %%ecx
\n
"
/* loop count */
"leal 516(%%eax), %%esi
\n
"
/* buf[64].im */
"leal 504(%%eax), %%edi
\n
"
/* buf[63].re */
"movl 12(%%ebp), %%eax
\n
"
/* data */
"movl %%esi, %%ebp
\n
"
/* buf */
"movl $32, %%ebx
\n
"
/* loop count */
"leal 516(%%ebp), %%esi
\n
"
/* buf[64].im */
"leal 504(%%ebp), %%edi
\n
"
/* buf[63].re */
".align 16
\n
"
".first_128_samples2:
\n
"
...
...
@@ -439,16 +411,16 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"addl $16, %%edx
\n
"
"movq %%mm0, (%%eax)
\n
"
"movq %%mm1, 8(%%eax)
\n
"
"addl $16, %%e
b
x
\n
"
"addl $16, %%e
c
x
\n
"
"addl $16, %%esi
\n
"
"addl $16, %%eax
\n
"
"addl $-16, %%edi
\n
"
"decl %%e
c
x
\n
"
"decl %%e
b
x
\n
"
"jnz .first_128_samples2
\n
"
"movl
8(
%%ebp
)
, %%esi
\n
"
/* buf[0].re */
"
leal 1020(%%esi)
, %%e
di
\n
"
/* buf[127].im
*/
"
movl $32
, %%e
cx
\n
"
/* loop count
*/
"movl %%ebp, %%esi
\n
"
/* buf[0].re */
"
movl $32
, %%e
bx
\n
"
/* loop count
*/
"
leal 1020(%%ebp)
, %%e
di
\n
"
/* buf[127].im
*/
".align 16
\n
"
".second_128_samples2:
\n
"
...
...
@@ -480,15 +452,14 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"addl $16, %%edx
\n
"
"addl $16, %%eax
\n
"
"addl $16, %%e
b
x
\n
"
"decl %%e
c
x
\n
"
"addl $16, %%e
c
x
\n
"
"decl %%e
b
x
\n
"
"jnz .second_128_samples2
\n
"
"movl 8(%%ebp), %%eax
\n
"
"leal 512(%%eax), %%esi
\n
"
/* buf[64].re */
"leal 508(%%eax), %%edi
\n
"
/* buf[63].im */
"movl $32, %%ecx
\n
"
/* loop count */
"movl 20(%%ebp), %%eax
\n
"
/* delay */
"leal 512(%%ebp), %%esi
\n
"
/* buf[64].re */
"leal 508(%%ebp), %%edi
\n
"
/* buf[63].im */
"movl $32, %%ebx
\n
"
/* loop count */
"addl $-1024, %%ecx
\n
"
/* delay */
".align 16
\n
"
".first_128_delays:
\n
"
...
...
@@ -515,18 +486,17 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"pfmul %%mm5, %%mm1
\n
"
/* w3*im1 | -w2*re1 */
"movq %%mm0, (%%e
a
x)
\n
"
"movq %%mm1, 8(%%e
a
x)
\n
"
"movq %%mm0, (%%e
c
x)
\n
"
"movq %%mm1, 8(%%e
c
x)
\n
"
"addl $16, %%esi
\n
"
"addl $-16, %%edi
\n
"
"addl $16, %%e
a
x
\n
"
"decl %%e
c
x
\n
"
"addl $16, %%e
c
x
\n
"
"decl %%e
b
x
\n
"
"jnz .first_128_delays
\n
"
"movl 8(%%ebp), %%ebx
\n
"
"leal 4(%%ebx), %%esi
\n
"
/* buf[0].im */
"leal 1016(%%ebx), %%edi
\n
"
/* buf[127].re */
"movl $32, %%ecx
\n
"
/* loop count */
"leal 4(%%ebp), %%esi
\n
"
/* buf[0].im */
"leal 1016(%%ebp), %%edi
\n
"
/* buf[127].re */
"movl $32, %%ebx
\n
"
/* loop count */
".align 16
\n
"
".second_128_delays:
\n
"
...
...
@@ -553,23 +523,24 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
"pfmul %%mm2, %%mm3
\n
"
/* -w3*re1 | w2*im1 */
"movq %%mm1, (%%e
a
x)
\n
"
"movq %%mm3, 8(%%e
a
x)
\n
"
"movq %%mm1, (%%e
c
x)
\n
"
"movq %%mm3, 8(%%e
c
x)
\n
"
"addl $16, %%esi
\n
"
"addl $-16, %%edi
\n
"
"addl $16, %%e
a
x
\n
"
"decl %%e
c
x
\n
"
"addl $16, %%e
c
x
\n
"
"decl %%e
b
x
\n
"
"jnz .second_128_delays
\n
"
"popl %%e
di
\n
"
"popl %%e
bp
\n
"
"popl %%esi
\n
"
"popl %%edi
\n
"
"popl %%edx
\n
"
"popl %%ecx
\n
"
"popl %%ebx
\n
"
"popl %%eax
\n
"
"leave
\n
"
"femms
\n
"
::
);
:
"=S"
(
buf
),
"=a"
(
data_ptr
),
"=c"
(
delay_prt
),
"=d"
(
window_prt
)
:
"S"
(
buf
),
"a"
(
data_ptr
),
"c"
(
delay_prt
),
"d"
(
window_prt
));
}
plugins/imdct/ac3_imdct_sse.c
View file @
bad0a366
...
...
@@ -2,7 +2,7 @@
* ac3_imdct_sse.c: accelerated SSE ac3 DCT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_sse.c,v 1.
4
2001/07/
08 23:15:11
reno Exp $
* $Id: ac3_imdct_sse.c,v 1.
5
2001/07/
26 20:00:33
reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
...
...
@@ -103,10 +103,7 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
"pushl %%edi
\n
"
"pushl %%esi
\n
"
"movl 8(%%ebp), %%eax
\n
"
/* pmt */
"movl 12(%%ebp), %%ebx
\n
"
/* buf */
"movl 16(%%ebp), %%ecx
\n
"
/* data */
"movl 20(%%ebp), %%edx
\n
"
/* xcos_sin_sse */
"movl %%edi, %%ebx
\n
"
/* buf */
"movl $64, -4(%%ebp)
\n
"
".align 16
\n
"
...
...
@@ -153,7 +150,9 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
"addl $4, %%esp
\n
"
"popl %%ebp
\n
"
::
);
:
"=D"
(
buf
)
:
"a"
(
pmt
),
"c"
(
data
),
"d"
(
xcos_sin_sse
),
"D"
(
buf
));
}
static
void
imdct512_post_ifft_twiddle_sse
(
complex_t
*
buf
,
float
*
xcos_sin_sse
)
...
...
@@ -226,24 +225,19 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
"pushl %%eax
\n
"
"pushl %%ebx
\n
"
"pushl %%ecx
\n
"
"pushl %%edx
\n
"
"pushl %%esi
\n
"
"pushl %%edi
\n
"
"pushl %%esi
\n
"
"pushl %%ebp
\n
"
"movl 20(%%ebp), %%ebx
\n
"
/* delay */
"movl 16(%%ebp), %%edx
\n
"
/* window */
"movl 8(%%ebp), %%eax
\n
"
/* buf */
"movl $16, %%ecx
\n
"
/* loop count */
"leal 516(%%eax), %%esi
\n
"
/* buf[64].im */
"leal 504(%%eax), %%edi
\n
"
/* buf[63].re */
"movl 12(%%ebp), %%eax
\n
"
/* data */
"movl %%esi, %%ebp
\n
"
/* buf */
"movl $16, %%ebx
\n
"
/* loop count */
"leal 516(%%ebp), %%esi
\n
"
/* buf[64].im */
"leal 504(%%ebp), %%edi
\n
"
/* buf[63].re */
".align 16
\n
"
".first_128_samples:
\n
"
...
...
@@ -256,7 +250,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"movlhps %%xmm3, %%xmm1
\n
"
/* 0.0 | re1 | 0.0 | re0 */
"movups (%%edx), %%xmm4
\n
"
/* w3 | w2 | w1 | w0 */
"movaps (%%e
b
x), %%xmm5
\n
"
/* d3 | d2 | d1 | d0 */
"movaps (%%e
c
x), %%xmm5
\n
"
/* d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1
\n
"
/* re1 | 0.0 | re0 | 0.0 */
"movss 16(%%esi), %%xmm6
\n
"
/* im2 */
...
...
@@ -270,23 +264,23 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"addps %%xmm5, %%xmm0
\n
"
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* re3 | 0.0 | re2 | 0.0 */
"movups 16(%%edx), %%xmm4
\n
"
/* w7 | w6 | w5 | w4 */
"movaps 16(%%e
b
x), %%xmm5
\n
"
/* d7 | d6 | d5 | d4 */
"movaps 16(%%e
c
x), %%xmm5
\n
"
/* d7 | d6 | d5 | d4 */
"subps %%xmm2, %%xmm6
\n
"
/* -re3 | im3 | -re2 | im2 */
"addl $32, %%edx
\n
"
"movaps %%xmm0, (%%eax)
\n
"
"addl $32, %%e
b
x
\n
"
"addl $32, %%e
c
x
\n
"
"mulps %%xmm4, %%xmm6
\n
"
"addl $32, %%esi
\n
"
"addl $32, %%eax
\n
"
"addps %%xmm5, %%xmm6
\n
"
"addl $-32, %%edi
\n
"
"movaps %%xmm6, -16(%%eax)
\n
"
"decl %%e
c
x
\n
"
"decl %%e
b
x
\n
"
"jnz .first_128_samples
\n
"
"movl
8(
%%ebp
)
, %%esi
\n
"
/* buf[0].re */
"
leal 1020(%%esi)
, %%e
di
\n
"
/* buf[127].im
*/
"
movl $16
, %%e
cx
\n
"
/* loop count
*/
"movl %%ebp, %%esi
\n
"
/* buf[0].re */
"
movl $16
, %%e
bx
\n
"
/* loop count
*/
"
leal 1020(%%ebp)
, %%e
di
\n
"
/* buf[127].im
*/
".align 16
\n
"
".second_128_samples:
\n
"
...
...
@@ -299,7 +293,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"movlhps %%xmm3, %%xmm1
\n
"
/* 0.0 | im1 | 0.0 | im1 */
"movups (%%edx), %%xmm4
\n
"
/* w3 | w2 | w1 | w0 */
"movaps (%%e
b
x), %%xmm5
\n
"
/* d3 | d2 | d1 | d0 */
"movaps (%%e
c
x), %%xmm5
\n
"
/* d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1
\n
"
/* im1 | 0.0 | im0 | 0.0 */
"movss 16(%%esi), %%xmm6
\n
"
/* re2 */
...
...
@@ -317,21 +311,20 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"addps %%xmm5, %%xmm0
\n
"
"mulps %%xmm4, %%xmm6
\n
"
"addl $-32, %%edi
\n
"
"movaps 16(%%e
b
x), %%xmm5
\n
"
/* d7 | d6 | d5 | d4 */
"movaps 16(%%e
c
x), %%xmm5
\n
"
/* d7 | d6 | d5 | d4 */
"movaps %%xmm0, (%%eax)
\n
"
"addps %%xmm5, %%xmm6
\n
"
"addl $32, %%edx
\n
"
"addl $32, %%eax
\n
"
"addl $32, %%e
b
x
\n
"
"addl $32, %%e
c
x
\n
"
"movaps %%xmm6, -16(%%eax)
\n
"
"decl %%e
c
x
\n
"
"decl %%e
b
x
\n
"
"jnz .second_128_samples
\n
"
"movl 8(%%ebp), %%eax
\n
"
"leal 512(%%eax), %%esi
\n
"
/* buf[64].re */
"leal 508(%%eax), %%edi
\n
"
/* buf[63].im */
"movl $16, %%ecx
\n
"
/* loop count */
"movl 20(%%ebp), %%eax
\n
"
/* delay */
"leal 512(%%ebp), %%esi
\n
"
/* buf[64].re */
"leal 508(%%ebp), %%edi
\n
"
/* buf[63].im */
"movl $16, %%ebx
\n
"
/* loop count */
"addl $-1024, %%ecx
\n
"
/* delay */
".align 16
\n
"
".first_128_delay:
\n
"
...
...
@@ -356,20 +349,19 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"mulps %%xmm4, %%xmm0
\n
"
"movups (%%edx), %%xmm5
\n
"
/* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* im3 | 0.0 | im2 | 0.0 */
"movaps %%xmm0, (%%e
a
x)
\n
"
"movaps %%xmm0, (%%e
c
x)
\n
"
"addl $32, %%esi
\n
"
"subps %%xmm2, %%xmm6
\n
"
/* -im3 | re3 | -im2 | re2 */
"addl $-32, %%edi
\n
"
"mulps %%xmm5, %%xmm6
\n
"
"addl $32, %%e
a
x
\n
"
"movaps %%xmm6, -16(%%e
a
x)
\n
"
"decl %%e
c
x
\n
"
"addl $32, %%e
c
x
\n
"
"movaps %%xmm6, -16(%%e
c
x)
\n
"
"decl %%e
b
x
\n
"
"jnz .first_128_delay
\n
"
"movl 8(%%ebp), %%ebx
\n
"
"leal 4(%%ebx), %%esi
\n
"
/* buf[0].im */
"leal 1016(%%ebx), %%edi
\n
"
/* buf[127].re */
"movl $16, %%ecx
\n
"
/* loop count */
"leal 4(%%ebp), %%esi
\n
"
/* buf[0].im */
"leal 1016(%%ebp), %%edi
\n
"
/* buf[127].re */
"movl $16, %%ebx
\n
"
/* loop count */
".align 16
\n
"
".second_128_delay:
\n
"
...
...
@@ -394,49 +386,45 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
"mulps %%xmm4, %%xmm1
\n
"
"movups (%%edx), %%xmm5
\n
"
/* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2
\n
"
/* re3 | 0.0 | re2 | 0.0 */
"movaps %%xmm1, (%%e
a
x)
\n
"
"movaps %%xmm1, (%%e
c
x)
\n
"
"addl $32, %%esi
\n
"
"subps %%xmm6, %%xmm2
\n
"
/* re | -im3 | re | -im2 */
"addl $-32, %%edi
\n
"
"mulps %%xmm5, %%xmm2
\n
"
"addl $32, %%e
a
x
\n
"
"movaps %%xmm2, -16(%%e
a
x)
\n
"
"decl %%e
c
x
\n
"
"addl $32, %%e
c
x
\n
"
"movaps %%xmm2, -16(%%e
c
x)
\n
"
"decl %%e
b
x
\n
"
"jnz .second_128_delay
\n
"
"popl %%e
di
\n
"
"popl %%e
bp
\n
"
"popl %%esi
\n
"
"popl %%edi
\n
"
"popl %%edx
\n
"
"popl %%ecx
\n
"
"popl %%ebx
\n
"
"popl %%eax
\n
"
:
"=S"
(
buf
),
"=a"
(
data_ptr
),
"=c"
(
delay_prt
),
"=d"
(
window_prt
)
:
"S"
(
buf
),
"a"
(
data_ptr
),
"c"
(
delay_prt
),
"d"
(
window_prt
));
"leave
\n
"
::
);
}
static
void
imdct512_window_delay_nol_sse
(
complex_t
*
buf
,
float
*
data_ptr
,
float
*
window_prt
,
float
*
delay_prt
)
{
__asm__
__volatile__
(
".align 16
\n
"
"pushl %%ebp
\n
"
"movl %%esp, %%ebp
\n
"
"pushl %%eax
\n
"
"pushl %%ebx
\n
"
"pushl %%ecx
\n
"
"pushl %%edx
\n
"
"pushl %%esi
\n
"
"pushl %%edi
\n
"
"pushl %%esi
\n
"
"pushl %%ebp
\n
"
/* movl 20(%%ebp), %%ebx delay */
"movl 16(%%ebp), %%edx
\n
"
/* window */
"movl 8(%%ebp), %%eax
\n
"
/* buf */
"movl $16, %%ecx
\n
"
/* loop count */
"leal 516(%%eax), %%esi
\n
"
/* buf[64].im */
"leal 504(%%eax), %%edi
\n
"
/* buf[63].re */
"movl 12(%%ebp), %%eax
\n
"
/* data */
"movl %%esi, %%ebp
\n
"
/* buf */
"movl $16, %%ebx
\n
"
/* loop count */
"leal 516(%%ebp), %%esi
\n
"
/* buf[64].im */
"leal 504(%%ebp), %%edi
\n
"
/* buf[63].re */
".align 16
\n
"
".first_128_sample:
\n
"
...
...
@@ -469,12 +457,12 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"addl $32, %%eax
\n
"
"addl $-32, %%edi
\n
"
"movaps %%xmm6, -16(%%eax)
\n
"
"decl %%e
c
x
\n
"
"decl %%e
b
x
\n
"
"jnz .first_128_sample
\n
"
"movl
8(
%%ebp
)
, %%esi
\n
"
/* buf[0].re */
"
leal 1020(%%esi)
, %%e
di
\n
"
/* buf[127].im
*/
"
movl $16
, %%e
cx
\n
"
/* loop count
*/
"movl %%ebp, %%esi
\n
"
/* buf[0].re */
"
movl $16
, %%e
bx
\n
"
/* loop count
*/
"
leal 1020(%%ebp)
, %%e
di
\n
"
/* buf[127].im
*/
".align 16
\n
"
".second_128_sample:
\n
"
...
...
@@ -507,14 +495,13 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
"addl $32, %%edx
\n
"
"addl $32, %%eax
\n
"
"movaps %%xmm6, -16(%%eax)
\n
"
"decl %%e
c
x
\n
"
"decl %%e
b
x
\n
"
"jnz .second_128_sample
\n
"
"movl 8(%%ebp), %%eax
\n
"
"leal 512(%%eax), %%esi
\n
"
/* buf[64].re */
"leal 508(%%eax), %%edi
\n
"
/* buf[63].im */
"movl $16, %%ecx
\n
"
/* loop count */
"movl 20(%%ebp), %%eax
\n
"
/* delay */
"leal 512(%%ebp), %%esi
\n
"
/* buf[64].re */
"leal 508(%%ebp), %%edi
\n
"
/* buf[63].im */
"movl $16, %%ebx
\n
"
/* loop count */
"addl $-1024, %%ecx
\n
"
/* delay */
".align 16
\n
"