Commit dee3179d authored by Renaud Dartus's avatar Renaud Dartus

* Alignement in asm functions

* 16 bytes alignement for data (need fo SSE)
* Optimization in SSE
parent 5b49dba8
......@@ -2,7 +2,7 @@
* ac3_imdct.h : AC3 IMDCT types
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct.h,v 1.4 2001/06/12 00:30:41 reno Exp $
* $Id: ac3_imdct.h,v 1.5 2001/07/08 23:15:11 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Renaud Dartus <reno@videolan.org>
......@@ -42,18 +42,19 @@ typedef struct imdct_s
float xsin1[N/4] __attribute__ ((aligned(16)));
float xcos2[N/8] __attribute__ ((aligned(16)));
float xsin2[N/8] __attribute__ ((aligned(16)));
float xcos_sin_sse[128 * 4] __attribute__ ((aligned(16)));
/* Twiddle factor LUT */
complex_t *w[7] __attribute__ ((aligned(16)));
complex_t w_1[1] __attribute__ ((aligned(16)));
float used_for_alignement1;
float used_for_alignement2;
complex_t w_2[2] __attribute__ ((aligned(16)));
complex_t w_4[4] __attribute__ ((aligned(16)));
complex_t w_8[8] __attribute__ ((aligned(16)));
complex_t w_16[16] __attribute__ ((aligned(16)));
complex_t w_32[32] __attribute__ ((aligned(16)));
complex_t w_64[64] __attribute__ ((aligned(16)));
float xcos_sin_sse[128 * 4] __attribute__ ((aligned(16)));
complex_t *w[7] __attribute__ ((aligned(16)));
/* Module used and shortcuts */
struct module_s * p_module;
......
......@@ -2,7 +2,7 @@
* ac3_downmix_3dn.c: accelerated 3D Now! ac3 downmix functions
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_downmix_3dn.c,v 1.3 2001/07/01 08:49:09 gbazin Exp $
* $Id: ac3_downmix_3dn.c,v 1.4 2001/07/08 23:15:11 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
*
......@@ -46,6 +46,7 @@ void sqrt2_3dn (void)
void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"movl $128, %%ebx\n" /* loop counter */
......@@ -58,6 +59,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7\n" /* slev */
"punpckldq %%mm7, %%mm7\n" /* slev | slev */
".align 16\n"
".loop:\n"
"movq (%%eax), %%mm0\n" /* left */
"movq 2048(%%eax), %%mm1\n" /* right */
......@@ -90,6 +92,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"movl $128, %%ebx\n" /* loop counter */
......@@ -99,6 +102,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7\n" /* slev */
"punpckldq %%mm7, %%mm7\n" /* slev | slev */
".align 16\n"
".loop3:\n"
"movq (%%eax), %%mm0\n" /* left */
"movq 1024(%%eax), %%mm1\n" /* right */
......@@ -127,7 +131,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"movl $128, %%ebx\n" /* loop counter */
......@@ -140,6 +144,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7\n" /* slev */
"punpckldq %%mm7, %%mm7\n" /* slev | slev */
".align 16\n"
".loop4:\n"
"movq (%%eax), %%mm0\n" /* left */
"movq 2048(%%eax), %%mm1\n" /* right */
......@@ -170,6 +175,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"movl $128, %%ebx\n" /* loop counter */
......@@ -179,6 +185,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 8(%%ecx), %%mm7\n" /* slev */
"punpckldq %%mm7, %%mm7\n" /* slev | slev */
".align 16\n"
".loop5:\n"
"movq (%%eax), %%mm0\n" /* left */
"movq 1024(%%eax), %%mm1\n" /* right */
......@@ -205,6 +212,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"movl $128, %%ebx\n" /* loop counter */
......@@ -214,6 +222,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
"movd 4(%%ecx), %%mm6\n" /* clev */
"punpckldq %%mm6, %%mm6\n" /* clev | clev */
".align 16\n"
".loop6:\n"
"movq (%%eax), %%mm0\n" /*left */
"movq 2048(%%eax), %%mm1\n" /* right */
......@@ -240,6 +249,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"pushl %%edx\n"
......@@ -248,6 +258,7 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
"punpckldq %%mm7, %%mm7\n" /* sqrt2 | sqrt2 */
"movl $128, %%ebx\n"
".align 16\n"
".loop2:\n"
"movq (%%ecx), %%mm0\n" /* c1 | c0 */
"pfmul %%mm7, %%mm0\n"
......@@ -274,9 +285,11 @@ void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"movl $128, %%ebx\n"
".align 16\n"
".loop1:\n"
"movq (%%ecx), %%mm0\n" /* l1 | l0 */
"movq (%%edx), %%mm1\n" /* r1 | r0 */
......
......@@ -2,7 +2,7 @@
* ac3_downmix_sse.c: accelerated SSE ac3 downmix functions
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
* $Id: ac3_downmix_sse.c,v 1.3 2001/07/01 08:49:09 gbazin Exp $
* $Id: ac3_downmix_sse.c,v 1.4 2001/07/08 23:15:11 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
......@@ -41,48 +41,51 @@
void sqrt2_sse (void) __asm__ ("sqrt2_sse");
void sqrt2_sse (void)
{
__asm__ (".float 0f0.7071068");
__asm__ (".align 16\n"
".float 0f0.7071068");
}
void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"movl $64, %%ebx\n" /* loop counter */
"movl $64, %%ebx\n" /* loop counter */
"movss (%%ecx), %%xmm5\n" /* unit */
"shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss (%%ecx), %%xmm5\n" /* unit */
"shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss 4(%%ecx), %%xmm6\n" /* clev */
"shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
"movss 4(%%ecx), %%xmm6\n" /* clev */
"shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
"movss 8(%%ecx), %%xmm7\n" /* slev */
"shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
"movss 8(%%ecx), %%xmm7\n" /* slev */
"shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
".align 16\n"
".loop:\n"
"movups (%%eax), %%xmm0\n" /* left */
"movups 2048(%%eax), %%xmm1\n" /* right */
"movups 1024(%%eax), %%xmm2\n" /* center */
"movups 3072(%%eax), %%xmm3\n" /* leftsur */
"movups 4096(%%eax), %%xmm4\n" /* rithgsur */
"mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n"
"mulps %%xmm6, %%xmm2\n"
"addps %%xmm2, %%xmm0\n"
"addps %%xmm2, %%xmm1\n"
"mulps %%xmm7, %%xmm3\n"
"mulps %%xmm7, %%xmm4\n"
"addps %%xmm3, %%xmm0\n"
"addps %%xmm4, %%xmm1\n"
"movups %%xmm0, (%%eax)\n"
"movups %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n"
"decl %%ebx\n"
"jnz .loop\n"
"movaps (%%eax), %%xmm0\n" /* left */
"movaps 2048(%%eax), %%xmm1\n" /* right */
"movaps 1024(%%eax), %%xmm2\n" /* center */
"movaps 3072(%%eax), %%xmm3\n" /* leftsur */
"movaps 4096(%%eax), %%xmm4\n" /* rithgsur */
"mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n"
"mulps %%xmm6, %%xmm2\n"
"addps %%xmm2, %%xmm0\n"
"addps %%xmm2, %%xmm1\n"
"mulps %%xmm7, %%xmm3\n"
"mulps %%xmm7, %%xmm4\n"
"addps %%xmm3, %%xmm0\n"
"addps %%xmm4, %%xmm1\n"
"movaps %%xmm0, (%%eax)\n"
"movaps %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n"
"decl %%ebx\n"
"jnz .loop\n"
"popl %%ebx\n"
"popl %%ebx\n"
: "=a" (samples)
: "a" (samples), "c" (dm_par));
}
......@@ -90,35 +93,37 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"movl $64, %%ebx\n" /* loop counter */
"movss (%%ecx), %%xmm5\n" /* unit */
"movss (%%ecx), %%xmm5\n" /* unit */
"shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss 8(%%ecx), %%xmm7\n" /* slev */
"shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
"movss 8(%%ecx), %%xmm7\n" /* slev */
"shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
".align 16\n"
".loop3:\n"
"movups (%%eax), %%xmm0\n" /* left */
"movups 1024(%%eax), %%xmm1\n" /* right */
"movups 2048(%%eax), %%xmm3\n" /* leftsur */
"movups 3072(%%eax), %%xmm4\n" /* rightsur */
"mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n"
"mulps %%xmm7, %%xmm3\n"
"mulps %%xmm7, %%xmm4\n"
"addps %%xmm3, %%xmm0\n"
"addps %%xmm4, %%xmm1\n"
"movups %%xmm0, (%%eax)\n"
"movups %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n"
"decl %%ebx\n"
"jnz .loop3\n"
"popl %%ebx\n"
"movaps (%%eax), %%xmm0\n" /* left */
"movaps 1024(%%eax), %%xmm1\n" /* right */
"movaps 2048(%%eax), %%xmm3\n" /* leftsur */
"movaps 3072(%%eax), %%xmm4\n" /* rightsur */
"mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n"
"mulps %%xmm7, %%xmm3\n"
"mulps %%xmm7, %%xmm4\n"
"addps %%xmm3, %%xmm0\n"
"addps %%xmm4, %%xmm1\n"
"movaps %%xmm0, (%%eax)\n"
"movaps %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n"
"decl %%ebx\n"
"jnz .loop3\n"
"popl %%ebx\n"
: "=a" (samples)
: "a" (samples), "c" (dm_par));
}
......@@ -126,112 +131,114 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"movl $64, %%ebx\n" /* loop counter */
"pushl %%ebx\n"
"movl $64, %%ebx\n" /* loop counter */
"movss (%%ecx), %%xmm5\n" /* unit */
"shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss (%%ecx), %%xmm5\n" /* unit */
"shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss 4(%%ecx), %%xmm6\n" /* clev */
"shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
"movss 4(%%ecx), %%xmm6\n" /* clev */
"shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
"movss 8(%%ecx), %%xmm7\n" /* slev */
"shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
"movss 8(%%ecx), %%xmm7\n" /* slev */
"shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
".align 16\n"
".loop4:\n"
"movups (%%eax), %%xmm0\n" /* left */
"movups 2048(%%eax), %%xmm1\n" /* right */
"movups 1024(%%eax), %%xmm2\n" /* center */
"movups 3072(%%eax), %%xmm3\n" /* sur */
"mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n"
"mulps %%xmm6, %%xmm2\n"
"addps %%xmm2, %%xmm0\n"
"mulps %%xmm7, %%xmm3\n"
"addps %%xmm2, %%xmm1\n"
"subps %%xmm3, %%xmm0\n"
"addps %%xmm3, %%xmm1\n"
"movups %%xmm0, (%%eax)\n"
"movups %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n"
"decl %%ebx\n"
"jnz .loop4\n"
"popl %%ebx\n"
"movaps (%%eax), %%xmm0\n" /* left */
"movaps 2048(%%eax), %%xmm1\n" /* right */
"movaps 1024(%%eax), %%xmm2\n" /* center */
"movaps 3072(%%eax), %%xmm3\n" /* sur */
"mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n"
"mulps %%xmm6, %%xmm2\n"
"addps %%xmm2, %%xmm0\n"
"mulps %%xmm7, %%xmm3\n"
"addps %%xmm2, %%xmm1\n"
"subps %%xmm3, %%xmm0\n"
"addps %%xmm3, %%xmm1\n"
"movaps %%xmm0, (%%eax)\n"
"movaps %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n"
"decl %%ebx\n"
"jnz .loop4\n"
"popl %%ebx\n"
: "=a" (samples)
: "a" (samples), "c" (dm_par));
}
void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
"pushl %%ebx\n"
"movl $64, %%ebx\n" /* loop counter */
".align 16\n"
"pushl %%ebx\n"
"movl $64, %%ebx\n" /* loop counter */
"movss (%%ecx), %%xmm5\n" /* unit */
"shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss (%%ecx), %%xmm5\n" /* unit */
"shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss 8(%%ecx), %%xmm7\n" /* slev */
"shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
"movss 8(%%ecx), %%xmm7\n" /* slev */
"shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
".align 16\n"
".loop5:\n"
"movups (%%eax), %%xmm0\n" /* left */
"movups 1024(%%eax), %%xmm1\n" /* right */
"movups 2048(%%eax), %%xmm3\n" /* sur */
"mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n"
"mulps %%xmm7, %%xmm3\n"
"subps %%xmm3, %%xmm0\n"
"addps %%xmm3, %%xmm1\n"
"movups %%xmm0, (%%eax)\n"
"movups %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n"
"decl %%ebx\n"
"jnz .loop5\n"
"popl %%ebx\n"
: "=a" (samples)
: "a" (samples), "c" (dm_par));
"movaps (%%eax), %%xmm0\n" /* left */
"movaps 1024(%%eax), %%xmm1\n" /* right */
"movaps 2048(%%eax), %%xmm3\n" /* sur */
"mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n"
"mulps %%xmm7, %%xmm3\n"
"subps %%xmm3, %%xmm0\n"
"addps %%xmm3, %%xmm1\n"
"movaps %%xmm0, (%%eax)\n"
"movaps %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n"
"decl %%ebx\n"
"jnz .loop5\n"
"popl %%ebx\n"
: "=a" (samples)
: "a" (samples), "c" (dm_par));
}
void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
{
__asm__ __volatile__ (
"pushl %%ebx\n"
"movl $64, %%ebx\n" /* loop counter */
".align 16\n"
"pushl %%ebx\n"
"movl $64, %%ebx\n" /* loop counter */
"movss (%%ecx), %%xmm5\n" /* unit */
"shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss (%%ecx), %%xmm5\n" /* unit */
"shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
"movss 4(%%ecx), %%xmm6\n" /* clev */
"shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
"movss 4(%%ecx), %%xmm6\n" /* clev */
"shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
".align 16\n"
".loop6:\n"
"movups (%%eax), %%xmm0\n" /*left */
"movups 2048(%%eax), %%xmm1\n" /* right */
"movups 1024(%%eax), %%xmm2\n" /* center */
"mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n"
"mulps %%xmm6, %%xmm2\n"
"addps %%xmm2, %%xmm0\n"
"addps %%xmm2, %%xmm1\n"
"movups %%xmm0, (%%eax)\n"
"movups %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n"
"decl %%ebx\n"
"jnz .loop6\n"
"popl %%ebx\n"
"movaps (%%eax), %%xmm0\n" /*left */
"movaps 2048(%%eax), %%xmm1\n" /* right */
"movaps 1024(%%eax), %%xmm2\n" /* center */
"mulps %%xmm5, %%xmm0\n"
"mulps %%xmm5, %%xmm1\n"
"mulps %%xmm6, %%xmm2\n"
"addps %%xmm2, %%xmm0\n"
"addps %%xmm2, %%xmm1\n"
"movaps %%xmm0, (%%eax)\n"
"movaps %%xmm1, 1024(%%eax)\n"
"addl $16, %%eax\n"
"decl %%ebx\n"
"jnz .loop6\n"
"popl %%ebx\n"
: "=a" (samples)
: "a" (samples), "c" (dm_par));
}
......@@ -239,24 +246,26 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"pushl %%edx\n"
"movl $sqrt2_sse, %%edx\n"
"movss (%%edx), %%xmm7\n"
"shufps $0, %%xmm7, %%xmm7\n" /* sqrt2 | sqrt2 | sqrt2 | sqrt2 */
"movl $64, %%ebx\n"
"movss (%%edx), %%xmm7\n"
"shufps $0, %%xmm7, %%xmm7\n" /* sqrt2 | sqrt2 | sqrt2 | sqrt2 */
"movl $64, %%ebx\n"
".align 16\n"
".loop2:\n"
"movups (%%ecx), %%xmm0\n" /* c3 | c2 | c1 | c0 */
"movaps (%%ecx), %%xmm0\n" /* c3 | c2 | c1 | c0 */
"mulps %%xmm7, %%xmm0\n"
"movhlps %%xmm0, %%xmm2\n" /* c3 | c2 */
"movhlps %%xmm0, %%xmm2\n" /* c3 | c2 */
"cvtps2pi %%xmm0, %%mm0\n" /* c1 c0 --> mm0, int_32 */
"cvtps2pi %%xmm2, %%mm1\n" /* c3 c2 --> mm1, int_32 */
"cvtps2pi %%xmm0, %%mm0\n" /* c1 c0 --> mm0, int_32 */
"cvtps2pi %%xmm2, %%mm1\n" /* c3 c2 --> mm1, int_32 */
"packssdw %%mm0, %%mm0\n" /* c1 c1 c0 c0 --> mm0, int_16 */
"packssdw %%mm1, %%mm1\n" /* c3 c3 c2 c2 --> mm1, int_16 */
"packssdw %%mm0, %%mm0\n" /* c1 c1 c0 c0 --> mm0, int_16 */
"packssdw %%mm1, %%mm1\n" /* c3 c3 c2 c2 --> mm1, int_16 */
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 8(%%eax)\n"
......@@ -275,18 +284,19 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"movl $64, %%ebx\n"
"movl $64, %%ebx\n"
".align 16\n"
".loop1:\n"
"movups (%%ecx), %%xmm0\n" /* l3 | l2 | l1 | l0 */
"movups (%%edx), %%xmm1\n" /* r3 | r2 | r1 | r0 */
"movhlps %%xmm0, %%xmm2\n" /* l3 | l2 */
"movhlps %%xmm1, %%xmm3\n" /* r3 | r2 */
"unpcklps %%xmm1, %%xmm0\n" /* r1 | l1 | r0 | l0 */
"unpcklps %%xmm3, %%xmm2\n" /* r3 | l3 | r2 | l2 */
"movaps (%%ecx), %%xmm0\n" /* l3 | l2 | l1 | l0 */
"movaps (%%edx), %%xmm1\n" /* r3 | r2 | r1 | r0 */
"movhlps %%xmm0, %%xmm2\n" /* l3 | l2 */
"movhlps %%xmm1, %%xmm3\n" /* r3 | r2 */
"unpcklps %%xmm1, %%xmm0\n" /* r1 | l1 | r0 | l0 */
"unpcklps %%xmm3, %%xmm2\n" /* r3 | l3 | r2 | l2 */
"cvtps2pi %%xmm0, %%mm0\n" /* r0 l0 --> mm0, int_32 */
"movhlps %%xmm0, %%xmm0\n"
......@@ -295,8 +305,8 @@ void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right
"movhlps %%xmm2, %%xmm2\n"
"cvtps2pi %%xmm2, %%mm3\n" /* r3 l3 --> mm3, int_32 */
"packssdw %%mm1, %%mm0\n" /* r1 l1 r0 l0 --> mm0, int_16 */
"packssdw %%mm3, %%mm2\n" /* r3 l3 r2 l2 --> mm2, int_16 */
"packssdw %%mm1, %%mm0\n" /* r1 l1 r0 l0 --> mm0, int_16 */
"packssdw %%mm3, %%mm2\n" /* r3 l3 r2 l2 --> mm2, int_16 */
"movq %%mm0, (%%eax)\n"
"movq %%mm2, 8(%%eax)\n"
......
......@@ -2,7 +2,7 @@
* ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_3dn.c,v 1.4 2001/06/03 12:47:21 sam Exp $
* $Id: ac3_imdct_3dn.c,v 1.5 2001/07/08 23:15:11 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
*
......@@ -89,6 +89,7 @@ void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
"addl $-4, %%esp\n" /* local variable, loop counter */
......@@ -106,6 +107,7 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
"movl 20(%%ebp), %%edx\n" /* xcos_sin_sse */
"movl $128, -4(%%ebp)\n"
".align 16\n"
".loop:\n"
"movl (%%eax), %%esi\n"
"movd (%%ecx, %%esi, 8), %%mm1\n" /* 2j */
......@@ -147,9 +149,11 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebx\n"
"movl $64, %%ebx\n" /* loop counter */
".align 16\n"
".loop1:\n"
"movq (%%eax), %%mm0\n" /* im0 | re0 */
"movq %%mm0, %%mm1\n" /* im0 | re0 */
......@@ -200,6 +204,7 @@ static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
{
__asm__ __volatile__ (
".align 16\n"
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
......@@ -219,6 +224,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"leal 504(%%eax), %%edi\n" /* buf[63].re */
"movl 12(%%ebp), %%eax\n" /* data */
".align 16\n"
".first_128_samples:\n"
"movd (%%esi), %%mm0\n" /* im0 */
"movd 8(%%esi), %%mm2\n" /* im1 */
......@@ -258,6 +264,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"leal 1020(%%esi), %%edi\n" /* buf[127].im */
"movl $32, %%ecx\n" /* loop count */
".align 16\n"
".second_128_samples:\n"
"movd (%%esi), %%mm0\n" /* buf[i].re */
"movd 8(%%esi), %%mm2\n" /* re1 */
......@@ -302,6 +309,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
"movl $32, %%ecx\n" /* loop count */
"movl 20(%%ebp), %%eax\n" /* delay */
".align 16\n"
".first_128_delay:\n"
"movd (%%esi), %%mm0\n" /* re0 */