Commit 5f232025 authored by Renaud Dartus's avatar Renaud Dartus

* Add 3D Now! imdct

* Remove kmudge for ac3 on MacOS X
parent 1ac785a2
......@@ -26,7 +26,7 @@ PLUGINS_TARGETS := alsa/alsa beos/beos darwin/darwin dsp/dsp dummy/dummy \
dvd/dvd esd/esd fb/fb ggi/ggi glide/glide gnome/gnome gtk/gtk \
downmix/downmix downmix/downmixsse downmix/downmix3dn \
idct/idct idct/idctclassic idct/idctmmx idct/idctmmxext \
imdct/imdct imdct/imdctsse \
imdct/imdct imdct/imdct3dn imdct/imdctsse \
macosx/macosx mga/mga \
motion/motion motion/motionmmx motion/motionmmxext \
mpeg/es mpeg/ps mpeg/ts null/null qt/qt sdl/sdl \
......@@ -317,12 +317,7 @@ endif
$(C_OBJ): %.o: Makefile.opts Makefile.dep Makefile
$(C_OBJ): %.o: .dep/%.d
$(C_OBJ): %.o: %.c
ifneq (,$(findstring darwin,$(SYS)))
#this is uglier of all
@if test "src/ac3_decoder/ac3_imdct.c" = "$<"; then $(CC) `echo $(CFLAGS) | sed -e 's/-O3/-O/'` -c -o $@ $<; echo "(CC) `echo $(CFLAGS) | sed -e 's/-O3/-O/'` -c -o $@ $<"; else $(CC) $(CFLAGS) -c -o $@ $<; echo "$(CC) $(CFLAGS) -c -o $@ $<"; fi
else
$(CC) $(CFLAGS) -c -o $@ $<
endif
$(CPP_OBJ): %.o: Makefile.opts Makefile.dep Makefile
$(CPP_OBJ): %.o: .dep/%.dpp
......
......@@ -3285,7 +3285,7 @@ int main() {
EOF
if { (eval echo configure:3287: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
ACCEL_PLUGINS="${ACCEL_PLUGINS} idctmmxext motionmmxext imdctsse downmix3dn downmixsse"
ACCEL_PLUGINS="${ACCEL_PLUGINS} idctmmxext motionmmxext imdct3dn imdctsse downmix3dn downmixsse"
echo "$ac_t""yes" 1>&6
else
echo "configure: failed program was:" >&5
......
......@@ -162,7 +162,7 @@ AC_TRY_COMPILE([void quux(){void *p;asm("packuswb %%mm1,%%mm2"::"r"(p));}],,
AC_MSG_CHECKING([if \$CC groks MMX EXT or SSE inline assembly])
AC_TRY_COMPILE([void quux(){void *p;asm("maskmovq %%mm1,%%mm2"::"r"(p));}],,
ACCEL_PLUGINS="${ACCEL_PLUGINS} idctmmxext motionmmxext imdctsse downmix3dn downmixsse"
ACCEL_PLUGINS="${ACCEL_PLUGINS} idctmmxext motionmmxext imdct3dn imdctsse downmix3dn downmixsse"
AC_MSG_RESULT(yes), AC_MSG_RESULT(no))
dnl
......
......@@ -9,15 +9,18 @@
PLUGIN_IMDCT = imdct.o ac3_imdct_c.o ac3_srfft_c.o
PLUGIN_IMDCTSSE = imdctsse.o ac3_imdct_sse.o ac3_srfft_sse.o
PLUGIN_IMDCT3DN = imdct3dn.o ac3_imdct_3dn.o ac3_srfft_3dn.o
PLUGIN_IMDCTCOMMON = ac3_imdct_common.o
BUILTIN_IMDCT = $(PLUGIN_IMDCT:%.o=BUILTIN_IMDCT_%.o) \
$(PLUGIN_IMDCTCOMMON:%.o=BUILTIN_IMDCT_%.o)
BUILTIN_IMDCTSSE = $(PLUGIN_IMDCTSSE:%.o=BUILTIN_IMDCTSSE_%.o) \
$(PLUGIN_IMDCTCOMMON:%.o=BUILTIN_IMDCTSSE_%.o)
BUILTIN_IMDCT3DN = $(PLUGIN_IMDCT3DN:%.o=BUILTIN_IMDCT3DN_%.o) \
$(PLUGIN_IMDCTCOMMON:%.o=BUILTIN_IMDCT3DN_%.o)
PLUGIN_C = $(PLUGIN_IMDCT) $(PLUGIN_IMDCTSSE) $(PLUGIN_IMDCTCOMMON)
ALL_OBJ = $(PLUGIN_C) $(BUILTIN_IMDCT) $(BUILTIN_IMDCTSSE)
PLUGIN_C = $(PLUGIN_IMDCT) $(PLUGIN_IMDCTSSE) $(PLUGIN_IMDCT3DN) $(PLUGIN_IMDCTCOMMON)
ALL_OBJ = $(PLUGIN_C) $(BUILTIN_IMDCT) $(BUILTIN_IMDCTSSE) $(BUILTIN_IMDCT3DN)
#
# Virtual targets
......@@ -33,6 +36,10 @@ $(BUILTIN_IMDCTSSE): BUILTIN_IMDCTSSE_%.o: .dep/%.d
$(BUILTIN_IMDCTSSE): BUILTIN_IMDCTSSE_%.o: %.c
$(CC) $(CFLAGS) -DBUILTIN -DMODULE_NAME=imdctsse -c -o $@ $<
$(BUILTIN_IMDCT3DN): BUILTIN_IMDCT3DN_%.o: .dep/%.d
$(BUILTIN_IMDCT3DN): BUILTIN_IMDCT3DN_%.o: %.c
$(CC) $(CFLAGS) -DBUILTIN -DMODULE_NAME=imdct3dn -c -o $@ $<
#
# Real targets
#
......@@ -51,3 +58,10 @@ $(BUILTIN_IMDCTSSE): BUILTIN_IMDCTSSE_%.o: %.c
ar r $@ $^
$(RANLIB) $@
../../lib/imdct3dn.so: $(PLUGIN_IMDCT3DN) $(PLUGIN_IMDCTCOMMON)
$(CC) $(PCFLAGS) -o $@ $^ $(PLCFLAGS)
../../lib/imdct3dn.a: $(BUILTIN_IMDCT3DN)
ar r $@ $^
$(RANLIB) $@
/*****************************************************************************
* ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: ac3_imdct_3dn.c,v 1.1 2001/05/16 14:51:29 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#define MODULE_NAME imdct3dn
#include "modules_inner.h"
/*****************************************************************************
* Preamble
*****************************************************************************/
#include "defs.h"
#include <math.h>
#include <stdio.h>
#include "config.h"
#include "common.h"
#include "threads.h"
#include "mtime.h"
#include "ac3_imdct.h"
#include "ac3_imdct_common.h"
#include "ac3_retables.h"
void _M( fft_64p ) ( complex_t *x );
void _M( fft_128p ) ( complex_t *a );
static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse);
static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse);
static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
void _M( imdct_init ) (imdct_t * p_imdct)
{
int i;
float scale = 181.019;
fprintf(stderr,"imct_init\n");
for (i=0; i < 128; i++)
{
float xcos_i = cos(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
float xsin_i = sin(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
p_imdct->xcos_sin_sse[i * 4] = xcos_i;
p_imdct->xcos_sin_sse[i * 4 + 1] = -xsin_i;
p_imdct->xcos_sin_sse[i * 4 + 2] = -xsin_i;
p_imdct->xcos_sin_sse[i * 4 + 3] = -xcos_i;
}
fprintf(stderr,"done imct_init\n");
}
void _M( imdct_do_512 ) (imdct_t * p_imdct, float data[], float delay[])
{
imdct512_pre_ifft_twiddle_3dn (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);
_M( fft_128p ) (p_imdct->buf);
imdct512_post_ifft_twiddle_3dn (p_imdct->buf, p_imdct->xcos_sin_sse);
imdct512_window_delay_3dn (p_imdct->buf, data, window, delay);
}
void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
{
imdct512_pre_ifft_twiddle_3dn (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);
_M( fft_128p ) (p_imdct->buf);
imdct512_post_ifft_twiddle_3dn (p_imdct->buf, p_imdct->xcos_sin_sse);
imdct512_window_delay_nol_3dn (p_imdct->buf, data, window, delay);
}
static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse)
{
__asm__ __volatile__ (
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
"addl $-4, %%esp\n" /* local variable, loop counter */
"pushl %%eax\n"
"pushl %%ebx\n"
"pushl %%ecx\n"
"pushl %%edx\n"
"pushl %%edi\n"
"pushl %%esi\n"
"movl 8(%%ebp), %%eax\n" /* pmt */
"movl 12(%%ebp), %%ebx\n" /* buf */
"movl 16(%%ebp), %%ecx\n" /* data */
"movl 20(%%ebp), %%edx\n" /* xcos_sin_sse */
"movl $128, -4(%%ebp)\n"
".loop:\n"
"movl (%%eax), %%esi\n"
"movd (%%ecx, %%esi, 8), %%mm1\n" /* 2j */
"punpckldq %%mm1, %%mm1\n" /* 2j | 2j */
"shll $1, %%esi\n"
"movq (%%edx, %%esi, 8), %%mm0\n" /* -s_j | c_j */
"movq 8(%%edx, %%esi, 8), %%mm2\n" /* -c_j | -s_j */
"negl %%esi\n"
"movd 1020(%%ecx, %%esi, 4), %%mm4\n" /* 255-2j */
"punpckldq %%mm4, %%mm4\n" /* 255-2j | 255-2j */
"addl $4, %%eax\n"
"pfmul %%mm4, %%mm0\n" /* 255-2j * -s_j | 255-2j * c_j */
"pfmul %%mm1, %%mm2\n" /* 2j * -c_j | 2j * -s_j */
"addl $8, %%ebx\n"
"pfadd %%mm2, %%mm0\n" /* 2j * -c_j + 255-2j * -s_j | 2j * -s_j + 255-2j * c_j */
"movq %%mm0, -8(%%ebx)\n"
"decl -4(%%ebp)\n"
"jnz .loop\n"
"popl %%esi\n"
"popl %%edi\n"
"popl %%edx\n"
"popl %%ecx\n"
"popl %%ebx\n"
"popl %%eax\n"
"addl $4, %%esp\n"
"popl %%ebp\n"
"femms\n"
::);
}
static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
{
__asm__ __volatile__ (
"pushl %%ebx\n"
"movl $64, %%ebx\n" /* loop counter */
".loop1:\n"
"movq (%%eax), %%mm0\n" /* im0 | re0 */
"movq %%mm0, %%mm1\n" /* im0 | re0 */
"punpckldq %%mm0, %%mm0\n" /* re0 | re0 */
"punpckhdq %%mm1, %%mm1\n" /* im0 | im0 */
"movq (%%ecx), %%mm2\n" /* -s | c */
"movq 8(%%ecx), %%mm3\n" /* -c | -s */
"movq %%mm3, %%mm4\n"
"punpckhdq %%mm2,%%mm3\n" /* -s | -c */
"punpckldq %%mm2,%%mm4\n" /* c | -s */
"movq 8(%%eax), %%mm2\n" /* im1 | re1 */
"movq %%mm2, %%mm5\n" /* im1 | re1 */
"punpckldq %%mm2, %%mm2\n" /* re1 | re1 */
"punpckhdq %%mm5, %%mm5\n" /* im1 | im1 */
"pfmul %%mm3, %%mm0\n" /* -s * re0 | -c * re0 */
"pfmul %%mm4, %%mm1\n" /* c * im0 | -s * im0 */
"movq 16(%%ecx), %%mm6\n" /* -s1 | c1 */
"movq 24(%%ecx), %%mm7\n" /* -c1 | -s1 */
"movq %%mm7, %%mm4\n"
"punpckhdq %%mm6,%%mm7\n" /* -s1 | -c1 */
"punpckldq %%mm6,%%mm4\n" /* c1 | -s1 */
"pfmul %%mm7, %%mm2\n" /* -s1*re1 | -c1*re1 */
"pfmul %%mm4, %%mm5\n" /* c1*im1 | -s1*im1 */
"pfadd %%mm1, %%mm0\n" /* -s * re0 + c * im0 | -c * re0 - s * im0 */
"pfadd %%mm5, %%mm2\n" /* -s1 * re1 + c1 * im1 | -c1 * re1 - s1 * im1 */
"movq %%mm0, (%%eax)\n"
"movq %%mm2, 8(%%eax)\n"
"addl $32, %%ecx\n"
"addl $16, %%eax\n"
"decl %%ebx\n"
"jnz .loop1\n"
"popl %%ebx\n"
"femms\n"
: "=a" (buf)
: "a" (buf), "c" (xcos_sin_sse) );
}
static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
{
__asm__ __volatile__ (
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
"pushl %%eax\n"
"pushl %%ebx\n"
"pushl %%ecx\n"
"pushl %%edx\n"
"pushl %%esi\n"
"pushl %%edi\n"
"movl 20(%%ebp), %%ebx\n" /* delay */
"movl 16(%%ebp), %%edx\n" /* window */
"movl 8(%%ebp), %%eax\n" /* buf */
"movl $32, %%ecx\n" /* loop count */
"leal 516(%%eax), %%esi\n" /* buf[64].im */
"leal 504(%%eax), %%edi\n" /* buf[63].re */
"movl 12(%%ebp), %%eax\n" /* data */
".first_128_samples:\n"
"movd (%%esi), %%mm0\n" /* im0 */
"movd 8(%%esi), %%mm2\n" /* im1 */
"movd (%%edi), %%mm1\n" /* re0 */
"movd -8(%%edi), %%mm3\n" /* re1 */
"pxor %%mm4, %%mm4\n"
"pxor %%mm5, %%mm5\n"
"pfsub %%mm0, %%mm4\n" /* -im0 */
"pfsub %%mm2, %%mm5\n" /* -im1 */
"punpckldq %%mm1, %%mm4\n" /* re0 | -im0 */
"punpckldq %%mm3, %%mm5\n" /* re1 | -im1 */
"movq (%%edx), %%mm0\n" /* w1 | w0 */
"movq 8(%%edx), %%mm1\n" /* w3 | w2 */
"movq (%%ebx), %%mm2\n" /* d1 | d0 */
"movq 8(%%ebx), %%mm3\n" /* d3 | d2 */
"pfmul %%mm4, %%mm0\n" /* w1*re0 | -w0*im0 */
"pfmul %%mm5, %%mm1\n" /* w3*re1 | -w2*im1 */
"pfadd %%mm2, %%mm0\n" /* w1*re0+d1 | -w0*im0+d0 */
"pfadd %%mm3, %%mm1\n" /* w3*re1+d3 | -w2*im1+d2 */
"addl $16, %%edx\n"
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 8(%%eax)\n"
"addl $16, %%ebx\n"
"addl $16, %%esi\n"
"addl $16, %%eax\n"
"addl $-16, %%edi\n"
"decl %%ecx\n"
"jnz .first_128_samples\n"
"movl 8(%%ebp), %%esi\n" /* buf[0].re */
"leal 1020(%%esi), %%edi\n" /* buf[127].im */
"movl $32, %%ecx\n" /* loop count */
".second_128_samples:\n"
"movd (%%esi), %%mm0\n" /* buf[i].re */
"movd 8(%%esi), %%mm2\n" /* re1 */
"movd (%%edi), %%mm1\n" /* buf[127-i].im */
"movd -8(%%edi), %%mm3\n" /* im1 */
"pxor %%mm4, %%mm4\n"
"pxor %%mm5, %%mm5\n"
"pfsub %%mm0, %%mm4\n" /* -re0 */
"pfsub %%mm2, %%mm5\n" /* -re1 */
"punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
"punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
"movq (%%edx), %%mm0\n" /* w1 | w0 */
"movq 8(%%edx), %%mm1\n" /* w3 | w2 */
"movq (%%ebx), %%mm2\n" /* d1 | d0 */
"movq 8(%%ebx), %%mm3\n" /* d3 | d2 */
"addl $16, %%esi\n"
"pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
"pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
"pfadd %%mm2, %%mm0\n" /* w1*im0+d1 | -w0*re0+d0 */
"pfadd %%mm3, %%mm1\n" /* w3*im1+d3 | -w2*re1+d2 */
"addl $-16, %%edi\n"
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 8(%%eax)\n"
"addl $16, %%edx\n"
"addl $16, %%eax\n"
"addl $16, %%ebx\n"
"decl %%ecx\n"
"jnz .second_128_samples\n"
"movl 8(%%ebp), %%eax\n"
"leal 512(%%eax), %%esi\n" /* buf[64].re */
"leal 508(%%eax), %%edi\n" /* buf[63].im */
"movl $32, %%ecx\n" /* loop count */
"movl 20(%%ebp), %%eax\n" /* delay */
".first_128_delay:\n"
"movd (%%esi), %%mm0\n" /* re0 */
"movd 8(%%esi), %%mm2\n" /* re1 */
"movd (%%edi), %%mm1\n" /* im0 */
"movd -8(%%edi), %%mm3\n" /* im1 */
"pxor %%mm4, %%mm4\n"
"pxor %%mm5, %%mm5\n"
"pfsub %%mm0, %%mm4\n" /* -re0 */
"pfsub %%mm2, %%mm5\n" /* -re1 */
"punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
"punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
"movq -16(%%edx), %%mm1\n" /* w3 | w2 */
"movq -8(%%edx), %%mm0\n" /* w1 | w0 */
"addl $-16, %%edx\n"
"pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
"pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 8(%%eax)\n"
"addl $16, %%esi\n"
"addl $-16, %%edi\n"
"addl $16, %%eax\n"
"decl %%ecx\n"
"jnz .first_128_delay\n"
"movl 8(%%ebp), %%ebx\n"
"leal 4(%%ebx), %%esi\n" /* buf[0].im */
"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
"movl $32, %%ecx\n" /* loop count */
".second_128_delay:\n"
"movd (%%esi), %%mm0\n" /* im0 */
"movd 8(%%esi), %%mm2\n" /* im1 */
"movd (%%edi), %%mm1\n" /* re0 */
"movd -8(%%edi), %%mm3\n" /* re1 */
"pxor %%mm4, %%mm4\n"
"pxor %%mm5, %%mm5\n"
"pfsub %%mm1, %%mm4\n" /* -re0 */
"pfsub %%mm3, %%mm5\n" /* -re1 */
"punpckldq %%mm4, %%mm0\n" /* -re0 | im0 */
"punpckldq %%mm5, %%mm2\n" /* -re1 | im1 */
"movq -16(%%edx), %%mm1\n" /* w3 | w2 */
"movq -8(%%edx), %%mm3\n" /* w1 | w0 */
"addl $-16, %%edx\n"
"pfmul %%mm0, %%mm1\n" /* -w1*re0 | w0*im0 */
"pfmul %%mm2, %%mm3\n" /* -w3*re1 | w2*im1 */
"movq %%mm1, (%%eax)\n"
"movq %%mm3, 8(%%eax)\n"
"addl $16, %%esi\n"
"addl $-16, %%edi\n"
"addl $16, %%eax\n"
"decl %%ecx\n"
"jnz .second_128_delay\n"
"popl %%edi\n"
"popl %%esi\n"
"popl %%edx\n"
"popl %%ecx\n"
"popl %%ebx\n"
"popl %%eax\n"
"leave\n"
"femms\n"
::);
}
static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
{
__asm__ __volatile__ (
"pushl %%ebp\n"
"movl %%esp, %%ebp\n"
"pushl %%eax\n"
"pushl %%ebx\n"
"pushl %%ecx\n"
"pushl %%edx\n"
"pushl %%esi\n"
"pushl %%edi\n"
"movl 20(%%ebp), %%ebx\n" /* delay */
"movl 16(%%ebp), %%edx\n" /* window */
"movl 8(%%ebp), %%eax\n" /* buf */
"movl $32, %%ecx\n" /* loop count */
"leal 516(%%eax), %%esi\n" /* buf[64].im */
"leal 504(%%eax), %%edi\n" /* buf[63].re */
"movl 12(%%ebp), %%eax\n" /* data */
".first_128_samples2:\n"
"movd (%%esi), %%mm0\n" /* im0 */
"movd 8(%%esi), %%mm2\n" /* im1 */
"movd (%%edi), %%mm1\n" /* re0 */
"movd -8(%%edi), %%mm3\n" /* re1 */
"pxor %%mm4, %%mm4\n"
"pxor %%mm5, %%mm5\n"
"pfsub %%mm0, %%mm4\n" /* -im0 */
"pfsub %%mm2, %%mm5\n" /* -im1 */
"punpckldq %%mm1, %%mm4\n" /* re0 | -im0 */
"punpckldq %%mm3, %%mm5\n" /* re1 | -im1 */
"movq (%%edx), %%mm0\n" /* w1 | w0 */
"movq 8(%%edx), %%mm1\n" /* w3 | w2 */
"pfmul %%mm4, %%mm0\n" /* w1*re0 | -w0*im0 */
"pfmul %%mm5, %%mm1\n" /* w3*re1 | -w2*im1 */
"addl $16, %%edx\n"
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 8(%%eax)\n"
"addl $16, %%ebx\n"
"addl $16, %%esi\n"
"addl $16, %%eax\n"
"addl $-16, %%edi\n"
"decl %%ecx\n"
"jnz .first_128_samples2\n"
"movl 8(%%ebp), %%esi\n" /* buf[0].re */
"leal 1020(%%esi), %%edi\n" /* buf[127].im */
"movl $32, %%ecx\n" /* loop count */
".second_128_samples2:\n"
"movd (%%esi), %%mm0\n" /* buf[i].re */
"movd 8(%%esi), %%mm2\n" /* re1 */
"movd (%%edi), %%mm1\n" /* buf[127-i].im */
"movd -8(%%edi), %%mm3\n" /* im1 */
"pxor %%mm4, %%mm4\n"
"pxor %%mm5, %%mm5\n"
"pfsub %%mm0, %%mm4\n" /* -re0 */
"pfsub %%mm2, %%mm5\n" /* -re1 */
"punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
"punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
"movq (%%edx), %%mm0\n" /* w1 | w0 */
"movq 8(%%edx), %%mm1\n" /* w3 | w2 */
"addl $16, %%esi\n"
"pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
"pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
"addl $-16, %%edi\n"
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 8(%%eax)\n"
"addl $16, %%edx\n"
"addl $16, %%eax\n"
"addl $16, %%ebx\n"
"decl %%ecx\n"
"jnz .second_128_samples2\n"
"movl 8(%%ebp), %%eax\n"
"leal 512(%%eax), %%esi\n" /* buf[64].re */
"leal 508(%%eax), %%edi\n" /* buf[63].im */
"movl $32, %%ecx\n" /* loop count */
"movl 20(%%ebp), %%eax\n" /* delay */
".first_128_delays:\n"
"movd (%%esi), %%mm0\n" /* re0 */
"movd 8(%%esi), %%mm2\n" /* re1 */
"movd (%%edi), %%mm1\n" /* im0 */
"movd -8(%%edi), %%mm3\n" /* im1 */
"pxor %%mm4, %%mm4\n"
"pxor %%mm5, %%mm5\n"
"pfsub %%mm0, %%mm4\n" /* -re0 */
"pfsub %%mm2, %%mm5\n" /* -re1 */
"punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
"punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
"movq -16(%%edx), %%mm1\n" /* w3 | w2 */
"movq -8(%%edx), %%mm0\n" /* w1 | w0 */
"addl $-16, %%edx\n"
"pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
"pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
"movq %%mm0, (%%eax)\n"
"movq %%mm1, 8(%%eax)\n"
"addl $16, %%esi\n"
"addl $-16, %%edi\n"
"addl $16, %%eax\n"
"decl %%ecx\n"
"jnz .first_128_delays\n"
"movl 8(%%ebp), %%ebx\n"
"leal 4(%%ebx), %%esi\n" /* buf[0].im */
"leal 1016(%%ebx), %%edi\n" /* buf[127].re */
"movl $32, %%ecx\n" /* loop count */
".second_128_delays:\n"
"movd (%%esi), %%mm0\n" /* im0 */
"movd 8(%%esi), %%mm2\n" /* im1 */
"movd (%%edi), %%mm1\n" /* re0 */
"movd -8(%%edi), %%mm3\n" /* re1 */
"pxor %%mm4, %%mm4\n"
"pxor %%mm5, %%mm5\n"
"pfsub %%mm1, %%mm4\n" /* -re0 */
"pfsub %%mm3, %%mm5\n" /* -re1 */
"punpckldq %%mm4, %%mm0\n" /* -re0 | im0 */
"punpckldq %%mm5, %%mm2\n" /* -re1 | im1 */
"movq -16(%%edx), %%mm1\n" /* w3 | w2 */
"movq -8(%%edx), %%mm3\n" /* w1 | w0 */
"addl $-16, %%edx\n"
"pfmul %%mm0, %%mm1\n" /* -w1*re0 | w0*im0 */
"pfmul %%mm2, %%mm3\n" /* -w3*re1 | w2*im1 */
"movq %%mm1, (%%eax)\n"
"movq %%mm3, 8(%%eax)\n"
"addl $16, %%esi\n"
"addl $-16, %%edi\n"