Commit 746e6074 authored by Christophe Massiot's avatar Christophe Massiot

Altivec IDCT and motion compensation, courtesy of Michel Lespinasse for

mpeg2dec (untested).
parent b0ef1133
......@@ -74,7 +74,6 @@ LIB_GGI = @LIB_GGI@
LIB_GLIDE = @LIB_GLIDE@
LIB_GNOME = @LIB_GNOME@
LIB_GTK = @LIB_GTK@
LIB_ALTIVEC = @LIB_ALTIVEC@
LIB_LIBDVDCSS = @LIB_LIBDVDCSS@
LIB_KDE = @LIB_KDE@
LIB_MACOSX = @LIB_MACOSX@
......@@ -237,7 +236,7 @@ endif
# Optimizations for PowerPC
ifneq (,$(findstring powerpc,$(ARCH)))
CFLAGS += -mmultiple -mhard-float -mstring
CFLAGS += -mmultiple -mhard-float -mstring -Wa,-m7400
endif
# Optimizations for Sparc
......
This diff is collapsed.
......@@ -56,6 +56,7 @@ AC_CHECK_FUNC(inet_aton,,[
])
AC_CHECK_FUNCS(vasprintf)
AC_CHECK_FUNCS(swab)
AC_CHECK_FUNCS(memalign)
AC_CHECK_FUNCS(sigrelse)
......@@ -619,14 +620,12 @@ AC_ARG_ENABLE(macosx,
LIB_MACOSX="-framework CoreAudio -framework Carbon -framework AGL"
LIB_TS="${LIB_TS} -framework AGL -framework Carbon"
LIB_SDL="${LIB_SDL} -framework AGL -framework Carbon"
LIB_ALTIVEC="-framework vecLib"
fi],
[AC_CHECK_HEADERS(Carbon/Carbon.h,
BUILTINS="${BUILTINS} macosx"
LIB_MACOSX="-framework CoreAudio -framework Carbon -framework AGL"
LIB_TS="${LIB_TS} -framework AGL -framework Carbon"
LIB_SDL="${LIB_SDL} -framework AGL -framework Carbon"
LIB_ALTIVEC="-framework vecLib"
)])
dnl
......@@ -959,7 +958,6 @@ AC_SUBST(LIB_GGI)
AC_SUBST(LIB_GLIDE)
AC_SUBST(LIB_GNOME)
AC_SUBST(LIB_GTK)
AC_SUBST(LIB_ALTIVEC)
AC_SUBST(LIB_LIBDVDCSS)
AC_SUBST(LIB_KDE)
AC_SUBST(LIB_MACOSX)
......
......@@ -3,7 +3,7 @@
* Collection of useful common types and macros definitions
*****************************************************************************
* Copyright (C) 1998, 1999, 2000 VideoLAN
* $Id: common.h,v 1.39 2001/08/22 17:21:45 massiot Exp $
* $Id: common.h,v 1.40 2001/09/25 11:46:13 massiot Exp $
*
* Authors: Samuel Hocevar <sam@via.ecp.fr>
* Vincent Seguin <seguin@via.ecp.fr>
......@@ -199,6 +199,15 @@ struct pgrm_descriptor_s;
# define ATTR_ALIGN(align)
#endif
/* Alignment of critical dynamic data structure */
#ifdef HAVE_MEMALIGN
/* Some systems have memalign() but no declaration for it */
void * memalign( size_t align, size_t size );
#else
/* Assume malloc alignment is sufficient */
# define memalign(align,size) malloc(size)
#endif
/* win32, cl and icl support */
#if defined( _MSC_VER )
# define __attribute__(x)
......
......@@ -28,6 +28,9 @@
/* Define if you have the gettimeofday function. */
#undef HAVE_GETTIMEOFDAY
/* Define if you have the memalign function. */
#undef HAVE_MEMALIGN
/* Define if you have the putenv function. */
#undef HAVE_PUTENV
......@@ -94,6 +97,9 @@
/* Define if you have the <getopt.h> header file. */
#undef HAVE_GETOPT_H
/* Define if you have the <gnome.h> header file. */
#undef HAVE_GNOME_H
/* Define if you have the <gtk/gtk.h> header file. */
#undef HAVE_GTK_GTK_H
......
......@@ -2,7 +2,7 @@
* vdec_ext-plugins.h : structures from the video decoder exported to plug-ins
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: vdec_ext-plugins.h,v 1.4 2001/09/05 16:07:49 massiot Exp $
* $Id: vdec_ext-plugins.h,v 1.5 2001/09/25 11:46:13 massiot Exp $
*
* Authors: Christophe Massiot <massiot@via.ecp.fr>
*
......@@ -27,7 +27,8 @@
*****************************************************************************/
typedef struct idct_inner_s
{
dctelem_t pi_block[64]; /* block */
/* Should be kept aligned ! */
dctelem_t * pi_block; /* block */
void ( * pf_idct ) ( dctelem_t *, yuv_data_t *, int,
void *, int );
/* sparse IDCT or not, add or copy ? */
......
......@@ -12,17 +12,15 @@ PLUGIN_IDCTCLASSIC = idctclassic.o
PLUGIN_IDCTMMX = idctmmx.o
PLUGIN_IDCTMMXEXT = idctmmxext.o
PLUGIN_IDCTALTIVEC = idctaltivec.o
PLUGIN_IDCTALTIVECASM = idctaltivecasm.o
BUILTIN_IDCT = $(PLUGIN_IDCT:%.o=BUILTIN_IDCT_%.o)
BUILTIN_IDCTCLASSIC = $(PLUGIN_IDCTCLASSIC:%.o=BUILTIN_IDCTCLASSIC_%.o)
BUILTIN_IDCTMMX = $(PLUGIN_IDCTMMX:%.o=BUILTIN_IDCTMMX_%.o)
BUILTIN_IDCTMMXEXT = $(PLUGIN_IDCTMMXEXT:%.o=BUILTIN_IDCTMMXEXT_%.o)
BUILTIN_IDCTALTIVEC = $(PLUGIN_IDCTALTIVEC:%.o=BUILTIN_IDCTALTIVEC_%.o)
BUILTIN_IDCTALTIVECASM = $(PLUGIN_IDCTALTIVECASM:%.o=BUILTIN_IDCTALTIVEC_%.o)
PLUGIN_C = $(PLUGIN_IDCT) $(PLUGIN_IDCTCLASSIC) $(PLUGIN_IDCTMMX) $(PLUGIN_IDCTMMXEXT)
ALL_OBJ = $(PLUGIN_C) $(PLUGIN_IDCTALTIVEC) $(BUILTIN_IDCT) $(BUILTIN_IDCTCLASSIC) $(BUILTIN_IDCTMMX) $(BUILTIN_IDCTMMXEXT) $(BUILTIN_IDCTALTIVEC)
PLUGIN_C = $(PLUGIN_IDCT) $(PLUGIN_IDCTCLASSIC) $(PLUGIN_IDCTMMX) $(PLUGIN_IDCTMMXEXT) $(PLUGIN_IDCTALTIVEC)
ALL_OBJ = $(PLUGIN_C) $(BUILTIN_IDCT) $(BUILTIN_IDCTCLASSIC) $(BUILTIN_IDCTMMX) $(BUILTIN_IDCTMMXEXT) $(BUILTIN_IDCTALTIVEC)
#
# Virtual targets
......@@ -30,13 +28,6 @@ ALL_OBJ = $(PLUGIN_C) $(PLUGIN_IDCTALTIVEC) $(BUILTIN_IDCT) $(BUILTIN_IDCTCLASSI
include ../../Makefile.modules
$(PLUGIN_IDCTALTIVEC): %.o: .dep/%.d
$(PLUGIN_IDCTALTIVEC): %.o: %.c
$(CC) $(CFLAGS) -DPLUGIN $(PCFLAGS) -faltivec -c -o $@ $<
$(PLUGIN_IDCTALTIVECASM): %.o: %.S
$(CC) $(CFLAGS) -DPLUGIN $(PCFLAGS) -faltivec -c -o $@ $<
$(BUILTIN_IDCT): BUILTIN_IDCT_%.o: .dep/%.d
$(BUILTIN_IDCT): BUILTIN_IDCT_%.o: %.c
$(CC) $(CFLAGS) -DBUILTIN -DMODULE_NAME=idct -c -o $@ $<
......@@ -55,10 +46,7 @@ $(BUILTIN_IDCTMMXEXT): BUILTIN_IDCTMMXEXT_%.o: %.c
$(BUILTIN_IDCTALTIVEC): BUILTIN_IDCTALTIVEC_%.o: .dep/%.d
$(BUILTIN_IDCTALTIVEC): BUILTIN_IDCTALTIVEC_%.o: %.c
$(CC) $(CFLAGS) -DBUILTIN -DMODULE_NAME=idctaltivec -faltivec -c -o $@ $<
$(BUILTIN_IDCTALTIVECASM): BUILTIN_IDCTALTIVEC_%.o: %.S
$(CC) $(CFLAGS) -DBUILTIN -DMODULE_NAME=idctaltivec -faltivec -c -o $@ $<
$(CC) $(CFLAGS) -DBUILTIN -DMODULE_NAME=idctaltivec -c -o $@ $<
#
# Real targets
......@@ -92,10 +80,10 @@ $(BUILTIN_IDCTALTIVECASM): BUILTIN_IDCTALTIVEC_%.o: %.S
ar r $@ $^
$(RANLIB) $@
../idctaltivec.so: $(PLUGIN_IDCTALTIVEC) $(PLUGIN_IDCTALTIVECASM)
$(CC) $(PCFLAGS) -o $@ $^ $(PLCFLAGS) $(LIB_ALTIVEC)
../idctaltivec.so: $(PLUGIN_IDCTALTIVEC)
$(CC) $(PCFLAGS) -o $@ $^ $(PLCFLAGS)
../idctaltivec.a: $(BUILTIN_IDCTALTIVEC) $(BUILTIN_IDCTALTIVECASM)
../idctaltivec.a: $(BUILTIN_IDCTALTIVEC)
ar r $@ $^
$(RANLIB) $@
......@@ -2,7 +2,7 @@
* idctaltivec.c : Altivec IDCT module
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
* $Id: idctaltivec.c,v 1.14 2001/09/11 22:22:31 massiot Exp $
* $Id: idctaltivec.c,v 1.15 2001/09/25 11:46:14 massiot Exp $
*
* Authors: Christophe Massiot <massiot@via.ecp.fr>
*
......@@ -21,6 +21,8 @@
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#ifndef __ALTIVEC__
#define MODULE_NAME idctaltivec
#include "modules_inner.h"
......@@ -31,6 +33,7 @@
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include "config.h"
#include "common.h"
......@@ -47,8 +50,6 @@
* Local prototypes.
*****************************************************************************/
static void idct_getfunctions( function_list_t * p_function_list );
void idct_block_copy_altivec( dctelem_t *, yuv_data_t *, int, void *, int );
void idct_block_add_altivec( dctelem_t *, yuv_data_t *, int, void *, int );
/*****************************************************************************
* Build configuration tree.
......@@ -93,16 +94,647 @@ static int idct_Probe( probedata_t *p_data )
}
/*****************************************************************************
* Placeholders for unused functions
* NormScan : This IDCT uses reordered coeffs, so we patch the scan table
*****************************************************************************/
static void NormScan( u8 ppi_scan[2][64] )
{
int i, j;
for( i = 0; i < 64; i++ )
{
j = ppi_scan[0][i];
ppi_scan[0][i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
j = ppi_scan[1][i];
ppi_scan[1][i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
}
}
/*****************************************************************************
* Placeholders for unused functions
*****************************************************************************/
static void InitIDCT( void ** pp_idct_data )
{
}
/*****************************************************************************
* IDCT in Altivec
*****************************************************************************/
static int16_t constants[5][8] ATTR_ALIGN(16) = {
{23170, 13573, 6518, 21895, -23170, -21895, 32, 31},
{16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725},
{22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521},
{21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692},
{19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722}
};
/*
* The asm code is generated with:
*
* gcc-2.95 -fvec -D__ALTIVEC__ -O9 -fomit-frame-pointer -mregnames -S
* idct_altivec.c
*
* awk '{args=""; len=split ($2, arg, ",");
* for (i=1; i<=len; i++) { a=arg[i]; if (i<len) a=a",";
* args = args sprintf ("%-6s", a) }
* printf ("\t\"\t%-16s%-24s\\n\"\n", $1, args) }' idct_altivec.s |
* unexpand -a
*
* I then do some simple trimming on the function prolog/trailers
*/
void idct_block_copy_altivec (int16_t * block, uint8_t * dest, int stride)
{
asm (" \n"
"# stwu %r1, -128(%r1) \n"
"# mflr %r0 \n"
"# stw %r0, 132(%r1) \n"
"# addi %r0, %r1, 128 \n"
"# bl _savev25 \n"
" addi %r9, %r3, 112 \n"
" vspltish %v25, 4 \n"
" vxor %v13, %v13, %v13 \n"
" lis %r10, constants@ha \n"
" lvx %v1, 0, %r9 \n"
" la %r10, constants@l(%r10) \n"
" lvx %v5, 0, %r3 \n"
" addi %r9, %r3, 16 \n"
" lvx %v8, 0, %r10 \n"
" addi %r11, %r10, 32 \n"
" lvx %v12, 0, %r9 \n"
" lvx %v6, 0, %r11 \n"
" addi %r8, %r3, 48 \n"
" vslh %v1, %v1, %v25 \n"
" addi %r9, %r3, 80 \n"
" lvx %v11, 0, %r8 \n"
" vslh %v5, %v5, %v25 \n"
" lvx %v0, 0, %r9 \n"
" addi %r11, %r10, 64 \n"
" vsplth %v3, %v8, 2 \n"
" lvx %v7, 0, %r11 \n"
" addi %r9, %r3, 96 \n"
" vslh %v12, %v12, %v25 \n"
" vmhraddshs %v27, %v1, %v6, %v13 \n"
" addi %r8, %r3, 32 \n"
" vsplth %v2, %v8, 5 \n"
" lvx %v1, 0, %r9 \n"
" vslh %v11, %v11, %v25 \n"
" addi %r3, %r3, 64 \n"
" lvx %v9, 0, %r8 \n"
" addi %r9, %r10, 48 \n"
" vslh %v0, %v0, %v25 \n"
" lvx %v4, 0, %r9 \n"
" vmhraddshs %v31, %v12, %v6, %v13 \n"
" addi %r10, %r10, 16 \n"
" vmhraddshs %v30, %v0, %v7, %v13 \n"
" lvx %v10, 0, %r3 \n"
" vsplth %v19, %v8, 3 \n"
" vmhraddshs %v15, %v11, %v7, %v13 \n"
" lvx %v12, 0, %r10 \n"
" vsplth %v6, %v8, 4 \n"
" vslh %v1, %v1, %v25 \n"
" vsplth %v11, %v8, 1 \n"
" li %r9, 4 \n"
" vslh %v9, %v9, %v25 \n"
" vsplth %v7, %v8, 0 \n"
" vmhraddshs %v18, %v1, %v4, %v13 \n"
" vspltw %v8, %v8, 3 \n"
" vsubshs %v0, %v13, %v27 \n"
" vmhraddshs %v1, %v9, %v4, %v13 \n"
" vmhraddshs %v17, %v3, %v31, %v0 \n"
" vmhraddshs %v4, %v2, %v15, %v30 \n"
" vslh %v10, %v10, %v25 \n"
" vmhraddshs %v9, %v5, %v12, %v13 \n"
" vspltish %v25, 6 \n"
" vmhraddshs %v5, %v10, %v12, %v13 \n"
" vmhraddshs %v28, %v19, %v30, %v15 \n"
" vmhraddshs %v27, %v3, %v27, %v31 \n"
" vsubshs %v0, %v13, %v18 \n"
" vmhraddshs %v18, %v11, %v18, %v1 \n"
" vaddshs %v30, %v17, %v4 \n"
" vmhraddshs %v12, %v11, %v1, %v0 \n"
" vsubshs %v4, %v17, %v4 \n"
" vaddshs %v10, %v9, %v5 \n"
" vsubshs %v17, %v27, %v28 \n"
" vaddshs %v27, %v27, %v28 \n"
" vsubshs %v1, %v9, %v5 \n"
" vaddshs %v28, %v10, %v18 \n"
" vsubshs %v18, %v10, %v18 \n"
" vaddshs %v10, %v1, %v12 \n"
" vsubshs %v1, %v1, %v12 \n"
" vsubshs %v12, %v17, %v4 \n"
" vaddshs %v4, %v17, %v4 \n"
" vmhraddshs %v5, %v7, %v12, %v1 \n"
" vmhraddshs %v26, %v6, %v4, %v10 \n"
" vmhraddshs %v29, %v6, %v12, %v1 \n"
" vmhraddshs %v14, %v7, %v4, %v10 \n"
" vsubshs %v12, %v18, %v30 \n"
" vaddshs %v9, %v28, %v27 \n"
" vaddshs %v16, %v18, %v30 \n"
" vsubshs %v10, %v28, %v27 \n"
" vmrglh %v31, %v9, %v12 \n"
" vmrglh %v30, %v5, %v26 \n"
" vmrglh %v15, %v14, %v29 \n"
" vmrghh %v5, %v5, %v26 \n"
" vmrglh %v27, %v16, %v10 \n"
" vmrghh %v9, %v9, %v12 \n"
" vmrghh %v18, %v16, %v10 \n"
" vmrghh %v1, %v14, %v29 \n"
" vmrglh %v14, %v9, %v5 \n"
" vmrglh %v16, %v31, %v30 \n"
" vmrglh %v10, %v15, %v27 \n"
" vmrghh %v9, %v9, %v5 \n"
" vmrghh %v26, %v15, %v27 \n"
" vmrglh %v27, %v16, %v10 \n"
" vmrghh %v12, %v1, %v18 \n"
" vmrglh %v29, %v1, %v18 \n"
" vsubshs %v0, %v13, %v27 \n"
" vmrghh %v5, %v31, %v30 \n"
" vmrglh %v31, %v9, %v12 \n"
" vmrglh %v30, %v5, %v26 \n"
" vmrglh %v15, %v14, %v29 \n"
" vmhraddshs %v17, %v3, %v31, %v0 \n"
" vmrghh %v18, %v16, %v10 \n"
" vmhraddshs %v27, %v3, %v27, %v31 \n"
" vmhraddshs %v4, %v2, %v15, %v30 \n"
" vmrghh %v1, %v14, %v29 \n"
" vmhraddshs %v28, %v19, %v30, %v15 \n"
" vmrghh %v0, %v9, %v12 \n"
" vsubshs %v13, %v13, %v18 \n"
" vmrghh %v5, %v5, %v26 \n"
" vmhraddshs %v18, %v11, %v18, %v1 \n"
" vaddshs %v9, %v0, %v8 \n"
" vaddshs %v30, %v17, %v4 \n"
" vmhraddshs %v12, %v11, %v1, %v13 \n"
" vsubshs %v4, %v17, %v4 \n"
" vaddshs %v10, %v9, %v5 \n"
" vsubshs %v17, %v27, %v28 \n"
" vaddshs %v27, %v27, %v28 \n"
" vsubshs %v1, %v9, %v5 \n"
" vaddshs %v28, %v10, %v18 \n"
" vsubshs %v18, %v10, %v18 \n"
" vaddshs %v10, %v1, %v12 \n"
" vsubshs %v1, %v1, %v12 \n"
" vsubshs %v12, %v17, %v4 \n"
" vaddshs %v4, %v17, %v4 \n"
" vaddshs %v9, %v28, %v27 \n"
" vmhraddshs %v14, %v7, %v4, %v10 \n"
" vsrah %v9, %v9, %v25 \n"
" vmhraddshs %v5, %v7, %v12, %v1 \n"
" vpkshus %v0, %v9, %v9 \n"
" vmhraddshs %v29, %v6, %v12, %v1 \n"
" stvewx %v0, 0, %r4 \n"
" vaddshs %v16, %v18, %v30 \n"
" vsrah %v31, %v14, %v25 \n"
" stvewx %v0, %r9, %r4 \n"
" add %r4, %r4, %r5 \n"
" vsrah %v15, %v16, %v25 \n"
" vpkshus %v0, %v31, %v31 \n"
" vsrah %v1, %v5, %v25 \n"
" stvewx %v0, 0, %r4 \n"
" vsubshs %v12, %v18, %v30 \n"
" stvewx %v0, %r9, %r4 \n"
" vmhraddshs %v26, %v6, %v4, %v10 \n"
" vpkshus %v0, %v1, %v1 \n"
" add %r4, %r4, %r5 \n"
" vsrah %v5, %v12, %v25 \n"
" stvewx %v0, 0, %r4 \n"
" vsrah %v30, %v29, %v25 \n"
" stvewx %v0, %r9, %r4 \n"
" vsubshs %v10, %v28, %v27 \n"
" vpkshus %v0, %v15, %v15 \n"
" add %r4, %r4, %r5 \n"
" stvewx %v0, 0, %r4 \n"
" vsrah %v18, %v26, %v25 \n"
" stvewx %v0, %r9, %r4 \n"
" vsrah %v27, %v10, %v25 \n"
" vpkshus %v0, %v5, %v5 \n"
" add %r4, %r4, %r5 \n"
" stvewx %v0, 0, %r4 \n"
" stvewx %v0, %r9, %r4 \n"
" vpkshus %v0, %v30, %v30 \n"
" add %r4, %r4, %r5 \n"
" stvewx %v0, 0, %r4 \n"
" stvewx %v0, %r9, %r4 \n"
" vpkshus %v0, %v18, %v18 \n"
" add %r4, %r4, %r5 \n"
" stvewx %v0, 0, %r4 \n"
" stvewx %v0, %r9, %r4 \n"
" add %r4, %r4, %r5 \n"
" vpkshus %v0, %v27, %v27 \n"
" stvewx %v0, 0, %r4 \n"
" stvewx %v0, %r9, %r4 \n"
"# addi %r0, %r1, 128 \n"
"# bl _restv25 \n"
"# lwz %r0, 132(%r1) \n"
"# mtlr %r0 \n"
"# la %r1, 128(%r1) \n"
);
}
void idct_block_add_altivec (int16_t * block, uint8_t * dest, int stride)
{
asm (" \n"
"# stwu %r1, -192(%r1) \n"
"# mflr %r0 \n"
"# stw %r0, 196(%r1) \n"
"# addi %r0, %r1, 192 \n"
"# bl _savev21 \n"
" addi %r9, %r3, 112 \n"
" vspltish %v21, 4 \n"
" vxor %v1, %v1, %v1 \n"
" lvx %v13, 0, %r9 \n"
" lis %r10, constants@ha \n"
" vspltisw %v3, -1 \n"
" la %r10, constants@l(%r10) \n"
" lvx %v5, 0, %r3 \n"
" addi %r9, %r3, 16 \n"
" lvx %v8, 0, %r10 \n"
" lvx %v12, 0, %r9 \n"
" addi %r11, %r10, 32 \n"
" lvx %v6, 0, %r11 \n"
" addi %r8, %r3, 48 \n"
" vslh %v13, %v13, %v21 \n"
" addi %r9, %r3, 80 \n"
" lvx %v11, 0, %r8 \n"
" vslh %v5, %v5, %v21 \n"
" lvx %v0, 0, %r9 \n"
" addi %r11, %r10, 64 \n"
" vsplth %v2, %v8, 2 \n"
" lvx %v7, 0, %r11 \n"
" vslh %v12, %v12, %v21 \n"
" addi %r9, %r3, 96 \n"
" vmhraddshs %v24, %v13, %v6, %v1 \n"
" addi %r8, %r3, 32 \n"
" vsplth %v17, %v8, 5 \n"
" lvx %v13, 0, %r9 \n"
" vslh %v11, %v11, %v21 \n"
" addi %r3, %r3, 64 \n"
" lvx %v10, 0, %r8 \n"
" vslh %v0, %v0, %v21 \n"
" addi %r9, %r10, 48 \n"
" vmhraddshs %v31, %v12, %v6, %v1 \n"
" lvx %v4, 0, %r9 \n"
" addi %r10, %r10, 16 \n"
" vmhraddshs %v26, %v0, %v7, %v1 \n"
" lvx %v9, 0, %r3 \n"
" vsplth %v16, %v8, 3 \n"
" vmhraddshs %v22, %v11, %v7, %v1 \n"
" lvx %v6, 0, %r10 \n"
" lvsl %v19, 0, %r4 \n"
" vsubshs %v12, %v1, %v24 \n"
" lvsl %v0, %r5, %r4 \n"
" vsplth %v11, %v8, 1 \n"
" vslh %v10, %v10, %v21 \n"
" vmrghb %v19, %v3, %v19 \n"
" lvx %v15, 0, %r4 \n"
" vslh %v13, %v13, %v21 \n"
" vmrghb %v3, %v3, %v0 \n"
" li %r9, 4 \n"
" vmhraddshs %v14, %v2, %v31, %v12 \n"
" vsplth %v7, %v8, 0 \n"
" vmhraddshs %v23, %v13, %v4, %v1 \n"
" vsplth %v18, %v8, 4 \n"
" vmhraddshs %v27, %v10, %v4, %v1 \n"
" vspltw %v8, %v8, 3 \n"
" vmhraddshs %v12, %v17, %v22, %v26 \n"
" vperm %v15, %v15, %v1, %v19 \n"
" vslh %v9, %v9, %v21 \n"
" vmhraddshs %v10, %v5, %v6, %v1 \n"
" vspltish %v21, 6 \n"
" vmhraddshs %v30, %v9, %v6, %v1 \n"
" vmhraddshs %v26, %v16, %v26, %v22 \n"
" vmhraddshs %v24, %v2, %v24, %v31 \n"
" vmhraddshs %v31, %v11, %v23, %v27 \n"
" vsubshs %v0, %v1, %v23 \n"
" vaddshs %v23, %v14, %v12 \n"
" vmhraddshs %v9, %v11, %v27, %v0 \n"
" vsubshs %v12, %v14, %v12 \n"
" vaddshs %v6, %v10, %v30 \n"
" vsubshs %v14, %v24, %v26 \n"
" vaddshs %v24, %v24, %v26 \n"
" vsubshs %v13, %v10, %v30 \n"
" vaddshs %v26, %v6, %v31 \n"
" vsubshs %v31, %v6, %v31 \n"
" vaddshs %v6, %v13, %v9 \n"
" vsubshs %v13, %v13, %v9 \n"
" vsubshs %v9, %v14, %v12 \n"
" vaddshs %v12, %v14, %v12 \n"
" vmhraddshs %v30, %v7, %v9, %v13 \n"
" vmhraddshs %v25, %v18, %v12, %v6 \n"
" vmhraddshs %v28, %v18, %v9, %v13 \n"
" vmhraddshs %v29, %v7, %v12, %v6 \n"
" vaddshs %v10, %v26, %v24 \n"
" vsubshs %v5, %v31, %v23 \n"
" vsubshs %v13, %v26, %v24 \n"
" vaddshs %v4, %v31, %v23 \n"
" vmrglh %v26, %v30, %v25 \n"
" vmrglh %v31, %v10, %v5 \n"
" vmrglh %v22, %v29, %v28 \n"
" vmrghh %v30, %v30, %v25 \n"
" vmrglh %v24, %v4, %v13 \n"
" vmrghh %v10, %v10, %v5 \n"
" vmrghh %v23, %v4, %v13 \n"
" vmrghh %v27, %v29, %v28 \n"
" vmrglh %v29, %v10, %v30 \n"
" vmrglh %v4, %v31, %v26 \n"
" vmrglh %v13, %v22, %v24 \n"
" vmrghh %v10, %v10, %v30 \n"
" vmrghh %v25, %v22, %v24 \n"
" vmrglh %v24, %v4, %v13 \n"
" vmrghh %v5, %v27, %v23 \n"
" vmrglh %v28, %v27, %v23 \n"
" vsubshs %v0, %v1, %v24 \n"
" vmrghh %v30, %v31, %v26 \n"
" vmrglh %v31, %v10, %v5 \n"
" vmrglh %v26, %v30, %v25 \n"
" vmrglh %v22, %v29, %v28 \n"
" vmhraddshs %v14, %v2, %v31, %v0 \n"
" vmrghh %v23, %v4, %v13 \n"
" vmhraddshs %v24, %v2, %v24, %v31 \n"
" vmhraddshs %v12, %v17, %v22, %v26 \n"
" vmrghh %v27, %v29, %v28 \n"
" vmhraddshs %v26, %v16, %v26, %v22 \n"
" vmrghh %v0, %v10, %v5 \n"
" vmhraddshs %v31, %v11, %v23, %v27 \n"
" vmrghh %v30, %v30, %v25 \n"
" vsubshs %v13, %v1, %v23 \n"
" vaddshs %v10, %v0, %v8 \n"
" vaddshs %v23, %v14, %v12 \n"
" vsubshs %v12, %v14, %v12 \n"
" vaddshs %v6, %v10, %v30 \n"
" vsubshs %v14, %v24, %v26 \n"
" vmhraddshs %v9, %v11, %v27, %v13 \n"
" vaddshs %v24, %v24, %v26 \n"
" vaddshs %v26, %v6, %v31 \n"
" vsubshs %v13, %v10, %v30 \n"
" vaddshs %v10, %v26, %v24 \n"
" vsubshs %v31, %v6, %v31 \n"
" vaddshs %v6, %v13, %v9 \n"
" vsrah %v10, %v10, %v21 \n"
" vsubshs %v13, %v13, %v9 \n"
" vaddshs %v0, %v15, %v10 \n"
" vsubshs %v9, %v14, %v12 \n"
" vaddshs %v12, %v14, %v12 \n"
" vpkshus %v15, %v0, %v0 \n"
" stvewx %v15, 0, %r4 \n"
" vaddshs %v4, %v31, %v23 \n"
" vmhraddshs %v29, %v7, %v12, %v6 \n"
" stvewx %v15, %r9, %r4 \n"
" add %r4, %r4, %r5 \n"
" vsubshs %v5, %v31, %v23 \n"
" lvx %v15, 0, %r4 \n"
" vmhraddshs %v30, %v7, %v9, %v13 \n"
" vsrah %v22, %v4, %v21 \n"
" vperm %v15, %v15, %v1, %v3 \n"
" vmhraddshs %v28, %v18, %v9, %v13 \n"
" vsrah %v31, %v29, %v21 \n"
" vsubshs %v13, %v26, %v24 \n"
" vaddshs %v0, %v15, %v31 \n"
" vsrah %v27, %v30, %v21 \n"
" vpkshus %v15, %v0, %v0 \n"
" vsrah %v30, %v5, %v21 \n"
" stvewx %v15, 0, %r4 \n"
" vsrah %v26, %v28, %v21 \n"
" stvewx %v15, %r9, %r4 \n"
" vmhraddshs %v25, %v18, %v12, %v6 \n"
" add %r4, %r4, %r5 \n"
" vsrah %v24, %v13, %v21 \n"
" lvx %v15, 0, %r4 \n"
" vperm %v15, %v15, %v1, %v19 \n"
" vsrah %v23, %v25, %v21 \n"
" vaddshs %v0, %v15, %v27 \n"
" vpkshus %v15, %v0, %v0 \n"
" stvewx %v15, 0, %r4 \n"
" stvewx %v15, %r9, %r4 \n"
" add %r4, %r4, %r5 \n"
" lvx %v15, 0, %r4 \n"
" vperm %v15, %v15, %v1, %v3 \n"
" vaddshs %v0, %v15, %v22 \n"
" vpkshus %v15, %v0, %v0 \n"
" stvewx %v15, 0, %r4 \n"
" stvewx %v15, %r9, %r4 \n"
" add %r4, %r4, %r5 \n"
" lvx %v15, 0, %r4 \n"
" vperm %v15, %v15, %v1, %v19 \n"
" vaddshs %v0, %v15, %v30 \n"
" vpkshus %v15, %v0, %v0 \n"
" stvewx %v15, 0, %r4 \n"
" stvewx %v15, %r9, %r4 \n"
" add %r4, %r4, %r5 \n"
" lvx %v15, 0, %r4 \n"
" vperm %v15, %v15, %v1, %v3 \n"
" vaddshs %v0, %v15, %v26 \n"
" vpkshus %v15, %v0, %v0 \n"
" stvewx %v15, 0, %r4 \n"
" stvewx %v15, %r9, %r4 \n"
" add %r4, %r4, %r5 \n"
" lvx %v15, 0, %r4 \n"
" vperm %v15, %v15, %v1, %v19 \n"
" vaddshs %v0, %v15, %v23 \n"
" vpkshus %v15, %v0, %v0 \n"
" stvewx %v15, 0, %r4 \n"
" stvewx %v15, %r9, %r4 \n"
" add %r4, %r4, %r5 \n"
" lvx %v15, 0, %r4 \n"
" vperm %v15, %v15, %v1, %v3 \n"