Commit 2ac5fe04 authored by Loren Merritt's avatar Loren Merritt

cosmetics: remove #if0'ed code

patch by Robert Swain.



git-svn-id: svn://svn.videolan.org/x264/trunk@332 df754926-b1dd-0310-bc7b-ec298dee348c
parent 1647e6d6
......@@ -97,48 +97,6 @@ static inline uint32_t bs_read( bs_t *s, int i_count )
return( i_result );
}
#if 0
/* Only > i386 */
static uint32_t bswap32( uint32_t x )
{
asm( "bswap %0": "=r" (x):"0" (x));
return x;
}
/* work only for i_count <= 32 - 7 */
static inline uint32_t bs_read( bs_t *s, int i_count )
{
if( s->p < s->p_end && i_count > 0 )
{
#if 0
uint32_t i_cache = ((s->p[0] << 24)+(s->p[1] << 16)+(s->p[2] << 8)+s->p[3]) << (8-s->i_left);
#else
uint32_t i_cache = bswap32( *((uint32_t*)s->p) ) << (8-s->i_left);
#endif
uint32_t i_ret = i_cache >> ( 32 - i_count);
s->i_left -= i_count;
#if 0
if( s->i_left <= 0 )
{
int i_skip = (8-s->i_left) >> 3;
s->p += i_skip;
s->i_left += i_skip << 3;
}
#else
while( s->i_left <= 0 )
{
s->p++;
s->i_left += 8;
}
#endif
return i_ret;
}
return 0;
}
#endif
static inline uint32_t bs_read1( bs_t *s )
{
......@@ -160,17 +118,12 @@ static inline uint32_t bs_read1( bs_t *s )
}
static inline uint32_t bs_show( bs_t *s, int i_count )
{
#if 0
bs_t s_tmp = *s;
return bs_read( &s_tmp, i_count );
#else
if( s->p < s->p_end && i_count > 0 )
{
uint32_t i_cache = ((s->p[0] << 24)+(s->p[1] << 16)+(s->p[2] << 8)+s->p[3]) << (8-s->i_left);
return( i_cache >> ( 32 - i_count) );
}
return 0;
#endif
}
/* TODO optimize */
......
......@@ -106,99 +106,6 @@ void x264_cpu_restore( uint32_t cpu )
}
}
#if 0
/*
* XXX: adapted from libmpeg2 */
#if 0
#define cpuid(op,eax,ebx,ecx,edx) \
__asm__ ("push %%ebx\n\t" \
"cpuid\n\t" \
"movl %%ebx,%1\n\t" \
"pop %%ebx" \
: "=a" (eax), \
"=r" (ebx), \
"=c" (ecx), \
"=d" (edx) \
: "a" (op) \
: "cc")
#endif
uint32_t x264_cpu_detect( void )
{
uint32_t cpu = 0;
uint32_t eax, ebx, ecx, edx;
int b_amd;
/* Test if cpuid is supported */
asm volatile(
"pushf\n"
"pushf\n"
"pop %0\n"
"movl %0,%1\n"
"xorl $0x200000,%0\n"
"push %0\n"
"popf\n"
"pushf\n"
"pop %0\n"
"popf\n"
: "=r" (eax), "=r" (ebx) : : "cc");
if( eax == ebx )
{
/* No cpuid */
return 0;
}
cpuid( 0, eax, ebx, ecx, edx);
if( eax == 0 )
{
return 0;
}
b_amd = (ebx == 0x68747541) && (ecx == 0x444d4163) && (edx == 0x69746e65);
cpuid( 1, eax, ebx, ecx, edx );
if( (edx&0x00800000) == 0 )
{
/* No MMX */
return 0;
}
cpu = X264_CPU_MMX;
if( (edx&0x02000000) )
{
/* SSE - identical to AMD MMX extensions */
cpu |= X264_CPU_MMXEXT|X264_CPU_SSE;
}
if( (edx&0x04000000) )
{
/* Is it OK ? */
cpu |= X264_CPU_SSE2;
}
cpuid( 0x80000000, eax, ebx, ecx, edx );
if( eax < 0x80000001 )
{
/* no extended capabilities */
return cpu;
}
cpuid( 0x80000001, eax, ebx, ecx, edx );
if( edx&0x80000000 )
{
cpu |= X264_CPU_3DNOW;
}
if( b_amd && (edx&0x00400000) )
{
/* AMD MMX extensions */
cpu |= X264_CPU_MMXEXT;
}
return cpu;
}
#endif
#elif defined( ARCH_PPC )
#ifdef SYS_MACOSX
......@@ -244,4 +151,3 @@ void x264_cpu_restore( uint32_t cpu )
}
#endif
......@@ -35,196 +35,6 @@
#include "dct.h"
#if 0
#define MMX_ZERO( MMZ ) \
asm volatile( "pxor " #MMZ ", " #MMZ "\n" :: )
/* MMP : diff, MMT: temp */
#define MMX_LOAD_DIFF_4P( MMP, MMT, MMZ, pix1, pix2 ) \
asm volatile( "movd (%0), " #MMP "\n" \
"punpcklbw " #MMZ ", " #MMP "\n" \
"movd (%1), " #MMT "\n" \
"punpcklbw " #MMZ ", " #MMT "\n" \
"psubw " #MMT ", " #MMP "\n" : : "r"(pix1), "r"(pix2) )
/* in: out: mma=mma+mmb, mmb=mmb-mma */
#define MMX_SUMSUB_BA( MMA, MMB ) \
asm volatile( "paddw " #MMB ", " #MMA "\n"\
"paddw " #MMB ", " #MMB "\n"\
"psubw " #MMA ", " #MMB "\n" :: )
#define MMX_SUMSUB_BADC( MMA, MMB, MMC, MMD ) \
asm volatile( "paddw " #MMB ", " #MMA "\n"\
"paddw " #MMD ", " #MMC "\n"\
"paddw " #MMB ", " #MMB "\n"\
"paddw " #MMD ", " #MMD "\n"\
"psubw " #MMA ", " #MMB "\n"\
"psubw " #MMC ", " #MMD "\n" :: )
/* inputs MMA, MMB output MMA MMT */
#define MMX_SUMSUB2_AB( MMA, MMB, MMT ) \
asm volatile( "movq " #MMA ", " #MMT "\n" \
"paddw " #MMA ", " #MMA "\n" \
"paddw " #MMB ", " #MMA "\n" \
"psubw " #MMB ", " #MMT "\n" \
"psubw " #MMB ", " #MMT "\n" :: )
/* inputs MMA, MMB output MMA MMS */
#define MMX_SUMSUBD2_AB( MMA, MMB, MMT, MMS ) \
asm volatile( "movq " #MMA ", " #MMS "\n" \
"movq " #MMB ", " #MMT "\n" \
"psraw $1 , " #MMB "\n" \
"psraw $1 , " #MMS "\n" \
"paddw " #MMB ", " #MMA "\n" \
"psubw " #MMT ", " #MMS "\n" :: )
#define SBUTTERFLYwd(a,b,t )\
asm volatile( "movq " #a ", " #t " \n\t" \
"punpcklwd " #b ", " #a " \n\t" \
"punpckhwd " #b ", " #t " \n\t" :: )
#define SBUTTERFLYdq(a,b,t )\
asm volatile( "movq " #a ", " #t " \n\t" \
"punpckldq " #b ", " #a " \n\t" \
"punpckhdq " #b ", " #t " \n\t" :: )
/* input ABCD output ADTC */
#define MMX_TRANSPOSE( MMA, MMB, MMC, MMD, MMT ) \
SBUTTERFLYwd( MMA, MMB, MMT ); \
SBUTTERFLYwd( MMC, MMD, MMB ); \
SBUTTERFLYdq( MMA, MMC, MMD ); \
SBUTTERFLYdq( MMT, MMB, MMC )
#define MMX_STORE_DIFF_4P( MMP, MMT, MM32, MMZ, dst ) \
asm volatile( "paddw " #MM32 "," #MMP "\n" \
"psraw $6, " #MMP "\n" \
"movd (%0), " #MMT "\n" \
"punpcklbw " #MMZ ", " #MMT "\n" \
"paddsw " #MMT ", " #MMP "\n" \
"packuswb " #MMZ ", " #MMP "\n" \
"movd " #MMP ", (%0)\n" :: "r"(dst) )
#define UNUSED_LONGLONG( foo ) \
static const unsigned long long foo __asm__ (#foo) __attribute__((unused)) __attribute__((aligned(16)))
UNUSED_LONGLONG( x264_mmx_32 ) = 0x0020002000200020ULL;
UNUSED_LONGLONG( x264_mmx_1 ) = 0x0001000100010001ULL;
/*
* XXX For all dct dc : input could be equal to output so ...
*/
void x264_dct4x4dc_mmxext( int16_t d[4][4] )
{
/* load DCT */
asm volatile(
"movq (%0), %%mm0\n"
"movq 8(%0), %%mm1\n"
"movq 16(%0), %%mm2\n"
"movq 24(%0), %%mm3\n" :: "r"(d) );
MMX_SUMSUB_BADC( %%mm1, %%mm0, %%mm3, %%mm2 ); /* mm1=s01 mm0=d01 mm3=s23 mm2=d23 */
MMX_SUMSUB_BADC( %%mm3, %%mm1, %%mm2, %%mm0 ); /* mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 */
/* in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0 */
MMX_TRANSPOSE ( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 );
MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm0, %%mm4 ); /* mm2=s01 mm3=d01 mm0=s23 mm4=d23 */
MMX_SUMSUB_BADC( %%mm0, %%mm2, %%mm4, %%mm3 ); /* mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 */
/* in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3 */
MMX_TRANSPOSE ( %%mm0, %%mm2, %%mm3, %%mm4, %%mm1 );
asm volatile( "movq x264_mmx_1, %%mm6" :: );
/* Store back */
asm volatile(
"paddw %%mm6, %%mm0\n"
"paddw %%mm6, %%mm4\n"
"psraw $1, %%mm0\n"
"movq %%mm0, (%0)\n"
"psraw $1, %%mm4\n"
"movq %%mm4, 8(%0)\n"
"paddw %%mm6, %%mm1\n"
"paddw %%mm6, %%mm3\n"
"psraw $1, %%mm1\n"
"movq %%mm1, 16(%0)\n"
"psraw $1, %%mm3\n"
"movq %%mm3, 24(%0)\n" :: "r"(d) );
}
void x264_idct4x4dc_mmxext( int16_t d[4][4] )
{
/* load DCT */
asm volatile(
"movq (%0), %%mm0\n"
"movq 8(%0), %%mm1\n"
"movq 16(%0), %%mm2\n"
"movq 24(%0), %%mm3\n" :: "r"(d) );
MMX_SUMSUB_BADC( %%mm1, %%mm0, %%mm3, %%mm2 ); /* mm1=s01 mm0=d01 mm3=s23 mm2=d23 */
MMX_SUMSUB_BADC( %%mm3, %%mm1, %%mm2, %%mm0 ); /* mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 */
/* in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0 */
MMX_TRANSPOSE( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 );
MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm0, %%mm4 ); /* mm2=s01 mm3=d01 mm0=s23 mm4=d23 */
MMX_SUMSUB_BADC( %%mm0, %%mm2, %%mm4, %%mm3 ); /* mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 */
/* in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3 */
MMX_TRANSPOSE( %%mm0, %%mm2, %%mm3, %%mm4, %%mm1 );
/* Store back */
asm volatile(
"movq %%mm0, (%0)\n"
"movq %%mm4, 8(%0)\n"
"movq %%mm1, 16(%0)\n"
"movq %%mm3, 24(%0)\n" :: "r"(d) );
}
/****************************************************************************
* subXxX_dct:
****************************************************************************/
inline void x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
{
/* Reset mm7 */
MMX_ZERO( %%mm7 );
/* Load 4 lines */
MMX_LOAD_DIFF_4P( %%mm0, %%mm6, %%mm7, &pix1[0*i_pix1], &pix2[0*i_pix2] );
MMX_LOAD_DIFF_4P( %%mm1, %%mm6, %%mm7, &pix1[1*i_pix1], &pix2[1*i_pix2] );
MMX_LOAD_DIFF_4P( %%mm2, %%mm6, %%mm7, &pix1[2*i_pix1], &pix2[2*i_pix2] );
MMX_LOAD_DIFF_4P( %%mm3, %%mm6, %%mm7, &pix1[3*i_pix1], &pix2[3*i_pix2] );
MMX_SUMSUB_BADC( %%mm3, %%mm0, %%mm2, %%mm1 ); /* mm3=s03 mm0=d03 mm2=s12 mm1=d12 */
MMX_SUMSUB_BA( %%mm2, %%mm3 ); /* mm2=s03+s12 mm3=s03-s12 */
MMX_SUMSUB2_AB( %%mm0, %%mm1, %%mm4 ); /* mm0=2.d03+d12 mm4=d03-2.d12 */
/* transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3 */
MMX_TRANSPOSE( %%mm2, %%mm0, %%mm3, %%mm4, %%mm1 );
MMX_SUMSUB_BADC( %%mm3, %%mm2, %%mm1, %%mm4 ); /* mm3=s03 mm2=d03 mm1=s12 mm4=d12 */
MMX_SUMSUB_BA( %%mm1, %%mm3 ); /* mm1=s03+s12 mm3=s03-s12 */
MMX_SUMSUB2_AB( %%mm2, %%mm4, %%mm0 ); /* mm2=2.d03+d12 mm0=d03-2.d12 */
/* transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3 */
MMX_TRANSPOSE( %%mm1, %%mm2, %%mm3, %%mm0, %%mm4 );
/* Store back */
asm volatile(
"movq %%mm1, (%0)\n"
"movq %%mm0, 8(%0)\n"
"movq %%mm4, 16(%0)\n"
"movq %%mm3, 24(%0)\n" :: "r"(dct) );
}
#endif
void x264_sub8x8_dct_mmxext( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
{
x264_sub4x4_dct_mmxext( dct[0], &pix1[0], i_pix1, &pix2[0], i_pix2 );
......@@ -246,41 +56,6 @@ void x264_sub16x16_dct_mmxext( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1,
/****************************************************************************
* addXxX_idct:
****************************************************************************/
#if 0
inline void x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
{
/* Load dct coeffs */
asm volatile(
"movq (%0), %%mm0\n"
"movq 8(%0), %%mm1\n"
"movq 16(%0), %%mm2\n"
"movq 24(%0), %%mm3\n" :: "r"(dct) );
MMX_SUMSUB_BA ( %%mm2, %%mm0 ); /* mm2=s02 mm0=d02 */
MMX_SUMSUBD2_AB( %%mm1, %%mm3, %%mm5, %%mm4 ); /* mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) */
MMX_SUMSUB_BADC( %%mm1, %%mm2, %%mm4, %%mm0 ); /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */
/* in: mm1, mm4, mm0, mm2 out: mm1, mm2, mm3, mm0 */
MMX_TRANSPOSE ( %%mm1, %%mm4, %%mm0, %%mm2, %%mm3 );
MMX_SUMSUB_BA ( %%mm3, %%mm1 ); /* mm3=s02 mm1=d02 */
MMX_SUMSUBD2_AB( %%mm2, %%mm0, %%mm5, %%mm4 ); /* mm2=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) */
MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm4, %%mm1 ); /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */
/* in: mm2, mm4, mm1, mm3 out: mm2, mm3, mm0, mm1 */
MMX_TRANSPOSE ( %%mm2, %%mm4, %%mm1, %%mm3, %%mm0 );
MMX_ZERO( %%mm7 );
asm volatile( "movq x264_mmx_32, %%mm6\n" :: );
MMX_STORE_DIFF_4P( %%mm2, %%mm4, %%mm6, %%mm7, &p_dst[0*i_dst] );
MMX_STORE_DIFF_4P( %%mm3, %%mm4, %%mm6, %%mm7, &p_dst[1*i_dst] );
MMX_STORE_DIFF_4P( %%mm0, %%mm4, %%mm6, %%mm7, &p_dst[2*i_dst] );
MMX_STORE_DIFF_4P( %%mm1, %%mm4, %%mm6, %%mm7, &p_dst[3*i_dst] );
}
#endif
void x264_add8x8_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] )
{
......
......@@ -75,983 +75,6 @@ AVG_WEIGHT(8,16)
AVG_WEIGHT(8,8)
AVG_WEIGHT(8,4)
#if 0
#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)
#define USED_UINT64(foo) \
static const uint64_t foo __asm__ (#foo) __attribute__((used))
#else
#define USED_UINT64(foo) \
static const uint64_t foo __asm__ (#foo) __attribute__((unused))
#endif
USED_UINT64( x264_w0x10 ) = 0x0010001000100010ULL;
#define MMX_ZERO( MMZ ) \
asm volatile( "pxor " #MMZ ", " #MMZ "\n" :: )
#define MMX_INIT( MMV, NAME ) \
asm volatile( "movq " #NAME ", " #MMV "\n" :: )
#define MMX_SAVE_4P( MMP, MMZ, dst ) \
asm volatile( "packuswb " #MMZ "," #MMP "\n" \
"movd " #MMP ", (%0)" :: "r"(dst) )
#define MMX_LOAD_4P( MMP, MMZ, pix ) \
asm volatile( "movd (%0), " #MMP "\n" \
"punpcklbw " #MMZ ", " #MMP "\n" : : "r"(pix) )
#define MMX_LOAD_4x4( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\
MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \
MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] ); \
MMX_LOAD_4P( MMP3, MMZ, &(pix)[2*(i_pix)] ); \
MMX_LOAD_4P( MMP4, MMZ, &(pix)[3*(i_pix)] )
#define MMX_LOAD_2x4( MMP1, MMP2, MMZ, pix, i_pix )\
MMX_LOAD_4P( MMP1, MMZ, &(pix)[0*(i_pix)] ); \
MMX_LOAD_4P( MMP2, MMZ, &(pix)[1*(i_pix)] )
#define MMX_SAVEPACK_8P( MMP1, MMP2, MMZ, dst ) \
asm volatile( "packuswb " #MMP2 "," #MMP1 "\n" \
"movq " #MMP1 ", (%0)\n" :: "r"(dst) )
#define MMX_LOAD_8P( MMP1, MMP2, MMZ, pix ) \
asm volatile( "movq (%0) , " #MMP1 "\n" \
"movq " #MMP1 ", " #MMP2 "\n" \
"punpcklbw " #MMZ ", " #MMP1 "\n" \
"punpckhbw " #MMZ ", " #MMP2 "\n" : : "r"(pix) )
#define MMX_LOAD_2x8( MMP1, MMP2, MMP3, MMP4, MMZ, pix, i_pix )\
MMX_LOAD_8P( MMP1, MMP2, MMZ, &(pix)[0*(i_pix)] ); \
MMX_LOAD_8P( MMP3, MMP4, MMZ, &(pix)[1*(i_pix)] )
#define SBUTTERFLYwd(a,b,t )\
asm volatile( "movq " #a ", " #t " \n\t" \
"punpcklwd " #b ", " #a " \n\t" \
"punpckhwd " #b ", " #t " \n\t" :: )
#define SBUTTERFLYdq(a,b,t )\
asm volatile( "movq " #a ", " #t " \n\t" \
"punpckldq " #b ", " #a " \n\t" \
"punpckhdq " #b ", " #t " \n\t" :: )
/* input ABCD output ADTC ( or 0?31-2->0123 ) */
#define MMX_TRANSPOSE( MMA, MMB, MMC, MMD, MMT ) \
SBUTTERFLYwd( MMA, MMB, MMT ); \
SBUTTERFLYwd( MMC, MMD, MMB ); \
SBUTTERFLYdq( MMA, MMC, MMD ); \
SBUTTERFLYdq( MMT, MMB, MMC )
/* first pass MM0 = MM0 -5*MM1 */
#define MMX_FILTERTAP_P1( MMP0, MMP1 ) \
asm volatile( "psubw " #MMP1 "," #MMP0 "\n" \
"psllw $2, " #MMP1 "\n" \
"psubw " #MMP1 "," #MMP0 "\n" :: )
\
/* second pass MM0 = MM0 + 20*(MM2+MM3) */
#define MMX_FILTERTAP_P2( MMP0, MMP2, MMP3 ) \
asm volatile( "paddw " #MMP3 "," #MMP2 "\n" \
\
"psllw $2, " #MMP2 "\n" \
"paddw " #MMP2 "," #MMP0 "\n" \
"psllw $2, " #MMP2 "\n" \
"paddw " #MMP2 "," #MMP0 "\n" :: )
/* last pass: MM0 = ( MM0 -5*MM1 + MM2 + MMV ) >> 5 */
#define MMX_FILTERTAP_P3( MMP0, MMP1, MMP2, MMV, MMZ ) \
asm volatile( "psubw " #MMP1 "," #MMP0 "\n" \
"psllw $2, " #MMP1 "\n" \
"psubw " #MMP1 "," #MMP0 "\n" \
\
"paddw " #MMP2 "," #MMP0 "\n" \
"paddw " #MMV "," #MMP0 "\n" \
"psraw $5, " #MMP0 "\n" :: )
#define MMX_FILTERTAP2_P1( MMP0, MMP1, MMP2, MMP3 ) \
asm volatile( "psubw " #MMP1 "," #MMP0 "\n" \
"psubw " #MMP3 "," #MMP2 "\n" \
"psllw $2, " #MMP1 "\n" \
"psllw $2, " #MMP3 "\n" \
"psubw " #MMP1 "," #MMP0 "\n" \
"psubw " #MMP3 "," #MMP2 "\n" :: )
/* second pass MM0 = MM0 + 20*(MM1+MM2) */
#define MMX_FILTERTAP2_P2( MMP0, MMP1, MMP2, MMP3, MMP4, MMP5 ) \
asm volatile( "paddw " #MMP2 "," #MMP1 "\n" \
"paddw " #MMP5 "," #MMP4 "\n" \
\
"psllw $2, " #MMP1 "\n" \
"psllw $2, " #MMP4 "\n" \
"paddw " #MMP1 "," #MMP0 "\n" \
"paddw " #MMP4 "," #MMP3 "\n" \
"psllw $2, " #MMP1 "\n" \
"psllw $2, " #MMP4 "\n" \
"paddw " #MMP1 "," #MMP0 "\n" \
"paddw " #MMP4 "," #MMP3 "\n" :: )
#define MMX_LOAD_1r( m1, dst ) \
asm volatile( "movq (%0), " #m1 "\n" :: "r"(dst) ); \
#define MMX_SAVE_1r( m1, dst ) \
asm volatile( "movq " #m1 ", (%0)\n" :: "r"(dst) ); \
#define MMX_LOAD_2r( m1, m2, dst, i_dst ) \
asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) )
#define MMX_SAVE_2r( m1, m2, dst, i_dst ) \
asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) )
#define MMX_SAVE_4r( m1, m2, m3, m4, dst, i_dst ) \
asm volatile( "movq " #m1 ", (%0)\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
asm volatile( "movq " #m2 ", (%0)\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \
asm volatile( "movq " #m3 ", (%0)\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \
asm volatile( "movq " #m4 ", (%0)\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) )
#define MMX_LOAD_4r( m1, m2, m3, m4, dst, i_dst ) \
asm volatile( "movq (%0), " #m1 "\n" :: "r"(&((uint8_t*)dst)[0*(i_dst)]) ); \
asm volatile( "movq (%0), " #m2 "\n" :: "r"(&((uint8_t*)dst)[1*(i_dst)]) ); \
asm volatile( "movq (%0), " #m3 "\n" :: "r"(&((uint8_t*)dst)[2*(i_dst)]) ); \
asm volatile( "movq (%0), " #m4 "\n" :: "r"(&((uint8_t*)dst)[3*(i_dst)]) )
static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
{
return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] + pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] + pix[ 3*i_pix_next];
}
static inline int x264_tapfilter1( uint8_t *pix )
{
return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] + pix[ 3];
}
typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );
/* Macro to define NxM functions */
/* mc I+H */
#define MC_IH( name, cpu, width, height, off ) \
static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
{ \
DECLARE_ALIGNED( uint8_t, tmp[width*height], width ); \
\
mc_hh_w##width( src, i_src_stride, tmp, width, i_height ); \
x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \
src+(off), i_src_stride, \
tmp, width, i_height ); \
}
/* mc I+V */
#define MC_IV( name, cpu, width, height, off ) \
static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
{ \
DECLARE_ALIGNED( uint8_t, tmp[width*height], width ); \
\
mc_hv_w##width( src, i_src_stride, tmp, width, i_height ); \
x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \
src+(off), i_src_stride, \
tmp, width, i_height ); \
}
/* mc H+V */
#define MC_HV( name, cpu, width, height, off1, off2 ) \
static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
{ \
DECLARE_ALIGNED( uint8_t, tmp1[width*height], width ); \
DECLARE_ALIGNED( uint8_t, tmp2[width*height], width ); \
\
mc_hv_w##width( src+(off1), i_src_stride, tmp1, width, i_height ); \
mc_hh_w##width( src+(off2), i_src_stride, tmp2, width, i_height ); \
x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \
tmp1, width, tmp2, width, \
i_height ); \
}
/* mc C+H */
#define MC_CH( name, cpu, width, height, off ) \
static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
{ \
DECLARE_ALIGNED( uint8_t, tmp1[width*height], width ); \
DECLARE_ALIGNED( uint8_t, tmp2[width*height], width ); \
\
mc_hc_w##width( src, i_src_stride, tmp1, width, i_height ); \
mc_hh_w##width( src+(off), i_src_stride, tmp2, width, i_height ); \
x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \
tmp1, width, tmp2, width, \
i_height ); \
}
/* mc C+V */
#define MC_CV( name, cpu, width, height, off ) \
static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \
{ \
DECLARE_ALIGNED( uint8_t, tmp1[width*height], width ); \
DECLARE_ALIGNED( uint8_t, tmp2[width*height], width ); \
\
mc_hc_w##width( src, i_src_stride, tmp1, width, i_height ); \
mc_hv_w##width( src+(off), i_src_stride, tmp2, width, i_height ); \
x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \
tmp1, width, tmp2, width, \
i_height ); \
}
/*****************************************************************************
* MC with width == 4 (height <= 8)
*****************************************************************************/
static inline void mc_hh_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
{
const int h4 = i_height / 4;
uint8_t srct[4*8*3];
uint64_t tmp[4];
int y;
src -= 2;
MMX_ZERO( %%mm7 );
MMX_INIT( %%mm6, x264_w0x10 );
for( y = 0; y < h4; y++ )
{
int i;
/* Preload data and transpose them */
MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[0], i_src );
MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*0], 8 );
MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[4], i_src );
MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
MMX_SAVE_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[4*8*1], 8 );
/* we read 2 more bytes that needed */
MMX_LOAD_4x4 ( %%mm0, %%mm4, %%mm3, %%mm1, %%mm7, &src[8], i_src );
MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
MMX_SAVE_2r( %%mm0, %%mm1, &srct[4*8*2], 8 );
/* tap filter */
for( i = 0; i < 4; i++ )
{
MMX_LOAD_4r( %%mm0, %%mm1, %%mm2, %%mm3, &srct[8*(i+0)], 8 );
MMX_FILTERTAP_P1( %%mm0, %%mm1 );
MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 );
MMX_LOAD_2r( %%mm1, %%mm2, &srct[8*(i+4)], 8 );
MMX_FILTERTAP_P3( %%mm0, %%mm1, %%mm2, %%mm6, %%mm7 );
MMX_SAVE_1r( %%mm0, &tmp[i] );
}
MMX_LOAD_4r( %%mm0, %%mm4, %%mm3, %%mm1, tmp, 8 );
MMX_TRANSPOSE( %%mm0, %%mm4, %%mm3, %%mm1, %%mm2 ); /* 0123 */
MMX_SAVE_4P( %%mm0, %%mm7, &dst[0*i_dst] );
MMX_SAVE_4P( %%mm1, %%mm7, &dst[1*i_dst] );
MMX_SAVE_4P( %%mm2, %%mm7, &dst[2*i_dst] );
MMX_SAVE_4P( %%mm3, %%mm7, &dst[3*i_dst] );
src += 4 * i_src;
dst += 4 * i_dst;
}
}
static inline void mc_hv_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height )
{
int y;
src -= 2 * i_src;
MMX_ZERO( %%mm7 );
MMX_INIT( %%mm6, x264_w0x10 );
for( y = 0; y < i_height; y++ )
{
MMX_LOAD_4x4( %%mm0, %%mm1, %%mm2, %%mm3, %%mm7, src, i_src );
MMX_FILTERTAP_P1( %%mm0, %%mm1 );
MMX_FILTERTAP_P2( %%mm0, %%mm2, %%mm3 );
MMX_LOAD_2x4( %%mm4, %%mm5, %%mm7, &src[4*i_src], i_src );
MMX_FILTERTAP_P3( %%mm0, %%mm4, %%mm5, %%mm6, %%mm7 );
MMX_SAVE_4P( %%mm0, %%mm7, dst );
src += i_src;
dst += i_dst;
}
}
static inline void mc_hc_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
{
int i, x, y;
for( y = 0; y < i_height; y++ )
{
int16_t tap[5+4];
for( i = 0; i < 5+4; i++ )
{
tap[i] = x264_tapfilter( &src[-2+i], i_src_stride );
}
for( x = 0; x < 4; x++ )
{
dst[x] = x264_mc_clip1( ( tap[0+x] - 5*tap[1+x] + 20 * tap[2+x] + 20 * tap[3+x] -5*tap[4+x] + tap[5+x] + 512 ) >> 10 );
}
src += i_src_stride;
dst += i_dst_stride;
}
}
MC_IH( mc_xy10, mmxext, 4, 8, 0 )
MC_IH( mc_xy30, mmxext, 4, 8, 1 )
MC_IV( mc_xy01, mmxext, 4, 8, 0 )
MC_IV( mc_xy03, mmxext, 4, 8, i_src_stride )
MC_HV( mc_xy11, mmxext, 4, 8, 0, 0 )
MC_HV( mc_xy31, mmxext, 4, 8, 1, 0 )
MC_HV( mc_xy13, mmxext, 4, 8, 0, i_src_stride )
MC_HV( mc_xy33, mmxext, 4, 8, 1, i_src_stride )
MC_CH( mc_xy21, mmxext, 4, 8, 0 )
MC_CH( mc_xy23, mmxext, 4, 8, i_src_stride )
MC_CV( mc_xy12, mmxext, 4, 8, 0 )
MC_CV( mc_xy32, mmxext, 4, 8, 1 )
</