Commit fe905276 authored by Eric Petit's avatar Eric Petit
Browse files

configure: use -falign-loops=16 on OS X

 common/ppc/: added AltiVecized mc_chroma + cleaning
 checkasm.c:  really fixed MC tests


git-svn-id: svn://svn.videolan.org/x264/trunk@199 df754926-b1dd-0310-bc7b-ec298dee348c
parent a1b95317
......@@ -87,7 +87,7 @@ static inline void pixel_avg_w16( uint8_t *dst, int i_dst,
int i_height )
{
int y;
vector_u8_t src1v, src2v;
vec_u8_t src1v, src2v;
for( y = 0; y < i_height; y++ )
{
LOAD_16( src1, src1v );
......@@ -119,8 +119,8 @@ MC_COPY( mc_copy_w8, 8 )
MC_COPY( mc_copy_w16, 16 )
/* TAP_FILTER:
a is source (vector_s16_t [6])
b is a temporary vector_s16_t
a is source (vec_s16_t [6])
b is a temporary vec_s16_t
c is the result
c = src[0] + a[5] - 5 * ( a[1] + a[4] ) + 20 * ( a[2] + a[3] );
......@@ -167,11 +167,11 @@ static inline void mc_hh_w8( uint8_t *src, int i_src,
DECLARE_ALIGNED( int16_t, tmp[8], 16 );
LOAD_ZERO;
vector_u8_t loadv;
vector_s16_t srcv[6];
vector_u8_t * _srcv = (vector_u8_t*) srcv;
vector_s16_t dstv;
vector_s16_t tmpv;
vec_u8_t loadv;
vec_s16_t srcv[6];
vec_u8_t * _srcv = (vec_u8_t*) srcv;
vec_s16_t dstv;
vec_s16_t tmpv;
for( y = 0; y < i_height; y++ )
{
......@@ -179,9 +179,9 @@ static inline void mc_hh_w8( uint8_t *src, int i_src,
for( x = 0; x < 6; x++ )
{
_srcv[x] = vec_perm( loadv, zero_u8,
_srcv[x] = vec_perm( loadv, zero_u8v,
vec_lvsl( 0, (int*) x ) );
CONVERT_U8_TO_S16( srcv[x] );
CONVERT_U8_TO_S16( srcv[x], srcv[x] );
}
TAP_FILTER( srcv, tmpv, dstv );
......@@ -226,10 +226,10 @@ static inline void mc_hv_w8( uint8_t *src, int i_src,
DECLARE_ALIGNED( int16_t, tmp[8], 16 );
LOAD_ZERO;
vector_s16_t srcv[6];
vector_u8_t * _srcv = (vector_u8_t*) srcv;
vector_s16_t dstv;
vector_s16_t tmpv;
vec_s16_t srcv[6];
vec_u8_t * _srcv = (vec_u8_t*) srcv;
vec_s16_t dstv;
vec_s16_t tmpv;
for( y = 0; y < i_height; y++ )
{
......@@ -240,14 +240,14 @@ static inline void mc_hv_w8( uint8_t *src, int i_src,
srcv[x] = srcv[x+1];
}
LOAD_8( &src[3*i_src], _srcv[5] );
CONVERT_U8_TO_S16( srcv[5] );
CONVERT_U8_TO_S16( srcv[5], srcv[5] );
}
else
{
for( x = 0; x < 6; x++ )
{
LOAD_8( &src[(x-2)*i_src], _srcv[x] );
CONVERT_U8_TO_S16( srcv[x] );
CONVERT_U8_TO_S16( srcv[x], srcv[x] );
}
}
......@@ -787,8 +787,94 @@ uint8_t *get_ref_altivec( uint8_t *src[4], int i_src_stride,
}
}
static void mc_chroma_altivec( uint8_t *src, int i_src_stride,
uint8_t *dst, int i_dst_stride,
int mvx, int mvy,
int i_width, int i_height )
{
uint8_t *srcp;
int x, y;
int d8x = mvx & 0x07;
int d8y = mvy & 0x07;
DECLARE_ALIGNED( uint16_t, coeff[4], 16 );
coeff[0] = (8-d8x)*(8-d8y);
coeff[1] = d8x *(8-d8y);
coeff[2] = (8-d8x)*d8y;
coeff[3] = d8x *d8y;
src += (mvy >> 3) * i_src_stride + (mvx >> 3);
srcp = &src[i_src_stride];
if( i_width < 8 )
{
/* TODO: optimize */
for( y = 0; y < i_height; y++ )
{
for( x = 0; x < i_width; x++ )
{
dst[x] = ( coeff[0]*src[x] + coeff[1]*src[x+1] +
coeff[2]*srcp[x] + coeff[3]*srcp[x+1] + 32 ) >> 6;
}
dst += i_dst_stride;
src = srcp;
srcp += i_src_stride;
}
return;
}
/* We now assume that i_width == 8 */
LOAD_ZERO;
vec_u16_t coeffv[4];
vec_u16_t k32v;
vec_u8_t srcv_8[4];
vec_u16_t srcv_16[4];
vec_u8_t dstv_8;
vec_u16_t dstv_16;
vec_u8_t permv;
vec_u16_t shiftv;
coeffv[0] = vec_ld( 0, coeff );
coeffv[3] = vec_splat( coeffv[0], 3 );
coeffv[2] = vec_splat( coeffv[0], 2 );
coeffv[1] = vec_splat( coeffv[0], 1 );
coeffv[0] = vec_splat( coeffv[0], 0 );
k32v = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
permv = vec_lvsl( 0, (uint8_t *) 1 );
shiftv = vec_splat_u16( 6 );
LOAD_16( src, srcv_8[2] );
srcv_8[3] = vec_perm( srcv_8[2], srcv_8[2], permv );
for( y = 0; y < i_height; y++ )
{
int i;
srcv_8[0] = srcv_8[2];
srcv_8[1] = srcv_8[3];
LOAD_16( srcp, srcv_8[2] );
srcv_8[3] = vec_perm( srcv_8[2], srcv_8[2], permv );
dstv_16 = k32v;
for( i = 0; i < 4; i++ )
{
CONVERT_U8_TO_U16( srcv_8[i], srcv_16[i] );
srcv_16[i] = vec_mladd( coeffv[i], srcv_16[i], zero_u16v );
dstv_16 = vec_add( dstv_16, srcv_16[i] );
}
dstv_16 = vec_sr( dstv_16, shiftv );
CONVERT_U16_TO_U8( dstv_16, dstv_8 );
STORE_8( dstv_8, dst );
dst += i_dst_stride;
srcp += i_src_stride;
}
}
void x264_mc_altivec_init( x264_mc_functions_t *pf )
{
pf->mc_luma = mc_luma_altivec;
pf->get_ref = get_ref_altivec;
pf->mc_chroma = mc_chroma_altivec;
}
......@@ -44,21 +44,21 @@ static int name( uint8_t *pix1, int i_pix1, \
int y; \
DECLARE_ALIGNED( int, sum, 16 ); \
\
LOAD_ZERO; \
vector_u8_t pix1v, pix2v; \
vector_s32_t sumv = zero_s32; \
LOAD_ZERO; \
vec_u8_t pix1v, pix2v; \
vec_s32_t sumv = zero_s32v; \
for( y = 0; y < ly; y++ ) \
{ \
LOAD_##lx( pix1, pix1v ); \
LOAD_##lx( pix2, pix2v ); \
sumv = (vector_s32_t) vec_sum4s( \
sumv = (vec_s32_t) vec_sum4s( \
vec_sub( vec_max( pix1v, pix2v ), \
vec_min( pix1v, pix2v ) ), \
(vector_u32_t) sumv ); \
(vec_u32_t) sumv ); \
pix1 += i_pix1; \
pix2 += i_pix2; \
} \
sumv = vec_sum##a( sumv, zero_s32 ); \
sumv = vec_sum##a( sumv, zero_s32v ); \
vec_ste( vec_splat( sumv, b ), 0, &sum ); \
return sum; \
}
......@@ -76,12 +76,12 @@ static inline int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
DECLARE_ALIGNED( int, i_satd, 16 );
LOAD_ZERO;
vector_s32_t satdv = zero_s32;
vector_u8_t pix1u8v, pix2u8v;
vector_s16_t pix1s16v, pix2s16v;
vector_s16_t diffv[8];
vector_s16_t tmpv[8];
vector_s16_t s01v, s23v, d01v, d23v;
vec_s32_t satdv = zero_s32v;
vec_u8_t pix1u8v, pix2u8v;
vec_s16_t pix1s16v, pix2s16v;
vec_s16_t diffv[8];
vec_s16_t tmpv[8];
vec_s16_t s01v, s23v, d01v, d23v;
/* Diff 8x8 */
for( i = 0; i < 8; i++ )
......@@ -90,8 +90,8 @@ static inline int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
LOAD_8( pix2, pix2u8v );
/* u8 -> s16 conversion */
pix1s16v = (vector_s16_t) vec_mergeh( zero_u8, pix1u8v );
pix2s16v = (vector_s16_t) vec_mergeh( zero_u8, pix2u8v );
CONVERT_U8_TO_S16( pix1u8v, pix1s16v );
CONVERT_U8_TO_S16( pix2u8v, pix2s16v );
diffv[i] = vec_sub( pix1s16v, pix2s16v );
......@@ -115,7 +115,7 @@ static inline int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
{
satdv = vec_sum4s( vec_abs( tmpv[i] ), satdv );
}
satdv = vec_sums( satdv, zero_s32 );
satdv = vec_sums( satdv, zero_s32v );
/* Done */
vec_ste( vec_splat( satdv, 3 ), 0, &i_satd );
......@@ -158,12 +158,12 @@ static inline int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
DECLARE_ALIGNED( int, i_satd, 16 );
LOAD_ZERO;
vector_s32_t satdv = zero_s32;
vector_u8_t pix1u8v, pix2u8v;
vector_s16_t pix1s16v, pix2s16v;
vector_s16_t diffv[4];
vector_s16_t tmpv[4];
vector_s16_t s01v, s23v, d01v, d23v;
vec_s32_t satdv = zero_s32v;
vec_u8_t pix1u8v, pix2u8v;
vec_s16_t pix1s16v, pix2s16v;
vec_s16_t diffv[4];
vec_s16_t tmpv[4];
vec_s16_t s01v, s23v, d01v, d23v;
/* Diff 4x8 */
for( i = 0; i < 4; i++ )
......@@ -171,9 +171,8 @@ static inline int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
LOAD_4( pix1, pix1u8v );
LOAD_4( pix2, pix2u8v );
/* u8 -> s16 conversion */
pix1s16v = (vector_s16_t) vec_mergeh( zero_u8, pix1u8v );
pix2s16v = (vector_s16_t) vec_mergeh( zero_u8, pix2u8v );
CONVERT_U8_TO_S16( pix1u8v, pix1s16v );
CONVERT_U8_TO_S16( pix2u8v, pix2s16v );
diffv[i] = vec_sub( pix1s16v, pix2s16v );
......@@ -195,7 +194,7 @@ static inline int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
{
satdv = vec_sum4s( vec_abs( tmpv[i] ), satdv );
}
satdv = vec_sum2s( satdv, zero_s32 );
satdv = vec_sum2s( satdv, zero_s32v );
/* Done */
vec_ste( vec_splat( satdv, 1 ), 0, &i_satd );
......
......@@ -21,22 +21,47 @@
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
/* Handy */
#define vector_u8_t vector unsigned char
#define vector_s16_t vector signed short
#define vector_u32_t vector unsigned int
#define vector_s32_t vector signed int
/***********************************************************************
* Vector types
**********************************************************************/
#define vec_u8_t vector unsigned char
#define vec_s8_t vector signed char
#define vec_u16_t vector unsigned short
#define vec_s16_t vector signed short
#define vec_u32_t vector unsigned int
#define vec_s32_t vector signed int
#define LOAD_ZERO vector_s32_t zero = vec_splat_s32( 0 )
#define zero_u8 (vector_u8_t) zero
#define zero_s16 (vector_s16_t) zero
#define zero_s32 (vector_s32_t) zero
/***********************************************************************
* Null vector
**********************************************************************/
#define LOAD_ZERO vec_u8_t zerov = vec_splat_u8( 0 )
#define CONVERT_U8_TO_S16( a ) \
a = (vector_s16_t) vec_mergeh( zero_u8, (vector_u8_t) a )
#define zero_u8v (vec_u8_t) zerov
#define zero_s8v (vec_s8_t) zerov
#define zero_u16v (vec_u16_t) zerov
#define zero_s16v (vec_s16_t) zerov
#define zero_u32v (vec_u32_t) zerov
#define zero_s32v (vec_s32_t) zerov
/* Macros to load aligned or unaligned data without risking buffer
overflows. */
/***********************************************************************
* CONVERT_*
**********************************************************************/
#define CONVERT_U8_TO_U16( s, d ) \
d = (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) s )
#define CONVERT_U8_TO_S16( s, d ) \
d = (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) s )
#define CONVERT_U16_TO_U8( s, d ) \
d = (vec_u8_t) vec_pack( (vec_u16_t) s, zero_u16v )
#define CONVERT_S16_TO_U8( s, d ) \
d = (vec_u8_t) vec_pack( (vec_s16_t) s, zero_s16v )
/***********************************************************************
* LOAD_16
***********************************************************************
* p: uint8_t *
* v: vec_u8_t
* Loads 16 bytes from p into v
**********************************************************************/
#define LOAD_16( p, v ) \
if( (long) p & 0xF ) \
{ \
......@@ -48,62 +73,87 @@
v = vec_ld( 0, p ); \
}
#define LOAD_8( p, v ) \
if( !( (long) p & 0xF ) ) \
{ \
v = vec_ld( 0, p ); \
} \
else if( ( (long) p & 0xF ) < 9 ) \
{ \
v = vec_perm( vec_ld( 0, p ), (vector unsigned char) zero, \
vec_lvsl( 0, p ) ); \
} \
else \
{ \
v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ), \
vec_lvsl( 0, p ) ); \
/***********************************************************************
* LOAD_8
***********************************************************************
* p: uint8_t *
* v: vec_u8_t
* Loads 8 bytes from p into the first half of v
**********************************************************************/
#define LOAD_8( p, v ) \
if( !( (long) p & 0xF ) ) \
{ \
v = vec_ld( 0, p ); \
} \
else if( ( (long) p & 0xF ) < 9 ) \
{ \
v = vec_perm( vec_ld( 0, p ), zero_u8v, \
vec_lvsl( 0, p ) ); \
} \
else \
{ \
v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ), \
vec_lvsl( 0, p ) ); \
}
/***********************************************************************
* LOAD_4
***********************************************************************
* p: uint8_t *
* v: vec_u8_t
* Loads 4 bytes from p into the first quarter of v
**********************************************************************/
#define LOAD_4( p, v ) \
if( !( (long) p & 0xF ) ) \
{ \
v = vec_ld( 0, p ); \
} \
else if( ( (long) p & 0xF ) < 13 ) \
{ \
v = vec_perm( vec_ld( 0, p ), zero_u8v, \
vec_lvsl( 0, p ) ); \
} \
else \
{ \
v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ), \
vec_lvsl( 0, p ) ); \
}
#define LOAD_4( p, v ) \
if( !( (long) p & 0xF ) ) \
{ \
v = vec_ld( 0, p ); \
} \
else if( ( (long) p & 0xF ) < 13 ) \
{ \
v = vec_perm( vec_ld( 0, p ), (vector unsigned char) zero, \
vec_lvsl( 0, p ) ); \
} \
else \
{ \
v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ), \
vec_lvsl( 0, p ) ); \
/***********************************************************************
* STORE_16
***********************************************************************
* v: vec_u8_t
* p: uint8_t *
* Stores the 16 bytes from v at address p
**********************************************************************/
#define STORE_16( v, p ) \
if( (long) p & 0xF ) \
{ \
vec_u8_t hv, lv, tmp1, tmp2; \
hv = vec_ld( 0, p ); \
lv = vec_ld( 16, p ); \
tmp2 = vec_lvsl( 0, p ); \
tmp1 = vec_perm( lv, hv, tmp2 ); \
tmp2 = vec_lvsr( 0, p ); \
hv = vec_perm( tmp1, v, tmp2 ); \
lv = vec_perm( v, tmp1, tmp2 ); \
vec_st( lv, 16, p ); \
vec_st( hv, 0, p ); \
} \
else \
{ \
vec_st( v, 0, p ); \
}
/* Store aligned or unaligned data */
#define STORE_16( v, p ) \
if( (long) p & 0xF ) \
{ \
vector unsigned char tmp1, tmp2; \
vector unsigned char align, mask; \
tmp1 = vec_ld( 0, p ); \
tmp2 = vec_ld( 16, p ); \
align = vec_lvsr( 0, p ); \
mask = vec_perm( (vector unsigned char) {0}, \
(vector unsigned char) {1}, \
align); \
v = vec_perm( v, v, align); \
tmp1 = vec_sel( tmp1, v, mask ); \
tmp2 = vec_sel( v, tmp2, mask ); \
vec_st( tmp1, 0, p ); \
vec_st( tmp2, 16, p ); \
} \
else \
{ \
vec_st( v, 0, p ); \
/* FIXME We can do better than that */
#define STORE_8( v, p ) \
{ \
DECLARE_ALIGNED( uint8_t, _p[16], 16 ); \
vec_st( v, 0, _p ); \
memcpy( p, _p, 8 ); \
}
/* Transpose 8x8 (vector_s16_t [8]) */
/* Transpose 8x8 (vec_s16_t [8]) */
#define TRANSPOSE8x8( a, b ) \
b[0] = vec_mergeh( a[0], a[4] ); \
b[1] = vec_mergel( a[0], a[4] ); \
......@@ -130,12 +180,12 @@
b[6] = vec_mergeh( a[3], a[7] ); \
b[7] = vec_mergel( a[3], a[7] );
/* Transpose 4x4 (vector_s16_t [4]) */
/* Transpose 4x4 (vec_s16_t [4]) */
#define TRANSPOSE4x4( a, b ) \
(b)[0] = vec_mergeh( (a)[0], zero_s16 ); \
(b)[1] = vec_mergeh( (a)[1], zero_s16 ); \
(b)[2] = vec_mergeh( (a)[2], zero_s16 ); \
(b)[3] = vec_mergeh( (a)[3], zero_s16 ); \
(b)[0] = vec_mergeh( (a)[0], zero_s16v ); \
(b)[1] = vec_mergeh( (a)[1], zero_s16v ); \
(b)[2] = vec_mergeh( (a)[2], zero_s16v ); \
(b)[3] = vec_mergeh( (a)[3], zero_s16v ); \
(a)[0] = vec_mergeh( (b)[0], (b)[2] ); \
(a)[1] = vec_mergel( (b)[0], (b)[2] ); \
(a)[2] = vec_mergeh( (b)[1], (b)[3] ); \
......@@ -145,7 +195,7 @@
(b)[2] = vec_mergeh( (a)[1], (a)[3] ); \
(b)[3] = vec_mergel( (a)[1], (a)[3] );
/* Hadamar (vector_s16_t [4]) */
/* Hadamar (vec_s16_t [4]) */
#define HADAMAR( a, b ) \
s01v = vec_add( (a)[0], (a)[1] ); \
s23v = vec_add( (a)[2], (a)[3] ); \
......
......@@ -35,6 +35,7 @@ case "$UNAMES" in
;;
Darwin)
SYS="MACOSX"
CFLAGS="$CFLAGS -falign-loops=16"
LDFLAGS="$LDFLAGS -lm -lmx"
;;
FreeBSD)
......
......@@ -11,6 +11,7 @@
#endif
#ifdef ARCH_PPC
#include "common/ppc/pixel.h"
#include "common/ppc/mc.h"
#endif
/* buf1, buf2: initialised to randome data and shouldn't write into them */
......@@ -262,12 +263,11 @@ static int check_mc()
#define MC_TEST_LUMA( w, h ) \
if( mc_a.mc_luma ) \
{ \
memset(buf1, 0xCD, 1024); \
memset(buf3, 0xCD, 1024); \
memset(buf4, 0xCD, 1024); \
mc_c.mc_luma( src2, 32, dst1, 16, dx, dy, w, h ); \
mc_a.mc_luma( src2, 32, dst2, 16, dx, dy, w, h ); \
if( memcmp( dst1, dst2, 16*16 ) ) \
if( memcmp( buf3, buf4, 1024 ) ) \
{ \
fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
ok[0] = 0; \
......@@ -277,11 +277,11 @@ static int check_mc()
#define MC_TEST_CHROMA( w, h ) \
if( mc_a.mc_chroma ) \
{ \
memset(dst1, 0xCD, (h) * 16); \
memset(buf3, 0xCD, 1024); \
memset(buf4, 0xCD, 1024); \
mc_c.mc_chroma( src, 32, dst1, 16, dx, dy, w, h ); \
memset(dst2, 0xCD, (h) * 16); \
mc_a.mc_chroma( src, 32, dst2, 16, dx, dy, w, h ); \
if( memcmp( dst1, dst2, 16*16 ) ) \
if( memcmp( buf3, buf4, 1024 ) ) \
{ \
fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
ok[1] = 0; \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment