Commit 1014aa4e authored by Eric Petit's avatar Eric Petit
Browse files

common/ppc: more cleaning, optimized a bit


git-svn-id: svn://svn.videolan.org/x264/trunk@201 df754926-b1dd-0310-bc7b-ec298dee348c
parent 77404162
......@@ -90,10 +90,10 @@ static inline void pixel_avg_w16( uint8_t *dst, int i_dst,
vec_u8_t src1v, src2v;
for( y = 0; y < i_height; y++ )
{
LOAD_16( src1, src1v );
LOAD_16( src2, src2v );
src1v = vec_load16( src1 );
src2v = vec_load16( src2 );
src1v = vec_avg( src1v, src2v );
STORE_16( src1v, dst );
vec_store16( src1v, dst );
dst += i_dst;
src1 += i_src1;
......@@ -175,13 +175,13 @@ static inline void mc_hh_w8( uint8_t *src, int i_src,
for( y = 0; y < i_height; y++ )
{
LOAD_16( &src[-2], loadv );
loadv = vec_load16( &src[-2] );
for( x = 0; x < 6; x++ )
{
_srcv[x] = vec_perm( loadv, zero_u8v,
vec_lvsl( 0, (int*) x ) );
CONVERT_U8_TO_S16( srcv[x], srcv[x] );
srcv[x] = vec_u8_to_s16( _srcv[x] );
}
TAP_FILTER( srcv, tmpv, dstv );
......@@ -225,7 +225,6 @@ static inline void mc_hv_w8( uint8_t *src, int i_src,
int x, y;
DECLARE_ALIGNED( int16_t, tmp[8], 16 );
LOAD_ZERO;
vec_s16_t srcv[6];
vec_u8_t * _srcv = (vec_u8_t*) srcv;
vec_s16_t dstv;
......@@ -239,15 +238,15 @@ static inline void mc_hv_w8( uint8_t *src, int i_src,
{
srcv[x] = srcv[x+1];
}
LOAD_8( &src[3*i_src], _srcv[5] );
CONVERT_U8_TO_S16( srcv[5], srcv[5] );
_srcv[5] = vec_load8( &src[3*i_src] );
srcv[5] = vec_u8_to_s16( _srcv[5] );
}
else
{
for( x = 0; x < 6; x++ )
{
LOAD_8( &src[(x-2)*i_src], _srcv[x] );
CONVERT_U8_TO_S16( srcv[x], srcv[x] );
_srcv[x] = vec_load8( &src[(x-2)*i_src] );
srcv[x] = vec_u8_to_s16( _srcv[x] );
}
}
......@@ -844,7 +843,7 @@ static void mc_chroma_altivec( uint8_t *src, int i_src_stride,
permv = vec_lvsl( 0, (uint8_t *) 1 );
shiftv = vec_splat_u16( 6 );
LOAD_16( src, srcv_8[2] );
srcv_8[2] = vec_load16( src );
srcv_8[3] = vec_perm( srcv_8[2], srcv_8[2], permv );
for( y = 0; y < i_height; y++ )
......@@ -853,19 +852,19 @@ static void mc_chroma_altivec( uint8_t *src, int i_src_stride,
srcv_8[0] = srcv_8[2];
srcv_8[1] = srcv_8[3];
LOAD_16( srcp, srcv_8[2] );
srcv_8[2] = vec_load16( srcp );
srcv_8[3] = vec_perm( srcv_8[2], srcv_8[2], permv );
dstv_16 = k32v;
for( i = 0; i < 4; i++ )
{
CONVERT_U8_TO_U16( srcv_8[i], srcv_16[i] );
srcv_16[i] = vec_u8_to_u16( srcv_8[i] );
srcv_16[i] = vec_mladd( coeffv[i], srcv_16[i], zero_u16v );
dstv_16 = vec_add( dstv_16, srcv_16[i] );
}
dstv_16 = vec_sr( dstv_16, shiftv );
CONVERT_U16_TO_U8( dstv_16, dstv_8 );
STORE_8( dstv_8, dst );
dstv_8 = vec_u16_to_u8( dstv_16 );
vec_store8( dstv_8, dst );
dst += i_dst_stride;
srcp += i_src_stride;
......
......@@ -44,13 +44,13 @@ static int name( uint8_t *pix1, int i_pix1, \
int y; \
DECLARE_ALIGNED( int, sum, 16 ); \
\
LOAD_ZERO; \
LOAD_ZERO; \
vec_u8_t pix1v, pix2v; \
vec_s32_t sumv = zero_s32v; \
for( y = 0; y < ly; y++ ) \
{ \
LOAD_##lx( pix1, pix1v ); \
LOAD_##lx( pix2, pix2v ); \
pix1v = vec_load##lx( pix1 ); \
pix2v = vec_load##lx( pix2 ); \
sumv = (vec_s32_t) vec_sum4s( \
vec_sub( vec_max( pix1v, pix2v ), \
vec_min( pix1v, pix2v ) ), \
......@@ -81,17 +81,15 @@ static inline int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
vec_s16_t pix1s16v, pix2s16v;
vec_s16_t diffv[8];
vec_s16_t tmpv[8];
vec_s16_t s01v, s23v, d01v, d23v;
/* Diff 8x8 */
for( i = 0; i < 8; i++ )
{
LOAD_8( pix1, pix1u8v );
LOAD_8( pix2, pix2u8v );
pix1u8v = vec_load8( pix1 );
pix2u8v = vec_load8( pix2 );
/* u8 -> s16 conversion */
CONVERT_U8_TO_S16( pix1u8v, pix1s16v );
CONVERT_U8_TO_S16( pix2u8v, pix2s16v );
pix1s16v = vec_u8_to_s16( pix1u8v );
pix2s16v = vec_u8_to_s16( pix2u8v );
diffv[i] = vec_sub( pix1s16v, pix2s16v );
......@@ -100,15 +98,15 @@ static inline int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
}
/* Hadamar H */
HADAMAR( &diffv[0], &tmpv[0] );
HADAMAR( &diffv[4], &tmpv[4] );
vec_hadamar( &diffv[0], &tmpv[0] );
vec_hadamar( &diffv[4], &tmpv[4] );
/* Transpose */
TRANSPOSE8x8( tmpv, diffv );
vec_transpose8x8( tmpv, diffv );
/* Hadamar V */
HADAMAR( &diffv[0], &tmpv[0] );
HADAMAR( &diffv[4], &tmpv[4] );
vec_hadamar( &diffv[0], &tmpv[0] );
vec_hadamar( &diffv[4], &tmpv[4] );
/* Sum of absolute values */
for( i = 0; i < 8; i++ )
......@@ -163,16 +161,15 @@ static inline int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
vec_s16_t pix1s16v, pix2s16v;
vec_s16_t diffv[4];
vec_s16_t tmpv[4];
vec_s16_t s01v, s23v, d01v, d23v;
/* Diff 4x8 */
for( i = 0; i < 4; i++ )
{
LOAD_4( pix1, pix1u8v );
LOAD_4( pix2, pix2u8v );
pix1u8v = vec_load4( pix1 );
pix2u8v = vec_load4( pix2 );
CONVERT_U8_TO_S16( pix1u8v, pix1s16v );
CONVERT_U8_TO_S16( pix2u8v, pix2s16v );
pix1s16v = vec_u8_to_s16( pix1u8v );
pix2s16v = vec_u8_to_s16( pix2u8v );
diffv[i] = vec_sub( pix1s16v, pix2s16v );
......@@ -181,13 +178,13 @@ static inline int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
}
/* Hadamar H */
HADAMAR( diffv, tmpv );
vec_hadamar( diffv, tmpv );
/* Transpose */
TRANSPOSE4x4( tmpv, diffv );
vec_transpose4x4( tmpv, diffv );
/* Hadamar V */
HADAMAR( diffv, tmpv );
vec_hadamar( diffv, tmpv );
/* Sum of absolute values */
for( i = 0; i < 4; i++ )
......
......@@ -21,6 +21,15 @@
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
/***********************************************************************
* For constant vectors, use parentheses on OS X and braces on Linux
**********************************************************************/
#ifdef SYS_MACOSX
#define CV(a...) (a)
#else
#define CV(a...) {a}
#endif
/***********************************************************************
* Vector types
**********************************************************************/
......@@ -44,165 +53,214 @@
#define zero_s32v (vec_s32_t) zerov
/***********************************************************************
* CONVERT_*
* Conversions
**********************************************************************/
#define CONVERT_U8_TO_U16( s, d ) \
d = (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) s )
#define CONVERT_U8_TO_S16( s, d ) \
d = (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) s )
#define CONVERT_U16_TO_U8( s, d ) \
d = (vec_u8_t) vec_pack( (vec_u16_t) s, zero_u16v )
#define CONVERT_S16_TO_U8( s, d ) \
d = (vec_u8_t) vec_pack( (vec_s16_t) s, zero_s16v )
static inline vec_u16_t vec_u8_to_u16( vec_u8_t v )
{
LOAD_ZERO;
return (vec_u16_t) vec_mergeh( zero_u8v, v );
}
static inline vec_u8_t vec_u16_to_u8( vec_u16_t v )
{
LOAD_ZERO;
return (vec_u8_t) vec_pack( v, zero_u16v );
}
#define vec_u8_to_s16(v) (vec_s16_t) vec_u8_to_u16(v)
#define vec_s16_to_u8(v) vec_u16_to_u8( (vec_u16_t) v )
/***********************************************************************
* LOAD_16
***********************************************************************
* p: uint8_t *
* v: vec_u8_t
* Loads 16 bytes from p into v
* vec_load16
**********************************************************************/
#define LOAD_16( p, v ) \
if( (long) p & 0xF ) \
{ \
v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ), \
vec_lvsl( 0, p ) ); \
} \
else \
{ \
v = vec_ld( 0, p ); \
static inline vec_u8_t vec_load16( uint8_t * p )
{
if( (long) p & 0xF )
{
vec_u8_t hv, lv, perm;
hv = vec_ld( 0, p );
lv = vec_ld( 16, p );
perm = vec_lvsl( 0, p );
return vec_perm( hv, lv, perm );
}
return vec_ld( 0, p );
}
/***********************************************************************
* LOAD_8
***********************************************************************
* p: uint8_t *
* v: vec_u8_t
* Loads 8 bytes from p into the first half of v
* vec_load8
**********************************************************************/
#define LOAD_8( p, v ) \
if( !( (long) p & 0xF ) ) \
{ \
v = vec_ld( 0, p ); \
} \
else if( ( (long) p & 0xF ) < 9 ) \
{ \
v = vec_perm( vec_ld( 0, p ), zero_u8v, \
vec_lvsl( 0, p ) ); \
} \
else \
{ \
v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ), \
vec_lvsl( 0, p ) ); \
static inline vec_u8_t vec_load8( uint8_t * p )
{
long align = (long) p & 0xF;
if( align )
{
vec_u8_t hv, perm;
hv = vec_ld( 0, p );
perm = vec_lvsl( 0, p );
if( align > 8 )
{
vec_u8_t lv;
lv = vec_ld( 16, p );
return vec_perm( hv, lv, perm );
}
return vec_perm( hv, hv, perm );
}
return vec_ld( 0, p );
}
/***********************************************************************
* LOAD_4
***********************************************************************
* p: uint8_t *
* v: vec_u8_t
* Loads 4 bytes from p into the first quarter of v
* vec_load4
**********************************************************************/
#define LOAD_4( p, v ) \
if( !( (long) p & 0xF ) ) \
{ \
v = vec_ld( 0, p ); \
} \
else if( ( (long) p & 0xF ) < 13 ) \
{ \
v = vec_perm( vec_ld( 0, p ), zero_u8v, \
vec_lvsl( 0, p ) ); \
} \
else \
{ \
v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ), \
vec_lvsl( 0, p ) ); \
static inline vec_u8_t vec_load4( uint8_t * p )
{
long align = (long) p & 0xF;
if( align )
{
vec_u8_t hv, perm;
hv = vec_ld( 0, p );
perm = vec_lvsl( 0, p );
if( align > 12 )
{
vec_u8_t lv;
lv = vec_ld( 16, p );
return vec_perm( hv, lv, perm );
}
return vec_perm( hv, hv, perm );
}
return vec_ld( 0, p );
}
/***********************************************************************
* STORE_16
***********************************************************************
* v: vec_u8_t
* p: uint8_t *
* Stores the 16 bytes from v at address p
* vec_store16
**********************************************************************/
#define STORE_16( v, p ) \
if( (long) p & 0xF ) \
{ \
vec_u8_t hv, lv, tmp1, tmp2; \
hv = vec_ld( 0, p ); \
lv = vec_ld( 16, p ); \
tmp2 = vec_lvsl( 0, p ); \
tmp1 = vec_perm( lv, hv, tmp2 ); \
tmp2 = vec_lvsr( 0, p ); \
hv = vec_perm( tmp1, v, tmp2 ); \
lv = vec_perm( v, tmp1, tmp2 ); \
vec_st( lv, 16, p ); \
vec_st( hv, 0, p ); \
} \
else \
{ \
vec_st( v, 0, p ); \
static inline void vec_store16( vec_u8_t v, uint8_t * p )
{
if( (long) p & 0xF )
{
vec_u8_t hv, lv, tmp1, tmp2;
hv = vec_ld( 0, p );
lv = vec_ld( 16, p );
tmp2 = vec_lvsl( 0, p );
tmp1 = vec_perm( lv, hv, tmp2 );
tmp2 = vec_lvsr( 0, p );
hv = vec_perm( tmp1, v, tmp2 );
lv = vec_perm( v, tmp1, tmp2 );
vec_st( lv, 16, p );
vec_st( hv, 0, p );
return;
}
vec_st( v, 0, p );
}
/***********************************************************************
* vec_store8
**********************************************************************/
static inline void vec_store8( vec_u8_t v, uint8_t * p )
{
LOAD_ZERO;
long align;
vec_u8_t hv, sel;
align = (long) p & 0xF;
hv = vec_ld( 0, p );
sel = (vec_u8_t) CV(-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0);
/* FIXME We can do better than that */
#define STORE_8( v, p ) \
{ \
DECLARE_ALIGNED( uint8_t, _p[16], 16 ); \
vec_st( v, 0, _p ); \
memcpy( p, _p, 8 ); \
if( align )
{
vec_u8_t perm;
perm = vec_lvsr( 0, p );
v = vec_perm( v, v, perm );
if( align > 8 )
{
vec_u8_t lv, hsel, lsel;
lv = vec_ld( 16, p );
hsel = vec_perm( zero_u8v, sel, perm );
lsel = vec_perm( sel, zero_u8v, perm );
hv = vec_sel( hv, v, hsel );
lv = vec_sel( lv, v, lsel );
vec_st( lv, 16, p );
}
else
{
sel = vec_perm( sel, sel, perm );
hv = vec_sel( hv, v, sel );
}
}
else
{
hv = vec_sel( hv, v, sel );
}
vec_st( hv, 0, p );
}
/* Transpose 8x8 (vec_s16_t [8]) */
#define TRANSPOSE8x8( a, b ) \
b[0] = vec_mergeh( a[0], a[4] ); \
b[1] = vec_mergel( a[0], a[4] ); \
b[2] = vec_mergeh( a[1], a[5] ); \
b[3] = vec_mergel( a[1], a[5] ); \
b[4] = vec_mergeh( a[2], a[6] ); \
b[5] = vec_mergel( a[2], a[6] ); \
b[6] = vec_mergeh( a[3], a[7] ); \
b[7] = vec_mergel( a[3], a[7] ); \
a[0] = vec_mergeh( b[0], b[4] ); \
a[1] = vec_mergel( b[0], b[4] ); \
a[2] = vec_mergeh( b[1], b[5] ); \
a[3] = vec_mergel( b[1], b[5] ); \
a[4] = vec_mergeh( b[2], b[6] ); \
a[5] = vec_mergel( b[2], b[6] ); \
a[6] = vec_mergeh( b[3], b[7] ); \
a[7] = vec_mergel( b[3], b[7] ); \
b[0] = vec_mergeh( a[0], a[4] ); \
b[1] = vec_mergel( a[0], a[4] ); \
b[2] = vec_mergeh( a[1], a[5] ); \
b[3] = vec_mergel( a[1], a[5] ); \
b[4] = vec_mergeh( a[2], a[6] ); \
b[5] = vec_mergel( a[2], a[6] ); \
b[6] = vec_mergeh( a[3], a[7] ); \
/***********************************************************************
* vec_transpose8x8
**********************************************************************/
static inline void vec_transpose8x8( vec_s16_t * a, vec_s16_t * b )
{
b[0] = vec_mergeh( a[0], a[4] );
b[1] = vec_mergel( a[0], a[4] );
b[2] = vec_mergeh( a[1], a[5] );
b[3] = vec_mergel( a[1], a[5] );
b[4] = vec_mergeh( a[2], a[6] );
b[5] = vec_mergel( a[2], a[6] );
b[6] = vec_mergeh( a[3], a[7] );
b[7] = vec_mergel( a[3], a[7] );
a[0] = vec_mergeh( b[0], b[4] );
a[1] = vec_mergel( b[0], b[4] );
a[2] = vec_mergeh( b[1], b[5] );
a[3] = vec_mergel( b[1], b[5] );
a[4] = vec_mergeh( b[2], b[6] );
a[5] = vec_mergel( b[2], b[6] );
a[6] = vec_mergeh( b[3], b[7] );
a[7] = vec_mergel( b[3], b[7] );
b[0] = vec_mergeh( a[0], a[4] );
b[1] = vec_mergel( a[0], a[4] );
b[2] = vec_mergeh( a[1], a[5] );
b[3] = vec_mergel( a[1], a[5] );
b[4] = vec_mergeh( a[2], a[6] );
b[5] = vec_mergel( a[2], a[6] );
b[6] = vec_mergeh( a[3], a[7] );
b[7] = vec_mergel( a[3], a[7] );
}
/* Transpose 4x4 (vec_s16_t [4]) */
#define TRANSPOSE4x4( a, b ) \
(b)[0] = vec_mergeh( (a)[0], zero_s16v ); \
(b)[1] = vec_mergeh( (a)[1], zero_s16v ); \
(b)[2] = vec_mergeh( (a)[2], zero_s16v ); \
(b)[3] = vec_mergeh( (a)[3], zero_s16v ); \
(a)[0] = vec_mergeh( (b)[0], (b)[2] ); \
(a)[1] = vec_mergel( (b)[0], (b)[2] ); \
(a)[2] = vec_mergeh( (b)[1], (b)[3] ); \
(a)[3] = vec_mergel( (b)[1], (b)[3] ); \
(b)[0] = vec_mergeh( (a)[0], (a)[2] ); \
(b)[1] = vec_mergel( (a)[0], (a)[2] ); \
(b)[2] = vec_mergeh( (a)[1], (a)[3] ); \
(b)[3] = vec_mergel( (a)[1], (a)[3] );
/* Hadamar (vec_s16_t [4]) */
#define HADAMAR( a, b ) \
s01v = vec_add( (a)[0], (a)[1] ); \
s23v = vec_add( (a)[2], (a)[3] ); \
d01v = vec_sub( (a)[0], (a)[1] ); \
d23v = vec_sub( (a)[2], (a)[3] ); \
(b)[0] = vec_add( s01v, s23v ); \
(b)[1] = vec_sub( s01v, s23v ); \
(b)[2] = vec_sub( d01v, d23v ); \
(b)[3] = vec_add( d01v, d23v );
/***********************************************************************
* vec_transpose4x4
**********************************************************************/
static inline void vec_transpose4x4( vec_s16_t * a, vec_s16_t * b )
{
#define WHATEVER a[0]
b[0] = vec_mergeh( a[0], WHATEVER );
b[1] = vec_mergeh( a[1], WHATEVER );
b[2] = vec_mergeh( a[2], WHATEVER );
b[3] = vec_mergeh( a[3], WHATEVER );
a[0] = vec_mergeh( b[0], b[2] );
a[1] = vec_mergel( b[0], b[2] );
a[2] = vec_mergeh( b[1], b[3] );
a[3] = vec_mergel( b[1], b[3] );
b[0] = vec_mergeh( a[0], a[2] );
b[1] = vec_mergel( a[0], a[2] );
b[2] = vec_mergeh( a[1], a[3] );
b[3] = vec_mergel( a[1], a[3] );
#undef WHATEVER
}
/***********************************************************************
* vec_hadamar
***********************************************************************
* b[0] = a[0] + a[1] + a[2] + a[3]
* b[1] = a[0] + a[1] - a[2] - a[3]
* b[2] = a[0] - a[1] - a[2] + a[3]
* b[3] = a[0] - a[1] + a[2] - a[3]
**********************************************************************/
static inline void vec_hadamar( vec_s16_t * a, vec_s16_t * b )
{
b[2] = vec_add( a[0], a[1] );
b[3] = vec_add( a[2], a[3] );
a[0] = vec_sub( a[0], a[1] );
a[2] = vec_sub( a[2], a[3] );
b[0] = vec_add( b[2], b[3] );
b[1] = vec_sub( b[2], b[3] );
b[2] = vec_sub( a[0], a[2] );
b[3] = vec_add( a[0], a[2] );
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment