Commit 2d5dcf8c authored by Guillaume Poirier's avatar Guillaume Poirier

Add AltiVec version of hadamard_ac. 2.4x faster than the C version.

Note this this implementation is pretty naive and should be improved
by implementing what's discussed in this ML thread:
date: Mon, Feb 2, 2009 at 6:58 PM
subject: Re: [x264-devel] [PATCH] AltiVec implementation of hadamard_ac routines
parent 2669f7dd
......@@ -98,6 +98,8 @@ PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec, 8, 8, 2s, 1 )
#define VEC_ABS(a) \
a = vec_max( a, vec_sub( zero_s16v, a ) );
#define VEC_ABSOLUTE(a) (vec_u16_t)vec_max( a, vec_sub( zero_s16v, a ) )
/***********************************************************************
* VEC_ADD_ABS
***********************************************************************
......@@ -1850,6 +1852,144 @@ static int pixel_sa8d_16x16_altivec( uint8_t *pix1, int i_pix1,
return i_satd;
}
#define HADAMARD4_ALTIVEC(d0,d1,d2,d3,s0,s1,s2,s3) {\
vec_s16_t t0 = vec_add(s0, s1); \
vec_s16_t t1 = vec_sub(s0, s1); \
vec_s16_t t2 = vec_add(s2, s3); \
vec_s16_t t3 = vec_sub(s2, s3); \
d0 = vec_add(t0, t2); \
d2 = vec_sub(t0, t2); \
d1 = vec_add(t1, t3); \
d3 = vec_sub(t1, t3); \
}
#define VEC_LOAD_HIGH( p, num ) \
vec_u8_t pix8_##num = vec_ld( stride*num, p ); \
vec_s16_t pix16_s##num = vec_perm(pix8_##num, zero_u8v, perm); \
vec_s16_t pix16_d##num;
static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, int stride, const vec_u8_t perm )
{
DECLARE_ALIGNED_16( int32_t sum4_tab[4] );
DECLARE_ALIGNED_16( int32_t sum8_tab[4] );
LOAD_ZERO;
VEC_LOAD_HIGH( pix, 0 );
VEC_LOAD_HIGH( pix, 1 );
VEC_LOAD_HIGH( pix, 2 );
VEC_LOAD_HIGH( pix, 3 );
HADAMARD4_ALTIVEC(pix16_d0,pix16_d1,pix16_d2,pix16_d3,
pix16_s0,pix16_s1,pix16_s2,pix16_s3);
VEC_LOAD_HIGH( pix, 4 );
VEC_LOAD_HIGH( pix, 5 );
VEC_LOAD_HIGH( pix, 6 );
VEC_LOAD_HIGH( pix, 7 );
HADAMARD4_ALTIVEC(pix16_d4,pix16_d5,pix16_d6,pix16_d7,
pix16_s4,pix16_s5,pix16_s6,pix16_s7);
VEC_TRANSPOSE_8(pix16_d0, pix16_d1, pix16_d2, pix16_d3,
pix16_d4, pix16_d5, pix16_d6, pix16_d7,
pix16_s0, pix16_s1, pix16_s2, pix16_s3,
pix16_s4, pix16_s5, pix16_s6, pix16_s7);
HADAMARD4_ALTIVEC(pix16_d0,pix16_d1,pix16_d2,pix16_d3,
pix16_s0,pix16_s1,pix16_s2,pix16_s3);
HADAMARD4_ALTIVEC(pix16_d4,pix16_d5,pix16_d6,pix16_d7,
pix16_s4,pix16_s5,pix16_s6,pix16_s7);
vec_u16_t addabs01 = vec_add( VEC_ABSOLUTE(pix16_d0), VEC_ABSOLUTE(pix16_d1) );
vec_u16_t addabs23 = vec_add( VEC_ABSOLUTE(pix16_d2), VEC_ABSOLUTE(pix16_d3) );
vec_u16_t addabs45 = vec_add( VEC_ABSOLUTE(pix16_d4), VEC_ABSOLUTE(pix16_d5) );
vec_u16_t addabs67 = vec_add( VEC_ABSOLUTE(pix16_d6), VEC_ABSOLUTE(pix16_d7) );
vec_u16_t sum4_v = vec_add(vec_add(addabs01, addabs23), vec_add(addabs45, addabs67));
vec_ste(vec_sums(vec_sum4s(sum4_v, zero_s32v), zero_s32v), 12, sum4_tab);
vec_s16_t tmpi0 = vec_add(pix16_d0, pix16_d4);
vec_s16_t tmpi4 = vec_sub(pix16_d0, pix16_d4);
vec_s16_t tmpi1 = vec_add(pix16_d1, pix16_d5);
vec_s16_t tmpi5 = vec_sub(pix16_d1, pix16_d5);
vec_s16_t tmpi2 = vec_add(pix16_d2, pix16_d6);
vec_s16_t tmpi6 = vec_sub(pix16_d2, pix16_d6);
vec_s16_t tmpi3 = vec_add(pix16_d3, pix16_d7);
vec_s16_t tmpi7 = vec_sub(pix16_d3, pix16_d7);
int sum4 = sum4_tab[3];
VEC_TRANSPOSE_8(tmpi0, tmpi1, tmpi2, tmpi3,
tmpi4, tmpi5, tmpi6, tmpi7,
pix16_d0, pix16_d1, pix16_d2, pix16_d3,
pix16_d4, pix16_d5, pix16_d6, pix16_d7);
vec_u16_t addsum04 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d0, pix16_d4) ),
VEC_ABSOLUTE( vec_sub(pix16_d0, pix16_d4) ) );
vec_u16_t addsum15 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d1, pix16_d5) ),
VEC_ABSOLUTE( vec_sub(pix16_d1, pix16_d5) ) );
vec_u16_t addsum26 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d2, pix16_d6) ),
VEC_ABSOLUTE( vec_sub(pix16_d2, pix16_d6) ) );
vec_u16_t addsum37 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d3, pix16_d7) ),
VEC_ABSOLUTE( vec_sub(pix16_d3, pix16_d7) ) );
vec_u16_t sum8_v = vec_add( vec_add(addsum04, addsum15), vec_add(addsum26, addsum37) );
vec_ste(vec_sums( vec_sum4s(sum8_v, zero_s32v), zero_s32v), 12, sum8_tab);
int sum8 = sum8_tab[3];
DECLARE_ALIGNED_16( int16_t tmp0_4_tab[8] );
vec_ste(vec_add(pix16_d0, pix16_d4), 0, tmp0_4_tab);
sum4 -= tmp0_4_tab[0];
sum8 -= tmp0_4_tab[0];
return ((uint64_t)sum8<<32) + sum4;
}
static const vec_u8_t hadamard_permtab[] = {
CV(0x10,0x00,0x11,0x01, 0x12,0x02,0x13,0x03, /* pix = mod16 */
0x14,0x04,0x15,0x05, 0x16,0x06,0x17,0x07 ),
CV(0x18,0x08,0x19,0x09, 0x1A,0x0A,0x1B,0x0B, /* pix = mod8 */
0x1C,0x0C,0x1D,0x0D, 0x1E,0x0E,0x1F,0x0F )
};
static uint64_t x264_pixel_hadamard_ac_16x16_altivec( uint8_t *pix, int stride )
{
int index = ((uintptr_t)pix & 8) >> 3;
vec_u8_t permh = hadamard_permtab[index];
vec_u8_t perml = hadamard_permtab[!index];
uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, permh );
sum += pixel_hadamard_ac_altivec( pix+8, stride, perml );
sum += pixel_hadamard_ac_altivec( pix+8*stride, stride, permh );
sum += pixel_hadamard_ac_altivec( pix+8*stride+8, stride, perml );
return ((sum>>34)<<32) + ((uint32_t)sum>>1);
}
static uint64_t x264_pixel_hadamard_ac_16x8_altivec( uint8_t *pix, int stride )
{
int index = ((uintptr_t)pix & 8) >> 3;
vec_u8_t permh = hadamard_permtab[index];
vec_u8_t perml = hadamard_permtab[!index];
uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, permh );
sum += pixel_hadamard_ac_altivec( pix+8, stride, perml );
return ((sum>>34)<<32) + ((uint32_t)sum>>1);
}
static uint64_t x264_pixel_hadamard_ac_8x16_altivec( uint8_t *pix, int stride )
{
vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ];
uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm );
sum += pixel_hadamard_ac_altivec( pix+8*stride, stride, perm );
return ((sum>>34)<<32) + ((uint32_t)sum>>1);
}
static uint64_t x264_pixel_hadamard_ac_8x8_altivec( uint8_t *pix, int stride ) {
vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ];
uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm );
return ((sum>>34)<<32) + ((uint32_t)sum>>1);
}
/****************************************************************************
* structural similarity metric
****************************************************************************/
......@@ -1932,5 +2072,10 @@ void x264_pixel_altivec_init( x264_pixel_function_t *pixf )
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_altivec;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_altivec;
pixf->hadamard_ac[PIXEL_16x16] = x264_pixel_hadamard_ac_16x16_altivec;
pixf->hadamard_ac[PIXEL_16x8] = x264_pixel_hadamard_ac_16x8_altivec;
pixf->hadamard_ac[PIXEL_8x16] = x264_pixel_hadamard_ac_8x16_altivec;
pixf->hadamard_ac[PIXEL_8x8] = x264_pixel_hadamard_ac_8x8_altivec;
pixf->ssim_4x4x2_core = ssim_4x4x2_core_altivec;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment