Commit a8650641 authored by Guillaume Poirier's avatar Guillaume Poirier

Add AltiVec implementation of add4x4_idct, add8x8_idct, add16x16_idct, 3.2x faster on average

1.05x faster overall with default encoding options
Patch by Noboru Asai % noboru DD asai AA gmail DD com %


git-svn-id: svn://svn.videolan.org/x264/trunk@685 df754926-b1dd-0310-bc7b-ec298dee348c
parent 3b6b4c41
......@@ -440,6 +440,10 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
dctf->add4x4_idct = x264_add4x4_idct_altivec;
dctf->add8x8_idct = x264_add8x8_idct_altivec;
dctf->add16x16_idct = x264_add16x16_idct_altivec;
dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
......
......@@ -233,6 +233,99 @@ void x264_sub16x16_dct8_altivec( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *p
* IDCT transform:
****************************************************************************/
#define IDCT_1D_ALTIVEC(s0, s1, s2, s3, d0, d1, d2, d3) \
{ \
/* a0 = SRC(0) + SRC(2); */ \
vec_s16_t a0v = vec_add(s0, s2); \
/* a1 = SRC(0) - SRC(2); */ \
vec_s16_t a1v = vec_sub(s0, s2); \
/* a2 = (SRC(1)>>1) - SRC(3); */ \
vec_s16_t a2v = vec_sub(vec_sra(s1, onev), s3); \
/* a3 = (SRC(3)>>1) + SRC(1); */ \
vec_s16_t a3v = vec_add(vec_sra(s3, onev), s1); \
/* DST(0, a0 + a3); */ \
d0 = vec_add(a0v, a3v); \
/* DST(1, a1 + a2); */ \
d1 = vec_add(a1v, a2v); \
/* DST(2, a1 - a2); */ \
d2 = vec_sub(a1v, a2v); \
/* DST(3, a0 - a3); */ \
d3 = vec_sub(a0v, a3v); \
}
#define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \
vdst_orig = vec_ld(0, dst); \
vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \
vdst_ss = (vec_s16_t)vec_mergeh(zero_u8v, vdst); \
va = vec_add(va, vdst_ss); \
va_u8 = vec_s16_to_u8(va); \
va_u32 = vec_splat((vec_u32_t)va_u8, 0); \
vec_ste(va_u32, element, (uint32_t*)dst);
#define ALTIVEC_STORE4_SUM_CLIP(dest, idctv, perm_ldv) \
{ \
/* unaligned load */ \
vec_u8_t lv = vec_ld(0, dest); \
vec_u8_t dstv = vec_perm(lv, zero_u8v, (vec_u8_t)perm_ldv); \
vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \
vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv); \
vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \
vec_u8_t idstsum8 = vec_s16_to_u8(idstsum); \
/* unaligned store */ \
vec_u32_t bodyv = vec_splat((vec_u32_t)idstsum8, 0); \
int element = ((unsigned long)dest & 0xf) >> 2; \
vec_ste(bodyv, element, (uint32_t *)dest); \
}
void x264_add4x4_idct_altivec( uint8_t *dst, int16_t dct[4][4] )
{
vec_u16_t onev = vec_splat_u16(1);
dct[0][0] += 32; // rounding for the >>6 at the end
vec_s16_t s0, s1, s2, s3;
s0 = vec_ld( 0x00, (int16_t*)dct );
s1 = vec_sld( s0, s0, 8 );
s2 = vec_ld( 0x10, (int16_t*)dct );
s3 = vec_sld( s2, s2, 8 );
vec_s16_t d0, d1, d2, d3;
IDCT_1D_ALTIVEC( s0, s1, s2, s3, d0, d1, d2, d3 );
vec_s16_t tr0, tr1, tr2, tr3;
VEC_TRANSPOSE_4( d0, d1, d2, d3, tr0, tr1, tr2, tr3 );
vec_s16_t idct0, idct1, idct2, idct3;
IDCT_1D_ALTIVEC( tr0, tr1, tr2, tr3, idct0, idct1, idct2, idct3 );
vec_u8_t perm_ldv = vec_lvsl( 0, dst );
vec_u16_t sixv = vec_splat_u16(6);
LOAD_ZERO;
ALTIVEC_STORE4_SUM_CLIP( &dst[0*FDEC_STRIDE], idct0, perm_ldv );
ALTIVEC_STORE4_SUM_CLIP( &dst[1*FDEC_STRIDE], idct1, perm_ldv );
ALTIVEC_STORE4_SUM_CLIP( &dst[2*FDEC_STRIDE], idct2, perm_ldv );
ALTIVEC_STORE4_SUM_CLIP( &dst[3*FDEC_STRIDE], idct3, perm_ldv );
}
void x264_add8x8_idct_altivec( uint8_t *p_dst, int16_t dct[4][4][4] )
{
x264_add4x4_idct_altivec( &p_dst[0], dct[0] );
x264_add4x4_idct_altivec( &p_dst[4], dct[1] );
x264_add4x4_idct_altivec( &p_dst[4*FDEC_STRIDE+0], dct[2] );
x264_add4x4_idct_altivec( &p_dst[4*FDEC_STRIDE+4], dct[3] );
}
void x264_add16x16_idct_altivec( uint8_t *p_dst, int16_t dct[16][4][4] )
{
x264_add8x8_idct_altivec( &p_dst[0], &dct[0] );
x264_add8x8_idct_altivec( &p_dst[8], &dct[4] );
x264_add8x8_idct_altivec( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
x264_add8x8_idct_altivec( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
}
#define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7)\
{\
/* a0 = SRC(0) + SRC(4); */ \
......@@ -362,4 +455,3 @@ void x264_add16x16_idct8_altivec( uint8_t *dst, int16_t dct[4][8][8] )
x264_add8x8_idct8_altivec( &dst[8*FDEC_STRIDE+0], dct[2] );
x264_add8x8_idct8_altivec( &dst[8*FDEC_STRIDE+8], dct[3] );
}
......@@ -32,6 +32,10 @@ void x264_sub8x8_dct_altivec( int16_t dct[4][4][4],
void x264_sub16x16_dct_altivec( int16_t dct[16][4][4],
uint8_t *pix1, uint8_t *pix2 );
void x264_add4x4_idct_altivec( uint8_t *p_dst, int16_t dct[4][4] );
void x264_add8x8_idct_altivec( uint8_t *p_dst, int16_t dct[4][4][4] );
void x264_add16x16_idct_altivec( uint8_t *p_dst, int16_t dct[16][4][4] );
void x264_sub8x8_dct8_altivec( int16_t dct[8][8],
uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_altivec( int16_t dct[4][8][8],
......
......@@ -64,7 +64,7 @@
#define vec_u8_to_s16(v) vec_u8_to_s16_h(v)
#define vec_u16_to_u8(v) vec_pack( v, zero_u16v )
#define vec_s16_to_u8(v) vec_pack( v, zero_u16v )
#define vec_s16_to_u8(v) vec_packsu( v, zero_s16v )
/***********************************************************************
* PREP_LOAD: declares two vectors required to perform unaligned loads
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment