Commit 3b6b4c41 authored by Guillaume Poirier's avatar Guillaume Poirier

add AltiVec implementation of dequant_4x4 and dequant_8x8, 2.8x faster than C,

 1.01x faster than previous revision with default encoding options
Patch by Noboru Asai % noboru DD asai AA gmail DD com %


git-svn-id: svn://svn.videolan.org/x264/trunk@684 df754926-b1dd-0310-bc7b-ec298dee348c
parent 09334c1a
......@@ -237,3 +237,136 @@ void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[6
}
}
#define DEQUANT_SHL() \
{ \
dctv = vec_ld(0, dct[y]); \
mf1v = vec_ld(0, dequant_mf[i_mf][y]); \
mf2v = vec_ld(16, dequant_mf[i_mf][y]); \
mfv = vec_packs(mf1v, mf2v); \
\
multEvenvA = vec_mule(dctv, mfv); \
multOddvA = vec_mulo(dctv, mfv); \
dctv = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), \
vec_mergel(multEvenvA, multOddvA)); \
dctv = vec_sl(dctv, i_qbitsv); \
vec_st(dctv, 0, dct[y]); \
}
#define DEQUANT_SHR() \
{ \
dctv = vec_ld(0, dct[y]); \
dct1v = vec_mergeh(dctv, dctv); \
dct2v = vec_mergel(dctv, dctv); \
mf1v = vec_ld(0, dequant_mf[i_mf][y]); \
mf2v = vec_ld(16, dequant_mf[i_mf][y]); \
\
multEvenvA = vec_mule(dct1v, (vec_s16_t)mf1v); \
multOddvA = vec_mulo(dct1v, (vec_s16_t)mf1v); \
temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
temp1v = vec_add(temp1v, fv); \
temp1v = vec_sra(temp1v, i_qbitsv); \
\
multEvenvA = vec_mule(dct2v, (vec_s16_t)mf2v); \
multOddvA = vec_mulo(dct2v, (vec_s16_t)mf2v); \
temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
temp2v = vec_add(temp2v, fv); \
temp2v = vec_sra(temp2v, i_qbitsv); \
\
dctv = (vec_s16_t)vec_packs(temp1v, temp2v); \
vec_st(dctv, 0, dct[y]); \
}
void x264_dequant_4x4_altivec( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
{
const int i_mf = i_qp%6;
const int i_qbits = i_qp/6 - 4;
int y;
vec_s16_t dctv;
vec_s16_t dct1v, dct2v;
vec_s32_t mf1v, mf2v;
vec_s16_t mfv;
vec_s32_t multEvenvA, multOddvA;
vec_s32_t temp1v, temp2v;
if( i_qbits >= 0 )
{
vec_u16_t i_qbitsv;
vect_ushort_u qbits_u;
qbits_u.s[0]=i_qbits;
i_qbitsv = vec_splat(qbits_u.v, 0);
for( y = 0; y < 4; y+=2 )
DEQUANT_SHL();
}
else
{
const int f = 1 << (-i_qbits-1);
vec_s32_t fv;
vect_int_u f_u;
f_u.s[0]=f;
fv = (vec_s32_t)vec_splat(f_u.v, 0);
vec_u32_t i_qbitsv;
vect_int_u qbits_u;
qbits_u.s[0]=-i_qbits;
i_qbitsv = vec_splat(qbits_u.v, 0);
vec_u32_t sixteenv;
vect_int_u sixteen_u;
sixteen_u.s[0]=16;
sixteenv = vec_splat(sixteen_u.v, 0);
for( y = 0; y < 4; y+=2 )
DEQUANT_SHR();
}
}
void x264_dequant_8x8_altivec( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp )
{
const int i_mf = i_qp%6;
const int i_qbits = i_qp/6 - 6;
int y;
vec_s16_t dctv;
vec_s16_t dct1v, dct2v;
vec_s32_t mf1v, mf2v;
vec_s16_t mfv;
vec_s32_t multEvenvA, multOddvA;
vec_s32_t temp1v, temp2v;
if( i_qbits >= 0 )
{
vec_u16_t i_qbitsv;
vect_ushort_u qbits_u;
qbits_u.s[0]=i_qbits;
i_qbitsv = vec_splat(qbits_u.v, 0);
for( y = 0; y < 8; y++ )
DEQUANT_SHL();
}
else
{
const int f = 1 << (-i_qbits-1);
vec_s32_t fv;
vect_int_u f_u;
f_u.s[0]=f;
fv = (vec_s32_t)vec_splat(f_u.v, 0);
vec_u32_t i_qbitsv;
vect_int_u qbits_u;
qbits_u.s[0]=-i_qbits;
i_qbitsv = vec_splat(qbits_u.v, 0);
vec_u32_t sixteenv;
vect_int_u sixteen_u;
sixteen_u.s[0]=16;
sixteenv = vec_splat(sixteen_u.v, 0);
for( y = 0; y < 8; y++ )
DEQUANT_SHR();
}
}
......@@ -26,4 +26,7 @@ void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[6
void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias );
void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias );
void x264_dequant_4x4_altivec( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_8x8_altivec( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
#endif
......@@ -245,6 +245,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4_dc = x264_quant_4x4_dc_altivec;
pf->quant_4x4 = x264_quant_4x4_altivec;
pf->quant_8x8 = x264_quant_8x8_altivec;
pf->dequant_4x4 = x264_dequant_4x4_altivec;
pf->dequant_8x8 = x264_dequant_8x8_altivec;
}
#endif
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment