Commit 09334c1a authored by Guillaume Poirier's avatar Guillaume Poirier

Add AltiVec implementation of quant_2x2_dc,

fix Altivec implementation of quant_(4x4|8x8)(|_dc) wrt current C implementation
Patch by Noboru Asai % noboru DD asai AA gmail DD com %


git-svn-id: svn://svn.videolan.org/x264/trunk@683 df754926-b1dd-0310-bc7b-ec298dee348c
parent 57461cb4
......@@ -37,23 +37,23 @@ typedef union {
#include "quant.h"
// quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
#define QUANT_16_U( dct0, dct1, quant_mf0, quant_mf1, quant_mf2, quant_mf3 ) \
temp1v = vec_ld((dct0), *dct); \
temp2v = vec_ld((dct1), *dct); \
mfvA = (vec_u16_t) vec_packs((vec_u32_t)vec_ld((quant_mf0), *quant_mf), (vec_u32_t)vec_ld((quant_mf1), *quant_mf)); \
mfvB = (vec_u16_t) vec_packs((vec_u32_t)vec_ld((quant_mf2), *quant_mf), (vec_u32_t)vec_ld((quant_mf3), *quant_mf)); \
#define QUANT_16_U( idx0, idx1 ) \
temp1v = vec_ld((idx0), *dct); \
temp2v = vec_ld((idx1), *dct); \
mfvA = vec_ld((idx0), mf); \
mfvB = vec_ld((idx1), mf); \
biasvA = vec_ld((idx0), bias); \
biasvB = vec_ld((idx1), bias); \
mskA = vec_cmplt(temp1v, zerov); \
mskB = vec_cmplt(temp2v, zerov); \
coefvA = (vec_u16_t)vec_max(vec_sub(zerov, temp1v), temp1v); \
coefvB = (vec_u16_t)vec_max(vec_sub(zerov, temp2v), temp2v); \
coefvA = vec_adds(coefvA, biasvA); \
coefvB = vec_adds(coefvB, biasvB); \
multEvenvA = vec_mule(coefvA, mfvA); \
multOddvA = vec_mulo(coefvA, mfvA); \
multEvenvB = vec_mule(coefvB, mfvB); \
multOddvB = vec_mulo(coefvB, mfvB); \
multEvenvA = vec_adds(multEvenvA, fV); \
multOddvA = vec_adds(multOddvA, fV); \
multEvenvB = vec_adds(multEvenvB, fV); \
multOddvB = vec_adds(multOddvB, fV); \
multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
multOddvA = vec_sr(multOddvA, i_qbitsv); \
multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
......@@ -62,58 +62,53 @@ temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(mul
temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
temp1v = vec_xor(temp1v, mskA); \
temp2v = vec_xor(temp2v, mskB); \
temp1v = vec_adds(temp1v, vec_and(mskA, one)); \
vec_st(temp1v, (dct0), (int16_t*)dct); \
temp2v = vec_adds(temp2v, vec_and(mskB, one)); \
vec_st(temp2v, (dct1), (int16_t*)dct);
temp1v = vec_adds(temp1v, vec_and(mskA, one)); \
vec_st(temp1v, (idx0), (int16_t*)dct); \
temp2v = vec_adds(temp2v, vec_and(mskB, one)); \
vec_st(temp2v, (idx1), (int16_t*)dct);
void x264_quant_4x4_altivec( int16_t dct[4][4], int quant_mf[4][4], int const i_qbits, int const f ) {
void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
{
vector bool short mskA;
vec_u32_t i_qbitsv;
vec_u16_t coefvA;
vec_u32_t multEvenvA, multOddvA;
vec_u16_t mfvA;
vec_u16_t biasvA;
vec_s16_t zerov, one;
vec_u32_t fV;
vector bool short mskB;
vec_u16_t coefvB;
vec_u32_t multEvenvB, multOddvB;
vec_u16_t mfvB;
vec_u16_t biasvB;
vec_s16_t temp1v, temp2v;
vect_int_u qbits_u;
qbits_u.s[0]=i_qbits;
qbits_u.s[0]=16;
i_qbitsv = vec_splat(qbits_u.v, 0);
vect_int_u f_u;
f_u.s[0]=f;
fV = vec_splat(f_u.v, 0);
zerov = vec_splat_s16(0);
one = vec_splat_s16(1);
QUANT_16_U( 0, 16, 0, 16, 32, 48 );
QUANT_16_U( 0, 16 );
}
// DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
#define QUANT_16_U_DC( dct0, dct1 ) \
temp1v = vec_ld((dct0), *dct); \
temp2v = vec_ld((dct1), *dct); \
#define QUANT_16_U_DC( idx0, idx1 ) \
temp1v = vec_ld((idx0), *dct); \
temp2v = vec_ld((idx1), *dct); \
mskA = vec_cmplt(temp1v, zerov); \
mskB = vec_cmplt(temp2v, zerov); \
coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v); \
coefvB = (vec_u16_t) vec_max(vec_sub(zerov, temp2v), temp2v); \
coefvA = vec_add(coefvA, biasv); \
coefvB = vec_add(coefvB, biasv); \
multEvenvA = vec_mule(coefvA, mfv); \
multOddvA = vec_mulo(coefvA, mfv); \
multEvenvB = vec_mule(coefvB, mfv); \
multOddvB = vec_mulo(coefvB, mfv); \
multEvenvA = vec_add(multEvenvA, fV); \
multOddvA = vec_add(multOddvA, fV); \
multEvenvB = vec_add(multEvenvB, fV); \
multOddvB = vec_add(multOddvB, fV); \
multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
multOddvA = vec_sr(multOddvA, i_qbitsv); \
multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
......@@ -123,18 +118,17 @@ temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(mul
temp1v = vec_xor(temp1v, mskA); \
temp2v = vec_xor(temp2v, mskB); \
temp1v = vec_add(temp1v, vec_and(mskA, one)); \
vec_st(temp1v, (dct0), (int16_t*)dct); \
vec_st(temp1v, (idx0), (int16_t*)dct); \
temp2v = vec_add(temp2v, vec_and(mskB, one)); \
vec_st(temp2v, (dct1), (int16_t*)dct);
vec_st(temp2v, (idx1), (int16_t*)dct);
void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_qbits, int const f ) {
void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias )
{
vector bool short mskA;
vec_u32_t i_qbitsv;
vec_u16_t coefvA;
vec_u32_t multEvenvA, multOddvA;
vec_s16_t zerov, one;
vec_u32_t fV;
vector bool short mskB;
vec_u16_t coefvB;
......@@ -143,17 +137,19 @@ void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_q
vec_s16_t temp1v, temp2v;
vec_u16_t mfv;
vec_u16_t biasv;
vect_ushort_u mf_u;
mf_u.s[0]=i_quant_mf;
mf_u.s[0]=mf;
mfv = vec_splat( mf_u.v, 0 );
vect_int_u qbits_u;
qbits_u.s[0]=i_qbits;
qbits_u.s[0]=16;
i_qbitsv = vec_splat(qbits_u.v, 0);
vect_int_u f_u;
f_u.s[0]=f;
fV = vec_splat(f_u.v, 0);
vect_ushort_u bias_u;
bias_u.s[0]=bias;
biasv = vec_splat(bias_u.v, 0);
zerov = vec_splat_s16(0);
one = vec_splat_s16(1);
......@@ -161,38 +157,83 @@ void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_q
QUANT_16_U_DC( 0, 16 );
}
// DC quant of a whole 2x2 block
#define QUANT_4_U_DC( idx0 ) \
const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0); \
temp1v = vec_ld((idx0), *dct); \
mskA = vec_cmplt(temp1v, zerov); \
coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v); \
coefvA = vec_add(coefvA, biasv); \
multEvenvA = vec_mule(coefvA, mfv); \
multOddvA = vec_mulo(coefvA, mfv); \
multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
multOddvA = vec_sr(multOddvA, i_qbitsv); \
temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
temp2v = vec_xor(temp2v, mskA); \
temp2v = vec_add(temp2v, vec_and(mskA, one)); \
temp1v = vec_sel(temp1v, temp2v, sel); \
vec_st(temp1v, (idx0), (int16_t*)dct);
void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias )
{
vector bool short mskA;
vec_u32_t i_qbitsv;
vec_u16_t coefvA;
vec_u32_t multEvenvA, multOddvA;
vec_s16_t zerov, one;
vec_s16_t temp1v, temp2v;
vec_u16_t mfv;
vec_u16_t biasv;
vect_ushort_u mf_u;
mf_u.s[0]=mf;
mfv = vec_splat( mf_u.v, 0 );
vect_int_u qbits_u;
qbits_u.s[0]=16;
i_qbitsv = vec_splat(qbits_u.v, 0);
vect_ushort_u bias_u;
bias_u.s[0]=bias;
biasv = vec_splat(bias_u.v, 0);
zerov = vec_splat_s16(0);
one = vec_splat_s16(1);
QUANT_4_U_DC(0);
}
void x264_quant_8x8_altivec( int16_t dct[8][8], int quant_mf[8][8], int const i_qbits, int const f ) {
void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
{
vector bool short mskA;
vec_u32_t i_qbitsv;
vec_u16_t coefvA;
vec_u32_t multEvenvA, multOddvA;
vec_u16_t mfvA;
vec_u16_t biasvA;
vec_s16_t zerov, one;
vec_u32_t fV;
vector bool short mskB;
vec_u16_t coefvB;
vec_u32_t multEvenvB, multOddvB;
vec_u16_t mfvB;
vec_u16_t biasvB;
vec_s16_t temp1v, temp2v;
vect_int_u qbits_u;
qbits_u.s[0]=i_qbits;
qbits_u.s[0]=16;
i_qbitsv = vec_splat(qbits_u.v, 0);
vect_int_u f_u;
f_u.s[0]=f;
fV = vec_splat(f_u.v, 0);
zerov = vec_splat_s16(0);
one = vec_splat_s16(1);
int i;
for ( i=0; i<4; i++ ) {
QUANT_16_U( i*2*16, i*2*16+16, i*4*16, i*4*16+16, i*4*16+32, i*4*16+48 );
QUANT_16_U( i*2*16, i*2*16+16 );
}
}
......@@ -21,8 +21,9 @@
#ifndef _PPC_QUANT_H
#define _PPC_QUANT_H 1
void x264_quant_4x4_altivec( int16_t dct[4][4], int quant_mf[4][4], int const i_qbits, int const f );
void x264_quant_8x8_altivec( int16_t dct[8][8], int quant_mf[8][8], int const i_qbits, int const f );
void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_qbits, int const f );
void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias );
void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias );
#endif
......@@ -238,4 +238,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_8x8 = x264_quant_8x8_ssse3;
}
#endif
#ifdef ARCH_PPC
if( cpu&X264_CPU_ALTIVEC ) {
pf->quant_2x2_dc = x264_quant_2x2_dc_altivec;
pf->quant_4x4_dc = x264_quant_4x4_dc_altivec;
pf->quant_4x4 = x264_quant_4x4_altivec;
pf->quant_8x8 = x264_quant_8x8_altivec;
}
#endif
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment