Commit c494e9c5 authored by Eric Petit's avatar Eric Petit

Merges Guillaume Poirier's AltiVec changes:

 * Adds optimized quant and sub*dct8 routines
 * Faster sub*dct routines
~8% overall speed-up with default settings


git-svn-id: svn://svn.videolan.org/x264/trunk@601 df754926-b1dd-0310-bc7b-ec298dee348c
parent 41c111bc
......@@ -26,6 +26,9 @@ N: Christian Heine
E: sennindemokrit AT gmx DOT net
D: x86 asm
N: Placeholder
D: Altivec optimizations
N: Eric Petit
E: titer AT videolan DOT org
C: titer
......@@ -36,6 +39,11 @@ S: France
N: Francesco Corriga
D: VfW
N: Guillaume Poirier
E: gpoirier CHEZ mplayerhq POINT hu
D: Altivec optimizations
S: Brittany, France
N: Justin Clay
E: justin_clay AT hotmail DOT com
C: wheatgerm
......
......@@ -43,7 +43,8 @@ endif
# AltiVec optims
ifeq ($(ARCH),PPC)
SRCS += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c
SRCS += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c \
common/ppc/quant.c
endif
# VIS optims
......
......@@ -439,6 +439,9 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
}
#endif
}
......
......@@ -5,6 +5,7 @@
* $Id$
*
* Authors: Eric Petit <titer@m0k.org>
* Guillaume Poirier <gpoirier@mplayerhq.hu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
......@@ -43,210 +44,187 @@
void x264_sub4x4_dct_altivec( int16_t dct[4][4],
uint8_t *pix1, uint8_t *pix2 )
{
PREP_DIFF;
PREP_STORE8;
PREP_DIFF_8BYTEALIGNED;
vec_s16_t dct0v, dct1v, dct2v, dct3v;
vec_s16_t tmp0v, tmp1v, tmp2v, tmp3v;
VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct0v );
VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct1v );
VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct2v );
VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct3v );
vec_u8_t permHighv;
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct0v );
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct1v );
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct2v );
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct3v );
VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v );
VEC_TRANSPOSE_4( tmp0v, tmp1v, tmp2v, tmp3v,
dct0v, dct1v, dct2v, dct3v );
permHighv = (vec_u8_t) CV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17);
VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v );
VEC_STORE8( tmp0v, dct[0] );
VEC_STORE8( tmp1v, dct[1] );
VEC_STORE8( tmp2v, dct[2] );
VEC_STORE8( tmp3v, dct[3] );
vec_st(vec_perm(tmp0v, tmp1v, permHighv), 0, dct);
vec_st(vec_perm(tmp2v, tmp3v, permHighv), 16, dct);
}
void x264_sub8x8_dct_altivec( int16_t dct[4][4][4],
uint8_t *pix1, uint8_t *pix2 )
{
PREP_DIFF;
PREP_STORE8_HL;
PREP_DIFF_8BYTEALIGNED;
vec_s16_t dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v;
vec_s16_t tmp0v, tmp1v, tmp2v, tmp3v, tmp4v, tmp5v, tmp6v, tmp7v;
VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v );
VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v );
VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v );
VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v );
VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v );
VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v );
VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v );
VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v );
vec_u8_t permHighv, permLowv;
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v );
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v );
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v );
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v );
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v );
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v );
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v );
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v );
VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v );
VEC_DCT( dct4v, dct5v, dct6v, dct7v, tmp4v, tmp5v, tmp6v, tmp7v );
VEC_TRANSPOSE_8( tmp0v, tmp1v, tmp2v, tmp3v,
tmp4v, tmp5v, tmp6v, tmp7v,
dct0v, dct1v, dct2v, dct3v,
dct4v, dct5v, dct6v, dct7v );
permHighv = (vec_u8_t) CV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17);
permLowv = (vec_u8_t) CV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v );
VEC_STORE8_H( tmp0v, dct[0][0] );
VEC_STORE8_H( tmp1v, dct[0][1] );
VEC_STORE8_H( tmp2v, dct[0][2] );
VEC_STORE8_H( tmp3v, dct[0][3] );
VEC_STORE8_L( tmp0v, dct[2][0] );
VEC_STORE8_L( tmp1v, dct[2][1] );
VEC_STORE8_L( tmp2v, dct[2][2] );
VEC_STORE8_L( tmp3v, dct[2][3] );
VEC_DCT( dct4v, dct5v, dct6v, dct7v, tmp4v, tmp5v, tmp6v, tmp7v );
VEC_STORE8_H( tmp4v, dct[1][0] );
VEC_STORE8_H( tmp5v, dct[1][1] );
VEC_STORE8_H( tmp6v, dct[1][2] );
VEC_STORE8_H( tmp7v, dct[1][3] );
VEC_STORE8_L( tmp4v, dct[3][0] );
VEC_STORE8_L( tmp5v, dct[3][1] );
VEC_STORE8_L( tmp6v, dct[3][2] );
VEC_STORE8_L( tmp7v, dct[3][3] );
vec_st(vec_perm(tmp0v, tmp1v, permHighv), 0, dct);
vec_st(vec_perm(tmp2v, tmp3v, permHighv), 16, dct);
vec_st(vec_perm(tmp4v, tmp5v, permHighv), 32, dct);
vec_st(vec_perm(tmp6v, tmp7v, permHighv), 48, dct);
vec_st(vec_perm(tmp0v, tmp1v, permLowv), 64, dct);
vec_st(vec_perm(tmp2v, tmp3v, permLowv), 80, dct);
vec_st(vec_perm(tmp4v, tmp5v, permLowv), 96, dct);
vec_st(vec_perm(tmp6v, tmp7v, permLowv), 112, dct);
}
void x264_sub16x16_dct_altivec( int16_t dct[16][4][4],
uint8_t *pix1, uint8_t *pix2 )
{
PREP_DIFF;
PREP_STORE8_HL;
vec_s16_t dcth0v, dcth1v, dcth2v, dcth3v,
dcth4v, dcth5v, dcth6v, dcth7v,
dctl0v, dctl1v, dctl2v, dctl3v,
dctl4v, dctl5v, dctl6v, dctl7v;
vec_s16_t temp0v, temp1v, temp2v, temp3v,
temp4v, temp5v, temp6v, temp7v;
VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth0v, dctl0v );
VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth1v, dctl1v );
VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth2v, dctl2v );
VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth3v, dctl3v );
VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth4v, dctl4v );
VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth5v, dctl5v );
VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth6v, dctl6v );
VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth7v, dctl7v );
VEC_DCT( dcth0v, dcth1v, dcth2v, dcth3v,
temp0v, temp1v, temp2v, temp3v );
VEC_DCT( dcth4v, dcth5v, dcth6v, dcth7v,
temp4v, temp5v, temp6v, temp7v );
VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
temp4v, temp5v, temp6v, temp7v,
dcth0v, dcth1v, dcth2v, dcth3v,
dcth4v, dcth5v, dcth6v, dcth7v );
VEC_DCT( dcth0v, dcth1v, dcth2v, dcth3v,
temp0v, temp1v, temp2v, temp3v );
VEC_STORE8_H( temp0v, dct[0][0] );
VEC_STORE8_H( temp1v, dct[0][1] );
VEC_STORE8_H( temp2v, dct[0][2] );
VEC_STORE8_H( temp3v, dct[0][3] );
VEC_STORE8_L( temp0v, dct[2][0] );
VEC_STORE8_L( temp1v, dct[2][1] );
VEC_STORE8_L( temp2v, dct[2][2] );
VEC_STORE8_L( temp3v, dct[2][3] );
VEC_DCT( dcth4v, dcth5v, dcth6v, dcth7v,
temp4v, temp5v, temp6v, temp7v );
VEC_STORE8_H( temp4v, dct[1][0] );
VEC_STORE8_H( temp5v, dct[1][1] );
VEC_STORE8_H( temp6v, dct[1][2] );
VEC_STORE8_H( temp7v, dct[1][3] );
VEC_STORE8_L( temp4v, dct[3][0] );
VEC_STORE8_L( temp5v, dct[3][1] );
VEC_STORE8_L( temp6v, dct[3][2] );
VEC_STORE8_L( temp7v, dct[3][3] );
VEC_DCT( dctl0v, dctl1v, dctl2v, dctl3v,
temp0v, temp1v, temp2v, temp3v );
VEC_DCT( dctl4v, dctl5v, dctl6v, dctl7v,
temp4v, temp5v, temp6v, temp7v );
VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
temp4v, temp5v, temp6v, temp7v,
dctl0v, dctl1v, dctl2v, dctl3v,
dctl4v, dctl5v, dctl6v, dctl7v );
VEC_DCT( dctl0v, dctl1v, dctl2v, dctl3v,
temp0v, temp1v, temp2v, temp3v );
VEC_STORE8_H( temp0v, dct[4][0] );
VEC_STORE8_H( temp1v, dct[4][1] );
VEC_STORE8_H( temp2v, dct[4][2] );
VEC_STORE8_H( temp3v, dct[4][3] );
VEC_STORE8_L( temp0v, dct[6][0] );
VEC_STORE8_L( temp1v, dct[6][1] );
VEC_STORE8_L( temp2v, dct[6][2] );
VEC_STORE8_L( temp3v, dct[6][3] );
VEC_DCT( dctl4v, dctl5v, dctl6v, dctl7v,
temp4v, temp5v, temp6v, temp7v );
VEC_STORE8_H( temp4v, dct[5][0] );
VEC_STORE8_H( temp5v, dct[5][1] );
VEC_STORE8_H( temp6v, dct[5][2] );
VEC_STORE8_H( temp7v, dct[5][3] );
VEC_STORE8_L( temp4v, dct[7][0] );
VEC_STORE8_L( temp5v, dct[7][1] );
VEC_STORE8_L( temp6v, dct[7][2] );
VEC_STORE8_L( temp7v, dct[7][3] );
VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth0v, dctl0v );
VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth1v, dctl1v );
VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth2v, dctl2v );
VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth3v, dctl3v );
VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth4v, dctl4v );
VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth5v, dctl5v );
VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth6v, dctl6v );
VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth7v, dctl7v );
VEC_DCT( dcth0v, dcth1v, dcth2v, dcth3v,
temp0v, temp1v, temp2v, temp3v );
VEC_DCT( dcth4v, dcth5v, dcth6v, dcth7v,
temp4v, temp5v, temp6v, temp7v );
VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
temp4v, temp5v, temp6v, temp7v,
dcth0v, dcth1v, dcth2v, dcth3v,
dcth4v, dcth5v, dcth6v, dcth7v );
VEC_DCT( dcth0v, dcth1v, dcth2v, dcth3v,
temp0v, temp1v, temp2v, temp3v );
VEC_STORE8_H( temp0v, dct[8][0] );
VEC_STORE8_H( temp1v, dct[8][1] );
VEC_STORE8_H( temp2v, dct[8][2] );
VEC_STORE8_H( temp3v, dct[8][3] );
VEC_STORE8_L( temp0v, dct[10][0] );
VEC_STORE8_L( temp1v, dct[10][1] );
VEC_STORE8_L( temp2v, dct[10][2] );
VEC_STORE8_L( temp3v, dct[10][3] );
VEC_DCT( dcth4v, dcth5v, dcth6v, dcth7v,
temp4v, temp5v, temp6v, temp7v );
VEC_STORE8_H( temp4v, dct[9][0] );
VEC_STORE8_H( temp5v, dct[9][1] );
VEC_STORE8_H( temp6v, dct[9][2] );
VEC_STORE8_H( temp7v, dct[9][3] );
VEC_STORE8_L( temp4v, dct[11][0] );
VEC_STORE8_L( temp5v, dct[11][1] );
VEC_STORE8_L( temp6v, dct[11][2] );
VEC_STORE8_L( temp7v, dct[11][3] );
VEC_DCT( dctl0v, dctl1v, dctl2v, dctl3v,
temp0v, temp1v, temp2v, temp3v );
VEC_DCT( dctl4v, dctl5v, dctl6v, dctl7v,
temp4v, temp5v, temp6v, temp7v );
VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
temp4v, temp5v, temp6v, temp7v,
dctl0v, dctl1v, dctl2v, dctl3v,
dctl4v, dctl5v, dctl6v, dctl7v );
VEC_DCT( dctl0v, dctl1v, dctl2v, dctl3v,
temp0v, temp1v, temp2v, temp3v );
VEC_STORE8_H( temp0v, dct[12][0] );
VEC_STORE8_H( temp1v, dct[12][1] );
VEC_STORE8_H( temp2v, dct[12][2] );
VEC_STORE8_H( temp3v, dct[12][3] );
VEC_STORE8_L( temp0v, dct[14][0] );
VEC_STORE8_L( temp1v, dct[14][1] );
VEC_STORE8_L( temp2v, dct[14][2] );
VEC_STORE8_L( temp3v, dct[14][3] );
VEC_DCT( dctl4v, dctl5v, dctl6v, dctl7v,
temp4v, temp5v, temp6v, temp7v );
VEC_STORE8_H( temp4v, dct[13][0] );
VEC_STORE8_H( temp5v, dct[13][1] );
VEC_STORE8_H( temp6v, dct[13][2] );
VEC_STORE8_H( temp7v, dct[13][3] );
VEC_STORE8_L( temp4v, dct[15][0] );
VEC_STORE8_L( temp5v, dct[15][1] );
VEC_STORE8_L( temp6v, dct[15][2] );
VEC_STORE8_L( temp7v, dct[15][3] );
x264_sub8x8_dct_altivec( &dct[ 0], &pix1[0], &pix2[0] );
x264_sub8x8_dct_altivec( &dct[ 4], &pix1[8], &pix2[8] );
x264_sub8x8_dct_altivec( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
x264_sub8x8_dct_altivec( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
}
/***************************************************************************
* 8x8 transform:
***************************************************************************/
/* DCT8_1D unrolled by 8 in Altivec */
#define DCT8_1D_ALTIVEC( dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v ) \
{ \
/* int s07 = SRC(0) + SRC(7); */ \
vec_s16_t s07v = vec_add( dct0v, dct7v); \
/* int s16 = SRC(1) + SRC(6); */ \
vec_s16_t s16v = vec_add( dct1v, dct6v); \
/* int s25 = SRC(2) + SRC(5); */ \
vec_s16_t s25v = vec_add( dct2v, dct5v); \
/* int s34 = SRC(3) + SRC(4); */ \
vec_s16_t s34v = vec_add( dct3v, dct4v); \
\
/* int a0 = s07 + s34; */ \
vec_s16_t a0v = vec_add(s07v, s34v); \
/* int a1 = s16 + s25; */ \
vec_s16_t a1v = vec_add(s16v, s25v); \
/* int a2 = s07 - s34; */ \
vec_s16_t a2v = vec_sub(s07v, s34v); \
/* int a3 = s16 - s25; */ \
vec_s16_t a3v = vec_sub(s16v, s25v); \
\
/* int d07 = SRC(0) - SRC(7); */ \
vec_s16_t d07v = vec_sub( dct0v, dct7v); \
/* int d16 = SRC(1) - SRC(6); */ \
vec_s16_t d16v = vec_sub( dct1v, dct6v); \
/* int d25 = SRC(2) - SRC(5); */ \
vec_s16_t d25v = vec_sub( dct2v, dct5v); \
/* int d34 = SRC(3) - SRC(4); */ \
vec_s16_t d34v = vec_sub( dct3v, dct4v); \
\
/* int a4 = d16 + d25 + (d07 + (d07>>1)); */ \
vec_s16_t a4v = vec_add( vec_add(d16v, d25v), vec_add(d07v, vec_sra(d07v, onev)) );\
/* int a5 = d07 - d34 - (d25 + (d25>>1)); */ \
vec_s16_t a5v = vec_sub( vec_sub(d07v, d34v), vec_add(d25v, vec_sra(d25v, onev)) );\
/* int a6 = d07 + d34 - (d16 + (d16>>1)); */ \
vec_s16_t a6v = vec_sub( vec_add(d07v, d34v), vec_add(d16v, vec_sra(d16v, onev)) );\
/* int a7 = d16 - d25 + (d34 + (d34>>1)); */ \
vec_s16_t a7v = vec_add( vec_sub(d16v, d25v), vec_add(d34v, vec_sra(d34v, onev)) );\
\
/* DST(0) = a0 + a1; */ \
dct0v = vec_add( a0v, a1v ); \
/* DST(1) = a4 + (a7>>2); */ \
dct1v = vec_add( a4v, vec_sra(a7v, twov) ); \
/* DST(2) = a2 + (a3>>1); */ \
dct2v = vec_add( a2v, vec_sra(a3v, onev) ); \
/* DST(3) = a5 + (a6>>2); */ \
dct3v = vec_add( a5v, vec_sra(a6v, twov) ); \
/* DST(4) = a0 - a1; */ \
dct4v = vec_sub( a0v, a1v ); \
/* DST(5) = a6 - (a5>>2); */ \
dct5v = vec_sub( a6v, vec_sra(a5v, twov) ); \
/* DST(6) = (a2>>1) - a3 ; */ \
dct6v = vec_sub( vec_sra(a2v, onev), a3v ); \
/* DST(7) = (a4>>2) - a7 ; */ \
dct7v = vec_sub( vec_sra(a4v, twov), a7v ); \
}
void x264_sub8x8_dct8_altivec( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
{
vec_u16_t onev = vec_splat_u16(1);
vec_u16_t twov = vec_add( onev, onev );
PREP_DIFF_8BYTEALIGNED;
vec_s16_t dct0v, dct1v, dct2v, dct3v,
dct4v, dct5v, dct6v, dct7v;
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v );
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v );
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v );
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v );
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v );
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v );
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v );
VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v );
DCT8_1D_ALTIVEC( dct0v, dct1v, dct2v, dct3v,
dct4v, dct5v, dct6v, dct7v );
vec_s16_t dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v;
VEC_TRANSPOSE_8(dct0v, dct1v, dct2v, dct3v,
dct4v, dct5v, dct6v, dct7v,
dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v );
DCT8_1D_ALTIVEC( dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v );
vec_st( dct_tr0v, 0, (signed short *)dct );
vec_st( dct_tr1v, 16, (signed short *)dct );
vec_st( dct_tr2v, 32, (signed short *)dct );
vec_st( dct_tr3v, 48, (signed short *)dct );
vec_st( dct_tr4v, 64, (signed short *)dct );
vec_st( dct_tr5v, 80, (signed short *)dct );
vec_st( dct_tr6v, 96, (signed short *)dct );
vec_st( dct_tr7v, 112, (signed short *)dct );
}
void x264_sub16x16_dct8_altivec( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
{
x264_sub8x8_dct8_altivec( dct[0], &pix1[0], &pix2[0] );
x264_sub8x8_dct8_altivec( dct[1], &pix1[8], &pix2[8] );
x264_sub8x8_dct8_altivec( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
x264_sub8x8_dct8_altivec( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
}
......@@ -31,4 +31,9 @@ void x264_sub8x8_dct_altivec( int16_t dct[4][4][4],
void x264_sub16x16_dct_altivec( int16_t dct[16][4][4],
uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_altivec( int16_t dct[8][8],
uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_altivec( int16_t dct[4][8][8],
uint8_t *pix1, uint8_t *pix2 );
#endif
......@@ -100,8 +100,7 @@ PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec, 8, 8, 2s, 1 )
* actually also calls vec_splat(0), but we already have a null vector.
**********************************************************************/
#define VEC_ABS(a) \
pix1v = vec_sub( zero_s16v, a ); \
a = vec_max( a, pix1v ); \
a = vec_max( a, vec_sub( zero_s16v, a ) );
/***********************************************************************
* VEC_ADD_ABS
......
......@@ -252,3 +252,31 @@
dl = vec_sub( temp1v, temp3v ); \
p1 += i1; \
p2 += i2
/***********************************************************************
* VEC_DIFF_H_8BYTE_ALIGNED
***********************************************************************
* p1, p2: u8 *
* i1, i2, n: int
* d: s16v
*
* Loads n bytes from p1 and p2, do the diff of the high elements into
* d, increments p1 and p2 by i1 and i2
* Slightly faster when we know we are loading/diffing 8bytes which
* are 8 byte aligned. Reduces need for two loads and two vec_lvsl()'s
**********************************************************************/
#define PREP_DIFF_8BYTEALIGNED \
LOAD_ZERO; \
vec_s16_t pix1v, pix2v; \
vec_u8_t permPix1, permPix2; \
permPix1 = vec_lvsl(0, pix1); \
permPix2 = vec_lvsl(0, pix2); \
#define VEC_DIFF_H_8BYTE_ALIGNED(p1,i1,p2,i2,n,d) \
pix1v = vec_perm(vec_ld(0,p1), zero_u8v, permPix1); \
pix2v = vec_perm(vec_ld(0, p2), zero_u8v, permPix2); \
pix1v = vec_u8_to_s16( pix1v ); \
pix2v = vec_u8_to_s16( pix2v ); \
d = vec_sub( pix1v, pix2v); \
p1 += i1; \
p2 += i2;
/*****************************************************************************
* quant.c: h264 encoder
*****************************************************************************
* Authors: Guillaume Poirier <poirierg@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#ifdef HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "common/common.h"
#include "ppccommon.h"
#include "quant.h"
// quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
#define QUANT_16_U( dct0, dct1, quant_mf0, quant_mf1, quant_mf2, quant_mf3 ) \
temp1v = vec_ld((dct0), *dct); \
temp2v = vec_ld((dct1), *dct); \
mfvA = (vec_u16_t) vec_packs((vec_u32_t)vec_ld((quant_mf0), *quant_mf), (vec_u32_t)vec_ld((quant_mf1), *quant_mf)); \
mfvB = (vec_u16_t) vec_packs((vec_u32_t)vec_ld((quant_mf2), *quant_mf), (vec_u32_t)vec_ld((quant_mf3), *quant_mf)); \
mskA = vec_cmplt(temp1v, zerov); \
mskB = vec_cmplt(temp2v, zerov); \
coefvA = (vec_u16_t)vec_max(vec_sub(zerov, temp1v), temp1v); \
coefvB = (vec_u16_t)vec_max(vec_sub(zerov, temp2v), temp2v); \
multEvenvA = vec_mule(coefvA, mfvA); \
multOddvA = vec_mulo(coefvA, mfvA); \
multEvenvB = vec_mule(coefvB, mfvB); \
multOddvB = vec_mulo(coefvB, mfvB); \
multEvenvA = vec_adds(multEvenvA, fV); \
multOddvA = vec_adds(multOddvA, fV); \
multEvenvB = vec_adds(multEvenvB, fV); \
multOddvB = vec_adds(multOddvB, fV); \
multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
multOddvA = vec_sr(multOddvA, i_qbitsv); \
multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
multOddvB = vec_sr(multOddvB, i_qbitsv); \
temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
temp1v = vec_xor(temp1v, mskA); \
temp2v = vec_xor(temp2v, mskB); \
temp1v = vec_adds(temp1v, vec_and(mskA, one)); \
vec_st(temp1v, (dct0), dct); \
temp2v = vec_adds(temp2v, vec_and(mskB, one)); \
vec_st(temp2v, (dct1), dct);
void x264_quant_4x4_altivec( int16_t dct[4][4], int quant_mf[4][4], int const i_qbits, int const f ) {
vector bool short mskA;
vec_s32_t i_qbitsv;
vec_u16_t coefvA;
vec_u32_t multEvenvA, multOddvA;
vec_u32_t mfvA;
vec_s16_t zerov, one;
vec_s32_t fV;
vector bool short mskB;
vec_u16_t coefvB;
vec_u32_t multEvenvB, multOddvB;
vec_u32_t mfvB;
vec_s16_t temp1v, temp2v;
vect_sint_u qbits_u;
qbits_u.s[0]=i_qbits;
i_qbitsv = vec_splat(qbits_u.v, 0);
vect_sint_u f_u;
f_u.s[0]=f;
fV = vec_splat(f_u.v, 0);
zerov = vec_splat_s16(0);
one = vec_splat_s16(1);
QUANT_16_U( 0, 16, 0, 16, 32, 48 );
}
// DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
#define QUANT_16_U_DC( dct0, dct1 ) \
temp1v = vec_ld((dct0), *dct); \
temp2v = vec_ld((dct1), *dct); \
mskA = vec_cmplt(temp1v, zerov); \
mskB = vec_cmplt(temp2v, zerov); \
coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v); \
coefvB = (vec_u16_t) vec_max(vec_sub(zerov, temp2v), temp2v); \
multEvenvA = vec_mule(coefvA, mfv); \
multOddvA = vec_mulo(coefvA, mfv); \
multEvenvB = vec_mule(coefvB, mfv); \
multOddvB = vec_mulo(coefvB, mfv); \
multEvenvA = vec_add(multEvenvA, fV); \
multOddvA = vec_add(multOddvA, fV); \
multEvenvB = vec_add(multEvenvB, fV); \
multOddvB = vec_add(multOddvB, fV); \
multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
multOddvA = vec_sr(multOddvA, i_qbitsv); \
multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
multOddvB = vec_sr(multOddvB, i_qbitsv); \
temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
temp1v = vec_xor(temp1v, mskA); \
temp2v = vec_xor(temp2v, mskB); \
temp1v = vec_add(temp1v, vec_and(mskA, one)); \
vec_st(temp1v, (dct0), dct); \
temp2v = vec_add(temp2v, vec_and(mskB, one)); \
vec_st(temp2v, (dct1), dct);
void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_qbits, int const f ) {
vector bool short mskA;
vec_s32_t i_qbitsv;
vec_u16_t coefvA;
vec_u32_t multEvenvA, multOddvA;
vec_s16_t zerov, one;
vec_s32_t fV;
vector bool short mskB;
vec_u16_t coefvB;
vec_u32_t multEvenvB, multOddvB;
vec_s16_t temp1v, temp2v;
vec_u32_t mfv;
vect_int_u mf_u;
mf_u.s[0]=i_quant_mf;
mfv = vec_splat( mf_u.v, 0 );
mfv = vec_packs( mfv, mfv);
vect_sint_u qbits_u;
qbits_u.s[0]=i_qbits;
i_qbitsv = vec_splat(qbits_u.v, 0);
vect_sint_u f_u;
f_u.s[0]=f;
fV = vec_splat(f_u.v, 0);
zerov = vec_splat_s16(0);
one = vec_splat_s16(1);
QUANT_16_U_DC( 0, 16 );
}
void x264_quant_8x8_altivec( int16_t dct[8][8], int quant_mf[8][8], int const i_qbits, int const f ) {
vector bool short mskA;
vec_s32_t i_qbitsv;
vec_u16_t coefvA;
vec_s32_t multEvenvA, multOddvA, mfvA;
vec_s16_t zerov, one;
vec_s32_t fV;
vector bool short mskB;
vec_u16_t coefvB;
vec_u32_t multEvenvB, multOddvB, mfvB;
vec_s16_t temp1v, temp2v;
vect_int_u qbits_u;
qbits_u.s[0]=i_qbits;
i_qbitsv = vec_splat(qbits_u.v, 0);
vect_sint_u f_u;
f_u.s[0]=f;
fV = vec_splat(f_u.v, 0);
zerov = vec_splat_s16(0);
one = vec_splat_s16(1);
int i;
for ( i=0; i<4; i++ ) {
QUANT_16_U( i*2*16, i*2*16+16, i*4*16, i*4*16+16, i*4*16+32, i*4*16+48 );
}
}
/*****************************************************************************
* quant.h: h264 encoder library
*****************************************************************************
* Authors: Guillaume Poirier <poirierg@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#ifndef _PPC_QUANT_H
#define _PPC_QUANT_H 1
typedef union {
unsigned int s[4];
vector unsigned int v;
} vect_int_u;
typedef union {
signed int s[4];
vector signed int v;
} vect_sint_u;
void x264_quant_4x4_altivec( int16_t dct[4][4], int quant_mf[4][4], int const i_qbits, int const f );
void x264_quant_8x8_altivec( int16_t dct[8][8], int quant_mf[8][8], int const i_qbits, int const f );
void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_qbits, int const f );
#endif
......@@ -25,6 +25,9 @@
#ifdef HAVE_MMXEXT
#include "i386/quant.h"