Commit cc002bd5 authored by Rong Yan's avatar Rong Yan Committed by Henrik Gramner

ppc: Add little-endian PowerPC support

parent 145f3a62
......@@ -264,7 +264,7 @@ void x264_sub16x16_dct8_altivec( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix
vec_u8_t lv = vec_ld(0, dest); \
vec_u8_t dstv = vec_perm(lv, zero_u8v, (vec_u8_t)perm_ldv); \
vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \
vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv); \
vec_u16_t dst16 = vec_u8_to_u16_h(dstv); \
vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \
vec_u8_t idstsum8 = vec_s16_to_u8(idstsum); \
/* unaligned store */ \
......@@ -384,7 +384,7 @@ void x264_add16x16_idct_altivec( uint8_t *p_dst, int16_t dct[16][16] )
vec_u8_t lv = vec_ld( 7, dest ); \
vec_u8_t dstv = vec_perm( hv, lv, (vec_u8_t)perm_ldv ); \
vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \
vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv); \
vec_u16_t dst16 = vec_u8_to_u16_h(dstv); \
vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \
vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum); \
/* unaligned store */ \
......
......@@ -291,6 +291,12 @@ static void mc_chroma_2xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
}
}
#ifdef WORDS_BIGENDIAN
#define VSLD(a,b,n) vec_sld(a,b,n)
#else
#define VSLD(a,b,n) vec_sld(b,a,16-n)
#endif
static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
uint8_t *src, intptr_t i_src_stride,
int mvx, int mvy, int i_height )
......@@ -316,8 +322,13 @@ static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
vec_u16_t src0v_16, src1v_16, src2v_16, src3v_16, dstv16;
vec_u16_t shiftv, k32v;
#ifdef WORDS_BIGENDIAN
static const vec_u8_t perm0v = CV(1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13);
static const vec_u8_t perm1v = CV(3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15);
#else
static const vec_u8_t perm0v = CV(0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12);
static const vec_u8_t perm1v = CV(2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14);
#endif
coeff0v = vec_ld( 0, coeff );
coeff3v = vec_splat( coeff0v, 3 );
......@@ -329,7 +340,7 @@ static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
VEC_LOAD( src, src2v_8, 9, vec_u8_t, src );
src2v_16 = vec_u8_to_u16( src2v_8 );
src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );
src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
for( int y = 0; y < i_height; y += 2 )
{
......@@ -337,7 +348,7 @@ static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
src1v_16 = src3v_16;
VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
src2v_16 = vec_u8_to_u16( src2v_8 );
src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );
src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
......@@ -359,7 +370,7 @@ static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
src1v_16 = src3v_16;
VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
src2v_16 = vec_u8_to_u16( src2v_8 );
src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );
src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
......@@ -415,12 +426,17 @@ static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
k32v = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
shiftv = vec_splat_u16( 6 );
#ifdef WORDS_BIGENDIAN
static const vec_u8_t perm0v = CV(1,5,9,13,17,21,25,29,0,0,0,0,0,0,0,0);
static const vec_u8_t perm1v = CV(3,7,11,15,19,23,27,31,0,0,0,0,0,0,0,0);
#else
static const vec_u8_t perm0v = CV(0,4,8,12,16,20,24,28,1,1,1,1,1,1,1,1);
static const vec_u8_t perm1v = CV(2,6,10,14,18,22,26,30,1,1,1,1,1,1,1,1);
#endif
VEC_LOAD( src, src2v_8, 16, vec_u8_t, src );
VEC_LOAD( src+16, src3v_8, 2, vec_u8_t, src );
src3v_8 = vec_sld( src2v_8, src3v_8, 2 );
src3v_8 = VSLD( src2v_8, src3v_8, 2 );
for( int y = 0; y < i_height; y += 2 )
{
......@@ -429,7 +445,7 @@ static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
src3v_8 = vec_sld( src2v_8, src3v_8, 2 );
src3v_8 = VSLD( src2v_8, src3v_8, 2 );
src0v_16h = vec_u8_to_u16_h( src0v_8 );
src0v_16l = vec_u8_to_u16_l( src0v_8 );
......@@ -467,7 +483,7 @@ static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
src3v_8 = vec_sld( src2v_8, src3v_8, 2 );
src3v_8 = VSLD( src2v_8, src3v_8, 2 );
src0v_16h = vec_u8_to_u16_h( src0v_8 );
src0v_16l = vec_u8_to_u16_l( src0v_8 );
......@@ -550,11 +566,11 @@ static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stri
VEC_LOAD_G( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t); \
VEC_LOAD_G( &src[x+14+i_stride*y], src6v, 16, vec_u8_t); \
\
src2v = vec_sld( src1v, src6v, 1 ); \
src3v = vec_sld( src1v, src6v, 2 ); \
src4v = vec_sld( src1v, src6v, 3 ); \
src5v = vec_sld( src1v, src6v, 4 ); \
src6v = vec_sld( src1v, src6v, 5 ); \
src2v = VSLD( src1v, src6v, 1 ); \
src3v = VSLD( src1v, src6v, 2 ); \
src4v = VSLD( src1v, src6v, 3 ); \
src5v = VSLD( src1v, src6v, 4 ); \
src6v = VSLD( src1v, src6v, 5 ); \
\
temp1v = vec_u8_to_s16_h( src1v ); \
temp2v = vec_u8_to_s16_h( src2v ); \
......@@ -629,12 +645,12 @@ static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stri
#define HPEL_FILTER_CENTRAL() \
{ \
temp1v = vec_sld( tempav, tempbv, 12 ); \
temp2v = vec_sld( tempav, tempbv, 14 ); \
temp1v = VSLD( tempav, tempbv, 12 ); \
temp2v = VSLD( tempav, tempbv, 14 ); \
temp3v = tempbv; \
temp4v = vec_sld( tempbv, tempcv, 2 ); \
temp5v = vec_sld( tempbv, tempcv, 4 ); \
temp6v = vec_sld( tempbv, tempcv, 6 ); \
temp4v = VSLD( tempbv, tempcv, 2 ); \
temp5v = VSLD( tempbv, tempcv, 4 ); \
temp6v = VSLD( tempbv, tempcv, 6 ); \
\
HPEL_FILTER_2( temp1v, temp2v, temp3v, \
temp4v, temp5v, temp6v ); \
......@@ -642,12 +658,12 @@ static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stri
dest1v = vec_add( temp1v, thirtytwov ); \
dest1v = vec_sra( dest1v, sixv ); \
\
temp1v = vec_sld( tempbv, tempcv, 12 ); \
temp2v = vec_sld( tempbv, tempcv, 14 ); \
temp1v = VSLD( tempbv, tempcv, 12 ); \
temp2v = VSLD( tempbv, tempcv, 14 ); \
temp3v = tempcv; \
temp4v = vec_sld( tempcv, tempdv, 2 ); \
temp5v = vec_sld( tempcv, tempdv, 4 ); \
temp6v = vec_sld( tempcv, tempdv, 6 ); \
temp4v = VSLD( tempcv, tempdv, 2 ); \
temp5v = VSLD( tempcv, tempdv, 4 ); \
temp6v = VSLD( tempcv, tempdv, 6 ); \
\
HPEL_FILTER_2( temp1v, temp2v, temp3v, \
temp4v, temp5v, temp6v ); \
......@@ -764,6 +780,9 @@ static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_
vec_u8_t lv, hv, src1p1v;
vec_u8_t avg0v, avg1v, avghv, avghp1v, avgleftv, avgrightv;
static const vec_u8_t inverse_bridge_shuffle = CV(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E );
#ifndef WORDS_BIGENDIAN
static const vec_u8_t inverse_bridge_shuffle_1 = CV(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F );
#endif
for( int y = 0; y < height; y++ )
{
......@@ -788,11 +807,15 @@ static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_
src1p1v = vec_ld(16*(x*2+2), src1);
avghp1v = vec_avg(lv, src1p1v);
avgleftv = vec_avg(vec_sld(avg0v, avghv, 1), avg0v);
avgrightv = vec_avg(vec_sld(avghv, avghp1v, 1), avghv);
avgleftv = vec_avg(VSLD(avg0v, avghv, 1), avg0v);
avgrightv = vec_avg(VSLD(avghv, avghp1v, 1), avghv);
vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dst0);
#ifdef WORDS_BIGENDIAN
vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dsth);
#else
vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1), 16*x, dsth);
#endif
avg0v = avghp1v;
......@@ -802,11 +825,15 @@ static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_
hv = vec_ld(16*(x*2+2), src2);
avghp1v = vec_avg(src1p1v, hv);
avgleftv = vec_avg(vec_sld(avg1v, avghv, 1), avg1v);
avgrightv = vec_avg(vec_sld(avghv, avghp1v, 1), avghv);
avgleftv = vec_avg(VSLD(avg1v, avghv, 1), avg1v);
avgrightv = vec_avg(VSLD(avghv, avghp1v, 1), avghv);
vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dstv);
#ifdef WORDS_BIGENDIAN
vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dstc);
#else
vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1), 16*x, dstc);
#endif
avg1v = avghp1v;
......@@ -820,11 +847,15 @@ static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_
lv = vec_ld(16*(x*2+1), src2);
avghp1v = vec_avg(src1v, lv);
avgleftv = vec_avg(vec_sld(avg0v, avghv, 1), avg0v);
avgrightv = vec_avg(vec_sld(avg1v, avghp1v, 1), avg1v);
avgleftv = vec_avg(VSLD(avg0v, avghv, 1), avg0v);
avgrightv = vec_avg(VSLD(avg1v, avghp1v, 1), avg1v);
lv = vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle);
#ifdef WORDS_BIGENDIAN
hv = (vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv);
#else
hv = vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1);
#endif
vec_ste((vec_u32_t)lv,16*x,(uint32_t*)dst0);
vec_ste((vec_u32_t)lv,16*x+4,(uint32_t*)dst0);
......
......@@ -81,10 +81,17 @@ typedef union {
/***********************************************************************
* 8 <-> 16 bits conversions
**********************************************************************/
#ifdef WORDS_BIGENDIAN
#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
#else
#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
#endif
#define vec_u8_to_u16(v) vec_u8_to_u16_h(v)
#define vec_u8_to_s16(v) vec_u8_to_s16_h(v)
......@@ -96,10 +103,17 @@ typedef union {
/***********************************************************************
* 16 <-> 32 bits conversions
**********************************************************************/
#ifdef WORDS_BIGENDIAN
#define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
#define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
#define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
#define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
#else
#define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( (vec_u16_t) v, zero_u16v )
#define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( (vec_u16_t) v, zero_u16v )
#define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( (vec_u16_t) v, zero_u16v )
#define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( (vec_u16_t) v, zero_u16v )
#endif
#define vec_u16_to_u32(v) vec_u16_to_u32_h(v)
#define vec_u16_to_s32(v) vec_u16_to_s32_h(v)
......
......@@ -251,6 +251,14 @@ int x264_quant_8x8_altivec( int16_t dct[64], uint16_t mf[64], uint16_t bias[64]
vec_st(dctv, 8*y, dct); \
}
#ifdef WORDS_BIGENDIAN
#define VEC_MULE vec_mule
#define VEC_MULO vec_mulo
#else
#define VEC_MULE vec_mulo
#define VEC_MULO vec_mule
#endif
#define DEQUANT_SHR() \
{ \
dctv = vec_ld(8*y, dct); \
......@@ -259,14 +267,14 @@ int x264_quant_8x8_altivec( int16_t dct[64], uint16_t mf[64], uint16_t bias[64]
mf1v = vec_ld(16*y, dequant_mf[i_mf]); \
mf2v = vec_ld(16+16*y, dequant_mf[i_mf]); \
\
multEvenvA = vec_mule(dct1v, (vec_s16_t)mf1v); \
multOddvA = vec_mulo(dct1v, (vec_s16_t)mf1v); \
multEvenvA = VEC_MULE(dct1v, (vec_s16_t)mf1v); \
multOddvA = VEC_MULO(dct1v, (vec_s16_t)mf1v); \
temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
temp1v = vec_add(temp1v, fv); \
temp1v = vec_sra(temp1v, i_qbitsv); \
\
multEvenvA = vec_mule(dct2v, (vec_s16_t)mf2v); \
multOddvA = vec_mulo(dct2v, (vec_s16_t)mf2v); \
multEvenvA = VEC_MULE(dct2v, (vec_s16_t)mf2v); \
multOddvA = VEC_MULO(dct2v, (vec_s16_t)mf2v); \
temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
temp2v = vec_add(temp2v, fv); \
temp2v = vec_sra(temp2v, i_qbitsv); \
......
......@@ -979,6 +979,9 @@ EOF
ppc64:Linux:*:*)
echo powerpc64-unknown-linux-gnu
exit ;;
ppc64le:Linux:*:*)
echo powerpc64le-unknown-linux-gnu
exit ;;
ppc:Linux:*:*)
echo powerpc-unknown-linux-gnu
exit ;;
......
......@@ -711,7 +711,7 @@ case $host_cpu in
ASFLAGS="$ASFLAGS -f elf64"
fi
;;
powerpc|powerpc64)
powerpc*)
ARCH="PPC"
if [ $asm = auto ] ; then
define HAVE_ALTIVEC
......@@ -881,11 +881,13 @@ define STACK_ALIGNMENT $stack_alignment
ASFLAGS="$ASFLAGS -DSTACK_ALIGNMENT=$stack_alignment"
# skip endianness check for Intel Compiler and MSVS, as all supported platforms are little. each have flags that will cause the check to fail as well
CPU_ENDIAN="little-endian"
if [ $compiler = GNU ]; then
echo "int i[2] = {0x42494745,0}; double f[2] = {0x1.0656e6469616ep+102,0};" > conftest.c
$CC $CFLAGS conftest.c -c -o conftest.o 2>/dev/null || die "endian test failed"
if (${cross_prefix}strings -a conftest.o | grep -q BIGE) && (${cross_prefix}strings -a conftest.o | grep -q FPendian) ; then
define WORDS_BIGENDIAN
CPU_ENDIAN="big-endian"
elif !(${cross_prefix}strings -a conftest.o | grep -q EGIB && ${cross_prefix}strings -a conftest.o | grep -q naidnePF) ; then
die "endian test failed"
fi
......@@ -1403,6 +1405,7 @@ gpl_filters=""
cat > conftest.log <<EOF
platform: $ARCH
byte order: $CPU_ENDIAN
system: $SYS
cli: $cli
libx264: $cli_libx264
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment