Commit ca7da1ae authored by David Conrad's avatar David Conrad Committed by Fiona Glaser

GSOC merge part 2: ARM stack alignment

Neither GCC nor ARMCC support 16 byte stack alignment despite the fact that NEON loads require it.
These macros only work for arrays, but fortunately that covers almost all instances of stack alignment in x264.
parent 1a072a3a
......@@ -39,7 +39,7 @@ typedef struct
uint8_t *p_end;
/* aligned for memcpy_aligned starting here */
DECLARE_ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
/* context */
uint8_t state[460];
......
......@@ -359,8 +359,8 @@ struct x264_t
const uint8_t *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */
DECLARE_ALIGNED_16( uint32_t nr_residual_sum[2][64] );
DECLARE_ALIGNED_16( uint16_t nr_offset[2][64] );
ALIGNED_16( uint32_t nr_residual_sum[2][64] );
ALIGNED_16( uint16_t nr_offset[2][64] );
uint32_t nr_count[2];
/* Slice header */
......@@ -413,11 +413,11 @@ struct x264_t
/* Current MB DCT coeffs */
struct
{
DECLARE_ALIGNED_16( int16_t luma16x16_dc[16] );
DECLARE_ALIGNED_16( int16_t chroma_dc[2][4] );
ALIGNED_16( int16_t luma16x16_dc[16] );
ALIGNED_16( int16_t chroma_dc[2][4] );
// FIXME share memory?
DECLARE_ALIGNED_16( int16_t luma8x8[4][64] );
DECLARE_ALIGNED_16( int16_t luma4x4[16+8][16] );
ALIGNED_16( int16_t luma8x8[4][64] );
ALIGNED_16( int16_t luma4x4[16+8][16] );
} dct;
/* MB table and cache for current frame/mb */
......@@ -494,7 +494,7 @@ struct x264_t
/* current value */
int i_type;
int i_partition;
DECLARE_ALIGNED_4( uint8_t i_sub_partition[4] );
ALIGNED_4( uint8_t i_sub_partition[4] );
int b_transform_8x8;
int i_cbp_luma;
......@@ -517,22 +517,22 @@ struct x264_t
/* space for p_fenc and p_fdec */
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
DECLARE_ALIGNED_16( uint8_t fenc_buf[24*FENC_STRIDE] );
DECLARE_ALIGNED_16( uint8_t fdec_buf[27*FDEC_STRIDE] );
ALIGNED_16( uint8_t fenc_buf[24*FENC_STRIDE] );
ALIGNED_16( uint8_t fdec_buf[27*FDEC_STRIDE] );
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
DECLARE_ALIGNED_16( uint8_t i4x4_fdec_buf[16*16] );
DECLARE_ALIGNED_16( uint8_t i8x8_fdec_buf[16*16] );
DECLARE_ALIGNED_16( int16_t i8x8_dct_buf[3][64] );
DECLARE_ALIGNED_16( int16_t i4x4_dct_buf[15][16] );
ALIGNED_16( uint8_t i4x4_fdec_buf[16*16] );
ALIGNED_16( uint8_t i8x8_fdec_buf[16*16] );
ALIGNED_16( int16_t i8x8_dct_buf[3][64] );
ALIGNED_16( int16_t i4x4_dct_buf[15][16] );
uint32_t i4x4_nnz_buf[4];
uint32_t i8x8_nnz_buf[4];
int i4x4_cbp;
int i8x8_cbp;
/* Psy trellis DCT data */
DECLARE_ALIGNED_16( int16_t fenc_dct8[4][64] );
DECLARE_ALIGNED_16( int16_t fenc_dct4[16][16] );
ALIGNED_16( int16_t fenc_dct8[4][64] );
ALIGNED_16( int16_t fenc_dct4[16][16] );
/* Psy RD SATD scores */
int fenc_satd[4][4];
......@@ -567,18 +567,18 @@ struct x264_t
uint8_t non_zero_count[X264_SCAN8_SIZE];
/* -1 if unused, -2 if unavailable */
DECLARE_ALIGNED_4( int8_t ref[2][X264_SCAN8_SIZE] );
ALIGNED_4( int8_t ref[2][X264_SCAN8_SIZE] );
/* 0 if not available */
DECLARE_ALIGNED_16( int16_t mv[2][X264_SCAN8_SIZE][2] );
DECLARE_ALIGNED_8( int16_t mvd[2][X264_SCAN8_SIZE][2] );
ALIGNED_16( int16_t mv[2][X264_SCAN8_SIZE][2] );
ALIGNED_8( int16_t mvd[2][X264_SCAN8_SIZE][2] );
/* 1 if SKIP or DIRECT. set only for B-frames + CABAC */
DECLARE_ALIGNED_4( int8_t skip[X264_SCAN8_SIZE] );
ALIGNED_4( int8_t skip[X264_SCAN8_SIZE] );
DECLARE_ALIGNED_16( int16_t direct_mv[2][X264_SCAN8_SIZE][2] );
DECLARE_ALIGNED_4( int8_t direct_ref[2][X264_SCAN8_SIZE] );
DECLARE_ALIGNED_4( int16_t pskip_mv[2] );
ALIGNED_16( int16_t direct_mv[2][X264_SCAN8_SIZE][2] );
ALIGNED_4( int8_t direct_ref[2][X264_SCAN8_SIZE] );
ALIGNED_4( int16_t pskip_mv[2] );
/* number of neighbors (top and left) that used 8x8 dct */
int i_neighbour_transform_size;
......
......@@ -764,7 +764,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
{\
int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
int i_qpn, i, mbn_xy, mbn_8x8, mbn_4x4;\
DECLARE_ALIGNED_4( uint8_t bS[4] ); /* filtering strength */\
ALIGNED_4( uint8_t bS[4] ); /* filtering strength */\
if( i_edge )\
i_edge+= b_8x8_transform;\
else\
......
......@@ -221,7 +221,7 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
{
int ref[2];
DECLARE_ALIGNED_8( int16_t mv[2][2] );
ALIGNED_8( int16_t mv[2][2] );
int i_list;
int i8;
const int8_t *l1ref0 = &h->fref1[0]->ref[0][ h->mb.i_b8_xy ];
......@@ -520,8 +520,8 @@ static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int he
int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
int i_mode = x264_size2pixel[height][width];
int i_stride0 = 16, i_stride1 = 16;
DECLARE_ALIGNED_16( uint8_t tmp0[16*16] );
DECLARE_ALIGNED_16( uint8_t tmp1[16*16] );
ALIGNED_ARRAY_16( uint8_t, tmp0,[16*16] );
ALIGNED_ARRAY_16( uint8_t, tmp1,[16*16] );
uint8_t *src0, *src1;
src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0],
......
......@@ -67,9 +67,20 @@
#else
#define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n)))
#endif
#define DECLARE_ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
#define DECLARE_ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 )
#define DECLARE_ALIGNED_4( var ) DECLARE_ALIGNED( var, 4 )
#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
#define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 )
#define ALIGNED_4( var ) DECLARE_ALIGNED( var, 4 )
// current arm compilers only maintain 8-byte stack alignment
// and cannot align stack variables to more than 8-bytes
#ifdef ARCH_ARM
#define ALIGNED_ARRAY_16( type, name, sub1, ... )\
ALIGNED_8( uint8_t name##_8 [sizeof(type sub1 __VA_ARGS__) + 8] );\
type (*name) __VA_ARGS__ = (void*)(name##_8 + ((intptr_t)name##_8 & 8))
#else
#define ALIGNED_ARRAY_16( type, name, sub1, ... )\
ALIGNED_16( type name sub1 __VA_ARGS__ )
#endif
#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
#define UNUSED __attribute__((unused))
......
......@@ -41,7 +41,7 @@
static inline void write16x4(uint8_t *dst, int dst_stride,
register vec_u8_t r0, register vec_u8_t r1,
register vec_u8_t r2, register vec_u8_t r3) {
DECLARE_ALIGNED_16(unsigned char result[64]);
ALIGNED_16(unsigned char result[64]);
uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
int int_dst_stride = dst_stride/4;
......@@ -220,7 +220,7 @@ static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
}
#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \
DECLARE_ALIGNED_16(unsigned char temp[16]); \
ALIGNED_16(unsigned char temp[16]); \
register vec_u8_t alphavec; \
register vec_u8_t betavec; \
register vec_u8_t mask; \
......
......@@ -303,7 +303,7 @@ static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
int d8x = mvx & 0x07;
int d8y = mvy & 0x07;
DECLARE_ALIGNED_16( uint16_t coeff[4] );
ALIGNED_16( uint16_t coeff[4] );
coeff[0] = (8-d8x)*(8-d8y);
coeff[1] = d8x *(8-d8y);
coeff[2] = (8-d8x)*d8y;
......@@ -384,7 +384,7 @@ static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
int d8x = mvx & 0x07;
int d8y = mvy & 0x07;
DECLARE_ALIGNED_16( uint16_t coeff[4] );
ALIGNED_16( uint16_t coeff[4] );
coeff[0] = (8-d8x)*(8-d8y);
coeff[1] = d8x *(8-d8y);
coeff[2] = (8-d8x)*d8y;
......
......@@ -33,7 +33,7 @@ static int name( uint8_t *pix1, int i_pix1, \
uint8_t *pix2, int i_pix2 ) \
{ \
int y; \
DECLARE_ALIGNED_16( int sum ); \
ALIGNED_16( int sum ); \
\
LOAD_ZERO; \
PREP_LOAD; \
......@@ -118,7 +118,7 @@ PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec, 8, 8, 2s, 1 )
static int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
DECLARE_ALIGNED_16( int i_satd );
ALIGNED_16( int i_satd );
PREP_DIFF;
PREP_LOAD_SRC( pix1 );
......@@ -163,7 +163,7 @@ static int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
DECLARE_ALIGNED_16( int i_satd );
ALIGNED_16( int i_satd );
PREP_DIFF;
vec_s16_t diff0v, diff1v, diff2v, diff3v;
......@@ -217,7 +217,7 @@ static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1,
static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
DECLARE_ALIGNED_16( int i_satd );
ALIGNED_16( int i_satd );
PREP_DIFF;
vec_s16_t diff0v, diff1v, diff2v, diff3v,
......@@ -271,7 +271,7 @@ static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1,
static int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
DECLARE_ALIGNED_16( int i_satd );
ALIGNED_16( int i_satd );
PREP_DIFF;
vec_s16_t diff0v, diff1v, diff2v, diff3v,
......@@ -331,7 +331,7 @@ static int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
DECLARE_ALIGNED_16( int i_satd );
ALIGNED_16( int i_satd );
PREP_DIFF;
vec_s16_t diff0v, diff1v, diff2v, diff3v,
......@@ -415,7 +415,7 @@ static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1,
static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
DECLARE_ALIGNED_16( int i_satd );
ALIGNED_16( int i_satd );
LOAD_ZERO;
PREP_LOAD;
......@@ -499,7 +499,7 @@ static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1,
static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
DECLARE_ALIGNED_16( int i_satd );
ALIGNED_16( int i_satd );
LOAD_ZERO;
PREP_LOAD;
......@@ -630,10 +630,10 @@ static void pixel_sad_x4_16x16_altivec( uint8_t *fenc,
uint8_t *pix2, uint8_t *pix3,
int i_stride, int scores[4] )
{
DECLARE_ALIGNED_16( int sum0 );
DECLARE_ALIGNED_16( int sum1 );
DECLARE_ALIGNED_16( int sum2 );
DECLARE_ALIGNED_16( int sum3 );
ALIGNED_16( int sum0 );
ALIGNED_16( int sum1 );
ALIGNED_16( int sum2 );
ALIGNED_16( int sum3 );
int y;
LOAD_ZERO;
......@@ -751,9 +751,9 @@ static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0,
uint8_t *pix1, uint8_t *pix2,
int i_stride, int scores[3] )
{
DECLARE_ALIGNED_16( int sum0 );
DECLARE_ALIGNED_16( int sum1 );
DECLARE_ALIGNED_16( int sum2 );
ALIGNED_16( int sum0 );
ALIGNED_16( int sum1 );
ALIGNED_16( int sum2 );
int y;
LOAD_ZERO;
......@@ -846,10 +846,10 @@ static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0,
static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
{
DECLARE_ALIGNED_16( int sum0 );
DECLARE_ALIGNED_16( int sum1 );
DECLARE_ALIGNED_16( int sum2 );
DECLARE_ALIGNED_16( int sum3 );
ALIGNED_16( int sum0 );
ALIGNED_16( int sum1 );
ALIGNED_16( int sum2 );
ALIGNED_16( int sum3 );
int y;
LOAD_ZERO;
......@@ -964,9 +964,9 @@ static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0,
uint8_t *pix1, uint8_t *pix2,
int i_stride, int scores[3] )
{
DECLARE_ALIGNED_16( int sum0 );
DECLARE_ALIGNED_16( int sum1 );
DECLARE_ALIGNED_16( int sum2 );
ALIGNED_16( int sum0 );
ALIGNED_16( int sum1 );
ALIGNED_16( int sum2 );
int y;
LOAD_ZERO;
......@@ -1062,10 +1062,10 @@ static void pixel_sad_x4_8x16_altivec( uint8_t *fenc,
uint8_t *pix2, uint8_t *pix3,
int i_stride, int scores[4] )
{
DECLARE_ALIGNED_16( int sum0 );
DECLARE_ALIGNED_16( int sum1 );
DECLARE_ALIGNED_16( int sum2 );
DECLARE_ALIGNED_16( int sum3 );
ALIGNED_16( int sum0 );
ALIGNED_16( int sum1 );
ALIGNED_16( int sum2 );
ALIGNED_16( int sum3 );
int y;
LOAD_ZERO;
......@@ -1183,9 +1183,9 @@ static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0,
uint8_t *pix1, uint8_t *pix2,
int i_stride, int scores[3] )
{
DECLARE_ALIGNED_16( int sum0 );
DECLARE_ALIGNED_16( int sum1 );
DECLARE_ALIGNED_16( int sum2 );
ALIGNED_16( int sum0 );
ALIGNED_16( int sum1 );
ALIGNED_16( int sum2 );
int y;
LOAD_ZERO;
......@@ -1283,10 +1283,10 @@ static void pixel_sad_x4_8x8_altivec( uint8_t *fenc,
uint8_t *pix2, uint8_t *pix3,
int i_stride, int scores[4] )
{
DECLARE_ALIGNED_16( int sum0 );
DECLARE_ALIGNED_16( int sum1 );
DECLARE_ALIGNED_16( int sum2 );
DECLARE_ALIGNED_16( int sum3 );
ALIGNED_16( int sum0 );
ALIGNED_16( int sum1 );
ALIGNED_16( int sum2 );
ALIGNED_16( int sum3 );
int y;
LOAD_ZERO;
......@@ -1404,9 +1404,9 @@ static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0,
uint8_t *pix1, uint8_t *pix2,
int i_stride, int scores[3] )
{
DECLARE_ALIGNED_16( int sum0 );
DECLARE_ALIGNED_16( int sum1 );
DECLARE_ALIGNED_16( int sum2 );
ALIGNED_16( int sum0 );
ALIGNED_16( int sum1 );
ALIGNED_16( int sum2 );
int y;
LOAD_ZERO;
......@@ -1506,7 +1506,7 @@ static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0,
static int pixel_ssd_16x16_altivec ( uint8_t *pix1, int i_stride_pix1,
uint8_t *pix2, int i_stride_pix2)
{
DECLARE_ALIGNED_16( int sum );
ALIGNED_16( int sum );
int y;
LOAD_ZERO;
......@@ -1586,7 +1586,7 @@ static int pixel_ssd_16x16_altivec ( uint8_t *pix1, int i_stride_pix1,
static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1,
uint8_t *pix2, int i_stride_pix2)
{
DECLARE_ALIGNED_16( int sum );
ALIGNED_16( int sum );
int y;
LOAD_ZERO;
......@@ -1638,8 +1638,8 @@ static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1,
****************************************************************************/
static int x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
{
DECLARE_ALIGNED_16(uint32_t sum_tab[4]);
DECLARE_ALIGNED_16(uint32_t sqr_tab[4]);
ALIGNED_16(uint32_t sum_tab[4]);
ALIGNED_16(uint32_t sqr_tab[4]);
LOAD_ZERO;
vec_u32_t sqr_v = zero_u32v;
......@@ -1667,8 +1667,8 @@ static int x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
static int x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
{
DECLARE_ALIGNED_16(uint32_t sum_tab[4]);
DECLARE_ALIGNED_16(uint32_t sqr_tab[4]);
ALIGNED_16(uint32_t sum_tab[4]);
ALIGNED_16(uint32_t sqr_tab[4]);
LOAD_ZERO;
vec_u32_t sqr_v = zero_u32v;
......@@ -1870,8 +1870,8 @@ static int pixel_sa8d_16x16_altivec( uint8_t *pix1, int i_pix1,
static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, int stride, const vec_u8_t perm )
{
DECLARE_ALIGNED_16( int32_t sum4_tab[4] );
DECLARE_ALIGNED_16( int32_t sum8_tab[4] );
ALIGNED_16( int32_t sum4_tab[4] );
ALIGNED_16( int32_t sum8_tab[4] );
LOAD_ZERO;
VEC_LOAD_HIGH( pix, 0 );
......@@ -1937,7 +1937,7 @@ static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, int stride, const vec_u
int sum8 = sum8_tab[3];
DECLARE_ALIGNED_16( int16_t tmp0_4_tab[8] );
ALIGNED_16( int16_t tmp0_4_tab[8] );
vec_ste(vec_add(pix16_d0, pix16_d4), 0, tmp0_4_tab);
sum4 -= tmp0_4_tab[0];
......@@ -1997,7 +1997,7 @@ static void ssim_4x4x2_core_altivec( const uint8_t *pix1, int stride1,
const uint8_t *pix2, int stride2,
int sums[2][4] )
{
DECLARE_ALIGNED_16( int temp[4] );
ALIGNED_16( int temp[4] );
int y;
vec_u8_t pix1v, pix2v;
......
......@@ -75,9 +75,9 @@ extern void predict_16x16_dc_left_core_sse2( uint8_t *src, int i_dc_left );
extern void predict_16x16_v_sse2( uint8_t *src );
extern void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
DECLARE_ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
DECLARE_ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
DECLARE_ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
#define PREDICT_P_SUM(j,i)\
H += i * ( src[j+i - FDEC_STRIDE ] - src[j-i - FDEC_STRIDE ] );\
......@@ -332,7 +332,7 @@ void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[33], int res[3] )
PREDICT_8x8_LOAD_TOP\
PREDICT_8x8_LOAD_LEFT\
int t;\
DECLARE_ALIGNED_16( int16_t sa8d_1d[2][8] );\
ALIGNED_16( int16_t sa8d_1d[2][8] );\
SUMSUB(l0,l4,l1,l5,l2,l6,l3,l7);\
SUMSUB(l0,l2,l1,l3,l4,l6,l5,l7);\
SUMSUB(l0,l1,l2,l3,l4,l5,l6,l7);\
......
......@@ -47,7 +47,7 @@ typedef struct
/* 8x8 */
int i_cost8x8;
/* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
DECLARE_ALIGNED_4( int16_t mvc[32][5][2] );
ALIGNED_4( int16_t mvc[32][5][2] );
x264_me_t me8x8[4];
/* Sub 4x4 */
......@@ -540,9 +540,9 @@ static void predict_4x4_mode_available( unsigned int i_neighbour,
/* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
{
DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] );
DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
DECLARE_ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[8][8] );
ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[4][4] );
ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
int i;
if( do_both_dct || h->mb.b_transform_8x8 )
......@@ -562,7 +562,7 @@ static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
/* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
static inline void x264_mb_cache_fenc_satd( x264_t *h )
{
DECLARE_ALIGNED_16( static uint8_t zero[16] ) = {0};
ALIGNED_16( static uint8_t zero[16] ) = {0};
uint8_t *fenc;
int x, y, satd_sum = 0, sa8d_sum = 0;
if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
......@@ -719,7 +719,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
/* 8x8 prediction selection */
if( flags & X264_ANALYSE_I8x8 )
{
DECLARE_ALIGNED_16( uint8_t edge[33] );
ALIGNED_ARRAY_16( uint8_t, edge,[33] );
x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
int i_cost = 0;
......@@ -1044,7 +1044,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
}
else if( h->mb.i_type == I_8x8 )
{
DECLARE_ALIGNED_16( uint8_t edge[33] );
ALIGNED_ARRAY_16( uint8_t, edge,[33] );
for( idx = 0; idx < 4; idx++ )
{
uint64_t pels_h = 0;
......@@ -1125,7 +1125,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
{
x264_me_t m;
int i_ref, i_mvc;
DECLARE_ALIGNED_4( int16_t mvc[8][2] );
ALIGNED_4( int16_t mvc[8][2] );
int i_halfpel_thresh = INT_MAX;
int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
......@@ -1322,7 +1322,7 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
{
x264_me_t m;
uint8_t **p_fenc = h->mb.pic.p_fenc;
DECLARE_ALIGNED_4( int16_t mvc[3][2] );
ALIGNED_4( int16_t mvc[3][2] );
int i, j;
/* XXX Needed for x264_mb_predict_mv */
......@@ -1372,7 +1372,7 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
{
x264_me_t m;
uint8_t **p_fenc = h->mb.pic.p_fenc;
DECLARE_ALIGNED_4( int16_t mvc[3][2] );
ALIGNED_4( int16_t mvc[3][2] );
int i, j;
/* XXX Needed for x264_mb_predict_mv */
......@@ -1419,7 +1419,7 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
{
DECLARE_ALIGNED_8( uint8_t pix1[16*8] );
ALIGNED_8( uint8_t pix1[16*8] );
uint8_t *pix2 = pix1+8;
const int i_stride = h->mb.pic.i_stride[1];
const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
......@@ -1595,14 +1595,14 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
{
DECLARE_ALIGNED_16( uint8_t pix0[16*16] );
DECLARE_ALIGNED_16( uint8_t pix1[16*16] );
ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
uint8_t *src0, *src1;
int stride0 = 16, stride1 = 16;
x264_me_t m;
int i_ref, i_mvc;
DECLARE_ALIGNED_4( int16_t mvc[9][2] );
ALIGNED_4( int16_t mvc[9][2] );
int i_halfpel_thresh = INT_MAX;
int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
......@@ -1779,7 +1779,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
uint8_t **p_fref[2] =
{ h->mb.pic.p_fref[0][a->l0.i_ref],
h->mb.pic.p_fref[1][a->l1.i_ref] };
DECLARE_ALIGNED_8( uint8_t pix[2][8*8] );
ALIGNED_8( uint8_t pix[2][8*8] );
int i, l;
/* XXX Needed for x264_mb_predict_mv */
......@@ -1844,8 +1844,8 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
uint8_t **p_fref[2] =
{ h->mb.pic.p_fref[0][a->l0.i_ref],
h->mb.pic.p_fref[1][a->l1.i_ref] };
DECLARE_ALIGNED_16( uint8_t pix[2][16*8] );
DECLARE_ALIGNED_4( int16_t mvc[2][2] );
ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
ALIGNED_4( int16_t mvc[2][2] );
int i, l;
h->mb.i_partition = D_16x8;
......@@ -1914,8 +1914,8 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
uint8_t **p_fref[2] =
{ h->mb.pic.p_fref[0][a->l0.i_ref],
h->mb.pic.p_fref[1][a->l1.i_ref] };
DECLARE_ALIGNED_8( uint8_t pix[2][8*16] );
DECLARE_ALIGNED_4( int16_t mvc[2][2] );
ALIGNED_8( uint8_t pix[2][8*16] );
ALIGNED_4( int16_t mvc[2][2] );
int i, l;
h->mb.i_partition = D_8x16;
......
......@@ -412,7 +412,7 @@ static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_lis
static NOINLINE uint32_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width )
{
DECLARE_ALIGNED_4( int16_t mvp[2] );
ALIGNED_4( int16_t mvp[2] );
uint32_t amvd;
int mdx, mdy;
......
......@@ -227,7 +227,7 @@ static void cavlc_qp_delta( x264_t *h, bs_t *s )
static void cavlc_mb_mvd( x264_t *h, bs_t *s, int i_list, int idx, int width )
{
DECLARE_ALIGNED_4( int16_t mvp[2] );
ALIGNED_4( int16_t mvp[2] );
x264_mb_predict_mv( h, i_list, idx, width, mvp );
bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0] );
bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1] );
......
......@@ -130,7 +130,7 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
int nz;
uint8_t *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]];
uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]];
DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[4] );
if( h->mb.b_lossless )
{
......@@ -166,7 +166,7 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
int nz;
uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
ALIGNED_ARRAY_16( int16_t, dct8x8,[8],[8] );
if( h->mb.b_lossless )
{
......@@ -196,8 +196,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
uint8_t *p_src = h->mb.pic.p_fenc[0];
uint8_t *p_dst = h->mb.pic.p_fdec[0];
DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
DECLARE_ALIGNED_16( int16_t dct_dc4x4[4][4] );
ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[4][4] );
ALIGNED_ARRAY_16( int16_t, dct_dc4x4,[4],[4] );
int i, nz;
int b_decimate = h->sh.i_type == SLICE_TYPE_B || (h->param.analyse.b_dct_decimate && h->sh.i_type == SLICE_TYPE_P);
......@@ -280,7 +280,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
{
int i, ch, nz, nz_dc;
int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
DECLARE_ALIGNED_16( int16_t dct2x2[2][2] );
ALIGNED_ARRAY_16( int16_t, dct2x2,[2],[2] );
h->mb.i_cbp_chroma = 0;
/* Early termination: check variance of chroma residual before encoding.
......@@ -336,7 +336,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
int i_decimate_score = 0;
int nz_ac = 0;
DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[4][4] );
if( h->mb.b_lossless )
{
......@@ -579,7 +579,7 @@ void x264_macroblock_encode( x264_t *h )
}
else if( h->mb.i_type == I_8x8 )
{
DECLARE_ALIGNED_16( uint8_t edge[33] );
ALIGNED_ARRAY_16( uint8_t, edge,[33] );
h->mb.b_transform_8x8 = 1;
/* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */
if( h->mb.i_skip_intra )
......@@ -674,7 +674,7 @@ void x264_macroblock_encode( x264_t *h )
}
else if( h->mb.b_transform_8x8 )
{
DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] );
ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[8][8] );
b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
h->nr_count[1] += h->mb.b_noise_reduction * 4;
......@@ -725,7 +725,7 @@ void x264_macroblock_encode( x264_t *h )
}
else
{
DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[4][4] );
h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
h->nr_count[0] += h->mb.b_noise_reduction * 16;
......@@ -844,9 +844,9 @@ void x264_macroblock_encode( x264_t *h )
*****************************************************************************/
int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
{
DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
DECLARE_ALIGNED_16( int16_t dct2x2[2][2] );
DECLARE_ALIGNED_16( int16_t dctscan[16] );
ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[4][4] );
ALIGNED_ARRAY_16( int16_t, dct2x2,[2],[2] );
ALIGNED_ARRAY_16( int16_t, dctscan,[16] );
int i_qp = h->mb.i_qp;
int mvp[2];
......@@ -1012,7 +1012,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
{
if( h->mb.b_transform_8x8 )
{
DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
ALIGNED_ARRAY_16( int16_t, dct8x8,[8],[8] );
h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, 0, i8 );
if( nnz8x8 )
......@@ -1038,7 +1038,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
{
int i4;
int i_decimate_8x8 = 0;
DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[4][4] );
h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
for( i4 = 0; i4 < 4; i4++ )
{
......@@ -1067,7 +1067,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
for( ch = 0; ch < 2; ch++ )
{
DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[4] );
p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
......@@ -1115,7 +1115,7 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
}
else