Commit d13b4c3a authored by Henrik Gramner's avatar Henrik Gramner
Browse files

osdep: Rework alignment macros

Drop ALIGNED_N and ALIGNED_ARRAY_N in favor of using explicit alignment.

This will allow us to increase the native alignment without unnecessarily
increasing the alignment of everything that's currently 32-byte aligned.
parent 5840e200
......@@ -635,11 +635,11 @@ struct x264_t
/* Current MB DCT coeffs */
struct
{
ALIGNED_N( dctcoef luma16x16_dc[3][16] );
ALIGNED_32( dctcoef luma16x16_dc[3][16] );
ALIGNED_16( dctcoef chroma_dc[2][8] );
// FIXME share memory?
ALIGNED_N( dctcoef luma8x8[12][64] );
ALIGNED_N( dctcoef luma4x4[16*3][16] );
ALIGNED_32( dctcoef luma8x8[12][64] );
ALIGNED_32( dctcoef luma4x4[16*3][16] );
} dct;
/* MB table and cache for current frame/mb */
......@@ -778,8 +778,8 @@ struct x264_t
/* space for p_fenc and p_fdec */
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
ALIGNED_N( pixel fenc_buf[48*FENC_STRIDE] );
ALIGNED_N( pixel fdec_buf[52*FDEC_STRIDE] );
ALIGNED_32( pixel fenc_buf[48*FENC_STRIDE] );
ALIGNED_32( pixel fdec_buf[52*FDEC_STRIDE] );
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
......@@ -796,8 +796,8 @@ struct x264_t
ALIGNED_16( dctcoef fenc_dct4[16][16] );
/* Psy RD SATD/SA8D scores cache */
ALIGNED_N( uint64_t fenc_hadamard_cache[9] );
ALIGNED_N( uint32_t fenc_satd_cache[32] );
ALIGNED_32( uint64_t fenc_hadamard_cache[9] );
ALIGNED_32( uint32_t fenc_satd_cache[32] );
/* pointer over mb of the frame to be compressed */
pixel *p_fenc[3]; /* y,u,v */
......@@ -930,8 +930,8 @@ struct x264_t
uint32_t (*nr_residual_sum)[64];
uint32_t *nr_count;
ALIGNED_N( udctcoef nr_offset_denoise[4][64] );
ALIGNED_N( uint32_t nr_residual_sum_buf[2][4][64] );
ALIGNED_32( udctcoef nr_offset_denoise[4][64] );
ALIGNED_32( uint32_t nr_residual_sum_buf[2][4][64] );
uint32_t nr_count_buf[2][4];
uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */
......
......@@ -121,8 +121,8 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int
int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
int i_mode = x264_size2pixel[height][width];
intptr_t i_stride0 = 16, i_stride1 = 16;
ALIGNED_ARRAY_N( pixel, tmp0,[16*16] );
ALIGNED_ARRAY_N( pixel, tmp1,[16*16] );
ALIGNED_ARRAY_32( pixel, tmp0,[16*16] );
ALIGNED_ARRAY_32( pixel, tmp1,[16*16] );
pixel *src0, *src1;
MC_LUMA_BI( 0 );
......
......@@ -108,10 +108,10 @@ int x264_is_pipe( const char *path );
#else
#define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n)))
#endif
#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
#define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 )
#define ALIGNED_4( var ) DECLARE_ALIGNED( var, 4 )
#define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 )
#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
// ARM compiliers don't reliably align stack variables
// - EABI requires only 8 byte stack alignment to be maintained
......@@ -127,37 +127,31 @@ int x264_is_pipe( const char *path );
#if ARCH_ARM && SYS_MACOSX
#define ALIGNED_ARRAY_8( ... ) ALIGNED_ARRAY_EMU( 7, __VA_ARGS__ )
#else
#define ALIGNED_ARRAY_8( type, name, sub1, ... )\
ALIGNED_8( type name sub1 __VA_ARGS__ )
#define ALIGNED_ARRAY_8( type, name, sub1, ... ) ALIGNED_8( type name sub1 __VA_ARGS__ )
#endif
#if ARCH_ARM
#define ALIGNED_ARRAY_16( ... ) ALIGNED_ARRAY_EMU( 15, __VA_ARGS__ )
#else
#define ALIGNED_ARRAY_16( type, name, sub1, ... )\
ALIGNED_16( type name sub1 __VA_ARGS__ )
#define ALIGNED_ARRAY_16( type, name, sub1, ... ) ALIGNED_16( type name sub1 __VA_ARGS__ )
#endif
#define EXPAND(x) x
#if ARCH_X86 || ARCH_X86_64
#define NATIVE_ALIGN 32
#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
#if STACK_ALIGNMENT >= 32
#define ALIGNED_ARRAY_32( type, name, sub1, ... )\
ALIGNED_32( type name sub1 __VA_ARGS__ )
#define ALIGNED_ARRAY_32( type, name, sub1, ... ) ALIGNED_32( type name sub1 __VA_ARGS__ )
#else
#define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) )
#endif
#define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) )
/* For AVX2 */
#if ARCH_X86 || ARCH_X86_64
#define NATIVE_ALIGN 32
#define ALIGNED_N ALIGNED_32
#define ALIGNED_ARRAY_N ALIGNED_ARRAY_32
#else
#define NATIVE_ALIGN 16
#define ALIGNED_N ALIGNED_16
#define ALIGNED_ARRAY_N ALIGNED_ARRAY_16
#define ALIGNED_32 ALIGNED_16
#define ALIGNED_ARRAY_32 ALIGNED_ARRAY_16
#define ALIGNED_ARRAY_64 ALIGNED_ARRAY_16
#endif
#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
......
......@@ -1735,7 +1735,7 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i
static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
pixel **p_fref, int i8x8, int size, int chroma )
{
ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
ALIGNED_ARRAY_32( pixel, pix1,[16*16] );
pixel *pix2 = pix1+8;
int i_stride = h->mb.pic.i_stride[1];
int chroma_h_shift = chroma <= CHROMA_422;
......@@ -1919,8 +1919,8 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8
static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
{
ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] );
ALIGNED_ARRAY_N( pixel, bi, [2],[16*16] );
ALIGNED_ARRAY_32( pixel, pix, [4],[16*16] );
ALIGNED_ARRAY_32( pixel, bi, [2],[16*16] );
int i_chroma_cost = 0;
int chromapix = h->luma2chroma_pixel[i_pixel];
......@@ -2013,8 +2013,8 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
{
ALIGNED_ARRAY_N( pixel, pix0,[16*16] );
ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
ALIGNED_ARRAY_32( pixel, pix0,[16*16] );
ALIGNED_ARRAY_32( pixel, pix1,[16*16] );
pixel *src0, *src1;
intptr_t stride0 = 16, stride1 = 16;
int i_ref, i_mvc;
......@@ -2147,7 +2147,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
}
else
{
ALIGNED_ARRAY_N( pixel, pixuv, [2],[16*FENC_STRIDE] );
ALIGNED_ARRAY_32( pixel, pixuv, [2],[16*FENC_STRIDE] );
int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
int v_shift = CHROMA_V_SHIFT;
......@@ -2483,7 +2483,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
{
ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] );
ALIGNED_ARRAY_32( pixel, pix,[2],[16*8] );
ALIGNED_4( int16_t mvc[3][2] );
h->mb.i_partition = D_16x8;
......
......@@ -128,8 +128,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
pixel *p_src = h->mb.pic.p_fenc[p];
pixel *p_dst = h->mb.pic.p_fdec[p];
ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
ALIGNED_ARRAY_N( dctcoef, dct_dc4x4,[16] );
ALIGNED_ARRAY_32( dctcoef, dct4x4,[16],[16] );
ALIGNED_ARRAY_32( dctcoef, dct_dc4x4,[16] );
int nz, block_cbp = 0;
int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
......@@ -350,7 +350,7 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
int i_decimate_score = b_decimate ? 0 : 7;
int nz_ac = 0;
ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
ALIGNED_ARRAY_32( dctcoef, dct4x4,[8],[16] );
if( h->mb.b_lossless )
{
......@@ -780,7 +780,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
}
else if( h->mb.b_transform_8x8 )
{
ALIGNED_ARRAY_N( dctcoef, dct8x8,[4],[64] );
ALIGNED_ARRAY_32( dctcoef, dct8x8,[4],[64] );
b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
......@@ -824,7 +824,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
}
else
{
ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
ALIGNED_ARRAY_32( dctcoef, dct4x4,[16],[16] );
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
{
int quant_cat = p ? CQM_4PC : CQM_4PY;
......@@ -965,7 +965,7 @@ void x264_macroblock_encode( x264_t *h )
*****************************************************************************/
static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
{
ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
ALIGNED_ARRAY_32( dctcoef, dct4x4,[8],[16] );
ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
ALIGNED_4( int16_t mvp[2] );
int i_qp = h->mb.i_qp;
......@@ -1219,7 +1219,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
int quant_cat = p ? CQM_8PC : CQM_8PY;
pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] );
ALIGNED_ARRAY_32( dctcoef, dct8x8,[64] );
h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 );
......@@ -1252,7 +1252,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
int i_decimate_8x8 = b_decimate ? 0 : 4;
ALIGNED_ARRAY_N( dctcoef, dct4x4,[4],[16] );
ALIGNED_ARRAY_32( dctcoef, dct4x4,[4],[16] );
int nnz8x8 = 0;
h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
......@@ -1311,7 +1311,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
i_qp = h->mb.i_chroma_qp;
for( int ch = 0; ch < 2; ch++ )
{
ALIGNED_ARRAY_N( dctcoef, dct4x4,[2],[16] );
ALIGNED_ARRAY_32( dctcoef, dct4x4,[2],[16] );
pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
......@@ -1376,7 +1376,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p4x4_internal( x264_t *h, int i
}
else
{
ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] );
ALIGNED_ARRAY_32( dctcoef, dct4x4,[16] );
h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 );
h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
......
......@@ -116,7 +116,7 @@ static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] );
ALIGNED_ARRAY_32( dctcoef, dct4x4,[16] );
if( b_predict )
{
......@@ -154,7 +154,7 @@ static ALWAYS_INLINE void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] );
ALIGNED_ARRAY_32( dctcoef, dct8x8,[64] );
ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
if( b_predict )
......
......@@ -191,7 +191,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
int omx, omy, pmx, pmy;
pixel *p_fenc = m->p_fenc[0];
pixel *p_fref_w = m->p_fref_w;
ALIGNED_ARRAY_N( pixel, pix,[16*16] );
ALIGNED_ARRAY_32( pixel, pix,[16*16] );
ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] );
ALIGNED_ARRAY_16( int, costs,[16] );
......@@ -875,7 +875,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
int chroma_v_shift = CHROMA_V_SHIFT;
int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
ALIGNED_ARRAY_N( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
ALIGNED_ARRAY_32( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
ALIGNED_ARRAY_16( int, costs,[4] );
int bmx = m->mv[0];
......@@ -1034,9 +1034,9 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
const int i_pixel = m0->i_pixel;
const int bw = x264_pixel_size[i_pixel].w;
const int bh = x264_pixel_size[i_pixel].h;
ALIGNED_ARRAY_N( pixel, pixy_buf,[2],[9][16*16] );
ALIGNED_ARRAY_N( pixel, pixu_buf,[2],[9][16*16] );
ALIGNED_ARRAY_N( pixel, pixv_buf,[2],[9][16*16] );
ALIGNED_ARRAY_32( pixel, pixy_buf,[2],[9][16*16] );
ALIGNED_ARRAY_32( pixel, pixu_buf,[2],[9][16*16] );
ALIGNED_ARRAY_32( pixel, pixv_buf,[2],[9][16*16] );
pixel *src[3][2][9];
int chromapix = h->luma2chroma_pixel[i_pixel];
int chroma_v_shift = CHROMA_V_SHIFT;
......@@ -1059,7 +1059,7 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
uint64_t bcostrd = COST_MAX64;
uint16_t amvd;
/* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
ALIGNED_ARRAY_N( uint8_t, visited,[8],[8][8] );
ALIGNED_ARRAY_32( uint8_t, visited,[8],[8][8] );
/* all permutations of an offset in up to 2 of the dimensions */
ALIGNED_4( static const int8_t dia4d[33][4] ) =
{
......
......@@ -243,7 +243,7 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
stride <<= b_field;
if( b_chroma )
{
ALIGNED_ARRAY_N( pixel, pix,[FENC_STRIDE*16] );
ALIGNED_ARRAY_32( pixel, pix,[FENC_STRIDE*16] );
int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
int shift = 7 - CHROMA_V_SHIFT;
......
......@@ -634,8 +634,8 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac,
int b_chroma, int dc, int num_coefs, int idx )
{
ALIGNED_ARRAY_N( dctcoef, orig_coefs, [64] );
ALIGNED_ARRAY_N( dctcoef, quant_coefs, [64] );
ALIGNED_ARRAY_32( dctcoef, orig_coefs, [64] );
ALIGNED_ARRAY_32( dctcoef, quant_coefs, [64] );
const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
const int b_interlaced = MB_INTERLACED;
......
......@@ -827,10 +827,10 @@ static int check_dct( int cpu_ref, int cpu_new )
x264_dct_function_t dct_asm;
x264_quant_function_t qf;
int ret = 0, ok, used_asm, interlace = 0;
ALIGNED_ARRAY_N( dctcoef, dct1, [16],[16] );
ALIGNED_ARRAY_N( dctcoef, dct2, [16],[16] );
ALIGNED_ARRAY_N( dctcoef, dct4, [16],[16] );
ALIGNED_ARRAY_N( dctcoef, dct8, [4],[64] );
ALIGNED_ARRAY_32( dctcoef, dct1, [16],[16] );
ALIGNED_ARRAY_32( dctcoef, dct2, [16],[16] );
ALIGNED_ARRAY_32( dctcoef, dct4, [16],[16] );
ALIGNED_ARRAY_32( dctcoef, dct8, [4],[64] );
ALIGNED_16( dctcoef dctdc[2][8] );
x264_t h_buf;
x264_t *h = &h_buf;
......@@ -1925,7 +1925,7 @@ static int check_deblock( int cpu_ref, int cpu_new )
ALIGNED_ARRAY_16( uint8_t, nnz, [X264_SCAN8_SIZE] );
ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
ALIGNED_ARRAY_16( int16_t, mv, [2],[X264_SCAN8_LUMA_SIZE][2] );
ALIGNED_ARRAY_N( uint8_t, bs, [2],[2][8][4] );
ALIGNED_ARRAY_32( uint8_t, bs, [2],[2][8][4] );
memset( bs, 99, sizeof(uint8_t)*2*4*8*2 );
for( int j = 0; j < X264_SCAN8_SIZE; j++ )
nnz[j] = ((rand()&7) == 7) * rand() & 0xf;
......@@ -1969,11 +1969,11 @@ static int check_quant( int cpu_ref, int cpu_new )
x264_quant_function_t qf_c;
x264_quant_function_t qf_ref;
x264_quant_function_t qf_a;
ALIGNED_ARRAY_N( dctcoef, dct1,[64] );
ALIGNED_ARRAY_N( dctcoef, dct2,[64] );
ALIGNED_ARRAY_N( dctcoef, dct3,[8],[16] );
ALIGNED_ARRAY_N( dctcoef, dct4,[8],[16] );
ALIGNED_ARRAY_N( uint8_t, cqm_buf,[64] );
ALIGNED_ARRAY_32( dctcoef, dct1,[64] );
ALIGNED_ARRAY_32( dctcoef, dct2,[64] );
ALIGNED_ARRAY_32( dctcoef, dct3,[8],[16] );
ALIGNED_ARRAY_32( dctcoef, dct4,[8],[16] );
ALIGNED_ARRAY_32( uint8_t, cqm_buf,[64] );
int ret = 0, ok, used_asm;
int oks[3] = {1,1,1}, used_asms[3] = {0,0,0};
x264_t h_buf;
......@@ -2587,7 +2587,7 @@ static int check_cabac( int cpu_ref, int cpu_new )
{\
for( int j = 0; j < 256; j++ )\
{\
ALIGNED_ARRAY_N( dctcoef, dct, [2],[64] );\
ALIGNED_ARRAY_32( dctcoef, dct, [2],[64] );\
uint8_t bitstream[2][1<<16];\
static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\
int ac = ctx_ac[ctx_block_cat];\
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment