diff --git a/common/frame.c b/common/frame.c index b27efa05e1e9b1e5298958326886a0a3cc49819f..9d76b8cbd0edae3f48597c7ad5062474f9c8fd72 100644 --- a/common/frame.c +++ b/common/frame.c @@ -38,7 +38,7 @@ static int align_stride( int x, int align, int disalign ) static int align_plane_size( int x, int disalign ) { if( !(x&(disalign-1)) ) - x += 128; + x += X264_MAX( 128, NATIVE_ALIGN ) / SIZEOF_PIXEL; return x; } @@ -63,29 +63,28 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec ) int i_mb_count = h->mb.i_mb_count; int i_stride, i_width, i_lines, luma_plane_count; int i_padv = PADV << PARAM_INTERLACED; - int align = 16; + int align = NATIVE_ALIGN / SIZEOF_PIXEL; #if ARCH_X86 || ARCH_X86_64 if( h->param.cpu&X264_CPU_CACHELINE_64 || h->param.cpu&X264_CPU_AVX512 ) - align = 64; + align = 64 / SIZEOF_PIXEL; else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX ) - align = 32; + align = 32 / SIZEOF_PIXEL; + else + align = 16 / SIZEOF_PIXEL; #endif #if ARCH_PPC - int disalign = 1<<9; + int disalign = (1<<9) / SIZEOF_PIXEL; #else - int disalign = 1<<10; + int disalign = (1<<10) / SIZEOF_PIXEL; #endif - /* ensure frame alignment after PADH is added */ - int padh_align = X264_MAX( align - PADH * SIZEOF_PIXEL, 0 ) / SIZEOF_PIXEL; - CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) ); PREALLOC_INIT /* allocate frame data (+64 for extra data for me) */ i_width = h->mb.i_mb_width*16; i_lines = h->mb.i_mb_height*16; - i_stride = align_stride( i_width + 2*PADH, align, disalign ); + i_stride = align_stride( i_width + PADH2, align, disalign ); if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 ) { @@ -123,7 +122,7 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec ) frame->i_csp = i_csp; frame->i_width_lowres = frame->i_width[0]/2; frame->i_lines_lowres = frame->i_lines[0]/2; - frame->i_stride_lowres = align_stride( frame->i_width_lowres + 2*PADH, align, disalign<<1 ); + frame->i_stride_lowres = align_stride( frame->i_width_lowres + PADH2, align, disalign<<1 ); for( int i = 0; i < h->param.i_bframe + 2; i++ ) for( int j = 0; j < h->param.i_bframe + 2; j++ ) @@ -152,9 +151,9 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec ) { int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12); int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*chroma_padv)); - PREALLOC( frame->buffer[1], (chroma_plane_size + padh_align) * SIZEOF_PIXEL ); + PREALLOC( frame->buffer[1], chroma_plane_size * SIZEOF_PIXEL ); if( PARAM_INTERLACED ) - PREALLOC( frame->buffer_fld[1], (chroma_plane_size + padh_align) * SIZEOF_PIXEL ); + PREALLOC( frame->buffer_fld[1], chroma_plane_size * SIZEOF_PIXEL ); } /* all 4 luma planes allocated together, since the cacheline split code @@ -167,9 +166,9 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec ) luma_plane_size *= 4; /* FIXME: Don't allocate both buffers in non-adaptive MBAFF. */ - PREALLOC( frame->buffer[p], (luma_plane_size + padh_align) * SIZEOF_PIXEL ); + PREALLOC( frame->buffer[p], luma_plane_size * SIZEOF_PIXEL ); if( PARAM_INTERLACED ) - PREALLOC( frame->buffer_fld[p], (luma_plane_size + padh_align) * SIZEOF_PIXEL ); + PREALLOC( frame->buffer_fld[p], luma_plane_size * SIZEOF_PIXEL ); } frame->b_duplicate = 0; @@ -207,7 +206,7 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec ) { int64_t luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign ); - PREALLOC( frame->buffer_lowres, (4 * luma_plane_size + padh_align) * SIZEOF_PIXEL ); + PREALLOC( frame->buffer_lowres, 4 * luma_plane_size * SIZEOF_PIXEL ); for( int j = 0; j <= !!h->param.i_bframe; j++ ) for( int i = 0; i <= h->param.i_bframe; i++ ) @@ -237,9 +236,9 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec ) if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 ) { int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12); - frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH + padh_align; + frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH_ALIGN; if( PARAM_INTERLACED ) - frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH + padh_align; + frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH_ALIGN; } for( int p = 0; p < luma_plane_count; p++ ) @@ -249,18 +248,18 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec ) { for( int i = 0; i < 4; i++ ) { - frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH + padh_align; + frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH_ALIGN; if( PARAM_INTERLACED ) - frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH + padh_align; + frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH_ALIGN; } frame->plane[p] = frame->filtered[p][0]; frame->plane_fld[p] = frame->filtered_fld[p][0]; } else { - frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH + padh_align; + frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH_ALIGN; if( PARAM_INTERLACED ) - frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH + padh_align; + frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH_ALIGN; } } @@ -270,7 +269,7 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec ) frame->mv16x16++; if( h->param.analyse.i_me_method >= X264_ME_ESA ) - frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH; + frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH_ALIGN; } else { @@ -278,7 +277,7 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec ) { int64_t luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign ); for( int i = 0; i < 4; i++ ) - frame->lowres[i] = frame->buffer_lowres + frame->i_stride_lowres * PADV + PADH + padh_align + i * luma_plane_size; + frame->lowres[i] = frame->buffer_lowres + frame->i_stride_lowres * PADV + PADH_ALIGN + i * luma_plane_size; for( int j = 0; j <= !!h->param.i_bframe; j++ ) for( int i = 0; i <= h->param.i_bframe; i++ ) diff --git a/common/frame.h b/common/frame.h index 10970eb16e5d08cd5f6326f03cbbefa9ac5e384c..ef20200810316471bf56bac1c1ee60d15ea9286f 100644 --- a/common/frame.h +++ b/common/frame.h @@ -31,6 +31,8 @@ /* number of pixels past the edge of the frame, for motion estimation/compensation */ #define PADH 32 #define PADV 32 +#define PADH_ALIGN X264_MAX( PADH, NATIVE_ALIGN / SIZEOF_PIXEL ) +#define PADH2 (PADH_ALIGN + PADH) typedef struct x264_frame { diff --git a/common/mc.c b/common/mc.c index 2d2aafa9114dc18bd04cb100e0b5f29acbcf803d..32f2793f6737be6ec0e133c3f5458f921337e656 100644 --- a/common/mc.c +++ b/common/mc.c @@ -749,15 +749,15 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end ) int stride = frame->i_stride[0]; if( start < 0 ) { - memset( frame->integral - PADV * stride - PADH, 0, stride * sizeof(uint16_t) ); + memset( frame->integral - PADV * stride - PADH_ALIGN, 0, stride * sizeof(uint16_t) ); start = -PADV; } if( b_end ) height += PADV-9; for( int y = start; y < height; y++ ) { - pixel *pix = frame->plane[0] + y * stride - PADH; - uint16_t *sum8 = frame->integral + (y+1) * stride - PADH; + pixel *pix = frame->plane[0] + y * stride - PADH_ALIGN; + uint16_t *sum8 = frame->integral + (y+1) * stride - PADH_ALIGN; uint16_t *sum4; if( h->frames.b_have_sub8x8_esa ) { diff --git a/encoder/analyse.c b/encoder/analyse.c index 970bd305a218e173e940e87bca873c5dc15e03e8..efd12377d76cc5714cdf99a9bb35e522681ca5d2 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -223,10 +223,10 @@ void x264_analyse_weight_frame( x264_t *h, int end ) if( h->sh.weight[j][0].weightfn ) { x264_frame_t *frame = h->fref[0][j]; - int width = frame->i_width[0] + 2*PADH; + int width = frame->i_width[0] + PADH2; int i_padv = PADV << PARAM_INTERLACED; int offset, height; - pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH; + pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH_ALIGN; height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted; offset = h->fenc->i_lines_weighted*frame->i_stride[0]; h->fenc->i_lines_weighted += height; @@ -234,7 +234,7 @@ void x264_analyse_weight_frame( x264_t *h, int end ) for( int k = j; k < h->i_ref[0]; k++ ) if( h->sh.weight[k][0].weightfn ) { - pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH; + pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH_ALIGN; x264_weight_scale_plane( h, dst + offset, frame->i_stride[0], src + offset, frame->i_stride[0], width, height, &h->sh.weight[k][0] ); diff --git a/encoder/encoder.c b/encoder/encoder.c index 04dcecec70d41b2dde2f9c3abde5a629255b36cc..18b04113eaabea70073b6af2168a2e9a8ebc9aaf 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -2185,14 +2185,14 @@ static void weighted_pred_init( x264_t *h ) assert( h->sh.weight[j][i].i_denom == denom ); if( !i ) { - h->fenc->weighted[j] = h->mb.p_weight_buf[buffer_next++] + h->fenc->i_stride[0] * i_padv + PADH; + h->fenc->weighted[j] = h->mb.p_weight_buf[buffer_next++] + h->fenc->i_stride[0] * i_padv + PADH_ALIGN; //scale full resolution frame if( h->param.i_threads == 1 ) { - pixel *src = h->fref[0][j]->filtered[0][0] - h->fref[0][j]->i_stride[0]*i_padv - PADH; - pixel *dst = h->fenc->weighted[j] - h->fenc->i_stride[0]*i_padv - PADH; + pixel *src = h->fref[0][j]->filtered[0][0] - h->fref[0][j]->i_stride[0]*i_padv - PADH_ALIGN; + pixel *dst = h->fenc->weighted[j] - h->fenc->i_stride[0]*i_padv - PADH_ALIGN; int stride = h->fenc->i_stride[0]; - int width = h->fenc->i_width[0] + PADH*2; + int width = h->fenc->i_width[0] + PADH2; int height = h->fenc->i_lines[0] + i_padv*2; x264_weight_scale_plane( h, dst, stride, src, stride, width, height, &h->sh.weight[j][0] ); h->fenc->i_lines_weighted = height; diff --git a/encoder/slicetype.c b/encoder/slicetype.c index c7984dd15e988c72b5032cff3f67441a2d2d55c8..0315ba6bf291679e3c7ddc3b7216b00a087896f6 100644 --- a/encoder/slicetype.c +++ b/encoder/slicetype.c @@ -492,11 +492,11 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int //scale lowres in lookahead for slicetype_frame_cost pixel *src = ref->buffer_lowres; pixel *dst = h->mb.p_weight_buf[0]; - int width = ref->i_width_lowres + PADH*2; + int width = ref->i_width_lowres + PADH2; int height = ref->i_lines_lowres + PADV*2; x264_weight_scale_plane( h, dst, ref->i_stride_lowres, src, ref->i_stride_lowres, width, height, &weights[0] ); - fenc->weighted[0] = h->mb.p_weight_buf[0] + PADH + ref->i_stride_lowres * PADV; + fenc->weighted[0] = h->mb.p_weight_buf[0] + PADH_ALIGN + ref->i_stride_lowres * PADV; } }