Commit 387828ed authored by Loren Merritt's avatar Loren Merritt Committed by Fiona Glaser

Convert x264 to use NV12 pixel format internally

~1% faster overall on Conroe, mostly due to improved cache locality.
Also allows improved SIMD on some chroma functions (e.g. deblock).
This change also extends the API to allow direct NV12 input, which should be a bit faster than YV12.
This isn't currently used in the x264cli, as swscale does not have fast NV12 conversion routines, but it might be useful for other applications.

Note this patch disables the chroma SIMD code for PPC and ARM until new versions are written.
parent c58954cc
......@@ -234,7 +234,7 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
pf->offsetsub = x264_mc_offsetsub_wtab_neon;
pf->weight_cache = x264_weight_cache_neon;
pf->mc_chroma = x264_mc_chroma_neon;
// pf->mc_chroma = x264_mc_chroma_neon;
pf->mc_luma = mc_luma_neon;
pf->get_ref = get_ref_neon;
pf->hpel_filter = hpel_filter_neon;
......
......@@ -1036,17 +1036,26 @@ void x264_picture_init( x264_picture_t *pic )
****************************************************************************/
int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
{
int csp = i_csp & X264_CSP_MASK;
if( csp <= X264_CSP_NONE || csp >= X264_CSP_MAX )
return -1;
x264_picture_init( pic );
pic->img.i_csp = i_csp;
pic->img.i_plane = 3;
pic->img.i_plane = csp == X264_CSP_NV12 ? 2 : 3;
pic->img.plane[0] = x264_malloc( 3 * i_width * i_height / 2 );
if( !pic->img.plane[0] )
return -1;
pic->img.plane[1] = pic->img.plane[0] + i_width * i_height;
pic->img.plane[2] = pic->img.plane[1] + i_width * i_height / 4;
if( csp != X264_CSP_NV12 )
pic->img.plane[2] = pic->img.plane[1] + i_width * i_height / 4;
pic->img.i_stride[0] = i_width;
pic->img.i_stride[1] = i_width / 2;
pic->img.i_stride[2] = i_width / 2;
if( csp == X264_CSP_NV12 )
pic->img.i_stride[1] = i_width;
else
{
pic->img.i_stride[1] = i_width / 2;
pic->img.i_stride[2] = i_width / 2;
}
return 0;
}
......
......@@ -668,16 +668,16 @@ struct x264_t
ALIGNED_16( uint32_t fenc_satd_cache[32] );
/* pointer over mb of the frame to be compressed */
pixel *p_fenc[3];
pixel *p_fenc[3]; /* y,u,v */
/* pointer to the actual source frame, not a block copy */
pixel *p_fenc_plane[3];
pixel *p_fenc_plane[2]; /* y,uv */
/* pointer over mb of the frame to be reconstructed */
pixel *p_fdec[3];
/* pointer over mb of the references */
int i_fref[2];
pixel *p_fref[2][32][4+2]; /* last: lN, lH, lV, lHV, cU, cV */
pixel *p_fref[2][32][4+1]; /* last: yN, yH, yV, yHV, uv */
pixel *p_fref_w[32]; /* weighted fullpel luma */
uint16_t *p_integral[2][16];
......@@ -813,7 +813,7 @@ struct x264_t
/* Buffers that are allocated per-thread even in sliced threads. */
void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
pixel *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
pixel *intra_border_backup[2][2]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
uint8_t (*deblock_strength[2])[2][4][4];
/* CPU functions dependents */
......
......@@ -134,7 +134,8 @@ static inline void deblock_chroma_c( pixel *pix, int xstride, int ystride, int a
pix += 2*ystride;
continue;
}
for( int d = 0; d < 2; d++ )
for( int d = 0; d < 2; d++, pix += ystride-2 )
for( int e = 0; e < 2; e++, pix++ )
{
int p1 = pix[-2*xstride];
int p0 = pix[-1*xstride];
......@@ -147,17 +148,16 @@ static inline void deblock_chroma_c( pixel *pix, int xstride, int ystride, int a
pix[-1*xstride] = x264_clip_pixel( p0 + delta ); /* p0' */
pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */
}
pix += ystride;
}
}
}
static void deblock_v_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
deblock_chroma_c( pix, stride, 2, alpha, beta, tc0 );
}
static void deblock_h_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
deblock_chroma_c( pix, 2, stride, alpha, beta, tc0 );
}
static inline void deblock_luma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta )
......@@ -212,9 +212,10 @@ static void deblock_h_luma_intra_c( pixel *pix, int stride, int alpha, int beta
deblock_luma_intra_c( pix, 1, stride, alpha, beta );
}
static inline void deblock_chroma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta )
static inline void deblock_chroma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int dir )
{
for( int d = 0; d < 8; d++ )
for( int d = 0; d < (dir?16:8); d++, pix += ystride-2 )
for( int e = 0; e < (dir?1:2); e++, pix++ )
{
int p1 = pix[-2*xstride];
int p0 = pix[-1*xstride];
......@@ -226,16 +227,15 @@ static inline void deblock_chroma_intra_c( pixel *pix, int xstride, int ystride,
pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */
pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */
}
pix += ystride;
}
}
static void deblock_v_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
deblock_chroma_intra_c( pix, stride, 2, alpha, beta, 1 );
}
static void deblock_h_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
deblock_chroma_intra_c( pix, 2, stride, alpha, beta, 0 );
}
static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
......@@ -267,7 +267,7 @@ static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264
}
}
static inline void deblock_edge( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
static inline void deblock_edge( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
{
int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset;
int index_b = i_qp-QP_BD_OFFSET + h->sh.i_beta_offset;
......@@ -283,12 +283,10 @@ static inline void deblock_edge( x264_t *h, pixel *pix1, pixel *pix2, int i_stri
tc[2] = (tc0_table(index_a)[bS[2]] << (BIT_DEPTH-8)) + b_chroma;
tc[3] = (tc0_table(index_a)[bS[3]] << (BIT_DEPTH-8)) + b_chroma;
pf_inter( pix1, i_stride, alpha, beta, tc );
if( b_chroma )
pf_inter( pix2, i_stride, alpha, beta, tc );
pf_inter( pix, i_stride, alpha, beta, tc );
}
static inline void deblock_edge_intra( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
static inline void deblock_edge_intra( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
{
int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset;
int index_b = i_qp-QP_BD_OFFSET + h->sh.i_beta_offset;
......@@ -298,9 +296,7 @@ static inline void deblock_edge_intra( x264_t *h, pixel *pix1, pixel *pix2, int
if( !alpha || !beta )
return;
pf_intra( pix1, i_stride, alpha, beta );
if( b_chroma )
pf_intra( pix2, i_stride, alpha, beta );
pf_intra( pix, i_stride, alpha, beta );
}
void x264_frame_deblock_row( x264_t *h, int mb_y )
......@@ -323,13 +319,11 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
uint8_t (*bs)[4][4] = h->deblock_strength[mb_y&b_interlaced][mb_x];
pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
pixel *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x;
pixel *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x;
pixel *pixuv = h->fdec->plane[1] + 8*mb_y*strideuv + 16*mb_x;
if( mb_y & b_interlaced )
{
pixy -= 15*stridey;
pixu -= 7*strideuv;
pixv -= 7*strideuv;
pixuv -= 7*strideuv;
}
int qp = h->mb.qp[mb_xy];
......@@ -339,11 +333,11 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
#define FILTER( intra, dir, edge, qp, chroma_qp )\
do\
{\
deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1), NULL,\
deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\
stride2y, bs[dir][edge], qp, 0,\
h->loopf.deblock_luma##intra[dir] );\
if( !(edge & 1) )\
deblock_edge##intra( h, pixu + 2*edge*(dir?stride2uv:1), pixv + 2*edge*(dir?stride2uv:1),\
deblock_edge##intra( h, pixuv + 2*edge*(dir?stride2uv:2),\
stride2uv, bs[dir][edge], chroma_qp, 1,\
h->loopf.deblock_chroma##intra[dir] );\
} while(0)
......@@ -393,15 +387,14 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
}
#if HAVE_MMX
void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_strength_mmxext( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
int mvy_limit, int bframe );
......@@ -414,9 +407,14 @@ void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X
#if ARCH_X86
void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
// FIXME this wrapper has a significant cpu cost
static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 );
......@@ -458,15 +456,15 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
if( cpu&X264_CPU_MMXEXT )
{
#if !X264_HIGH_BIT_DEPTH
pf->deblock_chroma[1] = x264_deblock_v_chroma_mmxext;
pf->deblock_chroma[0] = x264_deblock_h_chroma_mmxext;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmxext;
pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_mmxext;
#if ARCH_X86
pf->deblock_luma[1] = x264_deblock_v_luma_mmxext;
pf->deblock_luma[0] = x264_deblock_h_luma_mmxext;
pf->deblock_chroma[1] = x264_deblock_v_chroma_mmxext;
pf->deblock_chroma[0] = x264_deblock_h_chroma_mmxext;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmxext;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmxext;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmxext;
pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_mmxext;
#endif
#endif // !X264_HIGH_BIT_DEPTH
pf->deblock_strength = x264_deblock_strength_mmxext;
......@@ -478,8 +476,12 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
{
pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
pf->deblock_luma[0] = x264_deblock_h_luma_sse2;
pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2;
pf->deblock_chroma[0] = x264_deblock_h_chroma_sse2;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2;
pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_sse2;
}
#endif // !X264_HIGH_BIT_DEPTH
}
......@@ -502,8 +504,8 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
{
pf->deblock_luma[1] = x264_deblock_v_luma_neon;
pf->deblock_luma[0] = x264_deblock_h_luma_neon;
pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
pf->deblock_chroma[0] = x264_deblock_h_chroma_neon;
// pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
// pf->deblock_chroma[0] = x264_deblock_h_chroma_neon;
}
#endif
#endif // !X264_HIGH_BIT_DEPTH
......
......@@ -42,20 +42,12 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
i_stride = ALIGN( i_width + 2*PADH, align );
i_lines = h->mb.i_mb_height*16;
frame->i_plane = 3;
for( int i = 0; i < 3; i++ )
frame->i_plane = 2;
for( int i = 0; i < 2; i++ )
{
frame->i_stride[i] = ALIGN( i_stride >> !!i, align );
frame->i_width[i] = i_width >> !!i;
frame->i_lines[i] = i_lines >> !!i;
}
luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv));
chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv));
for( int i = 1; i < 3; i++ )
{
CHECKED_MALLOC( frame->buffer[i], chroma_plane_size * sizeof(pixel) );
frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
frame->i_stride[i] = ALIGN( i_stride, align );
frame->i_width[i] = i_width >> i;
frame->i_lines[i] = i_lines >> i;
}
for( int i = 0; i < h->param.i_bframe + 2; i++ )
......@@ -81,6 +73,12 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
frame->orig = frame;
luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv));
chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + i_padv));
CHECKED_MALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) );
frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * i_padv/2 + PADH;
/* all 4 luma planes allocated together, since the cacheline split code
* requires them to be in-phase wrt cacheline alignment. */
if( h->param.analyse.i_subpel_refine && b_fdec )
......@@ -217,6 +215,27 @@ void x264_frame_delete( x264_frame_t *frame )
x264_free( frame );
}
static int get_plane_ptr( x264_t *h, x264_picture_t *src, uint8_t **pix, int *stride, int plane, int xshift, int yshift )
{
int width = h->param.i_width >> xshift;
int height = h->param.i_height >> yshift;
*pix = src->img.plane[plane];
*stride = src->img.i_stride[plane];
if( src->img.i_csp & X264_CSP_VFLIP )
{
*pix += (height-1) * *stride;
*stride = -*stride;
}
if( width > abs(*stride) )
{
x264_log( h, X264_LOG_ERROR, "Input picture width (%d) is greater than stride (%d)\n", width, *stride );
return -1;
}
return 0;
}
#define get_plane_ptr(...) do{ if( get_plane_ptr(__VA_ARGS__) < 0 ) return -1; }while(0)
int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
{
int i_csp = src->img.i_csp & X264_CSP_MASK;
......@@ -232,43 +251,53 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
dst->param = src->param;
dst->i_pic_struct = src->i_pic_struct;
for( int i = 0; i < 3; i++ )
uint8_t *pix[3];
int stride[3];
get_plane_ptr( h, src, &pix[0], &stride[0], 0, 0, 0 );
h->mc.plane_copy( dst->plane[0], dst->i_stride[0], pix[0], stride[0],
h->param.i_width, h->param.i_height );
if( i_csp == X264_CSP_NV12 )
{
int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i;
uint8_t *plane = src->img.plane[s];
int stride = src->img.i_stride[s];
int width = h->param.i_width >> !!i;
int height = h->param.i_height >> !!i;
if( src->img.i_csp & X264_CSP_VFLIP )
{
plane += (height-1)*stride;
stride = -stride;
}
if( width > abs(stride) )
{
x264_log( h, X264_LOG_ERROR, "Input picture width is greater than stride\n" );
return -1;
}
h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, 1 );
h->mc.plane_copy( dst->plane[1], dst->i_stride[1], pix[1], stride[1],
h->param.i_width, h->param.i_height>>1 );
}
else
{
get_plane_ptr( h, src, &pix[1], &stride[1], i_csp==X264_CSP_I420 ? 1 : 2, 1, 1 );
get_plane_ptr( h, src, &pix[2], &stride[2], i_csp==X264_CSP_I420 ? 2 : 1, 1, 1 );
h->mc.plane_copy_interleave( dst->plane[1], dst->i_stride[1],
pix[1], stride[1], pix[2], stride[2],
h->param.i_width>>1, h->param.i_height>>1 );
}
return 0;
}
static void ALWAYS_INLINE pixel_memset( pixel *dst, int value, int size )
static void ALWAYS_INLINE pixel_memset( pixel *dst, pixel *src, int len, int size )
{
for( int i = 0; i < size; i++ )
dst[i] = value;
uint8_t *dstp = (uint8_t*)dst;
if(size == 1) {
memset(dst, *src, len);
} else if(size == 2) {
int v = M16( src );
for(int i=0; i<len; i++)
M16( dstp+i*2 ) = v;
} else if(size == 4) {
int v = M32( src );
for(int i=0; i<len; i++)
M32( dstp+i*4 ) = v;
}
}
static void plane_expand_border( pixel *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
static void plane_expand_border( pixel *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom, int b_chroma )
{
#define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
for( int y = 0; y < i_height; y++ )
{
/* left band */
pixel_memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
pixel_memset( PPIXEL(-i_padh, y), PPIXEL(0, y), i_padh>>b_chroma, sizeof(pixel)<<b_chroma );
/* right band */
pixel_memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
pixel_memset( PPIXEL(i_width, y), PPIXEL(i_width-1-b_chroma, y), i_padh>>b_chroma, sizeof(pixel)<<b_chroma );
}
/* upper band */
if( b_pad_top )
......@@ -289,9 +318,9 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e
for( int i = 0; i < frame->i_plane; i++ )
{
int stride = frame->i_stride[i];
int width = 16*h->mb.i_mb_width >> !!i;
int width = 16*h->sps->i_mb_width;
int height = (b_end ? 16*(h->mb.i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
int padh = PADH >> !!i;
int padh = PADH;
int padv = PADV >> !!i;
// buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
pixel *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
......@@ -299,12 +328,12 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e
height += 4 >> (!!i + h->sh.b_mbaff);
if( h->sh.b_mbaff )
{
plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, i );
plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, i );
}
else
{
plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, i );
}
}
}
......@@ -326,37 +355,35 @@ void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y
pixel *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
if( h->sh.b_mbaff )
{
plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, 0 );
plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, 0 );
}
else
plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, 0 );
}
}
void x264_frame_expand_border_lowres( x264_frame_t *frame )
{
for( int i = 0; i < 4; i++ )
plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 );
plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1, 0 );
}
void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
{
for( int i = 0; i < frame->i_plane; i++ )
{
int i_subsample = i ? 1 : 0;
int i_width = h->param.i_width >> i_subsample;
int i_height = h->param.i_height >> i_subsample;
int i_padx = (h->mb.i_mb_width * 16 - h->param.i_width) >> i_subsample;
int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> i_subsample;
int i_width = h->param.i_width;
int i_height = h->param.i_height >> !!i;
int i_padx = (h->mb.i_mb_width * 16 - h->param.i_width);
int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> !!i;
if( i_padx )
{
for( int y = 0; y < i_height; y++ )
{
pixel value = frame->plane[i][y*frame->i_stride[i] + i_width - 1];
pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width], value, i_padx );
}
pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
&frame->plane[i][y*frame->i_stride[i] + i_width - 1-i],
i_padx>>i, sizeof(pixel)<<i );
}
if( i_pady )
{
......
......@@ -59,13 +59,13 @@ typedef struct x264_frame
/* YUV buffer */
int i_plane;
int i_stride[3];
int i_width[3];
int i_lines[3];
int i_stride[2];
int i_width[2];
int i_lines[2];
int i_stride_lowres;
int i_width_lowres;
int i_lines_lowres;
pixel *plane[3];
pixel *plane[2];
pixel *filtered[4]; /* plane[0], H, V, HV */
pixel *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */
uint16_t *integral;
......
......@@ -40,7 +40,8 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h
if( h->mb.b_interlaced & i_ref )
mvy += (h->mb.i_mb_y & 1)*4 - 2;
h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x],
&h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1],
mvx, mvy, 2*width, 2*height );
......@@ -48,11 +49,6 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h
h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->sh.weight[i_ref][1], height*2 );
h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
h->mb.pic.p_fref[0][i_ref][5], h->mb.pic.i_stride[2],
mvx, mvy, 2*width, 2*height );
if( h->sh.weight[i_ref][2].weightfn )
h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
......@@ -73,13 +69,10 @@ static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int h
if( h->mb.b_interlaced & i_ref )
mvy += (h->mb.i_mb_y & 1)*4 - 2;
h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x],
&h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1],
mvx, mvy, 2*width, 2*height );
h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
h->mb.pic.p_fref[1][i_ref][5], h->mb.pic.i_stride[2],
mvx, mvy, 2*width, 2*height );
}
static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
......@@ -110,16 +103,12 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int
if( h->mb.b_interlaced & i_ref1 )
mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1],
h->mc.mc_chroma( tmp0, tmp0+8, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1],
mvx0, mvy0, 2*width, 2*height );
h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1],
h->mc.mc_chroma( tmp1, tmp1+8, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1],
mvx1, mvy1, 2*width, 2*height );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][5], h->mb.pic.i_stride[2],
mvx0, mvy0, 2*width, 2*height );
h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][5], h->mb.pic.i_stride[2],
mvx1, mvy1, 2*width, 2*height );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight );
}
void x264_mb_mc_8x8( x264_t *h, int i8 )
......@@ -329,11 +318,11 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
if( !b_lookahead )
for( int i = 0; i <= h->param.b_interlaced; i++ )
{
for( int j = 0; j < 3; j++ )
for( int j = 0; j < 2; j++ )
{
/* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
CHECKED_MALLOCZERO( h->intra_border_backup[i][j], ((h->mb.i_mb_width*16+32)>>!!j) * sizeof(pixel) );
h->intra_border_backup[i][j] += 8;
CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32) * sizeof(pixel) );
h->intra_border_backup[i][j] += 16;
}
CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->mb.i_mb_width );
}
......@@ -364,8 +353,8 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
for( int i = 0; i <= h->param.b_interlaced; i++ )
{
x264_free( h->deblock_strength[i] );
for( int j = 0; j < 3; j++ )
x264_free( h->intra_border_backup[i][j] - 8 );
for( int j = 0; j < 2; j++ )
x264_free( h->intra_border_backup[i][j] - 16 );
}
x264_free( h->scratch_buffer );
}
......@@ -485,13 +474,13 @@ void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
{
int stride_y = fenc->i_stride[0];
int stride_uv = fenc->i_stride[1];
int off_y = 16 * (i_mb_x + i_mb_y * stride_y);
int off_uv = 8 * (i_mb_x + i_mb_y * stride_uv);
int off_y = 16 * i_mb_x + 16 * i_mb_y * stride_y;
int off_uv = 16 * i_mb_x + 8 * i_mb_y * stride_uv;
h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y,
fenc->plane[1+(i_mb_x&1)]+off_uv, stride_uv, i_mb_x );
fenc->plane[1]+off_uv, stride_uv, i_mb_x );
}
static NOINLINE void copy_column8( pixel *dst, pixel *src )
NOINLINE void x264_copy_column8( pixel *dst, pixel *src )
{
// input pointers are offset by 4 rows because that's faster (smaller instruction size on x86)
for( int i = -4; i < 4; i++ )
......@@ -500,30 +489,46 @@ static NOINLINE void copy_column8( pixel *dst, pixel *src )
static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x, int mb_y, int i, int b_interlaced )
{
const int w = (i == 0 ? 16 : 8);
const int i_stride = h->fdec->i_stride[!!i];
const int i_stride2 = i_stride << b_interlaced;
const int i_pix_offset = b_interlaced
? w * (mb_x + (mb_y&~1) * i_stride) + (mb_y&1) * i_stride
: w * (mb_x + mb_y * i_stride);
const pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset];
const pixel *intra_fdec = &h->intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16>>!!i];
int w = (i ? 8 : 16);
int i_stride = h->fdec->i_stride[i];
int i_stride2 = i_stride << b_interlaced;
int i_pix_offset = b_interlaced
? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
: 16 * mb_x + w * mb_y * i_stride;
pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset];
pixel *intra_fdec = &h->intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16];
int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
x264_frame_t **fref[2] = { h->fref0, h->fref1 };
if( b_interlaced )
ref_pix_offset[1] += (1-2*(mb_y&1)) * i_stride;
h->mb.pic.i_stride[i] = i_stride2;
h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
h->mb.pic.p_fenc_plane[i], i_stride2, w );
memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, (w*3/2+1) * sizeof(pixel) );
if( i )
{
h->mc.load_deinterleave_8x8x2_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2 );
memcpy( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*sizeof(pixel) );
memcpy( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*sizeof(pixel) );
}
else
{
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fenc_plane[0], i_stride2, 16 );
memcpy( h->mb.pic.p_fdec[0]-FDEC_STRIDE, intra_fdec, 24*sizeof(pixel) );
}
if( b_interlaced )
{
for( int j = 0; j < w; j++ )
h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
if( i )
{
h->mb.pic.p_fdec[1][-1+j*FDEC_STRIDE] = plane_fdec[-2+j*i_stride2];
h->mb.pic.p_fdec[2][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
}
else
h->mb.pic.p_fdec[0][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
}
for( int j = 0; j < h->mb.pic.i_fref[0]; j++ )
{