Commit a155572e authored by Fiona Glaser's avatar Fiona Glaser

Sliced-threads: do hpel and deblock after returning

Lowers encoding latency around 14% in sliced threads mode with preset superfast.
Additionally, even if there is no waiting time between frames, this improves parallelism, because hpel+deblock are done during the (singlethreaded) lookahead.
For ease of debugging, dump-yuv forces all of the threads to wait and finish instead of setting b_full_recon.
parent 90408eca
......@@ -470,9 +470,13 @@ struct x264_t
x264_t *thread[X264_THREAD_MAX+1];
int b_thread_active;
int i_thread_phase; /* which thread to use for the next frame */
int i_thread_idx; /* which thread this is */
int i_threadslice_start; /* first row in this thread slice */
int i_threadslice_end; /* row after the end of this thread slice */
int i_threadslice_pass; /* which pass of encoding we are on */
x264_threadpool_t *threadpool;
x264_pthread_mutex_t mutex;
x264_pthread_cond_t cv;
/* bitstream output */
struct
......@@ -823,6 +827,9 @@ struct x264_t
/* extra data required for mbaff in mv prediction */
int16_t topright_mv[2][3][2];
int8_t topright_ref[2][3];
/* current mb deblock strength */
uint8_t (*deblock_strength)[8][4];
} cache;
/* */
......
......@@ -395,7 +395,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
int mb_xy = h->mb.i_mb_xy;
int transform_8x8 = h->mb.mb_transform_size[h->mb.i_mb_xy];
int intra_cur = IS_INTRA( h->mb.type[mb_xy] );
uint8_t (*bs)[8][4] = h->deblock_strength[mb_y&1][mb_x];
uint8_t (*bs)[8][4] = h->deblock_strength[mb_y&1][h->param.b_sliced_threads?mb_xy:mb_x];
pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
pixel *pixuv = h->fdec->plane[1] + chroma_height*mb_y*strideuv + 16*mb_x;
......@@ -592,7 +592,7 @@ void x264_macroblock_deblock( x264_t *h )
if( (h->mb.i_partition == D_16x16 && !h->mb.i_cbp_luma && !intra_cur) || qp <= qp_thresh )
return;
uint8_t (*bs)[8][4] = h->deblock_strength[h->mb.i_mb_y&1][h->mb.i_mb_x];
uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength;
if( intra_cur )
{
memset( &bs[0][1], 3, 3*4*sizeof(uint8_t) );
......
......@@ -480,9 +480,12 @@ static void ALWAYS_INLINE plane_expand_border( pixel *pix, int i_stride, int i_w
#undef PPIXEL
}
void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y )
{
int b_start = !mb_y;
int pad_top = mb_y == 0;
int pad_bot = mb_y == h->mb.i_mb_height - (1 << SLICE_MBAFF);
int b_start = mb_y == h->i_threadslice_start;
int b_end = mb_y == h->i_threadslice_end - (1 << SLICE_MBAFF);
if( mb_y & SLICE_MBAFF )
return;
for( int i = 0; i < frame->i_plane; i++ )
......@@ -491,30 +494,31 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e
int v_shift = i && CHROMA_V_SHIFT;
int stride = frame->i_stride[i];
int width = 16*h->mb.i_mb_width;
int height = (b_end ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> v_shift;
int height = (pad_bot ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> v_shift;
int padh = PADH;
int padv = PADV >> v_shift;
// buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
if( b_end && !b_start )
height += 4 >> (v_shift + SLICE_MBAFF);
pixel *pix;
int starty = 16*mb_y - 4*!b_start;
if( SLICE_MBAFF )
{
// border samples for each field are extended separately
pix = frame->plane_fld[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift);
plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, h_shift );
plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, h_shift );
pix = frame->plane_fld[i] + (starty*stride >> v_shift);
plane_expand_border( pix, stride*2, width, height, padh, padv, pad_top, pad_bot, h_shift );
plane_expand_border( pix+stride, stride*2, width, height, padh, padv, pad_top, pad_bot, h_shift );
height = (b_end ? 16*(h->mb.i_mb_height - mb_y) : 32) >> v_shift;
height = (pad_bot ? 16*(h->mb.i_mb_height - mb_y) : 32) >> v_shift;
if( b_end && !b_start )
height += 4 >> v_shift;
pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift);
plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, h_shift );
pix = frame->plane[i] + (starty*stride >> v_shift);
plane_expand_border( pix, stride, width, height, padh, padv, pad_top, pad_bot, h_shift );
}
else
{
pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift);
plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, h_shift );
pix = frame->plane[i] + (starty*stride >> v_shift);
plane_expand_border( pix, stride, width, height, padh, padv, pad_top, pad_bot, h_shift );
}
}
}
......@@ -619,6 +623,23 @@ void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
x264_pthread_mutex_unlock( &frame->mutex );
}
void x264_threadslice_cond_broadcast( x264_t *h, int pass )
{
x264_pthread_mutex_lock( &h->mutex );
h->i_threadslice_pass = pass;
if( pass > 0 )
x264_pthread_cond_broadcast( &h->cv );
x264_pthread_mutex_unlock( &h->mutex );
}
void x264_threadslice_cond_wait( x264_t *h, int pass )
{
x264_pthread_mutex_lock( &h->mutex );
while( h->i_threadslice_pass < pass )
x264_pthread_cond_wait( &h->cv, &h->mutex );
x264_pthread_mutex_unlock( &h->mutex );
}
/* list operators */
void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
......
......@@ -207,7 +207,7 @@ void x264_frame_delete( x264_frame_t *frame );
int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src );
void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y );
void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
void x264_frame_expand_border_lowres( x264_frame_t *frame );
void x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane );
......@@ -225,6 +225,9 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mba
void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed );
void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed );
void x264_threadslice_cond_broadcast( x264_t *h, int pass );
void x264_threadslice_cond_wait( x264_t *h, int pass );
void x264_frame_push( x264_frame_t **list, x264_frame_t *frame );
x264_frame_t *x264_frame_pop( x264_frame_t **list );
void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame );
......
......@@ -368,7 +368,17 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
}
for( int i = 0; i <= PARAM_INTERLACED; i++ )
{
CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->mb.i_mb_width );
if( h->param.b_sliced_threads )
{
/* Only allocate the first one, and allocate it for the whole frame, because we
* won't be deblocking until after the frame is fully encoded. */
if( h == h->thread[0] && !i )
CHECKED_MALLOC( h->deblock_strength[0], sizeof(**h->deblock_strength) * h->mb.i_mb_count );
else
h->deblock_strength[i] = h->thread[0]->deblock_strength[0];
}
else
CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->mb.i_mb_width );
h->deblock_strength[1] = h->deblock_strength[i];
}
}
......@@ -401,7 +411,8 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
if( !b_lookahead )
{
for( int i = 0; i <= PARAM_INTERLACED; i++ )
x264_free( h->deblock_strength[i] );
if( !h->param.b_sliced_threads || (h == h->thread[0] && !i) )
x264_free( h->deblock_strength[i] );
for( int i = 0; i < (PARAM_INTERLACED ? 5 : 2); i++ )
for( int j = 0; j < (CHROMA444 ? 3 : 2); j++ )
x264_free( h->intra_border_backup[i][j] - 16 );
......@@ -858,6 +869,8 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
const x264_left_table_t *left_index_table = h->mb.left_index_table;
h->mb.cache.deblock_strength = h->deblock_strength[mb_y&1][h->param.b_sliced_threads?h->mb.i_mb_xy:mb_x];
/* load cache */
if( h->mb.i_neighbour & MB_TOP )
{
......@@ -1432,7 +1445,7 @@ static void x264_macroblock_deblock_strength_mbaff( x264_t *h, uint8_t (*bs)[8][
void x264_macroblock_deblock_strength( x264_t *h )
{
uint8_t (*bs)[8][4] = h->deblock_strength[h->mb.i_mb_y&1][h->mb.i_mb_x];
uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength;
if( IS_INTRA( h->mb.i_type ) )
{
memset( bs[0][1], 3, 3*4*sizeof(uint8_t) );
......
......@@ -68,12 +68,28 @@ static double x264_ssim( double ssim )
return -10.0 * log10( inv_ssim );
}
static int x264_threadpool_wait_all( x264_t *h )
{
for( int i = 0; i < h->param.i_threads; i++ )
if( h->thread[i]->b_thread_active )
{
h->thread[i]->b_thread_active = 0;
if( (intptr_t)x264_threadpool_wait( h->threadpool, h->thread[i] ) < 0 )
return -1;
}
return 0;
}
static void x264_frame_dump( x264_t *h )
{
FILE *f = fopen( h->param.psz_dump_yuv, "r+b" );
if( !f )
return;
/* Wait for the threads to finish deblocking */
if( h->param.b_sliced_threads )
x264_threadpool_wait_all( h );
/* Write the frame in display order */
int frame_size = FRAME_SIZE( h->param.i_height * h->param.i_width * sizeof(pixel) );
fseek( f, (uint64_t)h->fdec->i_frame * frame_size, SEEK_SET );
......@@ -921,9 +937,6 @@ static int x264_validate_parameters( x264_t *h, int b_open )
h->param.i_nal_hrd = X264_NAL_HRD_VBR;
}
if( h->param.psz_dump_yuv )
h->param.b_full_recon = 1;
/* ensure the booleans are 0 or 1 so they can be used in math */
#define BOOLIFY(x) h->param.x = !!h->param.x
BOOLIFY( b_cabac );
......@@ -1258,8 +1271,18 @@ x264_t *x264_encoder_open( x264_param_t *param )
goto fail;
h->thread[0] = h;
for( int i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
for( int i = 0; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
{
if( i )
CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
if( i < h->param.i_threads )
{
if( x264_pthread_mutex_init( &h->thread[i]->mutex, NULL ) )
goto fail;
if( x264_pthread_cond_init( &h->thread[i]->cv, NULL ) )
goto fail;
}
}
for( int i = 0; i < h->param.i_threads; i++ )
{
......@@ -1354,6 +1377,11 @@ fail:
****************************************************************************/
int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
{
/* If the previous frame isn't done encoding, reconfiguring is probably dangerous. */
if( h->param.b_sliced_threads )
if( x264_threadpool_wait_all( h ) < 0 )
return -1;
int rc_reconfig = 0;
h = h->thread[h->thread[0]->i_thread_phase];
x264_set_aspect_ratio( h, param, 0 );
......@@ -1830,7 +1858,7 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
h->mb.pic.i_fref[1] = h->i_ref[1];
}
static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
static void x264_fdec_filter_row( x264_t *h, int mb_y, int pass )
{
/* mb_y is the mb to be encoded next, not the mb to be filtered here */
int b_hpel = h->fdec->b_kept_as_ref;
......@@ -1843,11 +1871,30 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
* above each MB, as bS=4 doesn't happen for the top of interlaced mbpairs. */
int minpix_y = min_y*16 - 4 * !b_start;
int maxpix_y = mb_y*16 - 4 * !b_end;
b_deblock &= b_hpel || h->param.b_full_recon;
if( h->param.b_sliced_threads && b_start && min_y && !b_inloop )
b_deblock &= b_hpel || h->param.b_full_recon || h->param.psz_dump_yuv;
if( h->param.b_sliced_threads )
{
b_deblock = 0; /* We already deblocked on the inloop pass. */
b_measure_quality = 0; /* We already measured quality on the inloop pass. */
switch( pass )
{
/* During encode: only do deblock if asked for */
default:
case 0:
b_deblock &= h->param.b_full_recon;
b_hpel = 0;
break;
/* During post-encode pass: do deblock if not done yet, do hpel for all
* rows except those between slices. */
case 1:
b_deblock &= !h->param.b_full_recon;
b_hpel &= !(b_start && min_y > 0);
b_measure_quality = 0;
break;
/* Final pass: do the rows between slices in sequence. */
case 2:
b_deblock = 0;
b_measure_quality = 0;
break;
}
}
if( mb_y & SLICE_MBAFF )
return;
......@@ -1861,17 +1908,19 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
/* FIXME: Prediction requires different borders for interlaced/progressive mc,
* but the actual image data is equivalent. For now, maintain this
* consistency by copying deblocked pixels between planes. */
if( PARAM_INTERLACED )
if( PARAM_INTERLACED && (!h->param.b_sliced_threads || pass == 1) )
for( int p = 0; p < h->fdec->i_plane; p++ )
for( int i = minpix_y>>(CHROMA_V_SHIFT && p); i < maxpix_y>>(CHROMA_V_SHIFT && p); i++ )
memcpy( h->fdec->plane_fld[p] + i*h->fdec->i_stride[p],
h->fdec->plane[p] + i*h->fdec->i_stride[p],
h->mb.i_mb_width*16*sizeof(pixel) );
if( h->fdec->b_kept_as_ref && (!h->param.b_sliced_threads || pass == 1) )
x264_frame_expand_border( h, h->fdec, min_y );
if( b_hpel )
{
int end = mb_y == h->mb.i_mb_height;
x264_frame_expand_border( h, h->fdec, min_y, end );
/* Can't do hpel until the previous slice is done encoding. */
if( h->param.analyse.i_subpel_refine )
{
x264_frame_filter( h, h->fdec, min_y, end );
......@@ -1879,7 +1928,7 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
}
}
if( SLICE_MBAFF )
if( SLICE_MBAFF && pass == 0 )
for( int i = 0; i < 3; i++ )
{
XCHG( pixel *, h->intra_border_backup[0][i], h->intra_border_backup[3][i] );
......@@ -2148,7 +2197,7 @@ static int x264_slice_write( x264_t *h )
int orig_last_mb = h->sh.i_last_mb;
uint8_t *last_emu_check;
x264_bs_bak_t bs_bak[2];
b_deblock &= b_hpel || h->param.b_full_recon;
b_deblock &= b_hpel || h->param.b_full_recon || h->param.psz_dump_yuv;
bs_realign( &h->out.bs );
/* Slice */
......@@ -2200,7 +2249,7 @@ static int x264_slice_write( x264_t *h )
if( !(i_mb_y & SLICE_MBAFF) && h->param.rc.i_vbv_buffer_size )
x264_bitstream_backup( h, &bs_bak[1], i_skip, 1 );
if( !h->mb.b_reencode_mb )
x264_fdec_filter_row( h, i_mb_y, 1 );
x264_fdec_filter_row( h, i_mb_y, 0 );
}
if( !(i_mb_y & SLICE_MBAFF) && back_up_bitstream )
......@@ -2447,7 +2496,23 @@ reencode:
+ (h->out.i_nal*NALU_OVERHEAD * 8)
- h->stat.frame.i_tex_bits
- h->stat.frame.i_mv_bits;
x264_fdec_filter_row( h, h->i_threadslice_end, 1 );
x264_fdec_filter_row( h, h->i_threadslice_end, 0 );
if( h->param.b_sliced_threads )
{
/* Tell the main thread we're done. */
x264_threadslice_cond_broadcast( h, 1 );
/* Do hpel now */
for( int mb_y = h->i_threadslice_start; mb_y <= h->i_threadslice_end; mb_y++ )
x264_fdec_filter_row( h, mb_y, 1 );
x264_threadslice_cond_broadcast( h, 2 );
/* Do the first row of hpel, now that the previous slice is done */
if( h->i_thread_idx > 0 )
{
x264_threadslice_cond_wait( h->thread[h->i_thread_idx-1], 2 );
x264_fdec_filter_row( h, h->i_threadslice_start + (1 << SLICE_MBAFF), 2 );
}
}
}
return 0;
......@@ -2488,7 +2553,7 @@ static void *x264_slices_write( x264_t *h )
#if HAVE_VISUALIZE
if( h->param.b_visualize )
if( x264_visualize_init( h ) )
return (void *)-1;
goto fail;
#endif
/* init stats */
......@@ -2521,7 +2586,7 @@ static void *x264_slices_write( x264_t *h )
}
h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, last_thread_mb );
if( x264_stack_align( x264_slice_write, h ) )
return (void *)-1;
goto fail;
h->sh.i_first_mb = h->sh.i_last_mb + 1;
// if i_first_mb is not the last mb in a row then go to the next mb in MBAFF order
if( SLICE_MBAFF && h->sh.i_first_mb % h->mb.i_mb_width )
......@@ -2537,6 +2602,12 @@ static void *x264_slices_write( x264_t *h )
#endif
return (void *)0;
fail:
/* Tell other threads we're done, so they wouldn't wait for it */
if( h->param.b_sliced_threads )
x264_threadslice_cond_broadcast( h, 2 );
return (void *)-1;
}
static int x264_threaded_slices_write( x264_t *h )
......@@ -2561,26 +2632,19 @@ static int x264_threaded_slices_write( x264_t *h )
x264_threads_distribute_ratecontrol( h );
/* dispatch */
/* setup */
for( int i = 0; i < h->param.i_threads; i++ )
{
x264_threadpool_run( h->threadpool, (void*)x264_slices_write, h->thread[i] );
h->thread[i]->i_thread_idx = i;
h->thread[i]->b_thread_active = 1;
x264_threadslice_cond_broadcast( h->thread[i], 0 );
}
/* dispatch */
for( int i = 0; i < h->param.i_threads; i++ )
{
h->thread[i]->b_thread_active = 0;
if( (intptr_t)x264_threadpool_wait( h->threadpool, h->thread[i] ) )
return -1;
}
/* Go back and fix up the hpel on the borders between slices. */
for( int i = 1; i < h->param.i_threads; i++ )
{
x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 1, 0 );
if( SLICE_MBAFF )
x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 2, 0 );
}
x264_threadpool_run( h->threadpool, (void*)x264_slices_write, h->thread[i] );
/* wait */
for( int i = 0; i < h->param.i_threads; i++ )
x264_threadslice_cond_wait( h->thread[i], 1 );
x264_threads_merge_ratecontrol( h );
......@@ -2677,11 +2741,6 @@ int x264_encoder_encode( x264_t *h,
x264_cpu_mask_misalign_sse();
#endif
// ok to call this before encoding any frames, since the initial values of fdec have b_kept_as_ref=0
if( x264_reference_update( h ) )
return -1;
h->fdec->i_lines_completed = -1;
/* no data out */
*pi_nal = 0;
*pp_nal = NULL;
......@@ -2777,6 +2836,12 @@ int x264_encoder_encode( x264_t *h,
/* ------------------- Get frame to be encoded ------------------------- */
/* 4: get picture to encode */
h->fenc = x264_frame_shift( h->frames.current );
/* If applicable, wait for previous frame reconstruction to finish */
if( h->param.b_sliced_threads )
if( x264_threadpool_wait_all( h ) < 0 )
return -1;
if( h->i_frame == h->i_thread_frames - 1 )
h->i_reordered_pts_delay = h->fenc->i_reordered_pts;
if( h->fenc->param )
......@@ -2786,6 +2851,11 @@ int x264_encoder_encode( x264_t *h,
h->fenc->param->param_free( h->fenc->param );
}
// ok to call this before encoding any frames, since the initial values of fdec have b_kept_as_ref=0
if( x264_reference_update( h ) )
return -1;
h->fdec->i_lines_completed = -1;
if( !IS_X264_TYPE_I( h->fenc->i_type ) )
{
int valid_refs_left = 0;
......@@ -3117,7 +3187,7 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
{
char psz_message[80];
if( h->b_thread_active )
if( !h->param.b_sliced_threads && h->b_thread_active )
{
h->b_thread_active = 0;
if( (intptr_t)x264_threadpool_wait( h->threadpool, h ) )
......@@ -3381,6 +3451,8 @@ void x264_encoder_close ( x264_t *h )
x264_lookahead_delete( h );
if( h->param.b_sliced_threads )
x264_threadpool_wait_all( h );
if( h->param.i_threads > 1 )
x264_threadpool_delete( h->threadpool );
if( h->i_thread_frames > 1 )
......@@ -3675,7 +3747,7 @@ void x264_encoder_close ( x264_t *h )
x264_free( h->nal_buffer );
x264_analyse_free_costs( h );
if( h->i_thread_frames > 1)
if( h->i_thread_frames > 1 )
h = h->thread[h->i_thread_phase];
/* frames */
......@@ -3717,7 +3789,9 @@ void x264_encoder_close ( x264_t *h )
}
x264_macroblock_thread_free( h->thread[i], 0 );
x264_free( h->thread[i]->out.p_bitstream );
x264_free( h->thread[i]->out.nal);
x264_free( h->thread[i]->out.nal );
x264_pthread_mutex_destroy( &h->thread[i]->mutex );
x264_pthread_cond_destroy( &h->thread[i]->cv );
x264_free( h->thread[i] );
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment