Commit 6f221210 authored by Fiona Glaser's avatar Fiona Glaser

Bring back slice-based threading support

Enabled with --sliced-threads
Unlike normal threading, adds no encoding latency.
Less efficient than normal threading, both performance and compression-wise.
Useful for low-latency encoding environments where performance is still important, such as HD videoconferencing.
Add --tune zerolatency, which eliminates all x264 encoder-side latency (no delayed frames at all).
Some tweaks to VBV ratecontrol and lookahead (in addition to those required by sliced threading).
Commit sponsored by a media streaming company that wishes to remain anonymous.
parent a2380187
......@@ -281,6 +281,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
else
p->i_threads = atoi(value);
}
OPT("sliced-threads")
p->b_sliced_threads = atobool(value);
OPT("sync-lookahead")
{
if( !strcmp(value, "auto") )
......@@ -888,6 +890,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
s += sprintf( s, " deadzone=%d,%d", p->analyse.i_luma_deadzone[0], p->analyse.i_luma_deadzone[1] );
s += sprintf( s, " chroma_qp_offset=%d", p->analyse.i_chroma_qp_offset );
s += sprintf( s, " threads=%d", p->i_threads );
s += sprintf( s, " sliced_threads=%d", p->b_sliced_threads );
if( p->i_slice_count )
s += sprintf( s, " slices=%d", p->i_slice_count );
if( p->i_slice_max_size )
......
......@@ -341,6 +341,8 @@ struct x264_t
x264_pthread_t thread_handle;
int b_thread_active;
int i_thread_phase; /* which thread to use for the next frame */
int i_threadslice_start; /* first row in this thread slice */
int i_threadslice_end; /* row after the end of this thread slice */
/* bitstream output */
struct
......
......@@ -768,42 +768,6 @@ int x264_macroblock_cache_init( x264_t *h )
memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
/* fdec: fenc:
* yyyyyyy
* yYYYY YYYY
* yYYYY YYYY
* yYYYY YYYY
* yYYYY YYYY
* uuu vvv UUVV
* uUU vVV UUVV
* uUU vVV
*/
h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
h->mb.i_neighbour4[6] =
h->mb.i_neighbour4[9] =
h->mb.i_neighbour4[12] =
h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
h->mb.i_neighbour4[3] =
h->mb.i_neighbour4[7] =
h->mb.i_neighbour4[11] =
h->mb.i_neighbour4[13] =
h->mb.i_neighbour4[15] =
h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
int buf_hpel = (h->param.i_width+48) * sizeof(int16_t);
int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
CHECKED_MALLOC( h->scratch_buffer, X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_mbtree ) );
return 0;
fail: return -1;
}
......@@ -832,7 +796,6 @@ void x264_macroblock_cache_end( x264_t *h )
x264_free( h->mb.skipbp );
x264_free( h->mb.cbp );
x264_free( h->mb.qp );
x264_free( h->scratch_buffer );
}
void x264_macroblock_slice_init( x264_t *h )
{
......@@ -871,6 +834,34 @@ void x264_macroblock_slice_init( x264_t *h )
memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );
setup_inverse_delta_pocs( h );
/* fdec: fenc:
* yyyyyyy
* yYYYY YYYY
* yYYYY YYYY
* yYYYY YYYY
* yYYYY YYYY
* uuu vvv UUVV
* uUU vVV UUVV
* uUU vVV
*/
h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
h->mb.i_neighbour4[6] =
h->mb.i_neighbour4[9] =
h->mb.i_neighbour4[12] =
h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
h->mb.i_neighbour4[3] =
h->mb.i_neighbour4[7] =
h->mb.i_neighbour4[11] =
h->mb.i_neighbour4[13] =
h->mb.i_neighbour4[15] =
h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
}
void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
......@@ -899,8 +890,10 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb
const int i_pix_offset = h->mb.b_interlaced
? w * (i_mb_x + (i_mb_y&~1) * i_stride) + (i_mb_y&1) * i_stride
: w * (i_mb_x + i_mb_y * i_stride);
const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
const uint8_t *intra_fdec = h->param.b_sliced_threads ? plane_fdec-i_stride2 :
&h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
const uint8_t *intra_fdec = &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
x264_frame_t **fref[2] = { h->fref0, h->fref1 };
int j, k;
if( h->mb.b_interlaced )
......@@ -909,13 +902,13 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb
h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
h->mb.pic.p_fenc_plane[i], i_stride2, w );
memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
if( i_mb_y > 0 )
memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
else
memset( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], 0, w*3/2+1 );
if( h->mb.b_interlaced || h->mb.b_reencode_mb )
{
const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
for( j = 0; j < w; j++ )
h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
}
for( j = 0; j < h->mb.pic.i_fref[0]; j++ )
{
h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &fref[0][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]];
......
......@@ -285,6 +285,38 @@ void x264_analyse_free_costs( x264_t *h )
}
}
void x264_analyse_weight_frame( x264_t *h, int end )
{
int j;
for( j=0; j<h->i_ref0; j++ )
{
if( h->sh.weight[j][0].weightfn )
{
x264_frame_t *frame = h->fref0[j];
int width = frame->i_width[0] + 2*PADH;
int i_padv = PADV << h->param.b_interlaced;
int offset, height;
uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
int k;
height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
offset = h->fenc->i_lines_weighted*frame->i_stride[0];
h->fenc->i_lines_weighted += height;
if( height )
{
for( k = j; k < h->i_ref0; k++ )
if( h->sh.weight[k][0].weightfn )
{
uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
src + offset, frame->i_stride[0],
width, height, &h->sh.weight[k][0] );
}
}
break;
}
}
}
/* initialize an array of lambda*nbits for all possible mvs */
static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
{
......@@ -361,13 +393,13 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
if( h->mb.i_mb_x == 0)
if( h->mb.i_mb_x == 0 )
{
int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
int thread_mvy_range = i_fmv_range;
if( h->param.i_threads > 1 )
if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
{
int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
int thresh = pix_y + h->param.analyse.i_mv_range_thread;
......@@ -387,33 +419,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
if( h->mb.b_interlaced )
thread_mvy_range >>= 1;
for( j=0; j<h->i_ref0; j++ )
{
if( h->sh.weight[j][0].weightfn )
{
x264_frame_t *frame = h->fref0[j];
int width = frame->i_width[0] + 2*PADH;
int i_padv = PADV << h->param.b_interlaced;
int offset, height;
uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
int k;
height = X264_MIN( 16 + thread_mvy_range + pix_y + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
offset = h->fenc->i_lines_weighted*frame->i_stride[0];
h->fenc->i_lines_weighted += height;
if( height )
{
for( k = j; k < h->i_ref0; k++ )
if( h->sh.weight[k][0].weightfn )
{
uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
src + offset, frame->i_stride[0],
width, height, &h->sh.weight[k][0] );
}
}
break;
}
}
x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
}
h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
......@@ -1247,7 +1253,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
{
h->mb.i_type = P_SKIP;
x264_analyse_update_cache( h, a );
assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 || h->param.b_sliced_threads );
return;
}
......@@ -1263,7 +1269,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
}
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 || h->param.b_sliced_threads );
h->mb.i_type = P_L0;
if( a->i_mbrd )
......@@ -2419,7 +2425,7 @@ void x264_macroblock_analyse( x264_t *h )
analysis.b_try_pskip = 0;
if( h->param.analyse.b_fast_pskip )
{
if( h->param.i_threads > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
if( h->param.i_threads > 1 && !h->param.b_sliced_threads && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
// FIXME don't need to check this if the reference frame is done
{}
else if( h->param.analyse.i_subpel_refine >= 3 )
......@@ -2437,7 +2443,7 @@ void x264_macroblock_analyse( x264_t *h )
{
h->mb.i_type = P_SKIP;
h->mb.i_partition = D_16x16;
assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 || h->param.b_sliced_threads );
}
else
{
......@@ -3145,7 +3151,7 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
}
#ifndef NDEBUG
if( h->param.i_threads > 1 && !IS_INTRA(h->mb.i_type) )
if( h->param.i_threads > 1 && !h->param.b_sliced_threads && !IS_INTRA(h->mb.i_type) )
{
int l;
for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
......
......@@ -26,6 +26,7 @@
int x264_analyse_init_costs( x264_t *h, int qp );
void x264_analyse_free_costs( x264_t *h );
void x264_analyse_weight_frame( x264_t *h, int end );
void x264_macroblock_analyse( x264_t *h );
void x264_slicetype_decide( x264_t *h );
......
......@@ -31,6 +31,7 @@
#include "analyse.h"
#include "ratecontrol.h"
#include "macroblock.h"
#include "me.h"
#if VISUALIZE
#include "common/visualize.h"
......@@ -409,7 +410,16 @@ static int x264_validate_parameters( x264_t *h )
x264_log( h, X264_LOG_WARNING, "not compiled with pthread support!\n");
h->param.i_threads = 1;
#endif
/* Avoid absurdly small thread slices as they can reduce performance
* and VBV compliance. Capped at an arbitrary 4 rows per thread. */
if( h->param.b_sliced_threads )
{
int max_threads = (h->param.i_height+15)/16 / 4;
h->param.i_threads = X264_MIN( h->param.i_threads, max_threads );
}
}
else
h->param.b_sliced_threads = 0;
if( h->param.b_interlaced )
{
......@@ -497,21 +507,26 @@ static int x264_validate_parameters( x264_t *h )
h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max );
int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced);
h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices );
h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 );
h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 );
if( h->param.b_interlaced && h->param.i_slice_max_size )
{
x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
h->param.i_slice_max_size = 0;
}
if( h->param.b_interlaced && h->param.i_slice_max_mbs )
if( h->param.b_sliced_threads )
h->param.i_slice_count = x264_clip3( h->param.i_threads, 0, max_slices );
else
{
x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
h->param.i_slice_max_mbs = 0;
h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices );
h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 );
h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 );
if( h->param.b_interlaced && h->param.i_slice_max_size )
{
x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
h->param.i_slice_max_size = 0;
}
if( h->param.b_interlaced && h->param.i_slice_max_mbs )
{
x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
h->param.i_slice_max_mbs = 0;
}
if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
h->param.i_slice_count = 0;
}
if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
h->param.i_slice_count = 0;
h->param.i_frame_reference = x264_clip3( h->param.i_frame_reference, 1, 16 );
if( h->param.i_keyint_max <= 0 )
......@@ -553,7 +568,7 @@ static int x264_validate_parameters( x264_t *h )
#ifdef HAVE_PTHREAD
if( h->param.i_sync_lookahead )
h->param.i_sync_lookahead = x264_clip3( h->param.i_sync_lookahead, h->param.i_threads + h->param.i_bframe, X264_LOOKAHEAD_MAX );
if( h->param.rc.b_stat_read || h->param.i_threads == 1 )
if( h->param.rc.b_stat_read || h->param.i_threads == 1 || h->param.b_sliced_threads )
h->param.i_sync_lookahead = 0;
#else
h->param.i_sync_lookahead = 0;
......@@ -676,7 +691,7 @@ static int x264_validate_parameters( x264_t *h )
if( !h->param.analyse.i_weighted_pred && h->param.rc.b_mb_tree && h->param.analyse.b_psy && !h->param.b_interlaced )
h->param.analyse.i_weighted_pred = X264_WEIGHTP_FAKE;
if( h->param.i_threads > 1 )
if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
{
int r = h->param.analyse.i_mv_range_thread;
int r2;
......@@ -851,7 +866,8 @@ x264_t *x264_encoder_open( x264_param_t *param )
if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size )
h->frames.i_delay = X264_MAX( h->frames.i_delay, h->param.rc.i_lookahead );
i_slicetype_length = h->frames.i_delay;
h->frames.i_delay += h->param.i_threads - 1;
if( !h->param.b_sliced_threads )
h->frames.i_delay += h->param.i_threads - 1;
h->frames.i_delay = X264_MIN( h->frames.i_delay, X264_LOOKAHEAD_MAX );
h->frames.i_delay += h->param.i_sync_lookahead;
......@@ -944,23 +960,45 @@ x264_t *x264_encoder_open( x264_param_t *param )
for( i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
if( x264_lookahead_init( h, i_slicetype_length ) )
goto fail;
for( i = 0; i < h->param.i_threads; i++ )
{
int init_nal_count = h->param.i_slice_count + 3;
int allocate_threadlocal_data = !h->param.b_sliced_threads || !i;
if( i > 0 )
*h->thread[i] = *h;
h->thread[i]->fdec = x264_frame_pop_unused( h, 1 );
if( !h->thread[i]->fdec )
goto fail;
if( allocate_threadlocal_data )
{
h->thread[i]->fdec = x264_frame_pop_unused( h, 1 );
if( !h->thread[i]->fdec )
goto fail;
}
else
h->thread[i]->fdec = h->thread[0]->fdec;
CHECKED_MALLOC( h->thread[i]->out.p_bitstream, h->out.i_bitstream );
/* Start each thread with room for 8 NAL units; it'll realloc later if needed. */
CHECKED_MALLOC( h->thread[i]->out.nal, 8*sizeof(x264_nal_t) );
h->thread[i]->out.i_nals_allocated = 8;
if( x264_macroblock_cache_init( h->thread[i] ) < 0 )
/* Start each thread with room for init_nal_count NAL units; it'll realloc later if needed. */
CHECKED_MALLOC( h->thread[i]->out.nal, init_nal_count*sizeof(x264_nal_t) );
h->thread[i]->out.i_nals_allocated = init_nal_count;
if( allocate_threadlocal_data && x264_macroblock_cache_init( h->thread[i] ) < 0 )
goto fail;
}
if( x264_lookahead_init( h, i_slicetype_length ) )
goto fail;
/* Allocate scratch buffer */
for( i = 0; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
{
int buf_hpel = (h->param.i_width+48) * sizeof(int16_t);
int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
CHECKED_MALLOC( h->thread[i]->scratch_buffer, X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_mbtree ) );
}
if( x264_ratecontrol_new( h ) < 0 )
goto fail;
......@@ -1009,8 +1047,8 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
COPY( b_deblocking_filter );
COPY( i_deblocking_filter_alphac0 );
COPY( i_deblocking_filter_beta );
COPY( analyse.intra );
COPY( analyse.inter );
COPY( analyse.intra );
COPY( analyse.i_direct_mv_pred );
/* Scratch buffer prevents me_range from being increased for esa/tesa */
if( h->param.analyse.i_me_method < X264_ME_ESA || param->analyse.i_me_range < h->param.analyse.i_me_range )
......@@ -1056,13 +1094,9 @@ static void x264_nal_start( x264_t *h, int i_type, int i_ref_idc )
nal->i_payload= 0;
nal->p_payload= &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8];
}
static int x264_nal_end( x264_t *h )
/* if number of allocated nals is not enough, re-allocate a larger one. */
static int x264_nal_check_buffer( x264_t *h )
{
x264_nal_t *nal = &h->out.nal[h->out.i_nal];
nal->i_payload = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8] - nal->p_payload;
h->out.i_nal++;
/* if number of allocated nals is not enough, re-allocate a larger one. */
if( h->out.i_nal >= h->out.i_nals_allocated )
{
x264_nal_t *new_out = x264_malloc( sizeof(x264_nal_t) * (h->out.i_nals_allocated*2) );
......@@ -1075,6 +1109,14 @@ static int x264_nal_end( x264_t *h )
}
return 0;
}
static int x264_nal_end( x264_t *h )
{
x264_nal_t *nal = &h->out.nal[h->out.i_nal];
nal->i_payload = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8] - nal->p_payload;
h->out.i_nal++;
return x264_nal_check_buffer( h );
}
static int x264_encoder_encapsulate_nals( x264_t *h )
{
......@@ -1396,7 +1438,7 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
if( min_y < 0 )
return;
if( !b_end )
if( !b_end && !h->param.b_sliced_threads )
{
int i, j;
for( j=0; j<=h->sh.b_mbaff; j++ )
......@@ -1425,10 +1467,8 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
}
}
if( h->param.i_threads > 1 && h->fdec->b_kept_as_ref )
{
if( h->param.i_threads > 1 && h->fdec->b_kept_as_ref && !h->param.b_sliced_threads )
x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << h->sh.b_mbaff)) );
}
min_y = X264_MAX( min_y*16-8, 0 );
max_y = b_end ? h->param.i_height : mb_y*16-8;
......@@ -1463,7 +1503,7 @@ static inline int x264_reference_update( x264_t *h )
int i, j;
if( !h->fdec->b_kept_as_ref )
{
if( h->param.i_threads > 1 )
if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
{
x264_frame_push_unused( h, h->fdec );
h->fdec = x264_frame_pop_unused( h, 1 );
......@@ -1567,8 +1607,6 @@ static inline void x264_slice_init( x264_t *h, int i_nal_type, int i_global_qp )
{
/* Nothing to do ? */
}
x264_macroblock_slice_init( h );
}
static int x264_slice_write( x264_t *h )
......@@ -1587,6 +1625,7 @@ static int x264_slice_write( x264_t *h )
x264_nal_start( h, h->i_nal_type, h->i_nal_ref_idc );
/* Slice header */
x264_macroblock_slice_init( h );
x264_slice_header_write( &h->out.bs, &h->sh, h->i_nal_ref_idc );
if( h->param.b_cabac )
{
......@@ -1626,7 +1665,7 @@ static int x264_slice_write( x264_t *h )
}
}
if( i_mb_x == 0 && !h->mb.b_reencode_mb )
if( i_mb_x == 0 && !h->mb.b_reencode_mb && !h->param.b_sliced_threads )
x264_fdec_filter_row( h, i_mb_y );
/* load cache */
......@@ -1795,7 +1834,8 @@ static int x264_slice_write( x264_t *h )
+ (h->out.i_nal*NALU_OVERHEAD * 8)
- h->stat.frame.i_tex_bits
- h->stat.frame.i_mv_bits;
x264_fdec_filter_row( h, h->sps->i_mb_height );
if( !h->param.b_sliced_threads )
x264_fdec_filter_row( h, h->sps->i_mb_height );
}
return 0;
......@@ -1803,11 +1843,11 @@ static int x264_slice_write( x264_t *h )
static void x264_thread_sync_context( x264_t *dst, x264_t *src )
{
x264_frame_t **f;
if( dst == src )
return;
// reference counting
x264_frame_t **f;
for( f = src->frames.reference; *f; f++ )
(*f)->i_reference_count++;
for( f = dst->frames.reference; *f; f++ )
......@@ -1831,6 +1871,7 @@ static void x264_thread_sync_stat( x264_t *dst, x264_t *src )
static void *x264_slices_write( x264_t *h )
{
int i_slice_num = 0;
int last_thread_mb = h->sh.i_last_mb;
if( h->param.i_sync_lookahead )
x264_lower_thread_priority( 10 );
......@@ -1849,20 +1890,19 @@ static void *x264_slices_write( x264_t *h )
/* init stats */
memset( &h->stat.frame, 0, sizeof(h->stat.frame) );
h->mb.b_reencode_mb = 0;
while( h->sh.i_first_mb < h->mb.i_mb_count )
while( h->sh.i_first_mb <= last_thread_mb )
{
h->sh.i_last_mb = h->mb.i_mb_count - 1;
h->sh.i_last_mb = last_thread_mb;
if( h->param.i_slice_max_mbs )
h->sh.i_last_mb = h->sh.i_first_mb + h->param.i_slice_max_mbs - 1;
else if( h->param.i_slice_count )
else if( h->param.i_slice_count && !h->param.b_sliced_threads )
{
x264_emms();
i_slice_num++;
double height = h->sps->i_mb_height >> h->param.b_interlaced;
int height = h->sps->i_mb_height >> h->param.b_interlaced;
int width = h->sps->i_mb_width << h->param.b_interlaced;
h->sh.i_last_mb = (int)(height * i_slice_num / h->param.i_slice_count + 0.5) * width - 1;
i_slice_num++;
h->sh.i_last_mb = (height * i_slice_num + h->param.i_slice_count/2) / h->param.i_slice_count * width - 1;
}
h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, h->mb.i_mb_count - 1 );
h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, last_thread_mb );
if( x264_stack_align( x264_slice_write, h ) )
return (void *)-1;
h->sh.i_first_mb = h->sh.i_last_mb + 1;
......@@ -1879,6 +1919,65 @@ static void *x264_slices_write( x264_t *h )
return (void *)0;
}
static int x264_threaded_slices_write( x264_t *h )
{
int i, j;
void *ret = NULL;
/* set first/last mb and sync contexts */
for( i = 0; i < h->param.i_threads; i++ )
{
x264_t *t = h->thread[i];
if( i )
{
t->param = h->param;
memcpy( &t->i_frame, &h->i_frame, offsetof(x264_t, rc) - offsetof(x264_t, i_frame) );
}
int height = h->sps->i_mb_height >> h->param.b_interlaced;
t->i_threadslice_start = ((height * i + h->param.i_slice_count/2) / h->param.i_threads) << h->param.b_interlaced;
t->i_threadslice_end = ((height * (i+1) + h->param.i_slice_count/2) / h->param.i_threads) << h->param.b_interlaced;
t->sh.i_first_mb = t->i_threadslice_start * h->sps->i_mb_width;
t->sh.i_last_mb = t->i_threadslice_end * h->sps->i_mb_width - 1;
}
x264_analyse_weight_frame( h, h->sps->i_mb_height*16 + 16 );
x264_threads_distribute_ratecontrol( h );
/* dispatch */
for( i = 0; i < h->param.i_threads; i++ )
if( x264_pthread_create( &h->thread[i]->thread_handle, NULL, (void*)x264_slices_write, (void*)h->thread[i] ) )
return -1;
for( i = 0; i < h->param.i_threads; i++ )
{
x264_pthread_join( h->thread[i]->thread_handle, &ret );
if( (intptr_t)ret )
return (intptr_t)ret;
}
/* deblocking and hpel filtering */
for( i = 0; i <= h->sps->i_mb_height; i++ )
x264_fdec_filter_row( h, i );
for( i = 1; i < h->param.i_threads; i++ )
{
x264_t *t = h->thread[i];
for( j = 0; j < t->out.i_nal; j++ )
{
h->out.nal[h->out.i_nal] = t->out.nal[j];
h->out.i_nal++;
x264_nal_check_buffer( h );
}
/* All entries in stat.frame are ints except for ssd/ssim,
* which are only calculated in the main thread. */
for( j = 0; j < (offsetof(x264_t,stat.frame.i_ssd) - offsetof(x264_t,stat.frame.i_mv_bits)) / sizeof(int); j++ )
((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j];
}
x264_threads_merge_ratecontrol( h );
return 0;
}
/****************************************************************************
* x264_encoder_encode:
* XXX: i_poc : is the poc of the current given picture
......@@ -1898,12 +1997,9 @@ int x264_encoder_encode( x264_t *h,
x264_picture_t *pic_out )
{
x264_t *thread_current, *thread_prev, *thread_oldest;
int i_nal_type;
int i_nal_ref_idc;
int i_nal_type, i_nal_ref_idc, i_global_qp, i;
int i_global_qp;
if( h->param.i_threads > 1)
if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
{
thread_prev = h->thread[ h->i_thread_phase ];
h->i_thread_phase = (h->i_thread_phase + 1) % h->param.i_threads;
......@@ -1964,7 +2060,7 @@ int x264_encoder_encode( x264_t *h,
/* 2: Place the frame into the queue for its slice type decision */
x264_lookahead_put_frame( h, fenc );
if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads )
if( h->frames.i_input <= h->frames.i_delay + (h->param.b_sliced_threads ? 0 : 1 - h->param.i_threads) )
{
/* Nothing yet to encode, waiting for filling of buffers */
pic_out->i_type = X264_TYPE_AUTO;
......@@ -2061,8 +2157,19 @@ int x264_encoder_encode( x264_t *h,
/* ---------------------- Write the bitstream -------------------------- */
/* Init bitstream context */
h->out.i_nal = 0;
bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
if( h->param.b_sliced_threads )
{
for( i = 0; i < h->param.i_threads; i++ )
{
bs_init( &h->thread[i]->out.bs, h->thread[i]->out.p_bitstream, h->thread[i]->out.i_bitstream );
h->thread[i]->out.i_nal = 0;
}
}
else
{
bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
h->out.i_nal = 0;
}
if( h->param.b_aud )
{
......@@ -2145,12 +2252,19 @@ int x264_encoder_encode( x264_t *h,
h->i_frame_num++;