Commit 6940dcae authored by Steven Walters's avatar Steven Walters Committed by Fiona Glaser

Threaded lookahead

Move lookahead into a separate thread, set to higher priority than the other threads, for optimal performance.
Reduces the amount that lookahead bottlenecks encoding, greatly increasing performance with lookahead-intensive settings (e.g. b-adapt 2) on many-core CPUs.
Buffer size can be controlled with --sync-lookahead, which defaults to auto (threads+bframes buffer size).
Note that this buffer is separate from the rc-lookahead value.
Note also that this does not split lookahead itself into multiple threads yet; this may be added in the future.
Additionally, split frames into "fdec" and "fenc" frame types and keep the two separate.
This split greatly reduces memory usage, which helps compensate for the larger lookahead size.
Extremely special thanks to Michael Kazmier and Alex Giladi of Avail Media, the original authors of this patch.
parent 7df6f5d6
......@@ -10,7 +10,7 @@ SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
common/quant.c common/vlc.c \
encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
encoder/set.c encoder/macroblock.c encoder/cabac.c \
encoder/cavlc.c encoder/encoder.c
encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
SRCCLI = x264.c matroska.c muxers.c
......
......@@ -45,6 +45,7 @@ void x264_param_default( x264_param_t *param )
param->cpu = x264_cpu_detect();
param->i_threads = X264_THREADS_AUTO;
param->b_deterministic = 1;
param->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
/* Video properties */
param->i_csp = X264_CSP_I420;
......@@ -276,6 +277,13 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
else
p->i_threads = atoi(value);
}
OPT("sync-lookahead")
{
if( !strcmp(value, "auto") )
p->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
else
p->i_sync_lookahead = atoi(value);
}
OPT2("deterministic", "n-deterministic")
p->b_deterministic = atobool(value);
OPT2("level", "level-idc")
......
......@@ -239,6 +239,19 @@ typedef struct
} x264_slice_header_t;
typedef struct x264_lookahead_t
{
uint8_t b_thread_active;
uint8_t b_exit_thread;
uint8_t b_analyse_keyframe;
int i_last_idr;
int i_slicetype_length;
x264_frame_t *last_nonb;
x264_synch_frame_list_t ifbuf;
x264_synch_frame_list_t next;
x264_synch_frame_list_t ofbuf;
} x264_lookahead_t;
/* From ffmpeg
*/
#define X264_SCAN8_SIZE (6*8)
......@@ -283,7 +296,7 @@ struct x264_t
/* encoder parameters */
x264_param_t param;
x264_t *thread[X264_THREAD_MAX];
x264_t *thread[X264_THREAD_MAX+1];
x264_pthread_t thread_handle;
int b_thread_active;
int i_thread_phase; /* which thread to use for the next frame */
......@@ -349,13 +362,9 @@ struct x264_t
struct
{
/* Frames to be encoded (whose types have been decided) */
x264_frame_t *current[X264_LOOKAHEAD_MAX+3];
/* Temporary buffer (frames types not yet decided) */
x264_frame_t *next[X264_LOOKAHEAD_MAX+3];
/* Unused frames */
x264_frame_t *unused[X264_LOOKAHEAD_MAX + X264_THREAD_MAX*2 + 16+4];
/* For adaptive B decision */
x264_frame_t *last_nonb;
x264_frame_t **current;
/* Unused frames: 0 = fenc, 1 = fdec */
x264_frame_t **unused[2];
/* frames used for reference + sentinels */
x264_frame_t *reference[16+2];
......@@ -667,6 +676,7 @@ struct x264_t
#if VISUALIZE
struct visualize_t *visualize;
#endif
x264_lookahead_t *lookahead;
};
// included at the end because it needs x264_t
......
......@@ -33,12 +33,12 @@ void x264_cpu_mask_misalign_sse( void );
* gcc 4.2 introduced __attribute__((force_align_arg_pointer)) to fix this
* problem, but I don't want to require such a new version.
* This applies only to x86_32, since other architectures that need alignment
* also have ABIs that ensure aligned stack. */
* either have ABIs that ensure aligned stack, or don't support it at all. */
#if defined(ARCH_X86) && defined(HAVE_MMX)
int x264_stack_align( void (*func)(x264_t*), x264_t *arg );
#define x264_stack_align(func,arg) x264_stack_align((void (*)(x264_t*))func,arg)
int x264_stack_align( void (*func)(), ... );
#define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__)
#else
#define x264_stack_align(func,arg) func(arg)
#define x264_stack_align(func,...) func(__VA_ARGS__)
#endif
typedef struct {
......
......@@ -26,7 +26,7 @@
#define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
x264_frame_t *x264_frame_new( x264_t *h )
x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
{
x264_frame_t *frame;
int i, j;
......@@ -60,9 +60,23 @@ x264_frame_t *x264_frame_new( x264_t *h )
CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
}
for( i = 0; i < h->param.i_bframe + 2; i++ )
for( j = 0; j < h->param.i_bframe + 2; j++ )
CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
frame->i_poc = -1;
frame->i_type = X264_TYPE_AUTO;
frame->i_qpplus1 = 0;
frame->i_pts = -1;
frame->i_frame = -1;
frame->i_frame_num = -1;
frame->i_lines_completed = -1;
frame->b_fdec = b_fdec;
/* all 4 luma planes allocated together, since the cacheline split code
* requires them to be in-phase wrt cacheline alignment. */
if( h->param.analyse.i_subpel_refine )
if( h->param.analyse.i_subpel_refine && b_fdec )
{
CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size);
for( i = 0; i < 4; i++ )
......@@ -75,77 +89,68 @@ x264_frame_t *x264_frame_new( x264_t *h )
frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
}
if( h->frames.b_have_lowres )
if( b_fdec ) /* fdec frame */
{
frame->i_width_lowres = frame->i_width[0]/2;
frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
frame->i_lines_lowres = frame->i_lines[0]/2;
luma_plane_size = frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv );
CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
for( i = 0; i < 4; i++ )
frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size;
for( j = 0; j <= !!h->param.i_bframe; j++ )
for( i = 0; i <= h->param.i_bframe; i++ )
{
CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
}
CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
for( j = 0; j <= h->param.i_bframe+1; j++ )
for( i = 0; i <= h->param.i_bframe+1; i++ )
{
CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
}
frame->i_intra_cost = frame->lowres_costs[0][0];
memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
if( h->param.i_bframe )
{
CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
}
else
{
frame->mv[1] = NULL;
frame->ref[1] = NULL;
}
CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
if( h->param.analyse.i_me_method >= X264_ME_ESA )
{
CHECKED_MALLOC( frame->buffer[3],
frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
}
}
if( h->param.analyse.i_me_method >= X264_ME_ESA )
else /* fenc frame */
{
CHECKED_MALLOC( frame->buffer[3],
frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
}
frame->i_poc = -1;
frame->i_type = X264_TYPE_AUTO;
frame->i_qpplus1 = 0;
frame->i_pts = -1;
frame->i_frame = -1;
frame->i_frame_num = -1;
frame->i_lines_completed = -1;
if( h->frames.b_have_lowres )
{
frame->i_width_lowres = frame->i_width[0]/2;
frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
frame->i_lines_lowres = frame->i_lines[0]/2;
CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
if( h->param.i_bframe )
{
CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
}
else
{
frame->mv[1] = NULL;
frame->ref[1] = NULL;
}
luma_plane_size = frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv );
CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
for( i = 0; i < h->param.i_bframe + 2; i++ )
for( j = 0; j < h->param.i_bframe + 2; j++ )
CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
for( i = 0; i < 4; i++ )
frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size;
if( h->param.rc.i_aq_mode )
{
CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
if( h->frames.b_have_lowres )
/* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
for( j = 0; j <= !!h->param.i_bframe; j++ )
for( i = 0; i <= h->param.i_bframe; i++ )
{
CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
}
CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
for( j = 0; j <= h->param.i_bframe+1; j++ )
for( i = 0; i <= h->param.i_bframe+1; i++ )
{
CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
}
frame->i_intra_cost = frame->lowres_costs[0][0];
memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
}
if( h->param.rc.i_aq_mode )
{
CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
if( h->frames.b_have_lowres )
/* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
}
}
if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
......@@ -971,19 +976,19 @@ void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
assert( frame->i_reference_count > 0 );
frame->i_reference_count--;
if( frame->i_reference_count == 0 )
x264_frame_push( h->frames.unused, frame );
assert( h->frames.unused[ sizeof(h->frames.unused) / sizeof(*h->frames.unused) - 1 ] == NULL );
x264_frame_push( h->frames.unused[frame->b_fdec], frame );
}
x264_frame_t *x264_frame_pop_unused( x264_t *h )
x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
{
x264_frame_t *frame;
if( h->frames.unused[0] )
frame = x264_frame_pop( h->frames.unused );
if( h->frames.unused[b_fdec][0] )
frame = x264_frame_pop( h->frames.unused[b_fdec] );
else
frame = x264_frame_new( h );
frame = x264_frame_new( h, b_fdec );
if( !frame )
return NULL;
frame->b_last_minigop_bframe = 0;
frame->i_reference_count = 1;
frame->b_intra_calculated = 0;
return frame;
......@@ -1008,3 +1013,54 @@ void x264_frame_sort( x264_frame_t **list, int b_dts )
}
} while( !b_ok );
}
void x264_frame_delete_list( x264_frame_t **list )
{
int i = 0;
while( list[i] )
x264_frame_delete( list[i++] );
x264_free( list );
}
int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int max_size )
{
if( max_size < 0 )
return -1;
slist->i_max_size = max_size;
slist->i_size = 0;
CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
x264_pthread_cond_init( &slist->cv_empty, NULL ) )
return -1;
return 0;
fail:
return -1;
}
void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist )
{
x264_pthread_mutex_destroy( &slist->mutex );
x264_pthread_cond_destroy( &slist->cv_fill );
x264_pthread_cond_destroy( &slist->cv_empty );
x264_frame_delete_list( slist->list );
}
void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame )
{
x264_pthread_mutex_lock( &slist->mutex );
while( slist->i_size == slist->i_max_size )
x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
slist->list[ slist->i_size++ ] = frame;
x264_pthread_mutex_unlock( &slist->mutex );
x264_pthread_cond_broadcast( &slist->cv_fill );
}
int x264_synch_frame_list_get_size( x264_synch_frame_list_t *slist )
{
int size;
x264_pthread_mutex_lock( &slist->mutex );
size = slist->i_size;
x264_pthread_mutex_unlock( &slist->mutex );
return size;
}
......@@ -40,6 +40,9 @@ typedef struct
int i_frame; /* Presentation frame number */
int i_frame_num; /* Coded frame number */
int b_kept_as_ref;
uint8_t b_fdec;
uint8_t b_last_minigop_bframe; /* this frame is the last b in a sequence of bframes */
uint8_t i_bframes; /* number of bframes following this nonb in coded order */
float f_qp_avg_rc; /* QPs as decided by ratecontrol */
float f_qp_avg_aq; /* QPs as decided by AQ in addition to ratecontrol */
......@@ -104,6 +107,17 @@ typedef struct
} x264_frame_t;
/* synchronized frame list */
typedef struct
{
x264_frame_t **list;
int i_max_size;
int i_size;
x264_pthread_mutex_t mutex;
x264_pthread_cond_t cv_fill; /* event signaling that the list became fuller */
x264_pthread_cond_t cv_empty; /* event signaling that the list became emptier */
} x264_synch_frame_list_t;
typedef void (*x264_deblock_inter_t)( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
typedef void (*x264_deblock_intra_t)( uint8_t *pix, int stride, int alpha, int beta );
typedef struct
......@@ -118,7 +132,7 @@ typedef struct
x264_deblock_intra_t deblock_h_chroma_intra;
} x264_deblock_function_t;
x264_frame_t *x264_frame_new( x264_t *h );
x264_frame_t *x264_frame_new( x264_t *h, int b_fdec );
void x264_frame_delete( x264_frame_t *frame );
int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src );
......@@ -144,8 +158,15 @@ x264_frame_t *x264_frame_pop( x264_frame_t **list );
void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame );
x264_frame_t *x264_frame_shift( x264_frame_t **list );
void x264_frame_push_unused( x264_t *h, x264_frame_t *frame );
x264_frame_t *x264_frame_pop_unused( x264_t *h );
x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec );
void x264_frame_sort( x264_frame_t **list, int b_dts );
void x264_frame_delete_list( x264_frame_t **list );
int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int nelem );
void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist );
void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame );
int x264_synch_frame_list_get_size( x264_synch_frame_list_t *slist );
#define x264_frame_sort_dts(list) x264_frame_sort(list, 1)
#define x264_frame_sort_pts(list) x264_frame_sort(list, 0)
......
......@@ -703,7 +703,7 @@ int x264_macroblock_cache_init( x264_t *h )
for( j=0; j<3; j++ )
{
/* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
CHECKED_MALLOCZERO( h->mb.intra_border_backup[i][j], h->fdec->i_stride[j] );
CHECKED_MALLOCZERO( h->mb.intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
h->mb.intra_border_backup[i][j] += 8;
}
......
......@@ -137,6 +137,9 @@ static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(vo
#define x264_pthread_cond_destroy pthread_cond_destroy
#define x264_pthread_cond_broadcast pthread_cond_broadcast
#define x264_pthread_cond_wait pthread_cond_wait
#define x264_pthread_attr_t pthread_attr_t
#define x264_pthread_attr_init pthread_attr_init
#define x264_pthread_attr_destroy pthread_attr_destroy
#else
#define x264_pthread_mutex_t int
#define x264_pthread_mutex_init(m,f) 0
......@@ -148,6 +151,9 @@ static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(vo
#define x264_pthread_cond_destroy(c)
#define x264_pthread_cond_broadcast(c)
#define x264_pthread_cond_wait(c,m)
#define x264_pthread_attr_t int
#define x264_pthread_attr_init(a) 0
#define x264_pthread_attr_destroy(a)
#endif
#define WORD_SIZE sizeof(void*)
......@@ -216,4 +222,11 @@ static int ALWAYS_INLINE x264_clz( uint32_t x )
}
#endif
#if defined(SYS_LINUX) && defined(HAVE_PTHREAD)
#include <unistd.h>
#define x264_lower_thread_priority(p) { UNUSED int nice_ret = nice(p); }
#else
#define x264_lower_thread_priority(p)
#endif
#endif /* X264_OSDEP_H */
......@@ -96,11 +96,13 @@ cglobal x264_cpu_cpuid, 0,6
cglobal x264_stack_align
push ebp
mov ebp, esp
sub esp, 4
sub esp, 8
and esp, ~15
mov ecx, [ebp+8]
mov edx, [ebp+12]
mov [esp], edx
mov edx, [ebp+16]
mov [esp+4], edx
call ecx
leave
ret
......
......@@ -28,4 +28,12 @@ int x264_macroblock_analyse( x264_t *h );
void x264_slicetype_decide( x264_t *h );
int x264_lowres_context_alloc( x264_t *h );
void x264_slicetype_analyse( x264_t *h, int keyframe );
int x264_lookahead_init( x264_t *h, int i_slicetype_length );
int x264_lookahead_is_empty( x264_t *h );
void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame );
void x264_lookahead_get_frames( x264_t *h );
void x264_lookahead_delete( x264_t *h );
#endif
......@@ -364,7 +364,7 @@ static int x264_validate_parameters( x264_t *h )
return -1;
}
if( h->param.i_threads == 0 )
if( h->param.i_threads == X264_THREADS_AUTO )
h->param.i_threads = x264_cpu_num_processors() * 3/2;
h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX );
if( h->param.i_threads > 1 )
......@@ -519,6 +519,14 @@ static int x264_validate_parameters( x264_t *h )
h->param.rc.b_mb_tree = 0;
if( h->param.rc.f_qcompress == 1 )
h->param.rc.b_mb_tree = 0;
#ifdef HAVE_PTHREAD
if( h->param.i_sync_lookahead )
h->param.i_sync_lookahead = x264_clip3( h->param.i_sync_lookahead, h->param.i_threads + h->param.i_bframe, X264_LOOKAHEAD_MAX );
if( h->param.rc.b_stat_read || h->param.i_threads == 1 )
h->param.i_sync_lookahead = 0;
#else
h->param.i_sync_lookahead = 0;
#endif
h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
&& h->param.i_bframe
......@@ -740,7 +748,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
{
x264_t *h;
char buf[1000], *p;
int i;
int i, i_slicetype_length;
CHECKED_MALLOCZERO( h, sizeof(x264_t) );
......@@ -793,8 +801,10 @@ x264_t *x264_encoder_open( x264_param_t *param )
h->frames.i_delay = h->param.i_bframe;
if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size )
h->frames.i_delay = X264_MAX( h->frames.i_delay, h->param.rc.i_lookahead );
i_slicetype_length = h->frames.i_delay;
h->frames.i_delay += h->param.i_threads - 1;
h->frames.i_delay = X264_MIN( h->frames.i_delay, X264_LOOKAHEAD_MAX );
h->frames.i_delay += h->param.i_sync_lookahead;
h->frames.i_max_ref0 = h->param.i_frame_reference;
h->frames.i_max_ref1 = h->sps->vui.i_num_reorder_frames;
......@@ -810,7 +820,12 @@ x264_t *x264_encoder_open( x264_param_t *param )
h->frames.i_last_idr = - h->param.i_keyint_max;
h->frames.i_input = 0;
h->frames.last_nonb = NULL;
CHECKED_MALLOCZERO( h->frames.unused[0], (h->frames.i_delay + 3) * sizeof(x264_frame_t *) );
/* Allocate room for max refs plus a few extra just in case. */
CHECKED_MALLOCZERO( h->frames.unused[1], (h->param.i_threads + 20) * sizeof(x264_frame_t *) );
CHECKED_MALLOCZERO( h->frames.current, (h->param.i_sync_lookahead + h->param.i_bframe
+ h->param.i_threads + 3) * sizeof(x264_frame_t *) );
h->i_ref0 = 0;
h->i_ref1 = 0;
......@@ -861,14 +876,14 @@ x264_t *x264_encoder_open( x264_param_t *param )
h->thread[0] = h;
h->i_thread_num = 0;
for( i = 1; i < h->param.i_threads; i++ )
for( i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
for( i = 0; i < h->param.i_threads; i++ )
{
if( i > 0 )
*h->thread[i] = *h;
h->thread[i]->fdec = x264_frame_pop_unused( h );
h->thread[i]->fdec = x264_frame_pop_unused( h, 1 );
if( !h->thread[i]->fdec )
goto fail;
CHECKED_MALLOC( h->thread[i]->out.p_bitstream, h->out.i_bitstream );
......@@ -879,6 +894,9 @@ x264_t *x264_encoder_open( x264_param_t *param )
goto fail;
}
if( x264_lookahead_init( h, i_slicetype_length ) )
goto fail;
if( x264_ratecontrol_new( h ) < 0 )
goto fail;
......@@ -1181,8 +1199,6 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
static inline int x264_reference_update( x264_t *h )
{
int i;
if( h->fdec->i_frame >= 0 )
h->i_frame++;
......@@ -1191,29 +1207,18 @@ static inline int x264_reference_update( x264_t *h )
if( h->param.i_threads > 1 )
{
x264_frame_push_unused( h, h->fdec );
h->fdec = x264_frame_pop_unused( h );
h->fdec = x264_frame_pop_unused( h, 1 );
if( !h->fdec )
return -1;
}
return 0;
}
/* move lowres copy of the image to the ref frame */
for( i = 0; i < 4; i++)
{
XCHG( uint8_t*, h->fdec->lowres[i], h->fenc->lowres[i] );
XCHG( uint8_t*, h->fdec->buffer_lowres[i], h->fenc->buffer_lowres[i] );
}
/* adaptive B decision needs a pointer, since it can't use the ref lists */
if( h->sh.i_type != SLICE_TYPE_B )
h->frames.last_nonb = h->fdec;
/* move frame in the buffer */
x264_frame_push( h->frames.reference, h->fdec );
if( h->frames.reference[h->frames.i_max_dpb] )
x264_frame_push_unused( h, x264_frame_shift( h->frames.reference ) );
h->fdec = x264_frame_pop_unused( h );
h->fdec = x264_frame_pop_unused( h, 1 );
if( !h->fdec )
return -1;
return 0;
......@@ -1516,6 +1521,8 @@ static void *x264_slices_write( x264_t *h )
{
int i_frame_size = 0;
int i_slice_num = 0;
if( h->param.i_sync_lookahead )
x264_lower_thread_priority( 10 );
#ifdef HAVE_MMX
/* Misalign mask has to be set separately for each thread. */
......@@ -1619,7 +1626,7 @@ int x264_encoder_encode( x264_t *h,
if( pic_in != NULL )
{
/* 1: Copy the picture to a frame and move it to a buffer */
x264_frame_t *fenc = x264_frame_pop_unused( h );
x264_frame_t *fenc = x264_frame_pop_unused( h, 0 );
if( !fenc )
return -1;
......@@ -1632,8 +1639,6 @@ int x264_encoder_encode( x264_t *h,
fenc->i_frame = h->frames.i_input++;
x264_frame_push( h->frames.next, fenc );
if( h->frames.b_have_lowres )
x264_frame_init_lowres( h, fenc );
......@@ -1645,55 +1650,33 @@ int x264_encoder_encode( x264_t *h,
else if( h->param.rc.i_aq_mode )
x264_adaptive_quant_frame( h, fenc );
/* 2: Place the frame into the queue for its slice type decision */
x264_lookahead_put_frame( h, fenc );
if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads )
{
/* Nothing yet to encode */
/* waiting for filling bframe buffer */
/* Nothing yet to encode, waiting for filling of buffers */
pic_out->i_type = X264_TYPE_AUTO;
return 0;
}
}
if( h->frames.current[0] == NULL )
else
{
int bframes = 0;
/* 2: Select frame types */
if( h->frames.next[0] == NULL )
{
if( x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out ) < 0 )
return -1;
return 0;
}
/* signal kills for lookahead thread */
h->lookahead->b_exit_thread = 1;
x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
}
x264_stack_align( x264_slicetype_decide, h );
/* 3: The picture is analyzed in the lookahead */
if( !h->frames.current[0] )
x264_lookahead_get_frames( h );
/* 3: move some B-frames and 1 non-B to encode queue */
while( IS_X264_TYPE_B( h->frames.next[bframes]->i_type ) )
bframes++;
x264_frame_push( h->frames.current, x264_frame_shift( &h->frames.next[bframes] ) );
/* FIXME: when max B-frames > 3, BREF may no longer be centered after GOP closing */
if( h->param.b_bframe_pyramid && bframes > 1 )
{
x264_frame_t *mid = x264_frame_shift( &h->frames.next[bframes/2] );
mid->i_type = X264_TYPE_BREF;
x264_frame_push( h->frames.current, mid );
bframes--;
}
while( bframes-- )
x264_frame_push( h->frames.current, x264_frame_shift( h->frames.next ) );
}
if( !h->frames.current[0] && x264_lookahead_is_empty( h ) )
return x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
/* ------------------- Get frame to be encoded ------------------------- */
/* 4: get picture to encode */
h->fenc = x264_frame_shift( h->frames.current );
if( h->fenc == NULL )
{
/* Nothing yet to encode (ex: waiting for I/P with B frames) */
/* waiting for filling bframe buffer */
pic_out->i_type = X264_TYPE_AUTO;
return 0;
}
if( h->fenc->param )
{
x264_encoder_reconfig( h, h->fenc->param );
......@@ -1704,6 +1687,7 @@ int x264_encoder_encode( x264_t *h,
if( h->fenc->i_type == X264_TYPE_IDR )
{
h->frames.i_last_idr = h->fenc->i_frame;
h->i_frame_num = 0;
}
/* ------------------- Setup frame context ----------------------------- */
......@@ -2029,6 +2013,8 @@ void x264_encoder_close ( x264_t *h )
|| h->stat.i_mb_count[SLICE_TYPE_P][I_PCM]
|| h->stat.i_mb_count[SLICE_TYPE_B][I_PCM];
x264_lookahead_delete( h );
for( i=0; i<h->param.i_threads; i++ )
{
// don't strictly have to wait for the other threads, but it's simpler than canceling them
......@@ -2248,21 +2234,9 @@ void x264_encoder_close ( x264_t *h )
h = h->thread[ h->i_thread_phase % h->param.i_threads ];
/* frames */
for( i = 0; h->frames.current[i]; i++ )
{
assert( h->frames.current[i]->i_reference_count == 1 );
x264_frame_delete( h->frames.current[i] );
}
for( i = 0; h->frames.next[i]; i++ )