Commit 6f221210 authored by Fiona Glaser's avatar Fiona Glaser

Bring back slice-based threading support

Enabled with --sliced-threads
Unlike normal threading, adds no encoding latency.
Less efficient than normal threading, both performance and compression-wise.
Useful for low-latency encoding environments where performance is still important, such as HD videoconferencing.
Add --tune zerolatency, which eliminates all x264 encoder-side latency (no delayed frames at all).
Some tweaks to VBV ratecontrol and lookahead (in addition to those required by sliced threading).
Commit sponsored by a media streaming company that wishes to remain anonymous.
parent a2380187
...@@ -281,6 +281,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value ) ...@@ -281,6 +281,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
else else
p->i_threads = atoi(value); p->i_threads = atoi(value);
} }
OPT("sliced-threads")
p->b_sliced_threads = atobool(value);
OPT("sync-lookahead") OPT("sync-lookahead")
{ {
if( !strcmp(value, "auto") ) if( !strcmp(value, "auto") )
...@@ -888,6 +890,7 @@ char *x264_param2string( x264_param_t *p, int b_res ) ...@@ -888,6 +890,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
s += sprintf( s, " deadzone=%d,%d", p->analyse.i_luma_deadzone[0], p->analyse.i_luma_deadzone[1] ); s += sprintf( s, " deadzone=%d,%d", p->analyse.i_luma_deadzone[0], p->analyse.i_luma_deadzone[1] );
s += sprintf( s, " chroma_qp_offset=%d", p->analyse.i_chroma_qp_offset ); s += sprintf( s, " chroma_qp_offset=%d", p->analyse.i_chroma_qp_offset );
s += sprintf( s, " threads=%d", p->i_threads ); s += sprintf( s, " threads=%d", p->i_threads );
s += sprintf( s, " sliced_threads=%d", p->b_sliced_threads );
if( p->i_slice_count ) if( p->i_slice_count )
s += sprintf( s, " slices=%d", p->i_slice_count ); s += sprintf( s, " slices=%d", p->i_slice_count );
if( p->i_slice_max_size ) if( p->i_slice_max_size )
......
...@@ -341,6 +341,8 @@ struct x264_t ...@@ -341,6 +341,8 @@ struct x264_t
x264_pthread_t thread_handle; x264_pthread_t thread_handle;
int b_thread_active; int b_thread_active;
int i_thread_phase; /* which thread to use for the next frame */ int i_thread_phase; /* which thread to use for the next frame */
int i_threadslice_start; /* first row in this thread slice */
int i_threadslice_end; /* row after the end of this thread slice */
/* bitstream output */ /* bitstream output */
struct struct
......
...@@ -768,42 +768,6 @@ int x264_macroblock_cache_init( x264_t *h ) ...@@ -768,42 +768,6 @@ int x264_macroblock_cache_init( x264_t *h )
memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) ); memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) ); memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
/* fdec: fenc:
* yyyyyyy
* yYYYY YYYY
* yYYYY YYYY
* yYYYY YYYY
* yYYYY YYYY
* uuu vvv UUVV
* uUU vVV UUVV
* uUU vVV
*/
h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
h->mb.i_neighbour4[6] =
h->mb.i_neighbour4[9] =
h->mb.i_neighbour4[12] =
h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
h->mb.i_neighbour4[3] =
h->mb.i_neighbour4[7] =
h->mb.i_neighbour4[11] =
h->mb.i_neighbour4[13] =
h->mb.i_neighbour4[15] =
h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
int buf_hpel = (h->param.i_width+48) * sizeof(int16_t);
int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
CHECKED_MALLOC( h->scratch_buffer, X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_mbtree ) );
return 0; return 0;
fail: return -1; fail: return -1;
} }
...@@ -832,7 +796,6 @@ void x264_macroblock_cache_end( x264_t *h ) ...@@ -832,7 +796,6 @@ void x264_macroblock_cache_end( x264_t *h )
x264_free( h->mb.skipbp ); x264_free( h->mb.skipbp );
x264_free( h->mb.cbp ); x264_free( h->mb.cbp );
x264_free( h->mb.qp ); x264_free( h->mb.qp );
x264_free( h->scratch_buffer );
} }
void x264_macroblock_slice_init( x264_t *h ) void x264_macroblock_slice_init( x264_t *h )
{ {
...@@ -871,6 +834,34 @@ void x264_macroblock_slice_init( x264_t *h ) ...@@ -871,6 +834,34 @@ void x264_macroblock_slice_init( x264_t *h )
memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) ); memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );
setup_inverse_delta_pocs( h ); setup_inverse_delta_pocs( h );
/* fdec: fenc:
* yyyyyyy
* yYYYY YYYY
* yYYYY YYYY
* yYYYY YYYY
* yYYYY YYYY
* uuu vvv UUVV
* uUU vVV UUVV
* uUU vVV
*/
h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
h->mb.i_neighbour4[6] =
h->mb.i_neighbour4[9] =
h->mb.i_neighbour4[12] =
h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
h->mb.i_neighbour4[3] =
h->mb.i_neighbour4[7] =
h->mb.i_neighbour4[11] =
h->mb.i_neighbour4[13] =
h->mb.i_neighbour4[15] =
h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
} }
void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y ) void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
...@@ -899,8 +890,10 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb ...@@ -899,8 +890,10 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb
const int i_pix_offset = h->mb.b_interlaced const int i_pix_offset = h->mb.b_interlaced
? w * (i_mb_x + (i_mb_y&~1) * i_stride) + (i_mb_y&1) * i_stride ? w * (i_mb_x + (i_mb_y&~1) * i_stride) + (i_mb_y&1) * i_stride
: w * (i_mb_x + i_mb_y * i_stride); : w * (i_mb_x + i_mb_y * i_stride);
const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
const uint8_t *intra_fdec = h->param.b_sliced_threads ? plane_fdec-i_stride2 :
&h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
int ref_pix_offset[2] = { i_pix_offset, i_pix_offset }; int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
const uint8_t *intra_fdec = &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
x264_frame_t **fref[2] = { h->fref0, h->fref1 }; x264_frame_t **fref[2] = { h->fref0, h->fref1 };
int j, k; int j, k;
if( h->mb.b_interlaced ) if( h->mb.b_interlaced )
...@@ -909,13 +902,13 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb ...@@ -909,13 +902,13 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb
h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset]; h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE, h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
h->mb.pic.p_fenc_plane[i], i_stride2, w ); h->mb.pic.p_fenc_plane[i], i_stride2, w );
memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 ); if( i_mb_y > 0 )
memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
else
memset( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], 0, w*3/2+1 );
if( h->mb.b_interlaced || h->mb.b_reencode_mb ) if( h->mb.b_interlaced || h->mb.b_reencode_mb )
{
const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
for( j = 0; j < w; j++ ) for( j = 0; j < w; j++ )
h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
}
for( j = 0; j < h->mb.pic.i_fref[0]; j++ ) for( j = 0; j < h->mb.pic.i_fref[0]; j++ )
{ {
h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &fref[0][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]]; h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &fref[0][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]];
......
...@@ -285,6 +285,38 @@ void x264_analyse_free_costs( x264_t *h ) ...@@ -285,6 +285,38 @@ void x264_analyse_free_costs( x264_t *h )
} }
} }
void x264_analyse_weight_frame( x264_t *h, int end )
{
int j;
for( j=0; j<h->i_ref0; j++ )
{
if( h->sh.weight[j][0].weightfn )
{
x264_frame_t *frame = h->fref0[j];
int width = frame->i_width[0] + 2*PADH;
int i_padv = PADV << h->param.b_interlaced;
int offset, height;
uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
int k;
height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
offset = h->fenc->i_lines_weighted*frame->i_stride[0];
h->fenc->i_lines_weighted += height;
if( height )
{
for( k = j; k < h->i_ref0; k++ )
if( h->sh.weight[k][0].weightfn )
{
uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
src + offset, frame->i_stride[0],
width, height, &h->sh.weight[k][0] );
}
}
break;
}
}
}
/* initialize an array of lambda*nbits for all possible mvs */ /* initialize an array of lambda*nbits for all possible mvs */
static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a ) static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
{ {
...@@ -361,13 +393,13 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) ...@@ -361,13 +393,13 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] ); h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border; h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border; h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
if( h->mb.i_mb_x == 0) if( h->mb.i_mb_x == 0 )
{ {
int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff; int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff; int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
int thread_mvy_range = i_fmv_range; int thread_mvy_range = i_fmv_range;
if( h->param.i_threads > 1 ) if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
{ {
int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16; int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
int thresh = pix_y + h->param.analyse.i_mv_range_thread; int thresh = pix_y + h->param.analyse.i_mv_range_thread;
...@@ -387,33 +419,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) ...@@ -387,33 +419,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
if( h->mb.b_interlaced ) if( h->mb.b_interlaced )
thread_mvy_range >>= 1; thread_mvy_range >>= 1;
for( j=0; j<h->i_ref0; j++ ) x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
{
if( h->sh.weight[j][0].weightfn )
{
x264_frame_t *frame = h->fref0[j];
int width = frame->i_width[0] + 2*PADH;
int i_padv = PADV << h->param.b_interlaced;
int offset, height;
uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
int k;
height = X264_MIN( 16 + thread_mvy_range + pix_y + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
offset = h->fenc->i_lines_weighted*frame->i_stride[0];
h->fenc->i_lines_weighted += height;
if( height )
{
for( k = j; k < h->i_ref0; k++ )
if( h->sh.weight[k][0].weightfn )
{
uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
src + offset, frame->i_stride[0],
width, height, &h->sh.weight[k][0] );
}
}
break;
}
}
} }
h->mb.mv_min[1] = 4*( -16*mb_y - 24 ); h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
...@@ -1247,7 +1253,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) ...@@ -1247,7 +1253,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
{ {
h->mb.i_type = P_SKIP; h->mb.i_type = P_SKIP;
x264_analyse_update_cache( h, a ); x264_analyse_update_cache( h, a );
assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 ); assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 || h->param.b_sliced_threads );
return; return;
} }
...@@ -1263,7 +1269,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) ...@@ -1263,7 +1269,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
} }
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref ); x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 ); assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 || h->param.b_sliced_threads );
h->mb.i_type = P_L0; h->mb.i_type = P_L0;
if( a->i_mbrd ) if( a->i_mbrd )
...@@ -2419,7 +2425,7 @@ void x264_macroblock_analyse( x264_t *h ) ...@@ -2419,7 +2425,7 @@ void x264_macroblock_analyse( x264_t *h )
analysis.b_try_pskip = 0; analysis.b_try_pskip = 0;
if( h->param.analyse.b_fast_pskip ) if( h->param.analyse.b_fast_pskip )
{ {
if( h->param.i_threads > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] ) if( h->param.i_threads > 1 && !h->param.b_sliced_threads && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
// FIXME don't need to check this if the reference frame is done // FIXME don't need to check this if the reference frame is done
{} {}
else if( h->param.analyse.i_subpel_refine >= 3 ) else if( h->param.analyse.i_subpel_refine >= 3 )
...@@ -2437,7 +2443,7 @@ void x264_macroblock_analyse( x264_t *h ) ...@@ -2437,7 +2443,7 @@ void x264_macroblock_analyse( x264_t *h )
{ {
h->mb.i_type = P_SKIP; h->mb.i_type = P_SKIP;
h->mb.i_partition = D_16x16; h->mb.i_partition = D_16x16;
assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 ); assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 || h->param.b_sliced_threads );
} }
else else
{ {
...@@ -3145,7 +3151,7 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ) ...@@ -3145,7 +3151,7 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
} }
#ifndef NDEBUG #ifndef NDEBUG
if( h->param.i_threads > 1 && !IS_INTRA(h->mb.i_type) ) if( h->param.i_threads > 1 && !h->param.b_sliced_threads && !IS_INTRA(h->mb.i_type) )
{ {
int l; int l;
for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ ) for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
......
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
int x264_analyse_init_costs( x264_t *h, int qp ); int x264_analyse_init_costs( x264_t *h, int qp );
void x264_analyse_free_costs( x264_t *h ); void x264_analyse_free_costs( x264_t *h );
void x264_analyse_weight_frame( x264_t *h, int end );
void x264_macroblock_analyse( x264_t *h ); void x264_macroblock_analyse( x264_t *h );
void x264_slicetype_decide( x264_t *h ); void x264_slicetype_decide( x264_t *h );
......
...@@ -31,6 +31,7 @@ ...@@ -31,6 +31,7 @@
#include "analyse.h" #include "analyse.h"
#include "ratecontrol.h" #include "ratecontrol.h"
#include "macroblock.h" #include "macroblock.h"
#include "me.h"
#if VISUALIZE #if VISUALIZE
#include "common/visualize.h" #include "common/visualize.h"
...@@ -409,7 +410,16 @@ static int x264_validate_parameters( x264_t *h ) ...@@ -409,7 +410,16 @@ static int x264_validate_parameters( x264_t *h )
x264_log( h, X264_LOG_WARNING, "not compiled with pthread support!\n"); x264_log( h, X264_LOG_WARNING, "not compiled with pthread support!\n");
h->param.i_threads = 1; h->param.i_threads = 1;
#endif #endif
/* Avoid absurdly small thread slices as they can reduce performance
* and VBV compliance. Capped at an arbitrary 4 rows per thread. */
if( h->param.b_sliced_threads )
{
int max_threads = (h->param.i_height+15)/16 / 4;
h->param.i_threads = X264_MIN( h->param.i_threads, max_threads );
}
} }
else
h->param.b_sliced_threads = 0;
if( h->param.b_interlaced ) if( h->param.b_interlaced )
{ {
...@@ -497,21 +507,26 @@ static int x264_validate_parameters( x264_t *h ) ...@@ -497,21 +507,26 @@ static int x264_validate_parameters( x264_t *h )
h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max ); h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max );
int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced); int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced);
h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices ); if( h->param.b_sliced_threads )
h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 ); h->param.i_slice_count = x264_clip3( h->param.i_threads, 0, max_slices );
h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 ); else
if( h->param.b_interlaced && h->param.i_slice_max_size )
{
x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
h->param.i_slice_max_size = 0;
}
if( h->param.b_interlaced && h->param.i_slice_max_mbs )
{ {
x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" ); h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices );
h->param.i_slice_max_mbs = 0; h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 );
h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 );
if( h->param.b_interlaced && h->param.i_slice_max_size )
{
x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
h->param.i_slice_max_size = 0;
}
if( h->param.b_interlaced && h->param.i_slice_max_mbs )
{
x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
h->param.i_slice_max_mbs = 0;
}
if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
h->param.i_slice_count = 0;
} }
if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
h->param.i_slice_count = 0;
h->param.i_frame_reference = x264_clip3( h->param.i_frame_reference, 1, 16 ); h->param.i_frame_reference = x264_clip3( h->param.i_frame_reference, 1, 16 );
if( h->param.i_keyint_max <= 0 ) if( h->param.i_keyint_max <= 0 )
...@@ -553,7 +568,7 @@ static int x264_validate_parameters( x264_t *h ) ...@@ -553,7 +568,7 @@ static int x264_validate_parameters( x264_t *h )
#ifdef HAVE_PTHREAD #ifdef HAVE_PTHREAD
if( h->param.i_sync_lookahead ) if( h->param.i_sync_lookahead )
h->param.i_sync_lookahead = x264_clip3( h->param.i_sync_lookahead, h->param.i_threads + h->param.i_bframe, X264_LOOKAHEAD_MAX ); h->param.i_sync_lookahead = x264_clip3( h->param.i_sync_lookahead, h->param.i_threads + h->param.i_bframe, X264_LOOKAHEAD_MAX );
if( h->param.rc.b_stat_read || h->param.i_threads == 1 ) if( h->param.rc.b_stat_read || h->param.i_threads == 1 || h->param.b_sliced_threads )
h->param.i_sync_lookahead = 0; h->param.i_sync_lookahead = 0;
#else #else
h->param.i_sync_lookahead = 0; h->param.i_sync_lookahead = 0;
...@@ -676,7 +691,7 @@ static int x264_validate_parameters( x264_t *h ) ...@@ -676,7 +691,7 @@ static int x264_validate_parameters( x264_t *h )
if( !h->param.analyse.i_weighted_pred && h->param.rc.b_mb_tree && h->param.analyse.b_psy && !h->param.b_interlaced ) if( !h->param.analyse.i_weighted_pred && h->param.rc.b_mb_tree && h->param.analyse.b_psy && !h->param.b_interlaced )
h->param.analyse.i_weighted_pred = X264_WEIGHTP_FAKE; h->param.analyse.i_weighted_pred = X264_WEIGHTP_FAKE;
if( h->param.i_threads > 1 ) if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
{ {
int r = h->param.analyse.i_mv_range_thread; int r = h->param.analyse.i_mv_range_thread;
int r2; int r2;
...@@ -851,7 +866,8 @@ x264_t *x264_encoder_open( x264_param_t *param ) ...@@ -851,7 +866,8 @@ x264_t *x264_encoder_open( x264_param_t *param )
if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size ) if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size )
h->frames.i_delay = X264_MAX( h->frames.i_delay, h->param.rc.i_lookahead ); h->frames.i_delay = X264_MAX( h->frames.i_delay, h->param.rc.i_lookahead );
i_slicetype_length = h->frames.i_delay; i_slicetype_length = h->frames.i_delay;
h->frames.i_delay += h->param.i_threads - 1; if( !h->param.b_sliced_threads )
h->frames.i_delay += h->param.i_threads - 1;
h->frames.i_delay = X264_MIN( h->frames.i_delay, X264_LOOKAHEAD_MAX ); h->frames.i_delay = X264_MIN( h->frames.i_delay, X264_LOOKAHEAD_MAX );
h->frames.i_delay += h->param.i_sync_lookahead; h->frames.i_delay += h->param.i_sync_lookahead;
...@@ -944,23 +960,45 @@ x264_t *x264_encoder_open( x264_param_t *param ) ...@@ -944,23 +960,45 @@ x264_t *x264_encoder_open( x264_param_t *param )
for( i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ ) for( i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
CHECKED_MALLOC( h->thread[i], sizeof(x264_t) ); CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
if( x264_lookahead_init( h, i_slicetype_length ) )
goto fail;
for( i = 0; i < h->param.i_threads; i++ ) for( i = 0; i < h->param.i_threads; i++ )
{ {
int init_nal_count = h->param.i_slice_count + 3;
int allocate_threadlocal_data = !h->param.b_sliced_threads || !i;
if( i > 0 ) if( i > 0 )
*h->thread[i] = *h; *h->thread[i] = *h;
h->thread[i]->fdec = x264_frame_pop_unused( h, 1 );
if( !h->thread[i]->fdec ) if( allocate_threadlocal_data )
goto fail; {
h->thread[i]->fdec = x264_frame_pop_unused( h, 1 );
if( !h->thread[i]->fdec )
goto fail;
}
else
h->thread[i]->fdec = h->thread[0]->fdec;
CHECKED_MALLOC( h->thread[i]->out.p_bitstream, h->out.i_bitstream ); CHECKED_MALLOC( h->thread[i]->out.p_bitstream, h->out.i_bitstream );
/* Start each thread with room for 8 NAL units; it'll realloc later if needed. */ /* Start each thread with room for init_nal_count NAL units; it'll realloc later if needed. */
CHECKED_MALLOC( h->thread[i]->out.nal, 8*sizeof(x264_nal_t) ); CHECKED_MALLOC( h->thread[i]->out.nal, init_nal_count*sizeof(x264_nal_t) );
h->thread[i]->out.i_nals_allocated = 8; h->thread[i]->out.i_nals_allocated = init_nal_count;
if( x264_macroblock_cache_init( h->thread[i] ) < 0 )
if( allocate_threadlocal_data && x264_macroblock_cache_init( h->thread[i] ) < 0 )
goto fail; goto fail;
} }
if( x264_lookahead_init( h, i_slicetype_length ) ) /* Allocate scratch buffer */
goto fail; for( i = 0; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
{
int buf_hpel = (h->param.i_width+48) * sizeof(int16_t);
int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
CHECKED_MALLOC( h->thread[i]->scratch_buffer, X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_mbtree ) );
}
if( x264_ratecontrol_new( h ) < 0 ) if( x264_ratecontrol_new( h ) < 0 )
goto fail; goto fail;
...@@ -1009,8 +1047,8 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param ) ...@@ -1009,8 +1047,8 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
COPY( b_deblocking_filter ); COPY( b_deblocking_filter );
COPY( i_deblocking_filter_alphac0 ); COPY( i_deblocking_filter_alphac0 );
COPY( i_deblocking_filter_beta ); COPY( i_deblocking_filter_beta );
COPY( analyse.intra );
COPY( analyse.inter ); COPY( analyse.inter );
COPY( analyse.intra );
COPY( analyse.i_direct_mv_pred ); COPY( analyse.i_direct_mv_pred );
/* Scratch buffer prevents me_range from being increased for esa/tesa */ /* Scratch buffer prevents me_range from being increased for esa/tesa */
if( h->param.analyse.i_me_method < X264_ME_ESA || param->analyse.i_me_range < h->param.analyse.i_me_range ) if( h->param.analyse.i_me_method < X264_ME_ESA || param->analyse.i_me_range < h->param.analyse.i_me_range )
...@@ -1056,13 +1094,9 @@ static void x264_nal_start( x264_t *h, int i_type, int i_ref_idc ) ...@@ -1056,13 +1094,9 @@ static void x264_nal_start( x264_t *h, int i_type, int i_ref_idc )
nal->i_payload= 0; nal->i_payload= 0;
nal->p_payload= &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8]; nal->p_payload= &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8];
} }
static int x264_nal_end( x264_t *h )