Commit 7b4f6a1f authored by Loren Merritt's avatar Loren Merritt

New threading method:

Encode multiple frames in prallel instead of dividing each frame into slices.
Improves speed, and reduces the bitrate penalty of threading.

Side effects:
It is no longer possible to re-encode a frame, so threaded scenecut detection
must run in the pre-me pass, which is faster but less precise. 
It is now useful to use more threads than you have cpus. --threads=auto has
been updated to use cpus*1.5.
Minor changes to ratecontrol.

New options: --pre-scenecut, --mvrange-thread, --non-deterministic


git-svn-id: svn://svn.videolan.org/x264/trunk@607 df754926-b1dd-0310-bc7b-ec298dee348c
parent fa2c1e54
......@@ -158,7 +158,7 @@ x264_hpel_filter_mmxext :
ALIGN 16
.vertical_filter:
prefetchnta [src + stride5 + 32]
prefetcht0 [src + stride5 + 32]
LOAD_ADD mm1, [src ], [src + stride5 ] ; a0
LOAD_ADD mm2, [src + stride ], [src + stride*4 ] ; b0
......
......@@ -45,6 +45,7 @@ void x264_param_default( x264_param_t *param )
/* CPU autodetect */
param->cpu = x264_cpu_detect();
param->i_threads = 1;
param->b_deterministic = 1;
/* Video properties */
param->i_csp = X264_CSP_I420;
......@@ -118,6 +119,7 @@ void x264_param_default( x264_param_t *param )
param->analyse.i_me_range = 16;
param->analyse.i_subpel_refine = 5;
param->analyse.b_chroma_me = 1;
param->analyse.i_mv_range_thread = -1;
param->analyse.i_mv_range = -1; // set from level_idc
param->analyse.i_direct_8x8_inference = -1; // set from level_idc
param->analyse.i_chroma_qp_offset = 0;
......@@ -245,6 +247,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
else
p->i_threads = atoi(value);
}
OPT2("deterministic", "n-deterministic")
p->b_deterministic = atobool(value);
OPT2("level", "level-idc")
{
if( atof(value) < 6 )
......@@ -301,6 +305,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
}
OPT("scenecut")
p->i_scenecut_threshold = atoi(value);
OPT("pre-scenecut")
p->b_pre_scenecut = atobool(value);
OPT("bframes")
p->i_bframe = atoi(value);
OPT("b-adapt")
......@@ -431,8 +437,10 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
b_error |= parse_enum( value, x264_motion_est_names, &p->analyse.i_me_method );
OPT2("merange", "me-range")
p->analyse.i_me_range = atoi(value);
OPT("mvrange")
OPT2("mvrange", "mv-range")
p->analyse.i_mv_range = atoi(value);
OPT2("mvrange-thread", "mv-range-thread")
p->analyse.i_mv_range_thread = atoi(value);
OPT2("subme", "subq")
p->analyse.i_subpel_refine = atoi(value);
OPT("bime")
......@@ -879,7 +887,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
s += sprintf( s, " cqm=%d", p->i_cqm_preset );
s += sprintf( s, " deadzone=%d,%d", p->analyse.i_luma_deadzone[0], p->analyse.i_luma_deadzone[1] );
s += sprintf( s, " chroma_qp_offset=%d", p->analyse.i_chroma_qp_offset );
s += sprintf( s, " slices=%d", p->i_threads );
s += sprintf( s, " threads=%d", p->i_threads );
s += sprintf( s, " nr=%d", p->analyse.i_noise_reduction );
s += sprintf( s, " decimate=%d", p->analyse.b_dct_decimate );
s += sprintf( s, " mbaff=%d", p->b_interlaced );
......@@ -893,8 +901,9 @@ char *x264_param2string( x264_param_t *p, int b_res )
p->analyse.b_bidir_me );
}
s += sprintf( s, " keyint=%d keyint_min=%d scenecut=%d",
p->i_keyint_max, p->i_keyint_min, p->i_scenecut_threshold );
s += sprintf( s, " keyint=%d keyint_min=%d scenecut=%d%s",
p->i_keyint_max, p->i_keyint_min, p->i_scenecut_threshold,
p->b_pre_scenecut ? "(pre)" : "" );
s += sprintf( s, " rc=%s", p->rc.i_rc_method == X264_RC_ABR ?
( p->rc.b_stat_read ? "2pass" : p->rc.i_vbv_buffer_size ? "cbr" : "abr" )
......
......@@ -52,6 +52,7 @@
#define pthread_create(t,u,f,d) *(t)=CreateThread(NULL,0,f,d,0,NULL)
#define pthread_join(t,s) { WaitForSingleObject(t,INFINITE); \
CloseHandle(t); }
#define usleep(t) Sleep((t+999)/1000);
#define HAVE_PTHREAD 1
#elif defined(SYS_BEOS)
......@@ -61,10 +62,17 @@
resume_thread(*(t)); }
#define pthread_join(t,s) { long tmp; \
wait_for_thread(t,(s)?(long*)(s):&tmp); }
#ifndef usleep
#define usleep(t) snooze(t)
#endif
#define HAVE_PTHREAD 1
#elif defined(HAVE_PTHREAD)
#include <pthread.h>
#else
#define pthread_t int
#define pthread_create(t,u,f,d)
#define pthread_join(t,s)
#endif
/****************************************************************************
......@@ -79,6 +87,10 @@
#define XCHG(type,a,b) { type t = a; a = b; b = t; }
#define FIX8(f) ((int)(f*(1<<8)+.5))
#ifndef offsetof
#define offsetof(T,F) ((unsigned int)((char *)&((T *)0)->F))
#endif
#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
#define UNUSED __attribute__((unused))
#else
......@@ -96,8 +108,10 @@
}
#define X264_BFRAME_MAX 16
#define X264_THREAD_MAX 16
#define X264_SLICE_MAX 4
#define X264_NAL_MAX (4 + X264_SLICE_MAX)
#define X264_THREAD_HEIGHT 24 // number of pixels (per thread) in progress at any given time. could theoretically be as low as 22
/****************************************************************************
* Includes
......@@ -272,7 +286,10 @@ struct x264_t
/* encoder parameters */
x264_param_t param;
x264_t *thread[X264_SLICE_MAX];
x264_t *thread[X264_THREAD_MAX];
pthread_t thread_handle;
int b_thread_active;
int i_thread_phase; /* which thread to use for the next frame */
/* bitstream output */
struct
......@@ -282,6 +299,7 @@ struct x264_t
int i_bitstream; /* size of p_bitstream */
uint8_t *p_bitstream; /* will hold data for all nal */
bs_t bs;
int i_frame_size;
} out;
/* frame number/poc */
......@@ -328,7 +346,7 @@ struct x264_t
/* Temporary buffer (frames types not yet decided) */
x264_frame_t *next[X264_BFRAME_MAX+3];
/* Unused frames */
x264_frame_t *unused[X264_BFRAME_MAX+3];
x264_frame_t *unused[X264_BFRAME_MAX + X264_THREAD_MAX*2 + 16+4];
/* For adaptive B decision */
x264_frame_t *last_nonb;
......@@ -439,6 +457,7 @@ struct x264_t
int16_t (*mvr[2][32])[2]; /* 16x16 mv for each possible ref */
int8_t *skipbp; /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
int8_t *mb_transform_size; /* transform_size_8x8_flag of each mb */
uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
/* current value */
int i_type;
......@@ -550,6 +569,7 @@ struct x264_t
/* XXX: both omit the cost of MBs coded as P_SKIP */
int i_intra_cost;
int i_inter_cost;
int i_mbs_analysed;
/* Adaptive direct mv pred */
int i_direct_score[2];
} frame;
......
......@@ -109,6 +109,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
frame->i_pts = -1;
frame->i_frame = -1;
frame->i_frame_num = -1;
frame->i_lines_completed = -1;
CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
......@@ -172,7 +173,7 @@ void x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src
static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv )
static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
{
#define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
int y;
......@@ -184,56 +185,68 @@ static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_
memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
}
/* upper band */
if( b_pad_top )
for( y = 0; y < i_padv; y++ )
memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
/* lower band */
if( b_pad_bottom )
for( y = 0; y < i_padv; y++ )
memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
#undef PPIXEL
}
void x264_frame_expand_border( x264_t *h, x264_frame_t *frame )
void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
{
int i;
int b_start = !mb_y;
if( mb_y & h->sh.b_mbaff )
return;
for( i = 0; i < frame->i_plane; i++ )
{
int stride = frame->i_stride[i];
int width = 16*h->sps->i_mb_width >> !!i;
int height = 16*h->sps->i_mb_height >> !!i;
int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
int padh = PADH >> !!i;
int padv = PADV >> !!i;
if( h->param.b_interlaced )
// buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
if( b_end && !b_start )
height += 4 >> (!!i + h->sh.b_mbaff);
if( h->sh.b_mbaff )
{
plane_expand_border( frame->plane[i], stride*2, width, height>>1, padh, padv );
plane_expand_border( frame->plane[i]+stride, stride*2, width, height>>1, padh, padv );
plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
}
else
{
plane_expand_border( frame->plane[i], stride, width, height, padh, padv );
plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
}
}
}
void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame )
void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
{
/* during filtering, 8 extra pixels were filtered on each edge.
we want to expand border from the last filtered pixel */
int b_start = !mb_y;
int stride = frame->i_stride[0];
int width = 16*h->sps->i_mb_width;
int height = 16*h->sps->i_mb_height;
int width = 16*h->sps->i_mb_width + 16;
int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
int padh = PADH - 8;
int padv = PADV - 8;
int i;
for( i = 1; i < 4; i++ )
{
if( h->param.b_interlaced )
// buffer: 8 luma, to match the hpel filter
uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 8;
if( h->sh.b_mbaff )
{
plane_expand_border( frame->filtered[i] - 16*stride - 8, stride*2, width+16, (height>>1)+16, padh, padv );
plane_expand_border( frame->filtered[i] - 15*stride - 8, stride*2, width+16, (height>>1)+16, padh, padv );
plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
}
else
{
plane_expand_border( frame->filtered[i] - 8*stride - 8, stride, width+16, height+16, padh, padv );
plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
}
}
}
......@@ -242,7 +255,7 @@ void x264_frame_expand_border_lowres( x264_frame_t *frame )
{
int i;
for( i = 0; i < 4; i++ )
plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_stride_lowres - 2*PADH, frame->i_lines_lowres, PADH, PADV );
plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_stride_lowres - 2*PADH, frame->i_lines_lowres, PADH, PADV, 1, 1 );
}
void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
......@@ -505,19 +518,19 @@ static inline void deblock_edge( x264_t *h, uint8_t *pix, int i_stride, int bS[4
}
}
void x264_frame_deblocking_filter( x264_t *h, int i_slice_type )
void x264_frame_deblock_row( x264_t *h, int mb_y )
{
const int s8x8 = 2 * h->mb.i_mb_stride;
const int s4x4 = 4 * h->mb.i_mb_stride;
const int b_interlaced = h->param.b_interlaced;
const int b_interlaced = h->sh.b_mbaff;
const int mvy_limit = 4 >> b_interlaced;
int mb_y, mb_x;
int mb_x;
int i_stride2[3] = { h->fdec->i_stride[0] << b_interlaced,
h->fdec->i_stride[1] << b_interlaced,
h->fdec->i_stride[2] << b_interlaced };
for( mb_y = 0, mb_x = 0; mb_y < h->sps->i_mb_height; )
for( mb_x = 0; mb_x < h->sps->i_mb_width; )
{
const int mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
......@@ -610,7 +623,7 @@ void x264_frame_deblocking_filter( x264_t *h, int i_slice_type )
bS[i] = 0;
for( l = 0; l < 1 + (i_slice_type == SLICE_TYPE_B); l++ )
for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ )
{
if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] ||
abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 ||
......@@ -673,16 +686,17 @@ void x264_frame_deblocking_filter( x264_t *h, int i_slice_type )
/* next mb */
if( !b_interlaced || (mb_y&1) )
mb_x++;
if( mb_x >= h->sps->i_mb_width )
{
mb_x = 0;
mb_y++;
}
else
mb_y ^= b_interlaced;
mb_y ^= b_interlaced;
}
}
void x264_frame_deblock( x264_t *h )
{
int mb_y;
for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y += 1 + h->sh.b_mbaff )
x264_frame_deblock_row( h, mb_y );
}
#ifdef HAVE_MMXEXT
void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
......
......@@ -70,6 +70,10 @@ typedef struct
int *i_row_bits;
int *i_row_qp;
/* threading */
int i_lines_completed; /* in pixels */
int i_reference_count; /* number of threads using this frame (not necessarily the number of pointers) */
} x264_frame_t;
typedef void (*x264_deblock_inter_t)( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
......@@ -91,14 +95,15 @@ void x264_frame_delete( x264_frame_t *frame );
void x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src );
void x264_frame_expand_border( x264_t *h, x264_frame_t *frame );
void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame );
void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
void x264_frame_expand_border_lowres( x264_frame_t *frame );
void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame );
void x264_frame_deblocking_filter( x264_t *h, int i_slice_type );
void x264_frame_deblock( x264_t *h );
void x264_frame_deblock_row( x264_t *h, int mb_y );
void x264_frame_filter( int cpu, x264_frame_t *frame, int b_interlaced );
void x264_frame_filter( int cpu, x264_frame_t *frame, int b_interlaced, int mb_y, int b_end );
void x264_frame_init_lowres( int cpu, x264_frame_t *frame );
void x264_deblock_init( int cpu, x264_deblock_function_t *pf );
......
......@@ -157,7 +157,7 @@ x264_hpel_filter_mmxext :
ALIGN 16
.vertical_filter:
prefetchnta [src3 + stride*2 + 32]
prefetcht0 [src3 + stride*2 + 32]
LOAD_ADD mm1, [src ], [src3 + stride*2 ] ; a0
LOAD_ADD mm2, [src + stride ], [src3 + stride ] ; b0
......
......@@ -308,6 +308,27 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
}
}
if( h->param.i_threads > 1 )
{
int di = b8x8 ? 4 : 1;
for( i4=0; i4<16; i4+=di )
{
if( h->mb.cache.mv[0][x264_scan8[i4]][1] > h->mb.mv_max_spel[1]
|| h->mb.cache.mv[1][x264_scan8[i4]][1] > h->mb.mv_max_spel[1] )
{
#if 0
fprintf(stderr, "direct_temporal: (%d,%d) (%d,%d) > %d \n",
h->mb.cache.mv[0][x264_scan8[i4]][0],
h->mb.cache.mv[0][x264_scan8[i4]][1],
h->mb.cache.mv[1][x264_scan8[i4]][0],
h->mb.cache.mv[1][x264_scan8[i4]][1],
h->mb.mv_max_spel[1]);
#endif
return 0;
}
}
}
return 1;
}
......@@ -368,6 +389,19 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
if( IS_INTRA( type_col ) )
return 1;
if( h->param.i_threads > 1
&& ( mv[0][1] > h->mb.mv_max_spel[1]
|| mv[1][1] > h->mb.mv_max_spel[1] ) )
{
#if 0
fprintf(stderr, "direct_spatial: (%d,%d) (%d,%d) > %d \n",
mv[0][0], mv[0][1], mv[1][0], mv[1][1],
h->mb.mv_max_spel[1]);
#endif
return 0;
}
b8x8 = h->sps->b_direct8x8_inference ||
(type_col != P_8x8 && type_col != B_SKIP && type_col != B_DIRECT && type_col != B_8x8);
......@@ -861,6 +895,13 @@ int x264_macroblock_cache_init( x264_t *h )
CHECKED_MALLOC( h->mb.mvr[i][j], 2 * i_mb_count * sizeof(int16_t) );
}
for( i=0; i<=h->param.b_interlaced; i++ )
for( j=0; j<3; j++ )
{
CHECKED_MALLOC( h->mb.intra_border_backup[i][j], h->fdec->i_stride[j] );
h->mb.intra_border_backup[i][j] += 8;
}
/* init with not available (for top right idx=7,15) */
memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
......@@ -871,6 +912,9 @@ fail: return -1;
void x264_macroblock_cache_end( x264_t *h )
{
int i, j;
for( i=0; i<=h->param.b_interlaced; i++ )
for( j=0; j<3; j++ )
x264_free( h->mb.intra_border_backup[i][j] - 8 );
for( i=0; i<2; i++ )
{
int i_refs = i ? 1 + h->param.b_bframe_pyramid : h->param.i_frame_reference;
......@@ -1117,6 +1161,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
: w * (i_mb_x + i_mb_y * i_stride);
int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
const uint8_t *intra_fdec = &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
x264_frame_t **fref[2] = { h->fref0, h->fref1 };
int j, k, l;
......@@ -1127,7 +1172,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
&h->fenc->plane[i][i_pix_offset], i_stride2, w );
memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], &plane_fdec[-1-i_stride2], w*3/2+1 );
memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
for( j = 0; j < w; j++ )
h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
......
......@@ -390,46 +390,48 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
extern void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
int i_stride, int i_width, int i_height );
void x264_frame_filter( int cpu, x264_frame_t *frame, int b_interlaced )
void x264_frame_filter( int cpu, x264_frame_t *frame, int b_interlaced, int mb_y, int b_end )
{
const int x_inc = 16, y_inc = 16;
const int stride = frame->i_stride[0] << b_interlaced;
const int height = frame->i_lines[0] >> b_interlaced;
const int start = (mb_y*16 >> b_interlaced) - 8;
const int height = ((b_end ? frame->i_lines[0] : mb_y*16) >> b_interlaced) + 8;
int x, y;
pf_mc_t int_h = mc_hh;
pf_mc_t int_v = mc_hv;
pf_mc_t int_hv = mc_hc;
if( mb_y & b_interlaced )
return;
mb_y >>= b_interlaced;
#ifdef HAVE_MMXEXT
if ( cpu & X264_CPU_MMXEXT )
{
int offs = -8*stride - 8;
// buffer = 4 for deblock + 3 for 6tap, rounded to 8
int offs = start*stride - 8;
x264_hpel_filter_mmxext(
frame->filtered[1] + offs,
frame->filtered[2] + offs,
frame->filtered[3] + offs,
frame->plane[0] + offs,
stride, stride - 48, height + 16);
stride, stride - 48, height - start );
}
else
#endif
{
for( y = -8; y < height + 8; y += y_inc )
for( y = start; y < height; y += y_inc )
{
uint8_t *p_in = frame->plane[0] + y * stride - 8;
uint8_t *p_h = frame->filtered[1] + y * stride - 8;
uint8_t *p_v = frame->filtered[2] + y * stride - 8;
uint8_t *p_hv = frame->filtered[3] + y * stride - 8;
uint8_t *p_c = frame->filtered[3] + y * stride - 8;
for( x = -8; x < stride - 64 + 8; x += x_inc )
{
int_h( p_in, stride, p_h, stride, x_inc, y_inc );
int_v( p_in, stride, p_v, stride, x_inc, y_inc );
int_hv( p_in, stride, p_hv, stride, x_inc, y_inc );
mc_hh( p_in, stride, p_h, stride, x_inc, y_inc );
mc_hv( p_in, stride, p_v, stride, x_inc, y_inc );
mc_hc( p_in, stride, p_c, stride, x_inc, y_inc );
p_h += x_inc;
p_v += x_inc;
p_hv += x_inc;
p_c += x_inc;
p_in += x_inc;
}
}
......@@ -440,8 +442,9 @@ void x264_frame_filter( int cpu, x264_frame_t *frame, int b_interlaced )
* the sum of an 8x8 pixel region with top-left corner on that point.
* in the lower plane, 4x4 sums (needed only with --analyse p4x4). */
if( frame->integral )
if( frame->integral && b_end )
{
//FIXME slice
memset( frame->integral - 32 * stride - 32, 0, stride * sizeof(uint16_t) );
for( y = -32; y < frame->i_lines[0] + 31; y++ )
{
......
Old threading method: slice-based
application calls x264
x264 runs B-adapt and ratecontrol (serial)
split frame into several slices, and spawn a thread for each slice
wait until all threads are done
deblock and hpel filter (serial)
return to application
In x264cli, there is one additional thread to decode the input.
New threading method: frame-based
application calls x264
x264 runs B-adapt and ratecontrol (serial to the application, but parallel to the other x264 threads)
spawn a thread for this frame
thread runs encode in 1 slice, deblock, hpel filter
meanwhile x264 waits for the oldest thread to finish
return to application, but the rest of the threads continue running in the background
No additional threads are needed to decode the input, unless decoding+B-adapt is slower than slice+deblock+hpel, in which case an additional input thread would allow decoding in parallel to B-adapt.
Penalties for slice-based threading:
Each slice adds some bitrate (or equivalently reduces quality), for a variety of reasons: the slice header costs some bits, cabac contexts are reset, mvs and intra samples can't be predicted across the slice boundary.
In CBR mode, we have to allocate bits between slices before encoding them, which may lead to uneven quality.
Some parts of the encoder are serial, so it doesn't scale well with lots of cpus.
Penalties for frame-base threading:
To allow encoding of multiple frames in parallel, we have to ensure that any given macroblock uses motion vectors only from pieces of the reference frames that have been encoded already. This is usually not noticeable, but can matter for very fast upward motion.
We have to commit to one frame type before starting on the frame. Thus scenecut detection must run during the lowres pre-motion-estimation along with B-adapt, which makes it faster but less accurate than re-encoding the whole frame.
Ratecontrol gets delayed feedback, since it has to plan frame N before frame N-1 finishes.
Benchmarks:
cpu: 4x woodcrest 3GHz
content: 480p
x264 -B1000 -b2 -m1 -Anone
threads speed psnr
old new old new
1: 1.000x 1.000x 0.000 0.000
2: 1.168x 1.413x -0.038 -0.007
3: 1.208x 1.814x -0.064 -0.005
4: 1.293x 2.329x -0.095 -0.006
5: 2.526x -0.007
6: 2.658x -0.001
7: 2.723x -0.018
8: 2.712x -0.019
x264 -B1000 -b2 -m5
threads speed psnr
old new old new
1: 1.000x 1.000x 0.000 0.000
2: 1.319x 1.517x -0.036 -0.006
3: 1.466x 2.013x -0.068 -0.005
4: 1.578x 2.741x -0.101 -0.004
5: 3.022x -0.015
6: 3.221x -0.014
7: 3.331x -0.020
8: 3.425x -0.025
x264 -B1000 -b2 -m6 -r3 -8 --b-rdo
threads speed psnr
old new old new
1: 1.000x 1.000x 0.000 0.000
2: 1.531x 1.707x -0.032 -0.006
3: 1.866x 2.277x -0.061 -0.005
4: 2.097x 3.204x -0.088 -0.006
5: 3.468x -0.013
6: 3.629x -0.010
7: 3.716x -0.014
8: 3.745x -0.018
......@@ -26,6 +26,7 @@
#include <string.h>
#include <math.h>
#include <limits.h>
#include <unistd.h>
#include "common/common.h"
#include "macroblock.h"
......@@ -219,27 +220,54 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
/* II: Inter part P/B frame */
if( h->sh.i_type != SLICE_TYPE_I )
{
int i;
int i_fmv_range = h->param.analyse.i_mv_range - 16;
int i, j;
int i_fmv_range = 4 * h->param.analyse.i_mv_range;
int i_fpel_border = 5; // 3 for hex search, 2 for subpel, ignores subme7 & bime
/* Calculate max allowed MV range */
#define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range )
h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
h->mb.mv_min_fpel[0] = CLIP_FMV( -16*h->mb.i_mb_x - 8 );
h->mb.mv_max_fpel[0] = CLIP_FMV( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 8 );
h->mb.mv_min_spel[0] = 4*( h->mb.mv_min_fpel[0] - 16 );
h->mb.mv_max_spel[0] = 4*( h->mb.mv_max_fpel[0] + 16 );
h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
if( h->mb.i_mb_x == 0)
{
int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
int thread_mvy_range = i_fmv_range;
if( h->param.i_threads > 1 )
{
int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
int thresh = pix_y + h->param.analyse.i_mv_range_thread;
for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
{
x264_frame_t **fref = i ? h->fref1 : h->fref0;
int i_ref = i ? h->i_ref1 : h->i_ref0;
for( j=0; j<i_ref; j++ )
{
// could use a condition variable or the like, but
// this way is faster at least on LinuxThreads.
while( fref[j]->i_lines_completed < thresh )
usleep(100);
thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->i_lines_completed - pix_y );
}
}
if( h->param.b_deterministic )
thread_mvy_range = h->param.analyse.i_mv_range_thread;
if( h->mb.b_interlaced )
thread_mvy_range >>= 1;
}
h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
h->mb.mv_min_fpel[1] = CLIP_FMV( -16*mb_y - 8 );
h->mb.mv_max_fpel[1] = CLIP_FMV( 16*( mb_height - mb_y - 1 ) + 8 );
h->mb.mv_min_spel[1] = 4*( h->mb.mv_min_fpel[1] - 16 );
h->mb.mv_max_spel[1] = 4*( h->mb.mv_max_fpel[1] + 16 );
h->mb.mv_min_spel[1] = CLIP_FMV( h->mb.mv_min[1] );
h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
}
#undef CLIP_FMV
......@@ -943,6 +971,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
{
h->mb.i_type = P_SKIP;
x264_analyse_update_cache( h, a );
assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
return;
}
......@@ -960,6 +989,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
}
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
h->mb.i_type = P_L0;
if( a->b_mbrd && a->l0.i_ref == 0
......@@ -2043,7 +2073,10 @@ void x264_macroblock_analyse( x264_t *h )
analysis.b_try_pskip = 0;
if( h->param.analyse.b_fast_pskip )
{
if( h->param.analyse.i_subpel_refine >= 3 )
if( h->param.i_threads > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
// FIXME don't need to check this if the reference frame is done
{}
else if( h->param.analyse.i_subpel_refine >= 3 )
analysis.b_try_pskip = 1;
else if( h->mb.i_mb_type_left == P_SKIP ||