Commit b1eac265 authored by Fiona Glaser's avatar Fiona Glaser
Browse files

Make MV costs global instead of static

Fixes some extremely rare threading race conditions and makes the code cleaner.
Downside: slightly higher memory usage when calling multiple encoders from the same application.
parent c8c06079
......@@ -52,6 +52,8 @@ do {\
#define X264_THREAD_MAX 128
#define X264_PCM_COST (386*8)
#define X264_LOOKAHEAD_MAX 250
// arbitrary, but low because SATD scores are 1/4 normal
#define X264_LOOKAHEAD_QP 12
// number of pixels (per thread) in progress at any given time.
// 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
......@@ -347,6 +349,12 @@ struct x264_t
uint16_t (*quant4_bias[4])[16]; /* [4][52][16] */
uint16_t (*quant8_bias[2])[64]; /* [2][52][64] */
/* mv/ref cost arrays. Indexed by lambda instead of
* qp because, due to rounding, some quantizers share
* lambdas. This saves memory. */
uint16_t *cost_mv[92];
uint16_t *cost_mv_fpel[92][4];
const uint8_t *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */
ALIGNED_16( uint32_t nr_residual_sum[2][64] );
......
......@@ -140,6 +140,7 @@ static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(vo
#define x264_pthread_attr_t pthread_attr_t
#define x264_pthread_attr_init pthread_attr_init
#define x264_pthread_attr_destroy pthread_attr_destroy
#define X264_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
#else
#define x264_pthread_mutex_t int
#define x264_pthread_mutex_init(m,f) 0
......@@ -154,6 +155,7 @@ static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(vo
#define x264_pthread_attr_t int
#define x264_pthread_attr_init(a) 0
#define x264_pthread_attr_destroy(a)
#define X264_PTHREAD_MUTEX_INITIALIZER 0
#endif
#define WORD_SIZE sizeof(void*)
......
......@@ -77,7 +77,7 @@ typedef struct
int i_lambda;
int i_lambda2;
int i_qp;
int16_t *p_cost_mv;
uint16_t *p_cost_mv;
uint16_t *p_cost_ref0;
uint16_t *p_cost_ref1;
int i_mbrd;
......@@ -237,46 +237,36 @@ static const int i_sub_mb_p_cost_table[4] = {
static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
/* Indexed by lambda instead of qp because, due to rounding,
* some quantizers share lambdas. This saves memory. */
uint16_t *x264_cost_mv_fpel[92][4];
uint16_t x264_cost_ref[92][3][33];
static uint16_t x264_cost_ref[92][3][33];
static x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
/* initialize an array of lambda*nbits for all possible mvs */
static int x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
int x264_analyse_init_costs( x264_t *h, int qp )
{
static int16_t *p_cost_mv[92];
int i, j;
if( !p_cost_mv[a->i_lambda] )
{
x264_emms();
/* could be faster, but isn't called many times */
/* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
CHECKED_MALLOC( p_cost_mv[a->i_lambda], (4*4*2048 + 1) * sizeof(int16_t) );
p_cost_mv[a->i_lambda] += 2*4*2048;
for( i = 0; i <= 2*4*2048; i++ )
{
p_cost_mv[a->i_lambda][-i] =
p_cost_mv[a->i_lambda][i] = a->i_lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
}
for( i = 0; i < 3; i++ )
for( j = 0; j < 33; j++ )
x264_cost_ref[a->i_lambda][i][j] = i ? a->i_lambda * bs_size_te( i, j ) : 0;
}
a->p_cost_mv = p_cost_mv[a->i_lambda];
a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
/* FIXME is this useful for all me methods? */
if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_lambda][0] )
int lambda = x264_lambda_tab[qp];
if( h->cost_mv[lambda] )
return 0;
/* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
h->cost_mv[lambda] += 2*4*2048;
for( i = 0; i <= 2*4*2048; i++ )
{
h->cost_mv[lambda][-i] =
h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
}
x264_pthread_mutex_lock( &cost_ref_mutex );
for( i = 0; i < 3; i++ )
for( j = 0; j < 33; j++ )
x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
x264_pthread_mutex_unlock( &cost_ref_mutex );
if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
{
for( j=0; j<4; j++ )
{
CHECKED_MALLOC( x264_cost_mv_fpel[a->i_lambda][j], (4*2048 + 1) * sizeof(int16_t) );
x264_cost_mv_fpel[a->i_lambda][j] += 2*2048;
CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
h->cost_mv_fpel[lambda][j] += 2*2048;
for( i = -2*2048; i < 2*2048; i++ )
x264_cost_mv_fpel[a->i_lambda][j][i] = p_cost_mv[a->i_lambda][i*4+j];
h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
}
}
return 0;
......@@ -284,6 +274,27 @@ fail:
return -1;
}
void x264_analyse_free_costs( x264_t *h )
{
int i, j;
for( i = 0; i < 92; i++ )
{
if( h->cost_mv[i] )
x264_free( h->cost_mv[i] - 2*4*2048 );
if( h->cost_mv_fpel[i][0] )
for( j = 0; j < 4; j++ )
x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
}
}
/* initialize an array of lambda*nbits for all possible mvs */
static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
{
a->p_cost_mv = h->cost_mv[a->i_lambda];
a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
}
static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
{
int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
......@@ -2317,7 +2328,7 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
/*****************************************************************************
* x264_macroblock_analyse:
*****************************************************************************/
int x264_macroblock_analyse( x264_t *h )
void x264_macroblock_analyse( x264_t *h )
{
x264_mb_analysis_t analysis;
int i_cost = COST_MAX;
......@@ -2392,13 +2403,12 @@ int x264_macroblock_analyse( x264_t *h )
int i_thresh16x8;
int i_satd_inter, i_satd_intra;
if( x264_mb_analyse_load_costs( h, &analysis ) )
return -1;
x264_mb_analyse_load_costs( h, &analysis );
x264_mb_analyse_inter_p16x16( h, &analysis );
if( h->mb.i_type == P_SKIP )
return 0;
return;
if( flags & X264_ANALYSE_PSUB16x16 )
{
......@@ -2686,8 +2696,7 @@ int x264_macroblock_analyse( x264_t *h )
int i_satd_inter;
h->mb.b_skip_mc = 0;
if( x264_mb_analyse_load_costs( h, &analysis ) )
return -1;
x264_mb_analyse_load_costs( h, &analysis );
/* select best inter mode */
/* direct must be first */
......@@ -2713,7 +2722,7 @@ int x264_macroblock_analyse( x264_t *h )
{
h->mb.i_type = B_SKIP;
x264_analyse_update_cache( h, &analysis );
return 0;
return;
}
}
......@@ -2945,7 +2954,6 @@ int x264_macroblock_analyse( x264_t *h )
x264_psy_trellis_init( h, 0 );
if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
h->mb.i_skip_intra = 0;
return 0;
}
/*-------------------- Update MB from the analysis ----------------------*/
......
......@@ -24,9 +24,10 @@
#ifndef X264_ANALYSE_H
#define X264_ANALYSE_H
int x264_macroblock_analyse( x264_t *h );
int x264_analyse_init_costs( x264_t *h, int qp );
void x264_analyse_free_costs( x264_t *h );
void x264_macroblock_analyse( x264_t *h );
void x264_slicetype_decide( x264_t *h );
int x264_lowres_context_alloc( x264_t *h );
void x264_slicetype_analyse( x264_t *h, int keyframe );
......
......@@ -748,7 +748,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
{
x264_t *h;
char buf[1000], *p;
int i, i_slicetype_length;
int i, qp, i_slicetype_length;
CHECKED_MALLOCZERO( h, sizeof(x264_t) );
......@@ -869,6 +869,12 @@ x264_t *x264_encoder_open( x264_param_t *param )
p += sprintf( p, " none!" );
x264_log( h, X264_LOG_INFO, "%s\n", buf );
for( qp = h->param.rc.i_qp_min; qp <= h->param.rc.i_qp_max; qp++ )
if( x264_analyse_init_costs( h, qp ) )
goto fail;
if( x264_analyse_init_costs( h, X264_LOOKAHEAD_QP ) )
goto fail;
h->out.i_nal = 0;
h->out.i_bitstream = X264_MAX( 1000000, h->param.i_width * h->param.i_height * 4
* ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min )
......@@ -900,9 +906,6 @@ x264_t *x264_encoder_open( x264_param_t *param )
if( x264_ratecontrol_new( h ) < 0 )
goto fail;
if( x264_lowres_context_alloc( h ) )
goto fail;
if( h->param.psz_dump_yuv )
{
/* create or truncate the reconstructed video file */
......@@ -1332,12 +1335,7 @@ static int x264_slice_write( x264_t *h )
/* load cache */
x264_macroblock_cache_load( h, i_mb_x, i_mb_y );
/* analyse parameters
* Slice I: choose I_4x4 or I_16x16 mode
* Slice P: choose between using P mode or intra (4x4 or 16x16)
* */
if( x264_macroblock_analyse( h ) )
return -1;
x264_macroblock_analyse( h );
/* encode this macroblock -> be careful it can change the mb type to P_SKIP if needed */
x264_macroblock_encode( h );
......@@ -2230,6 +2228,8 @@ void x264_encoder_close ( x264_t *h )
x264_cqm_delete( h );
x264_analyse_free_costs( h );
if( h->param.i_threads > 1)
h = h->thread[ h->i_thread_phase % h->param.i_threads ];
......
......@@ -195,8 +195,8 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
#define CHECK_MVRANGE(mx,my) ( mx >= mv_x_min && mx <= mv_x_max && my >= mv_y_min && my <= mv_y_max )
const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 );
bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 );
......@@ -452,8 +452,8 @@ me_hex2:
/* hexagon grid */
omx = bmx; omy = bmy;
const int16_t *p_cost_omvx = p_cost_mvx + omx*4;
const int16_t *p_cost_omvy = p_cost_mvy + omy*4;
const uint16_t *p_cost_omvx = p_cost_mvx + omx*4;
const uint16_t *p_cost_omvy = p_cost_mvy + omy*4;
i = 1;
do
{
......@@ -569,7 +569,7 @@ me_hex2:
int delta = x264_pixel_size[sad_size].w;
int16_t *xs = h->scratch_buffer;
int xn;
uint16_t *cost_fpel_mvx = x264_cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
uint16_t *cost_fpel_mvx = h->cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta,
p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE,
......@@ -768,8 +768,8 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
{
const int bw = x264_pixel_size[m->i_pixel].w;
const int bh = x264_pixel_size[m->i_pixel].h;
const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
const int i_pixel = m->i_pixel;
const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
......@@ -942,10 +942,10 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
const int i_pixel = m0->i_pixel;
const int bw = x264_pixel_size[i_pixel].w;
const int bh = x264_pixel_size[i_pixel].h;
const int16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0];
const int16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1];
const int16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0];
const int16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1];
const uint16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0];
const uint16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1];
const uint16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0];
const uint16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1];
ALIGNED_ARRAY_16( uint8_t, pixy_buf,[2],[9][16*16] );
ALIGNED_8( uint8_t pixu_buf[2][9][8*8] );
ALIGNED_8( uint8_t pixv_buf[2][9][8*8] );
......@@ -1073,7 +1073,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
static const int pixel_mv_offs[] = { 0, 4, 4*8, 0, 2, 2*8, 0 };
int16_t *cache_mv = h->mb.cache.mv[i_list][x264_scan8[i4]];
int16_t *cache_mv2 = cache_mv + pixel_mv_offs[m->i_pixel];
const int16_t *p_cost_mvx, *p_cost_mvy;
const uint16_t *p_cost_mvx, *p_cost_mvy;
const int bw = x264_pixel_size[m->i_pixel].w>>2;
const int bh = x264_pixel_size[m->i_pixel].h>>2;
const int i_pixel = m->i_pixel;
......
......@@ -31,7 +31,7 @@ typedef struct
{
/* input */
int i_pixel; /* PIXEL_WxH */
int16_t *p_cost_mv; /* lambda * nbits for each possible mv */
uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */
int i_ref_cost;
int i_ref;
......
......@@ -953,6 +953,8 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp )
}
}
q = x264_clip3f( q, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
rc->qpa_rc =
rc->qpa_aq = 0;
h->fdec->f_qp_avg_rc =
......
......@@ -29,22 +29,14 @@
#include "me.h"
static int x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
{
a->i_qp = 12; // arbitrary, but low because SATD scores are 1/4 normal
a->i_qp = X264_LOOKAHEAD_QP;
a->i_lambda = x264_lambda_tab[ a->i_qp ];
if( x264_mb_analyse_load_costs( h, a ) )
return -1;
x264_mb_analyse_load_costs( h, a );
h->mb.i_me_method = X264_MIN( X264_ME_HEX, h->param.analyse.i_me_method ); // maybe dia?
h->mb.i_subpel_refine = 4; // 3 should be enough, but not tweaking for speed now
h->mb.b_chroma_me = 0;
return 0;
}
int x264_lowres_context_alloc( x264_t *h )
{
x264_mb_analysis_t a;
return x264_lowres_context_init( h, &a );
}
static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
......
......@@ -262,9 +262,10 @@ static void Help( x264_param_t *defaults, int longhelp )
" where <option> is either\n"
" q=<integer> (force QP)\n"
" or b=<float> (bitrate multiplier)\n" );
H1( " --qpfile <string> Force frametypes and QPs for some or all frames\n"
H2( " --qpfile <string> Force frametypes and QPs for some or all frames\n"
" Format of each line: framenumber frametype QP\n"
" QP of -1 lets x264 choose. Frametypes: I,i,P,B,b.\n" );
" QP of -1 lets x264 choose. Frametypes: I,i,P,B,b.\n"
" QPs are restricted by qpmin/qpmax.\n" );
H1( "\n" );
H1( "Analysis:\n" );
H1( "\n" );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment