Commit b1eac265 authored by Fiona Glaser's avatar Fiona Glaser

Make MV costs global instead of static

Fixes some extremely rare threading race conditions and makes the code cleaner.
Downside: slightly higher memory usage when calling multiple encoders from the same application.
parent c8c06079
......@@ -52,6 +52,8 @@ do {\
#define X264_THREAD_MAX 128
#define X264_PCM_COST (386*8)
#define X264_LOOKAHEAD_MAX 250
// arbitrary, but low because SATD scores are 1/4 normal
#define X264_LOOKAHEAD_QP 12
// number of pixels (per thread) in progress at any given time.
// 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
......@@ -347,6 +349,12 @@ struct x264_t
uint16_t (*quant4_bias[4])[16]; /* [4][52][16] */
uint16_t (*quant8_bias[2])[64]; /* [2][52][64] */
/* mv/ref cost arrays. Indexed by lambda instead of
* qp because, due to rounding, some quantizers share
* lambdas. This saves memory. */
uint16_t *cost_mv[92];
uint16_t *cost_mv_fpel[92][4];
const uint8_t *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */
ALIGNED_16( uint32_t nr_residual_sum[2][64] );
......
......@@ -140,6 +140,7 @@ static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(vo
#define x264_pthread_attr_t pthread_attr_t
#define x264_pthread_attr_init pthread_attr_init
#define x264_pthread_attr_destroy pthread_attr_destroy
#define X264_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
#else
#define x264_pthread_mutex_t int
#define x264_pthread_mutex_init(m,f) 0
......@@ -154,6 +155,7 @@ static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(vo
#define x264_pthread_attr_t int
#define x264_pthread_attr_init(a) 0
#define x264_pthread_attr_destroy(a)
#define X264_PTHREAD_MUTEX_INITIALIZER 0
#endif
#define WORD_SIZE sizeof(void*)
......
......@@ -77,7 +77,7 @@ typedef struct
int i_lambda;
int i_lambda2;
int i_qp;
int16_t *p_cost_mv;
uint16_t *p_cost_mv;
uint16_t *p_cost_ref0;
uint16_t *p_cost_ref1;
int i_mbrd;
......@@ -237,46 +237,36 @@ static const int i_sub_mb_p_cost_table[4] = {
static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
/* Indexed by lambda instead of qp because, due to rounding,
* some quantizers share lambdas. This saves memory. */
uint16_t *x264_cost_mv_fpel[92][4];
uint16_t x264_cost_ref[92][3][33];
static uint16_t x264_cost_ref[92][3][33];
static x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
/* initialize an array of lambda*nbits for all possible mvs */
static int x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
int x264_analyse_init_costs( x264_t *h, int qp )
{
static int16_t *p_cost_mv[92];
int i, j;
if( !p_cost_mv[a->i_lambda] )
{
x264_emms();
/* could be faster, but isn't called many times */
/* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
CHECKED_MALLOC( p_cost_mv[a->i_lambda], (4*4*2048 + 1) * sizeof(int16_t) );
p_cost_mv[a->i_lambda] += 2*4*2048;
for( i = 0; i <= 2*4*2048; i++ )
{
p_cost_mv[a->i_lambda][-i] =
p_cost_mv[a->i_lambda][i] = a->i_lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
}
for( i = 0; i < 3; i++ )
for( j = 0; j < 33; j++ )
x264_cost_ref[a->i_lambda][i][j] = i ? a->i_lambda * bs_size_te( i, j ) : 0;
}
a->p_cost_mv = p_cost_mv[a->i_lambda];
a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
/* FIXME is this useful for all me methods? */
if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_lambda][0] )
int lambda = x264_lambda_tab[qp];
if( h->cost_mv[lambda] )
return 0;
/* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
h->cost_mv[lambda] += 2*4*2048;
for( i = 0; i <= 2*4*2048; i++ )
{
h->cost_mv[lambda][-i] =
h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
}
x264_pthread_mutex_lock( &cost_ref_mutex );
for( i = 0; i < 3; i++ )
for( j = 0; j < 33; j++ )
x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
x264_pthread_mutex_unlock( &cost_ref_mutex );
if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
{
for( j=0; j<4; j++ )
{
CHECKED_MALLOC( x264_cost_mv_fpel[a->i_lambda][j], (4*2048 + 1) * sizeof(int16_t) );
x264_cost_mv_fpel[a->i_lambda][j] += 2*2048;
CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
h->cost_mv_fpel[lambda][j] += 2*2048;
for( i = -2*2048; i < 2*2048; i++ )
x264_cost_mv_fpel[a->i_lambda][j][i] = p_cost_mv[a->i_lambda][i*4+j];
h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
}
}
return 0;
......@@ -284,6 +274,27 @@ fail:
return -1;
}
void x264_analyse_free_costs( x264_t *h )
{
int i, j;
for( i = 0; i < 92; i++ )
{
if( h->cost_mv[i] )
x264_free( h->cost_mv[i] - 2*4*2048 );
if( h->cost_mv_fpel[i][0] )
for( j = 0; j < 4; j++ )
x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
}
}
/* initialize an array of lambda*nbits for all possible mvs */
static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
{
a->p_cost_mv = h->cost_mv[a->i_lambda];
a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
}
static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
{
int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
......@@ -2317,7 +2328,7 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
/*****************************************************************************
* x264_macroblock_analyse:
*****************************************************************************/
int x264_macroblock_analyse( x264_t *h )
void x264_macroblock_analyse( x264_t *h )
{
x264_mb_analysis_t analysis;
int i_cost = COST_MAX;
......@@ -2392,13 +2403,12 @@ int x264_macroblock_analyse( x264_t *h )
int i_thresh16x8;
int i_satd_inter, i_satd_intra;
if( x264_mb_analyse_load_costs( h, &analysis ) )
return -1;
x264_mb_analyse_load_costs( h, &analysis );
x264_mb_analyse_inter_p16x16( h, &analysis );
if( h->mb.i_type == P_SKIP )
return 0;
return;
if( flags & X264_ANALYSE_PSUB16x16 )
{
......@@ -2686,8 +2696,7 @@ int x264_macroblock_analyse( x264_t *h )
int i_satd_inter;
h->mb.b_skip_mc = 0;
if( x264_mb_analyse_load_costs( h, &analysis ) )
return -1;
x264_mb_analyse_load_costs( h, &analysis );
/* select best inter mode */
/* direct must be first */
......@@ -2713,7 +2722,7 @@ int x264_macroblock_analyse( x264_t *h )
{
h->mb.i_type = B_SKIP;
x264_analyse_update_cache( h, &analysis );
return 0;
return;
}
}
......@@ -2945,7 +2954,6 @@ int x264_macroblock_analyse( x264_t *h )
x264_psy_trellis_init( h, 0 );
if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
h->mb.i_skip_intra = 0;
return 0;
}
/*-------------------- Update MB from the analysis ----------------------*/
......
......@@ -24,9 +24,10 @@
#ifndef X264_ANALYSE_H
#define X264_ANALYSE_H
int x264_macroblock_analyse( x264_t *h );
int x264_analyse_init_costs( x264_t *h, int qp );
void x264_analyse_free_costs( x264_t *h );
void x264_macroblock_analyse( x264_t *h );
void x264_slicetype_decide( x264_t *h );
int x264_lowres_context_alloc( x264_t *h );
void x264_slicetype_analyse( x264_t *h, int keyframe );
......
......@@ -748,7 +748,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
{
x264_t *h;
char buf[1000], *p;
int i, i_slicetype_length;
int i, qp, i_slicetype_length;
CHECKED_MALLOCZERO( h, sizeof(x264_t) );
......@@ -869,6 +869,12 @@ x264_t *x264_encoder_open( x264_param_t *param )
p += sprintf( p, " none!" );
x264_log( h, X264_LOG_INFO, "%s\n", buf );
for( qp = h->param.rc.i_qp_min; qp <= h->param.rc.i_qp_max; qp++ )
if( x264_analyse_init_costs( h, qp ) )
goto fail;
if( x264_analyse_init_costs( h, X264_LOOKAHEAD_QP ) )
goto fail;
h->out.i_nal = 0;
h->out.i_bitstream = X264_MAX( 1000000, h->param.i_width * h->param.i_height * 4
* ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min )
......@@ -900,9 +906,6 @@ x264_t *x264_encoder_open( x264_param_t *param )
if( x264_ratecontrol_new( h ) < 0 )
goto fail;
if( x264_lowres_context_alloc( h ) )
goto fail;
if( h->param.psz_dump_yuv )
{
/* create or truncate the reconstructed video file */
......@@ -1332,12 +1335,7 @@ static int x264_slice_write( x264_t *h )
/* load cache */
x264_macroblock_cache_load( h, i_mb_x, i_mb_y );
/* analyse parameters
* Slice I: choose I_4x4 or I_16x16 mode
* Slice P: choose between using P mode or intra (4x4 or 16x16)
* */
if( x264_macroblock_analyse( h ) )
return -1;
x264_macroblock_analyse( h );
/* encode this macroblock -> be careful it can change the mb type to P_SKIP if needed */
x264_macroblock_encode( h );
......@@ -2230,6 +2228,8 @@ void x264_encoder_close ( x264_t *h )
x264_cqm_delete( h );
x264_analyse_free_costs( h );
if( h->param.i_threads > 1)
h = h->thread[ h->i_thread_phase % h->param.i_threads ];
......
......@@ -195,8 +195,8 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
#define CHECK_MVRANGE(mx,my) ( mx >= mv_x_min && mx <= mv_x_max && my >= mv_y_min && my <= mv_y_max )
const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 );
bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 );
......@@ -452,8 +452,8 @@ me_hex2:
/* hexagon grid */
omx = bmx; omy = bmy;
const int16_t *p_cost_omvx = p_cost_mvx + omx*4;
const int16_t *p_cost_omvy = p_cost_mvy + omy*4;
const uint16_t *p_cost_omvx = p_cost_mvx + omx*4;
const uint16_t *p_cost_omvy = p_cost_mvy + omy*4;
i = 1;
do
{
......@@ -569,7 +569,7 @@ me_hex2:
int delta = x264_pixel_size[sad_size].w;
int16_t *xs = h->scratch_buffer;
int xn;
uint16_t *cost_fpel_mvx = x264_cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
uint16_t *cost_fpel_mvx = h->cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta,
p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE,
......@@ -768,8 +768,8 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
{
const int bw = x264_pixel_size[m->i_pixel].w;
const int bh = x264_pixel_size[m->i_pixel].h;
const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
const int i_pixel = m->i_pixel;
const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
......@@ -942,10 +942,10 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
const int i_pixel = m0->i_pixel;
const int bw = x264_pixel_size[i_pixel].w;
const int bh = x264_pixel_size[i_pixel].h;
const int16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0];
const int16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1];
const int16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0];
const int16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1];
const uint16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0];
const uint16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1];
const uint16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0];
const uint16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1];
ALIGNED_ARRAY_16( uint8_t, pixy_buf,[2],[9][16*16] );
ALIGNED_8( uint8_t pixu_buf[2][9][8*8] );
ALIGNED_8( uint8_t pixv_buf[2][9][8*8] );
......@@ -1073,7 +1073,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
static const int pixel_mv_offs[] = { 0, 4, 4*8, 0, 2, 2*8, 0 };
int16_t *cache_mv = h->mb.cache.mv[i_list][x264_scan8[i4]];
int16_t *cache_mv2 = cache_mv + pixel_mv_offs[m->i_pixel];
const int16_t *p_cost_mvx, *p_cost_mvy;
const uint16_t *p_cost_mvx, *p_cost_mvy;
const int bw = x264_pixel_size[m->i_pixel].w>>2;
const int bh = x264_pixel_size[m->i_pixel].h>>2;
const int i_pixel = m->i_pixel;
......
......@@ -31,7 +31,7 @@ typedef struct
{
/* input */
int i_pixel; /* PIXEL_WxH */
int16_t *p_cost_mv; /* lambda * nbits for each possible mv */
uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */
int i_ref_cost;
int i_ref;
......
......@@ -953,6 +953,8 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp )
}
}
q = x264_clip3f( q, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
rc->qpa_rc =
rc->qpa_aq = 0;
h->fdec->f_qp_avg_rc =
......
......@@ -29,22 +29,14 @@
#include "me.h"
static int x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
{
a->i_qp = 12; // arbitrary, but low because SATD scores are 1/4 normal
a->i_qp = X264_LOOKAHEAD_QP;
a->i_lambda = x264_lambda_tab[ a->i_qp ];
if( x264_mb_analyse_load_costs( h, a ) )
return -1;
x264_mb_analyse_load_costs( h, a );
h->mb.i_me_method = X264_MIN( X264_ME_HEX, h->param.analyse.i_me_method ); // maybe dia?
h->mb.i_subpel_refine = 4; // 3 should be enough, but not tweaking for speed now
h->mb.b_chroma_me = 0;
return 0;
}
int x264_lowres_context_alloc( x264_t *h )
{
x264_mb_analysis_t a;
return x264_lowres_context_init( h, &a );
}
static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
......
......@@ -262,9 +262,10 @@ static void Help( x264_param_t *defaults, int longhelp )
" where <option> is either\n"
" q=<integer> (force QP)\n"
" or b=<float> (bitrate multiplier)\n" );
H1( " --qpfile <string> Force frametypes and QPs for some or all frames\n"
H2( " --qpfile <string> Force frametypes and QPs for some or all frames\n"
" Format of each line: framenumber frametype QP\n"
" QP of -1 lets x264 choose. Frametypes: I,i,P,B,b.\n" );
" QP of -1 lets x264 choose. Frametypes: I,i,P,B,b.\n"
" QPs are restricted by qpmin/qpmax.\n" );
H1( "\n" );
H1( "Analysis:\n" );
H1( "\n" );
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment