Commit fd1cf294 authored by Fiona Glaser's avatar Fiona Glaser

Faster weightp motion search

For blind-weight dupes, copy the motion vector from the main search and qpel-refine instead of doing a full search.
Fix the p8x8 early termination, which had unexpected results when combined with blind weighting.
Overall, marginally reduces compression but should potentially improve speed by over 5%.
parent bc0ae2ef
...@@ -661,6 +661,7 @@ struct x264_t ...@@ -661,6 +661,7 @@ struct x264_t
/* maps fref1[0]'s ref indices into the current list0 */ /* maps fref1[0]'s ref indices into the current list0 */
#define map_col_to_list0(col) h->mb.map_col_to_list0[col+2] #define map_col_to_list0(col) h->mb.map_col_to_list0[col+2]
int8_t map_col_to_list0[18]; int8_t map_col_to_list0[18];
int ref_blind_dupe; /* The index of the blind reference frame duplicate. */
} mb; } mb;
/* rate control encoding only */ /* rate control encoding only */
......
...@@ -1263,7 +1263,14 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) ...@@ -1263,7 +1263,14 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp ); x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc ); x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
if( h->mb.ref_blind_dupe == i_ref )
{
CP32( m.mv, a->l0.mvc[0][0] );
x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
}
else
x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
/* early termination /* early termination
* SSD threshold would probably be better than SATD */ * SSD threshold would probably be better than SATD */
...@@ -1321,18 +1328,25 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t ...@@ -1321,18 +1328,25 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
h->mb.i_partition = D_8x8; h->mb.i_partition = D_8x8;
#define CHECK_NEIGHBOUR(i)\
{\
int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
i_maxref = ref;\
}
/* early termination: if 16x16 chose ref 0, then evalute no refs older /* early termination: if 16x16 chose ref 0, then evalute no refs older
* than those used by the neighbors */ * than those used by the neighbors */
if( i_maxref > 0 && a->l0.me16x16.i_ref == 0 && if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
h->mb.i_mb_type_top && h->mb.i_mb_type_left ) h->mb.i_mb_type_top && h->mb.i_mb_type_left )
{ {
i_maxref = 0; i_maxref = 0;
i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 - 1 ] ); CHECK_NEIGHBOUR( -8 - 1 );
i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 0 ] ); CHECK_NEIGHBOUR( -8 + 0 );
i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 2 ] ); CHECK_NEIGHBOUR( -8 + 2 );
i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 4 ] ); CHECK_NEIGHBOUR( -8 + 4 );
i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 0 - 1 ] ); CHECK_NEIGHBOUR( 0 - 1 );
i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 2*8 - 1 ] ); CHECK_NEIGHBOUR( 2*8 - 1 );
} }
for( i_ref = 0; i_ref <= i_maxref; i_ref++ ) for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
...@@ -1348,7 +1362,7 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t ...@@ -1348,7 +1362,7 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 ); LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
l0m->cost = INT_MAX; l0m->cost = INT_MAX;
for( i_ref = 0; i_ref <= i_maxref; i_ref++ ) for( i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
{ {
const int i_ref_cost = REF_COST( 0, i_ref ); const int i_ref_cost = REF_COST( 0, i_ref );
i_halfpel_thresh -= i_ref_cost; i_halfpel_thresh -= i_ref_cost;
...@@ -1359,7 +1373,13 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t ...@@ -1359,7 +1373,13 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref ); x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp ); x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh ); if( h->mb.ref_blind_dupe == i_ref )
{
CP32( m.mv, a->l0.mvc[0][i+1] );
x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
}
else
x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
m.cost += i_ref_cost; m.cost += i_ref_cost;
i_halfpel_thresh += i_ref_cost; i_halfpel_thresh += i_ref_cost;
...@@ -1367,6 +1387,10 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t ...@@ -1367,6 +1387,10 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
if( m.cost < l0m->cost ) if( m.cost < l0m->cost )
h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) ); h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
i_ref = h->mb.ref_blind_dupe;
else
i_ref++;
} }
x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv ); x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref ); x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
...@@ -1389,7 +1413,10 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t ...@@ -1389,7 +1413,10 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a ) static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
{ {
const int i_ref = a->l0.me16x16.i_ref; /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
* reference frame flags. Thus, if we're not doing mixedrefs, just
* don't bother analysing the dupes. */
const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0; const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
uint8_t **p_fenc = h->mb.pic.p_fenc; uint8_t **p_fenc = h->mb.pic.p_fenc;
int i_mvc; int i_mvc;
...@@ -1452,7 +1479,9 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a ) ...@@ -1452,7 +1479,9 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
for( i = 0; i < 2; i++ ) for( i = 0; i < 2; i++ )
{ {
x264_me_t *l0m = &a->l0.me16x8[i]; x264_me_t *l0m = &a->l0.me16x8[i];
const int ref8[2] = { a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref }; const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
const int ref8[2] = { minref, maxref };
const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2; const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
m.i_pixel = PIXEL_16x8; m.i_pixel = PIXEL_16x8;
...@@ -1475,7 +1504,14 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a ) ...@@ -1475,7 +1504,14 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref ); x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp ); x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
x264_me_search( h, &m, mvc, 3 ); /* We can only take this shortcut if the first search was performed on ref0. */
if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
{
/* We can just leave the MV from the previous ref search. */
x264_me_refine_qpel_refdupe( h, &m, NULL );
}
else
x264_me_search( h, &m, mvc, 3 );
m.cost += i_ref_cost; m.cost += i_ref_cost;
...@@ -1502,7 +1538,9 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a ) ...@@ -1502,7 +1538,9 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
for( i = 0; i < 2; i++ ) for( i = 0; i < 2; i++ )
{ {
x264_me_t *l0m = &a->l0.me8x16[i]; x264_me_t *l0m = &a->l0.me8x16[i];
const int ref8[2] = { a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref }; const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
const int ref8[2] = { minref, maxref };
const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2; const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
m.i_pixel = PIXEL_8x16; m.i_pixel = PIXEL_8x16;
...@@ -1524,7 +1562,14 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a ) ...@@ -1524,7 +1562,14 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref ); x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp ); x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
x264_me_search( h, &m, mvc, 3 ); /* We can only take this shortcut if the first search was performed on ref0. */
if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
{
/* We can just leave the MV from the previous ref search. */
x264_me_refine_qpel_refdupe( h, &m, NULL );
}
else
x264_me_search( h, &m, mvc, 3 );
m.cost += i_ref_cost; m.cost += i_ref_cost;
......
...@@ -1335,8 +1335,7 @@ static void x264_weighted_pred_init( x264_t *h ) ...@@ -1335,8 +1335,7 @@ static void x264_weighted_pred_init( x264_t *h )
static inline void x264_reference_build_list( x264_t *h, int i_poc ) static inline void x264_reference_build_list( x264_t *h, int i_poc )
{ {
int i; int i, b_ok;
int b_ok;
/* build ref list 0/1 */ /* build ref list 0/1 */
h->mb.pic.i_fref[0] = h->i_ref0 = 0; h->mb.pic.i_fref[0] = h->i_ref0 = 0;
...@@ -1403,6 +1402,7 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc ) ...@@ -1403,6 +1402,7 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
/* add duplicates */ /* add duplicates */
if( h->fenc->i_type == X264_TYPE_P ) if( h->fenc->i_type == X264_TYPE_P )
{ {
int idx = -1;
if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ) if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
{ {
x264_weight_t w[3]; x264_weight_t w[3];
...@@ -1414,7 +1414,7 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc ) ...@@ -1414,7 +1414,7 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
{ {
h->fenc->weight[0][0].i_denom = 0; h->fenc->weight[0][0].i_denom = 0;
SET_WEIGHT( w[0], 1, 1, 0, -1 ); SET_WEIGHT( w[0], 1, 1, 0, -1 );
x264_weighted_reference_duplicate( h, 0, w ); idx = x264_weighted_reference_duplicate( h, 0, w );
} }
else else
{ {
...@@ -1428,7 +1428,7 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc ) ...@@ -1428,7 +1428,7 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
w[0] = h->fenc->weight[0][0]; w[0] = h->fenc->weight[0][0];
w[0].i_offset--; w[0].i_offset--;
h->mc.weight_cache( h, &w[0] ); h->mc.weight_cache( h, &w[0] );
x264_weighted_reference_duplicate( h, 0, w ); idx = x264_weighted_reference_duplicate( h, 0, w );
} }
} }
} }
...@@ -1439,8 +1439,9 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc ) ...@@ -1439,8 +1439,9 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
SET_WEIGHT( w[0], 1, 1, 0, -1 ); SET_WEIGHT( w[0], 1, 1, 0, -1 );
h->fenc->weight[0][0].i_denom = 0; h->fenc->weight[0][0].i_denom = 0;
w[1].weightfn = w[2].weightfn = NULL; w[1].weightfn = w[2].weightfn = NULL;
x264_weighted_reference_duplicate( h, 0, w ); idx = x264_weighted_reference_duplicate( h, 0, w );
} }
h->mb.ref_blind_dupe = idx;
} }
assert( h->i_ref0 + h->i_ref1 <= 16 ); assert( h->i_ref0 + h->i_ref1 <= 16 );
......
...@@ -729,6 +729,11 @@ void x264_me_refine_qpel( x264_t *h, x264_me_t *m ) ...@@ -729,6 +729,11 @@ void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
refine_subpel( h, m, hpel, qpel, NULL, 1 ); refine_subpel( h, m, hpel, qpel, NULL, 1 );
} }
void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh )
{
refine_subpel( h, m, 0, X264_MIN( 2, subpel_iterations[h->mb.i_subpel_refine][3] ), p_halfpel_thresh, 0 );
}
#define COST_MV_SAD( mx, my ) \ #define COST_MV_SAD( mx, my ) \
{ \ { \
int stride = 16; \ int stride = 16; \
......
...@@ -62,6 +62,7 @@ static inline void x264_me_search( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], i ...@@ -62,6 +62,7 @@ static inline void x264_me_search( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], i
{ x264_me_search_ref( h, m, mvc, i_mvc, NULL ); } { x264_me_search_ref( h, m, mvc, i_mvc, NULL ); }
void x264_me_refine_qpel( x264_t *h, x264_me_t *m ); void x264_me_refine_qpel( x264_t *h, x264_me_t *m );
void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh );
void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int i_list ); void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int i_list );
void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2 ); void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2 );
void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight ); void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment