Commit ccac8546 authored by Dylan Yudaken's avatar Dylan Yudaken Committed by Fiona Glaser

Weighted P-frame prediction

Merge Dylan's Google Summer of Code 2009 tree.
Detect fades and use weighted prediction to improve compression and quality.
"Blind" mode provides a small overall quality increase by using a -1 offset without doing any analysis, as described in JVT-AB033.
"Smart", the default mode, also performs fade detection and decides weights accordingly.
MB-tree takes into account the effects of "smart" analysis in lookahead, even further improving quality in fades.
If psy is on, mbtree is on, interlaced is off, and weightp is off, fade detection will still be performed.
However, it will be used to adjust quality instead of create actual weights.
This will improve quality in fades when encoding in Baseline profile.

Doesn't add support for interlaced encoding with weightp yet.
Only adds support for luma weights, not chroma weights.
Internal code for chroma weights is in, but there's no analysis yet.
Baseline profile requires that weightp be off.
All weightp modes may cause minor breakage in non-compliant decoders that take shortcuts in deblocking reference frame checks.
"Smart" may cause serious breakage in non-compliant decoders that take shortcuts in handling of duplicate reference frames.

Thanks to Google for sponsoring our most successful Summer of Code yet!
parent b0673412
...@@ -136,6 +136,7 @@ void x264_param_default( x264_param_t *param ) ...@@ -136,6 +136,7 @@ void x264_param_default( x264_param_t *param )
param->analyse.i_chroma_qp_offset = 0; param->analyse.i_chroma_qp_offset = 0;
param->analyse.b_fast_pskip = 1; param->analyse.b_fast_pskip = 1;
param->analyse.b_weighted_bipred = 1; param->analyse.b_weighted_bipred = 1;
param->analyse.i_weighted_pred = X264_WEIGHTP_SMART;
param->analyse.b_dct_decimate = 1; param->analyse.b_dct_decimate = 1;
param->analyse.b_transform_8x8 = 1; param->analyse.b_transform_8x8 = 1;
param->analyse.i_trellis = 1; param->analyse.i_trellis = 1;
...@@ -489,6 +490,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value ) ...@@ -489,6 +490,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
p->analyse.b_transform_8x8 = atobool(value); p->analyse.b_transform_8x8 = atobool(value);
OPT2("weightb", "weight-b") OPT2("weightb", "weight-b")
p->analyse.b_weighted_bipred = atobool(value); p->analyse.b_weighted_bipred = atobool(value);
OPT("weightp")
p->analyse.i_weighted_pred = atoi(value);
OPT2("direct", "direct-pred") OPT2("direct", "direct-pred")
b_error |= parse_enum( value, x264_direct_pred_names, &p->analyse.i_direct_mv_pred ); b_error |= parse_enum( value, x264_direct_pred_names, &p->analyse.i_direct_mv_pred );
OPT("chroma-qp-offset") OPT("chroma-qp-offset")
...@@ -903,6 +906,7 @@ char *x264_param2string( x264_param_t *p, int b_res ) ...@@ -903,6 +906,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
p->i_bframe_pyramid, p->i_bframe_adaptive, p->i_bframe_bias, p->i_bframe_pyramid, p->i_bframe_adaptive, p->i_bframe_bias,
p->analyse.i_direct_mv_pred, p->analyse.b_weighted_bipred ); p->analyse.i_direct_mv_pred, p->analyse.b_weighted_bipred );
} }
s += sprintf( s, " wpredp=%d", p->analyse.i_weighted_pred > 0 ? p->analyse.i_weighted_pred : 0 );
s += sprintf( s, " keyint=%d keyint_min=%d scenecut=%d", s += sprintf( s, " keyint=%d keyint_min=%d scenecut=%d",
p->i_keyint_max, p->i_keyint_min, p->i_scenecut_threshold ); p->i_keyint_max, p->i_keyint_min, p->i_scenecut_threshold );
......
...@@ -60,6 +60,13 @@ do {\ ...@@ -60,6 +60,13 @@ do {\
// 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety // 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
#define X264_THREAD_HEIGHT 24 #define X264_THREAD_HEIGHT 24
/* WEIGHTP_FAKE is set when mb_tree & psy are enabled, but normal weightp is disabled
* (such as in baseline). It checks for fades in lookahead and adjusts qp accordingly
* to increase quality. Defined as (-1) so that if(i_weighted_pred > 0) is true only when
* real weights are being used. */
#define X264_WEIGHTP_FAKE (-1)
/**************************************************************************** /****************************************************************************
* Includes * Includes
****************************************************************************/ ****************************************************************************/
...@@ -233,6 +240,9 @@ typedef struct ...@@ -233,6 +240,9 @@ typedef struct
int arg; int arg;
} ref_pic_list_order[2][16]; } ref_pic_list_order[2][16];
/* P-frame weighting */
x264_weight_t weight[16][3];
int i_mmco_remove_from_end; int i_mmco_remove_from_end;
int i_mmco_command_count; int i_mmco_command_count;
struct /* struct for future expansion */ struct /* struct for future expansion */
...@@ -390,6 +400,9 @@ struct x264_t ...@@ -390,6 +400,9 @@ struct x264_t
/* Unused frames: 0 = fenc, 1 = fdec */ /* Unused frames: 0 = fenc, 1 = fdec */
x264_frame_t **unused[2]; x264_frame_t **unused[2];
/* Unused blank frames (for duplicates) */
x264_frame_t **blank_unused;
/* frames used for reference + sentinels */ /* frames used for reference + sentinels */
x264_frame_t *reference[16+2]; x264_frame_t *reference[16+2];
...@@ -502,6 +515,9 @@ struct x264_t ...@@ -502,6 +515,9 @@ struct x264_t
uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */ uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
uint8_t (*nnz_backup)[16]; /* when using cavlc + 8x8dct, the deblocker uses a modified nnz */ uint8_t (*nnz_backup)[16]; /* when using cavlc + 8x8dct, the deblocker uses a modified nnz */
/* buffer for weighted versions of the reference frames */
uint8_t *p_weight_buf[16];
/* current value */ /* current value */
int i_type; int i_type;
int i_partition; int i_partition;
...@@ -564,6 +580,7 @@ struct x264_t ...@@ -564,6 +580,7 @@ struct x264_t
/* pointer over mb of the references */ /* pointer over mb of the references */
int i_fref[2]; int i_fref[2];
uint8_t *p_fref[2][32][4+2]; /* last: lN, lH, lV, lHV, cU, cV */ uint8_t *p_fref[2][32][4+2]; /* last: lN, lH, lV, lHV, cU, cV */
uint8_t *p_fref_w[32]; /* weighted fullpel luma */
uint16_t *p_integral[2][16]; uint16_t *p_integral[2][16];
/* fref stride */ /* fref stride */
...@@ -681,6 +698,8 @@ struct x264_t ...@@ -681,6 +698,8 @@ struct x264_t
/* */ /* */
int i_direct_score[2]; int i_direct_score[2];
int i_direct_frames[2]; int i_direct_frames[2];
/* num p-frames weighted */
int i_wpred[3];
} stat; } stat;
......
...@@ -73,6 +73,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) ...@@ -73,6 +73,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
frame->i_frame_num = -1; frame->i_frame_num = -1;
frame->i_lines_completed = -1; frame->i_lines_completed = -1;
frame->b_fdec = b_fdec; frame->b_fdec = b_fdec;
frame->orig = frame;
/* all 4 luma planes allocated together, since the cacheline split code /* all 4 luma planes allocated together, since the cacheline split code
* requires them to be in-phase wrt cacheline alignment. */ * requires them to be in-phase wrt cacheline alignment. */
...@@ -86,9 +87,11 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) ...@@ -86,9 +87,11 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
else else
{ {
CHECKED_MALLOC( frame->buffer[0], luma_plane_size); CHECKED_MALLOC( frame->buffer[0], luma_plane_size);
frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH; frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
} }
frame->b_duplicate = 0;
if( b_fdec ) /* fdec frame */ if( b_fdec ) /* fdec frame */
{ {
CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t)); CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
...@@ -168,38 +171,43 @@ fail: ...@@ -168,38 +171,43 @@ fail:
void x264_frame_delete( x264_frame_t *frame ) void x264_frame_delete( x264_frame_t *frame )
{ {
int i, j; int i, j;
for( i = 0; i < 4; i++ ) /* Duplicate frames are blank copies of real frames (including pointers),
x264_free( frame->buffer[i] ); * so freeing those pointers would cause a double free later. */
for( i = 0; i < 4; i++ ) if( !frame->b_duplicate )
x264_free( frame->buffer_lowres[i] ); {
for( i = 0; i < X264_BFRAME_MAX+2; i++ ) for( i = 0; i < 4; i++ )
for( j = 0; j < X264_BFRAME_MAX+2; j++ ) x264_free( frame->buffer[i] );
x264_free( frame->i_row_satds[i][j] ); for( i = 0; i < 4; i++ )
for( j = 0; j < 2; j++ ) x264_free( frame->buffer_lowres[i] );
for( i = 0; i <= X264_BFRAME_MAX; i++ ) for( i = 0; i < X264_BFRAME_MAX+2; i++ )
{ for( j = 0; j < X264_BFRAME_MAX+2; j++ )
x264_free( frame->lowres_mvs[j][i] ); x264_free( frame->i_row_satds[i][j] );
x264_free( frame->lowres_mv_costs[j][i] ); for( j = 0; j < 2; j++ )
} for( i = 0; i <= X264_BFRAME_MAX; i++ )
x264_free( frame->i_propagate_cost ); {
for( j = 0; j <= X264_BFRAME_MAX+1; j++ ) x264_free( frame->lowres_mvs[j][i] );
for( i = 0; i <= X264_BFRAME_MAX+1; i++ ) x264_free( frame->lowres_mv_costs[j][i] );
{ }
x264_free( frame->lowres_costs[j][i] ); x264_free( frame->i_propagate_cost );
x264_free( frame->lowres_inter_types[j][i] ); for( j = 0; j <= X264_BFRAME_MAX+1; j++ )
} for( i = 0; i <= X264_BFRAME_MAX+1; i++ )
x264_free( frame->f_qp_offset ); {
x264_free( frame->f_qp_offset_aq ); x264_free( frame->lowres_costs[j][i] );
x264_free( frame->i_inv_qscale_factor ); x264_free( frame->lowres_inter_types[j][i] );
x264_free( frame->i_row_bits ); }
x264_free( frame->i_row_qp ); x264_free( frame->f_qp_offset );
x264_free( frame->mb_type ); x264_free( frame->f_qp_offset_aq );
x264_free( frame->mv[0] ); x264_free( frame->i_inv_qscale_factor );
x264_free( frame->mv[1] ); x264_free( frame->i_row_bits );
x264_free( frame->ref[0] ); x264_free( frame->i_row_qp );
x264_free( frame->ref[1] ); x264_free( frame->mb_type );
x264_pthread_mutex_destroy( &frame->mutex ); x264_free( frame->mv[0] );
x264_pthread_cond_destroy( &frame->cv ); x264_free( frame->mv[1] );
x264_free( frame->ref[0] );
x264_free( frame->ref[1] );
x264_pthread_mutex_destroy( &frame->mutex );
x264_pthread_cond_destroy( &frame->cv );
}
x264_free( frame ); x264_free( frame );
} }
...@@ -747,7 +755,15 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) ...@@ -747,7 +755,15 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\ int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
int i4p= mb_4x4+x+y*s4x4;\ int i4p= mb_4x4+x+y*s4x4;\
int i4q= mbn_4x4+xn+yn*s4x4;\ int i4q= mbn_4x4+xn+yn*s4x4;\
if((h->mb.ref[0][i8p] != h->mb.ref[0][i8q] ||\ int refs_equal;\
if( h->mb.ref[0][i8p] < 0 || h->mb.ref[0][i8q] < 0 )\
refs_equal = h->mb.ref[0][i8p] == h->mb.ref[0][i8q];\
else if( !h->mb.b_interlaced )\
refs_equal = h->fref0[h->mb.ref[0][i8p]]->i_poc == h->fref0[h->mb.ref[0][i8q]]->i_poc;\
else\
refs_equal = ( h->fref0[h->mb.ref[0][i8p]>>1]->i_poc == h->fref0[h->mb.ref[0][i8q]>>1]->i_poc ) &&\
( (h->mb.ref[0][i8p]&1) == (h->mb.ref[0][i8q]&1) );\
if((!refs_equal ||\
abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\ abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\ abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
(h->sh.i_type == SLICE_TYPE_B &&\ (h->sh.i_type == SLICE_TYPE_B &&\
...@@ -992,6 +1008,32 @@ x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec ) ...@@ -992,6 +1008,32 @@ x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
frame->i_reference_count = 1; frame->i_reference_count = 1;
frame->b_intra_calculated = 0; frame->b_intra_calculated = 0;
frame->b_scenecut = 1; frame->b_scenecut = 1;
memset( frame->weight, 0, sizeof(frame->weight) );
memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
return frame;
}
void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
{
assert( frame->i_reference_count > 0 );
frame->i_reference_count--;
if( frame->i_reference_count == 0 )
x264_frame_push( h->frames.blank_unused, frame );
}
x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
{
x264_frame_t *frame;
if( h->frames.blank_unused[0] )
frame = x264_frame_pop( h->frames.blank_unused );
else
frame = x264_malloc( sizeof(x264_frame_t) );
if( !frame )
return NULL;
frame->b_duplicate = 1;
frame->i_reference_count = 1;
return frame; return frame;
} }
...@@ -1015,9 +1057,27 @@ void x264_frame_sort( x264_frame_t **list, int b_dts ) ...@@ -1015,9 +1057,27 @@ void x264_frame_sort( x264_frame_t **list, int b_dts )
} while( !b_ok ); } while( !b_ok );
} }
void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
int i_width, int i_height, x264_weight_t *w )
{
int x;
/* Weight horizontal strips of height 16. This was found to be the optimal height
* in terms of the cache loads. */
while( i_height > 0 )
{
for( x = 0; x < i_width ; x += 16 )
w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
i_height -= 16;
dst += 16 * i_dst_stride;
src += 16 * i_src_stride;
}
}
void x264_frame_delete_list( x264_frame_t **list ) void x264_frame_delete_list( x264_frame_t **list )
{ {
int i = 0; int i = 0;
if( !list )
return;
while( list[i] ) while( list[i] )
x264_frame_delete( list[i++] ); x264_frame_delete( list[i++] );
x264_free( list ); x264_free( list );
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#define PADH 32 #define PADH 32
#define PADV 32 #define PADV 32
typedef struct typedef struct x264_frame
{ {
/* */ /* */
int i_poc; int i_poc;
...@@ -65,6 +65,11 @@ typedef struct ...@@ -65,6 +65,11 @@ typedef struct
uint8_t *buffer[4]; uint8_t *buffer[4];
uint8_t *buffer_lowres[4]; uint8_t *buffer_lowres[4];
x264_weight_t weight[16][3]; /* the weights for the P frames used to encode this frame */
uint8_t *weighted[16]; /* plane[0] weighted of the reference frames */
int b_duplicate;
struct x264_frame *orig;
/* motion data */ /* motion data */
int8_t *mb_type; int8_t *mb_type;
int16_t (*mv[2])[2]; int16_t (*mv[2])[2];
...@@ -96,6 +101,7 @@ typedef struct ...@@ -96,6 +101,7 @@ typedef struct
uint16_t *i_propagate_cost; uint16_t *i_propagate_cost;
uint16_t *i_inv_qscale_factor; uint16_t *i_inv_qscale_factor;
int b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */ int b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */
float f_weighted_cost_delta[X264_BFRAME_MAX+2];
/* vbv */ /* vbv */
uint8_t i_planned_type[X264_LOOKAHEAD_MAX+1]; uint8_t i_planned_type[X264_LOOKAHEAD_MAX+1];
...@@ -103,6 +109,7 @@ typedef struct ...@@ -103,6 +109,7 @@ typedef struct
/* threading */ /* threading */
int i_lines_completed; /* in pixels */ int i_lines_completed; /* in pixels */
int i_lines_weighted; /* FIXME: this only supports weighting of one reference frame */
int i_reference_count; /* number of threads using this frame (not necessarily the number of pointers) */ int i_reference_count; /* number of threads using this frame (not necessarily the number of pointers) */
x264_pthread_mutex_t mutex; x264_pthread_mutex_t mutex;
x264_pthread_cond_t cv; x264_pthread_cond_t cv;
...@@ -160,6 +167,10 @@ x264_frame_t *x264_frame_pop( x264_frame_t **list ); ...@@ -160,6 +167,10 @@ x264_frame_t *x264_frame_pop( x264_frame_t **list );
void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame ); void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame );
x264_frame_t *x264_frame_shift( x264_frame_t **list ); x264_frame_t *x264_frame_shift( x264_frame_t **list );
void x264_frame_push_unused( x264_t *h, x264_frame_t *frame ); void x264_frame_push_unused( x264_t *h, x264_frame_t *frame );
void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame );
x264_frame_t *x264_frame_pop_blank_unused( x264_t *h );
void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
int i_width, int i_height, x264_weight_t *w );
x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec ); x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec );
void x264_frame_sort( x264_frame_t **list, int b_dts ); void x264_frame_sort( x264_frame_t **list, int b_dts );
void x264_frame_delete_list( x264_frame_t **list ); void x264_frame_delete_list( x264_frame_t **list );
......
...@@ -477,7 +477,7 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h ...@@ -477,7 +477,7 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h
h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0], h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0],
mvx, mvy, 4*width, 4*height ); mvx, mvy, 4*width, 4*height, &h->sh.weight[i_ref][0] );
// chroma is offset if MCing from a field of opposite parity // chroma is offset if MCing from a field of opposite parity
if( h->mb.b_interlaced & i_ref ) if( h->mb.b_interlaced & i_ref )
...@@ -487,9 +487,20 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h ...@@ -487,9 +487,20 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h
h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1], h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1],
mvx, mvy, 2*width, 2*height ); mvx, mvy, 2*width, 2*height );
if( h->sh.weight[i_ref][1].weightfn )
h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->sh.weight[i_ref][1], height*2 );
h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
h->mb.pic.p_fref[0][i_ref][5], h->mb.pic.i_stride[2], h->mb.pic.p_fref[0][i_ref][5], h->mb.pic.i_stride[2],
mvx, mvy, 2*width, 2*height ); mvx, mvy, 2*width, 2*height );
if( h->sh.weight[i_ref][2].weightfn )
h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->sh.weight[i_ref][2],height*2 );
} }
static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height ) static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
{ {
...@@ -500,7 +511,7 @@ static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int h ...@@ -500,7 +511,7 @@ static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int h
h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
h->mb.pic.p_fref[1][i_ref], h->mb.pic.i_stride[0], h->mb.pic.p_fref[1][i_ref], h->mb.pic.i_stride[0],
mvx, mvy, 4*width, 4*height ); mvx, mvy, 4*width, 4*height, weight_none );
if( h->mb.b_interlaced & i_ref ) if( h->mb.b_interlaced & i_ref )
mvy += (h->mb.i_mb_y & 1)*4 - 2; mvy += (h->mb.i_mb_y & 1)*4 - 2;
...@@ -531,9 +542,9 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int ...@@ -531,9 +542,9 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int
uint8_t *src0, *src1; uint8_t *src0, *src1;
src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0], src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0],
mvx0, mvy0, 4*width, 4*height ); mvx0, mvy0, 4*width, 4*height, weight_none );
src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0], src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
mvx1, mvy1, 4*width, 4*height ); mvx1, mvy1, 4*width, 4*height, weight_none );
h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
src0, i_stride0, src1, i_stride1, weight ); src0, i_stride0, src1, i_stride1, weight );
...@@ -701,10 +712,55 @@ int x264_macroblock_cache_init( x264_t *h ) ...@@ -701,10 +712,55 @@ int x264_macroblock_cache_init( x264_t *h )
for( i=0; i<2; i++ ) for( i=0; i<2; i++ )
{ {
int i_refs = X264_MIN(16, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << h->param.b_interlaced; int i_refs = X264_MIN(16, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << h->param.b_interlaced;
if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
i_refs = X264_MIN(16, i_refs + 2); //smart weights add two duplicate frames
else if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_BLIND )
i_refs = X264_MIN(16, i_refs + 1); //blind weights add one duplicate frame
for( j=0; j < i_refs; j++ ) for( j=0; j < i_refs; j++ )
CHECKED_MALLOC( h->mb.mvr[i][j], 2 * i_mb_count * sizeof(int16_t) ); CHECKED_MALLOC( h->mb.mvr[i][j], 2 * i_mb_count * sizeof(int16_t) );
} }
if( h->param.analyse.i_weighted_pred )
{
int i_padv = PADV << h->param.b_interlaced;
#define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
int i_stride, luma_plane_size;
int numweightbuf;
if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE )
{
// only need buffer for lookahead thread
if( !h->param.i_sync_lookahead || h == h->thread[h->param.i_threads] )
{
// Fake analysis only works on lowres
i_stride = ALIGN( h->sps->i_mb_width*8 + 2*PADH, align );
luma_plane_size = i_stride * (h->sps->i_mb_height*8+2*i_padv);
// Only need 1 buffer for analysis
numweightbuf = 1;
}
else
numweightbuf = 0;
}
else
{
i_stride = ALIGN( h->sps->i_mb_width*16 + 2*PADH, align );
luma_plane_size = i_stride * (h->sps->i_mb_height*16+2*i_padv);
if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
//SMART can weight one ref and one offset -1
numweightbuf = 2;
else
//blind only has one weighted copy (offset -1)
numweightbuf = 1;
}
for( i = 0; i < numweightbuf; i++ )
CHECKED_MALLOC( h->mb.p_weight_buf[i], luma_plane_size );
#undef ALIGN
}
for( i=0; i<=h->param.b_interlaced; i++ ) for( i=0; i<=h->param.b_interlaced; i++ )
for( j=0; j<3; j++ ) for( j=0; j<3; j++ )
{ {
...@@ -765,6 +821,9 @@ void x264_macroblock_cache_end( x264_t *h ) ...@@ -765,6 +821,9 @@ void x264_macroblock_cache_end( x264_t *h )
for( i=0; i<2; i++ ) for( i=0; i<2; i++ )
for( j=0; j<32; j++ ) for( j=0; j<32; j++ )
x264_free( h->mb.mvr[i][j] ); x264_free( h->mb.mvr[i][j] );
for( i=0; i<16; i++ )
x264_free( h->mb.p_weight_buf[i] );
if( h->param.b_cabac ) if( h->param.b_cabac )
{ {
x264_free( h->mb.chroma_pred_mode ); x264_free( h->mb.chroma_pred_mode );
...@@ -866,8 +925,14 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb ...@@ -866,8 +925,14 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb
{ {
h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &fref[0][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]]; h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &fref[0][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]];
if( i == 0 ) if( i == 0 )
{
for( k = 1; k < 4; k++ ) for( k = 1; k < 4; k++ )
h->mb.pic.p_fref[0][j][k] = &fref[0][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]]; h->mb.pic.p_fref[0][j][k] = &fref[0][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]];
if( h->sh.weight[j][0].weightfn )
h->mb.pic.p_fref_w[j] = &h->fenc->weighted[j >> h->mb.b_interlaced][ref_pix_offset[j&1]];
else
h->mb.pic.p_fref_w[j] = h->mb.pic.p_fref[0][j][0];
}
} }
if( h->sh.i_type == SLICE_TYPE_B ) if( h->sh.i_type == SLICE_TYPE_B )
for( j = 0; j < h->mb.pic.i_fref[1]; j++ ) for( j = 0; j < h->mb.pic.i_fref[1]; j++ )
......
...@@ -120,6 +120,67 @@ PIXEL_AVG_C( pixel_avg_4x2, 4, 2 ) ...@@ -120,6 +120,67 @@ PIXEL_AVG_C( pixel_avg_4x2, 4, 2 )
PIXEL_AVG_C( pixel_avg_2x4, 2, 4 ) PIXEL_AVG_C( pixel_avg_2x4, 2, 4 )
PIXEL_AVG_C( pixel_avg_2x2, 2, 2 ) PIXEL_AVG_C( pixel_avg_2x2, 2, 2 )
static void x264_weight_cache( x264_t *h, x264_weight_t *w )
{
w->weightfn = h->mc.weight;
}
#define opscale(x) dst[x] = x264_clip_uint8( ( ( ( src[x] * weight->i_scale ) + (1<<(weight->i_denom - 1) ) )>> weight->i_denom ) + weight->i_offset )
#define opscale_noden(x) dst[x] = x264_clip_uint8( ( src[x] * weight->i_scale ) + weight->i_offset )
static inline void mc_weight( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, const x264_weight_t *weight, int i_width, int i_height )
{
int x, y;
if( weight->i_denom >= 1 )
{
for( y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
{
for( x = 0; x < i_width; x++ )
opscale( x );
}
}
else
{
for( y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
for( x = 0; x < i_width; x++ )
opscale_noden( x );
}
}
#define MC_WEIGHT_C( name, lx ) \
static void name( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, const x264_weight_t *weight, int height ) \
{ \
int x, y; \
if( weight->i_denom >= 1 ) \
{ \
for( y = 0; y < height; y++, dst += i_dst_stride, src += i_src_stride ) \
for( x = 0; x < lx; x++ ) \
opscale( x ); \
} \
else \
{ \
for( y = 0; y < height; y++, dst += i_dst_stride, src += i_src_stride ) \
for( x = 0; x < lx; x++ ) \
opscale_noden( x ); \
} \
}
MC_WEIGHT_C( mc_weight_w20, 20 )
MC_WEIGHT_C( mc_weight_w16, 16 )