Commit ccac8546 authored by Dylan Yudaken's avatar Dylan Yudaken Committed by Fiona Glaser

Weighted P-frame prediction

Merge Dylan's Google Summer of Code 2009 tree.
Detect fades and use weighted prediction to improve compression and quality.
"Blind" mode provides a small overall quality increase by using a -1 offset without doing any analysis, as described in JVT-AB033.
"Smart", the default mode, also performs fade detection and decides weights accordingly.
MB-tree takes into account the effects of "smart" analysis in lookahead, even further improving quality in fades.
If psy is on, mbtree is on, interlaced is off, and weightp is off, fade detection will still be performed.
However, it will be used to adjust quality instead of create actual weights.
This will improve quality in fades when encoding in Baseline profile.

Doesn't add support for interlaced encoding with weightp yet.
Only adds support for luma weights, not chroma weights.
Internal code for chroma weights is in, but there's no analysis yet.
Baseline profile requires that weightp be off.
All weightp modes may cause minor breakage in non-compliant decoders that take shortcuts in deblocking reference frame checks.
"Smart" may cause serious breakage in non-compliant decoders that take shortcuts in handling of duplicate reference frames.

Thanks to Google for sponsoring our most successful Summer of Code yet!
parent b0673412
......@@ -136,6 +136,7 @@ void x264_param_default( x264_param_t *param )
param->analyse.i_chroma_qp_offset = 0;
param->analyse.b_fast_pskip = 1;
param->analyse.b_weighted_bipred = 1;
param->analyse.i_weighted_pred = X264_WEIGHTP_SMART;
param->analyse.b_dct_decimate = 1;
param->analyse.b_transform_8x8 = 1;
param->analyse.i_trellis = 1;
......@@ -489,6 +490,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
p->analyse.b_transform_8x8 = atobool(value);
OPT2("weightb", "weight-b")
p->analyse.b_weighted_bipred = atobool(value);
OPT("weightp")
p->analyse.i_weighted_pred = atoi(value);
OPT2("direct", "direct-pred")
b_error |= parse_enum( value, x264_direct_pred_names, &p->analyse.i_direct_mv_pred );
OPT("chroma-qp-offset")
......@@ -903,6 +906,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
p->i_bframe_pyramid, p->i_bframe_adaptive, p->i_bframe_bias,
p->analyse.i_direct_mv_pred, p->analyse.b_weighted_bipred );
}
s += sprintf( s, " wpredp=%d", p->analyse.i_weighted_pred > 0 ? p->analyse.i_weighted_pred : 0 );
s += sprintf( s, " keyint=%d keyint_min=%d scenecut=%d",
p->i_keyint_max, p->i_keyint_min, p->i_scenecut_threshold );
......
......@@ -60,6 +60,13 @@ do {\
// 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
#define X264_THREAD_HEIGHT 24
/* WEIGHTP_FAKE is set when mb_tree & psy are enabled, but normal weightp is disabled
* (such as in baseline). It checks for fades in lookahead and adjusts qp accordingly
* to increase quality. Defined as (-1) so that if(i_weighted_pred > 0) is true only when
* real weights are being used. */
#define X264_WEIGHTP_FAKE (-1)
/****************************************************************************
* Includes
****************************************************************************/
......@@ -233,6 +240,9 @@ typedef struct
int arg;
} ref_pic_list_order[2][16];
/* P-frame weighting */
x264_weight_t weight[16][3];
int i_mmco_remove_from_end;
int i_mmco_command_count;
struct /* struct for future expansion */
......@@ -390,6 +400,9 @@ struct x264_t
/* Unused frames: 0 = fenc, 1 = fdec */
x264_frame_t **unused[2];
/* Unused blank frames (for duplicates) */
x264_frame_t **blank_unused;
/* frames used for reference + sentinels */
x264_frame_t *reference[16+2];
......@@ -502,6 +515,9 @@ struct x264_t
uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
uint8_t (*nnz_backup)[16]; /* when using cavlc + 8x8dct, the deblocker uses a modified nnz */
/* buffer for weighted versions of the reference frames */
uint8_t *p_weight_buf[16];
/* current value */
int i_type;
int i_partition;
......@@ -564,6 +580,7 @@ struct x264_t
/* pointer over mb of the references */
int i_fref[2];
uint8_t *p_fref[2][32][4+2]; /* last: lN, lH, lV, lHV, cU, cV */
uint8_t *p_fref_w[32]; /* weighted fullpel luma */
uint16_t *p_integral[2][16];
/* fref stride */
......@@ -681,6 +698,8 @@ struct x264_t
/* */
int i_direct_score[2];
int i_direct_frames[2];
/* num p-frames weighted */
int i_wpred[3];
} stat;
......
......@@ -73,6 +73,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
frame->i_frame_num = -1;
frame->i_lines_completed = -1;
frame->b_fdec = b_fdec;
frame->orig = frame;
/* all 4 luma planes allocated together, since the cacheline split code
* requires them to be in-phase wrt cacheline alignment. */
......@@ -86,9 +87,11 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
else
{
CHECKED_MALLOC( frame->buffer[0], luma_plane_size);
frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
}
frame->b_duplicate = 0;
if( b_fdec ) /* fdec frame */
{
CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
......@@ -168,38 +171,43 @@ fail:
void x264_frame_delete( x264_frame_t *frame )
{
int i, j;
for( i = 0; i < 4; i++ )
x264_free( frame->buffer[i] );
for( i = 0; i < 4; i++ )
x264_free( frame->buffer_lowres[i] );
for( i = 0; i < X264_BFRAME_MAX+2; i++ )
for( j = 0; j < X264_BFRAME_MAX+2; j++ )
x264_free( frame->i_row_satds[i][j] );
for( j = 0; j < 2; j++ )
for( i = 0; i <= X264_BFRAME_MAX; i++ )
{
x264_free( frame->lowres_mvs[j][i] );
x264_free( frame->lowres_mv_costs[j][i] );
}
x264_free( frame->i_propagate_cost );
for( j = 0; j <= X264_BFRAME_MAX+1; j++ )
for( i = 0; i <= X264_BFRAME_MAX+1; i++ )
{
x264_free( frame->lowres_costs[j][i] );
x264_free( frame->lowres_inter_types[j][i] );
}
x264_free( frame->f_qp_offset );
x264_free( frame->f_qp_offset_aq );
x264_free( frame->i_inv_qscale_factor );
x264_free( frame->i_row_bits );
x264_free( frame->i_row_qp );
x264_free( frame->mb_type );
x264_free( frame->mv[0] );
x264_free( frame->mv[1] );
x264_free( frame->ref[0] );
x264_free( frame->ref[1] );
x264_pthread_mutex_destroy( &frame->mutex );
x264_pthread_cond_destroy( &frame->cv );
/* Duplicate frames are blank copies of real frames (including pointers),
* so freeing those pointers would cause a double free later. */
if( !frame->b_duplicate )
{
for( i = 0; i < 4; i++ )
x264_free( frame->buffer[i] );
for( i = 0; i < 4; i++ )
x264_free( frame->buffer_lowres[i] );
for( i = 0; i < X264_BFRAME_MAX+2; i++ )
for( j = 0; j < X264_BFRAME_MAX+2; j++ )
x264_free( frame->i_row_satds[i][j] );
for( j = 0; j < 2; j++ )
for( i = 0; i <= X264_BFRAME_MAX; i++ )
{
x264_free( frame->lowres_mvs[j][i] );
x264_free( frame->lowres_mv_costs[j][i] );
}
x264_free( frame->i_propagate_cost );
for( j = 0; j <= X264_BFRAME_MAX+1; j++ )
for( i = 0; i <= X264_BFRAME_MAX+1; i++ )
{
x264_free( frame->lowres_costs[j][i] );
x264_free( frame->lowres_inter_types[j][i] );
}
x264_free( frame->f_qp_offset );
x264_free( frame->f_qp_offset_aq );
x264_free( frame->i_inv_qscale_factor );
x264_free( frame->i_row_bits );
x264_free( frame->i_row_qp );
x264_free( frame->mb_type );
x264_free( frame->mv[0] );
x264_free( frame->mv[1] );
x264_free( frame->ref[0] );
x264_free( frame->ref[1] );
x264_pthread_mutex_destroy( &frame->mutex );
x264_pthread_cond_destroy( &frame->cv );
}
x264_free( frame );
}
......@@ -747,7 +755,15 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
int i4p= mb_4x4+x+y*s4x4;\
int i4q= mbn_4x4+xn+yn*s4x4;\
if((h->mb.ref[0][i8p] != h->mb.ref[0][i8q] ||\
int refs_equal;\
if( h->mb.ref[0][i8p] < 0 || h->mb.ref[0][i8q] < 0 )\
refs_equal = h->mb.ref[0][i8p] == h->mb.ref[0][i8q];\
else if( !h->mb.b_interlaced )\
refs_equal = h->fref0[h->mb.ref[0][i8p]]->i_poc == h->fref0[h->mb.ref[0][i8q]]->i_poc;\
else\
refs_equal = ( h->fref0[h->mb.ref[0][i8p]>>1]->i_poc == h->fref0[h->mb.ref[0][i8q]>>1]->i_poc ) &&\
( (h->mb.ref[0][i8p]&1) == (h->mb.ref[0][i8q]&1) );\
if((!refs_equal ||\
abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
(h->sh.i_type == SLICE_TYPE_B &&\
......@@ -992,6 +1008,32 @@ x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
frame->i_reference_count = 1;
frame->b_intra_calculated = 0;
frame->b_scenecut = 1;
memset( frame->weight, 0, sizeof(frame->weight) );
memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
return frame;
}
void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
{
assert( frame->i_reference_count > 0 );
frame->i_reference_count--;
if( frame->i_reference_count == 0 )
x264_frame_push( h->frames.blank_unused, frame );
}
x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
{
x264_frame_t *frame;
if( h->frames.blank_unused[0] )
frame = x264_frame_pop( h->frames.blank_unused );
else
frame = x264_malloc( sizeof(x264_frame_t) );
if( !frame )
return NULL;
frame->b_duplicate = 1;
frame->i_reference_count = 1;
return frame;
}
......@@ -1015,9 +1057,27 @@ void x264_frame_sort( x264_frame_t **list, int b_dts )
} while( !b_ok );
}
void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
int i_width, int i_height, x264_weight_t *w )
{
int x;
/* Weight horizontal strips of height 16. This was found to be the optimal height
* in terms of the cache loads. */
while( i_height > 0 )
{
for( x = 0; x < i_width ; x += 16 )
w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
i_height -= 16;
dst += 16 * i_dst_stride;
src += 16 * i_src_stride;
}
}
void x264_frame_delete_list( x264_frame_t **list )
{
int i = 0;
if( !list )
return;
while( list[i] )
x264_frame_delete( list[i++] );
x264_free( list );
......
......@@ -28,7 +28,7 @@
#define PADH 32
#define PADV 32
typedef struct
typedef struct x264_frame
{
/* */
int i_poc;
......@@ -65,6 +65,11 @@ typedef struct
uint8_t *buffer[4];
uint8_t *buffer_lowres[4];
x264_weight_t weight[16][3]; /* the weights for the P frames used to encode this frame */
uint8_t *weighted[16]; /* plane[0] weighted of the reference frames */
int b_duplicate;
struct x264_frame *orig;
/* motion data */
int8_t *mb_type;
int16_t (*mv[2])[2];
......@@ -96,6 +101,7 @@ typedef struct
uint16_t *i_propagate_cost;
uint16_t *i_inv_qscale_factor;
int b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */
float f_weighted_cost_delta[X264_BFRAME_MAX+2];
/* vbv */
uint8_t i_planned_type[X264_LOOKAHEAD_MAX+1];
......@@ -103,6 +109,7 @@ typedef struct
/* threading */
int i_lines_completed; /* in pixels */
int i_lines_weighted; /* FIXME: this only supports weighting of one reference frame */
int i_reference_count; /* number of threads using this frame (not necessarily the number of pointers) */
x264_pthread_mutex_t mutex;
x264_pthread_cond_t cv;
......@@ -160,6 +167,10 @@ x264_frame_t *x264_frame_pop( x264_frame_t **list );
void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame );
x264_frame_t *x264_frame_shift( x264_frame_t **list );
void x264_frame_push_unused( x264_t *h, x264_frame_t *frame );
void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame );
x264_frame_t *x264_frame_pop_blank_unused( x264_t *h );
void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
int i_width, int i_height, x264_weight_t *w );
x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec );
void x264_frame_sort( x264_frame_t **list, int b_dts );
void x264_frame_delete_list( x264_frame_t **list );
......
......@@ -477,7 +477,7 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h
h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0],
mvx, mvy, 4*width, 4*height );
mvx, mvy, 4*width, 4*height, &h->sh.weight[i_ref][0] );
// chroma is offset if MCing from a field of opposite parity
if( h->mb.b_interlaced & i_ref )
......@@ -487,9 +487,20 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h
h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1],
mvx, mvy, 2*width, 2*height );
if( h->sh.weight[i_ref][1].weightfn )
h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->sh.weight[i_ref][1], height*2 );
h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
h->mb.pic.p_fref[0][i_ref][5], h->mb.pic.i_stride[2],
mvx, mvy, 2*width, 2*height );
if( h->sh.weight[i_ref][2].weightfn )
h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->sh.weight[i_ref][2],height*2 );
}
static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
{
......@@ -500,7 +511,7 @@ static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int h
h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
h->mb.pic.p_fref[1][i_ref], h->mb.pic.i_stride[0],
mvx, mvy, 4*width, 4*height );
mvx, mvy, 4*width, 4*height, weight_none );
if( h->mb.b_interlaced & i_ref )
mvy += (h->mb.i_mb_y & 1)*4 - 2;
......@@ -531,9 +542,9 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int
uint8_t *src0, *src1;
src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0],
mvx0, mvy0, 4*width, 4*height );
mvx0, mvy0, 4*width, 4*height, weight_none );
src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
mvx1, mvy1, 4*width, 4*height );
mvx1, mvy1, 4*width, 4*height, weight_none );
h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
src0, i_stride0, src1, i_stride1, weight );
......@@ -701,10 +712,55 @@ int x264_macroblock_cache_init( x264_t *h )
for( i=0; i<2; i++ )
{
int i_refs = X264_MIN(16, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << h->param.b_interlaced;
if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
i_refs = X264_MIN(16, i_refs + 2); //smart weights add two duplicate frames
else if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_BLIND )
i_refs = X264_MIN(16, i_refs + 1); //blind weights add one duplicate frame
for( j=0; j < i_refs; j++ )
CHECKED_MALLOC( h->mb.mvr[i][j], 2 * i_mb_count * sizeof(int16_t) );
}
if( h->param.analyse.i_weighted_pred )
{
int i_padv = PADV << h->param.b_interlaced;
#define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
int i_stride, luma_plane_size;
int numweightbuf;
if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE )
{
// only need buffer for lookahead thread
if( !h->param.i_sync_lookahead || h == h->thread[h->param.i_threads] )
{
// Fake analysis only works on lowres
i_stride = ALIGN( h->sps->i_mb_width*8 + 2*PADH, align );
luma_plane_size = i_stride * (h->sps->i_mb_height*8+2*i_padv);
// Only need 1 buffer for analysis
numweightbuf = 1;
}
else
numweightbuf = 0;
}
else
{
i_stride = ALIGN( h->sps->i_mb_width*16 + 2*PADH, align );
luma_plane_size = i_stride * (h->sps->i_mb_height*16+2*i_padv);
if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
//SMART can weight one ref and one offset -1
numweightbuf = 2;
else
//blind only has one weighted copy (offset -1)
numweightbuf = 1;
}
for( i = 0; i < numweightbuf; i++ )
CHECKED_MALLOC( h->mb.p_weight_buf[i], luma_plane_size );
#undef ALIGN
}
for( i=0; i<=h->param.b_interlaced; i++ )
for( j=0; j<3; j++ )
{
......@@ -765,6 +821,9 @@ void x264_macroblock_cache_end( x264_t *h )
for( i=0; i<2; i++ )
for( j=0; j<32; j++ )
x264_free( h->mb.mvr[i][j] );
for( i=0; i<16; i++ )
x264_free( h->mb.p_weight_buf[i] );
if( h->param.b_cabac )
{
x264_free( h->mb.chroma_pred_mode );
......@@ -866,8 +925,14 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb
{
h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &fref[0][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]];
if( i == 0 )
{
for( k = 1; k < 4; k++ )
h->mb.pic.p_fref[0][j][k] = &fref[0][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]];
if( h->sh.weight[j][0].weightfn )
h->mb.pic.p_fref_w[j] = &h->fenc->weighted[j >> h->mb.b_interlaced][ref_pix_offset[j&1]];
else
h->mb.pic.p_fref_w[j] = h->mb.pic.p_fref[0][j][0];
}
}
if( h->sh.i_type == SLICE_TYPE_B )
for( j = 0; j < h->mb.pic.i_fref[1]; j++ )
......
......@@ -120,6 +120,67 @@ PIXEL_AVG_C( pixel_avg_4x2, 4, 2 )
PIXEL_AVG_C( pixel_avg_2x4, 2, 4 )
PIXEL_AVG_C( pixel_avg_2x2, 2, 2 )
static void x264_weight_cache( x264_t *h, x264_weight_t *w )
{
w->weightfn = h->mc.weight;
}
#define opscale(x) dst[x] = x264_clip_uint8( ( ( ( src[x] * weight->i_scale ) + (1<<(weight->i_denom - 1) ) )>> weight->i_denom ) + weight->i_offset )
#define opscale_noden(x) dst[x] = x264_clip_uint8( ( src[x] * weight->i_scale ) + weight->i_offset )
static inline void mc_weight( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, const x264_weight_t *weight, int i_width, int i_height )
{
int x, y;
if( weight->i_denom >= 1 )
{
for( y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
{
for( x = 0; x < i_width; x++ )
opscale( x );
}
}
else
{
for( y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
for( x = 0; x < i_width; x++ )
opscale_noden( x );
}
}
#define MC_WEIGHT_C( name, lx ) \
static void name( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, const x264_weight_t *weight, int height ) \
{ \
int x, y; \
if( weight->i_denom >= 1 ) \
{ \
for( y = 0; y < height; y++, dst += i_dst_stride, src += i_src_stride ) \
for( x = 0; x < lx; x++ ) \
opscale( x ); \
} \
else \
{ \
for( y = 0; y < height; y++, dst += i_dst_stride, src += i_src_stride ) \
for( x = 0; x < lx; x++ ) \
opscale_noden( x ); \
} \
}
MC_WEIGHT_C( mc_weight_w20, 20 )
MC_WEIGHT_C( mc_weight_w16, 16 )
MC_WEIGHT_C( mc_weight_w12, 12 )
MC_WEIGHT_C( mc_weight_w8, 8 )
MC_WEIGHT_C( mc_weight_w4, 4 )
MC_WEIGHT_C( mc_weight_w2, 2 )
static weight_fn_t x264_mc_weight_wtab[6] =
{
mc_weight_w2,
mc_weight_w4,
mc_weight_w8,
mc_weight_w12,
mc_weight_w16,
mc_weight_w20,
};
const x264_weight_t weight_none[3] = { {{0}} };
static void mc_copy( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
{
int y;
......@@ -163,7 +224,7 @@ static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
static void mc_luma( uint8_t *dst, int i_dst_stride,
uint8_t *src[4], int i_src_stride,
int mvx, int mvy,
int i_width, int i_height )
int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
int offset = (mvy>>2)*i_src_stride + (mvx>>2);
......@@ -174,17 +235,19 @@ static void mc_luma( uint8_t *dst, int i_dst_stride,
uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
pixel_avg( dst, i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_width, i_height );
if( weight->weightfn )
mc_weight( dst, i_dst_stride, dst, i_dst_stride, weight, i_width, i_height );
}
else if( weight->weightfn )
mc_weight( dst, i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
else
{
mc_copy( src1, i_src_stride, dst, i_dst_stride, i_width, i_height );
}
}
static uint8_t *get_ref( uint8_t *dst, int *i_dst_stride,
uint8_t *src[4], int i_src_stride,
int mvx, int mvy,
int i_width, int i_height )
int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
int offset = (mvy>>2)*i_src_stride + (mvx>>2);
......@@ -195,6 +258,13 @@ static uint8_t *get_ref( uint8_t *dst, int *i_dst_stride,
uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
pixel_avg( dst, *i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_width, i_height );
if( weight->weightfn )
mc_weight( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_width, i_height );
return dst;
}
else if( weight->weightfn )
{
mc_weight( dst, *i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
return dst;
}
else
......@@ -403,6 +473,11 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
pf->avg[PIXEL_2x4] = pixel_avg_2x4;
pf->avg[PIXEL_2x2] = pixel_avg_2x2;
pf->weight = x264_mc_weight_wtab;
pf->offsetadd = x264_mc_weight_wtab;
pf->offsetsub = x264_mc_weight_wtab;
pf->weight_cache = x264_weight_cache;
pf->copy_16x16_unaligned = mc_copy_w16;
pf->copy[PIXEL_16x16] = mc_copy_w16;
pf->copy[PIXEL_8x8] = mc_copy_w8;
......
......@@ -21,6 +21,33 @@
#ifndef X264_MC_H
#define X264_MC_H
struct x264_weight_t;
typedef void (* weight_fn_t)( uint8_t *, int, uint8_t *,int, const struct x264_weight_t *, int );
typedef struct x264_weight_t
{
/* aligning the first member is a gcc hack to force the struct to be
* 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */
ALIGNED_16( int16_t cachea[8] );
int16_t cacheb[8];
int32_t i_denom;
int32_t i_scale;
int32_t i_offset;
weight_fn_t *weightfn;
} ALIGNED_16( x264_weight_t );
extern const x264_weight_t weight_none[3];
#define SET_WEIGHT( w, b, s, d, o )\
{\
(w).i_scale = (s);\
(w).i_denom = (d);\
(w).i_offset = (o);\
if( b )\
h->mc.weight_cache( h, &w );\
else\
w.weightfn = NULL;\
}
/* Do the MC
* XXX: Only width = 4, 8 or 16 are valid
* width == 4 -> height == 4 or 8
......@@ -32,12 +59,12 @@ typedef struct
{
void (*mc_luma)(uint8_t *dst, int i_dst, uint8_t **src, int i_src,
int mvx, int mvy,
int i_width, int i_height );
int i_width, int i_height, const x264_weight_t *weight );
/* may round up the dimensions if they're not a power of 2 */
uint8_t* (*get_ref)(uint8_t *dst, int *i_dst, uint8_t **src, int i_src,
int mvx, int mvy,
int i_width, int i_height );
int i_width, int i_height, const x264_weight_t *weight );
/* mc_chroma may write up to 2 bytes of garbage to the right of dst,
* so it must be run from left to right. */
......@@ -74,6 +101,10 @@ typedef struct
void (*frame_init_lowres_core)( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
int src_stride, int dst_stride, int width, int height );
weight_fn_t *weight;
weight_fn_t *offsetadd;
weight_fn_t *offsetsub;
void (*weight_cache)( x264_t *, x264_weight_t * );
void (*mbtree_propagate_cost)( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, int len );
......
......@@ -6,6 +6,7 @@
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
;* Laurent Aimar <fenrir@via.ecp.fr>
;* Dylan Yudaken <dyudaken@gmail.com>
;* Min Chen <chenm001.163.com>
;*
;* This program is free software; you can redistribute it and/or modify
......@@ -28,6 +29,7 @@
SECTION_RODATA 32
ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0
pw_1: times 8 dw 1
pw_4: times 8 dw 4
pw_8: times 8 dw 8
pw_32: times 8 dw 32
......@@ -37,9 +39,8 @@ sw_64: dd 64
SECTION .text
;=============================================================================
; weighted prediction
; implicit weighted biprediction
;=============================================================================
; implicit bipred only:
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
%ifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,10,11
......@@ -64,12 +65,12 @@ SECTION .text
%endmacro
%endif
%macro SPLATW 2
%macro SPLATW 2-3 0
%if mmsize==16
pshuflw %1, %2, 0
pshuflw %1, %2, %3*0x55
punpcklqdq %1, %1
%else
pshufw %1, %2, 0
pshufw %1, %2, %3*0x55
%endif
%endmacro
......@@ -175,6 +176,225 @@ INIT_XMM
AVG_WEIGHT ssse3, 8, 7
AVG_WEIGHT ssse3, 16, 7
;=============================================================================
; P frame explicit weighted prediction
;=============================================================================
%macro WEIGHT_START 1
mova m3, [r4]
mova m6, [r4+16]
movd m5, [r4+32]
pxor m2, m2
%if (%1 == 20 || %1 == 12) && mmsize == 16
movdq2q mm3, xmm3
movdq2q mm4, xmm4
movdq2q mm5, xmm5
movdq2q mm6, xmm6
pxor mm2, mm2
%endif
%endmacro
%macro WEIGHT_START_SSSE3 1
mova m3, [r4]
mova m4, [r4+16]
pxor m2, m2
%if ( %1 == 20 || %1 == 12 )
movdq2q mm3, xmm3
movdq2q mm4, xmm4
pxor mm2, mm2
%endif
%endmacro
;; macro to weight mmsize bytes taking half from %1 and half from %2
%macro WEIGHT 2 ; (src1,src2)
movh m0, [%1]
movh m1, [%2]
punpcklbw m0, m2 ;setup
punpcklbw m1, m2 ;setup
pmullw m0, m3 ;scale
pmullw m1, m3 ;scale
paddsw m0, m6 ;1<<(denom-1)+(offset<<denom)
paddsw m1, m6 ;1<<(denom-1)+(offset<<denom)
psraw m0, m5 ;denom
psraw m1, m5 ;denom
%endmacro
%macro WEIGHT_SSSE3 2
movh m0, [%1]
movh m1, [%2]
punpcklbw m0, m2
punpcklbw m1, m2
psllw m0, 7
psllw m1, 7
pmulhrsw m0, m3
pmulhrsw m1, m3
paddw m0, m4
paddw m1, m4
%endmacro
%macro WEIGHT_SAVE_ROW 3 ;(src,dst,width)