Commit 25b40141 authored by Loren Merritt's avatar Loren Merritt

Successive elimination motion search: same as exhaustive search, but 2-3x faster.



git-svn-id: svn://svn.videolan.org/x264/trunk@388 df754926-b1dd-0310-bc7b-ec298dee348c
parent a9607af8
...@@ -395,6 +395,7 @@ struct x264_t ...@@ -395,6 +395,7 @@ struct x264_t
/* pointer over mb of the references */ /* pointer over mb of the references */
uint8_t *p_fref[2][16][4+2]; /* last: lN, lH, lV, lHV, cU, cV */ uint8_t *p_fref[2][16][4+2]; /* last: lN, lH, lV, lHV, cU, cV */
uint16_t *p_integral[2][16];
/* common stride */ /* common stride */
int i_stride[3]; int i_stride[3];
......
...@@ -90,6 +90,12 @@ x264_frame_t *x264_frame_new( x264_t *h ) ...@@ -90,6 +90,12 @@ x264_frame_t *x264_frame_new( x264_t *h )
} }
} }
if( h->param.analyse.i_me_method == X264_ME_ESA )
{
frame->buffer[11] = x264_malloc( frame->i_stride[0] * (frame->i_lines[0] + 64) * sizeof(uint16_t) );
frame->integral = (uint16_t*)frame->buffer[11] + frame->i_stride[0] * 32 + 32;
}
frame->i_poc = -1; frame->i_poc = -1;
frame->i_type = X264_TYPE_AUTO; frame->i_type = X264_TYPE_AUTO;
frame->i_qpplus1 = 0; frame->i_qpplus1 = 0;
...@@ -121,7 +127,7 @@ void x264_frame_delete( x264_frame_t *frame ) ...@@ -121,7 +127,7 @@ void x264_frame_delete( x264_frame_t *frame )
{ {
x264_free( frame->buffer[i] ); x264_free( frame->buffer[i] );
} }
for( i = 4; i < 11; i++ ) /* filtered planes */ for( i = 4; i < 12; i++ ) /* filtered planes */
{ {
x264_free( frame->buffer[i] ); x264_free( frame->buffer[i] );
} }
......
...@@ -46,10 +46,11 @@ typedef struct ...@@ -46,10 +46,11 @@ typedef struct
uint8_t *plane[4]; uint8_t *plane[4];
uint8_t *filtered[4]; /* plane[0], H, V, HV */ uint8_t *filtered[4]; /* plane[0], H, V, HV */
uint8_t *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */ uint8_t *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */
uint16_t *integral;
/* for unrestricted mv we allocate more data than needed /* for unrestricted mv we allocate more data than needed
* allocated data are stored in buffer */ * allocated data are stored in buffer */
void *buffer[11]; void *buffer[12];
/* motion data */ /* motion data */
int8_t *mb_type; int8_t *mb_type;
......
...@@ -907,6 +907,14 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) ...@@ -907,6 +907,14 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
} }
} }
if( h->fdec->integral )
{
for( i = 0; i < h->i_ref0; i++ )
h->mb.pic.p_integral[0][i] = &h->fref0[i]->integral[ 16 * ( i_mb_x + i_mb_y * h->fdec->i_stride[0] )];
for( i = 0; i < h->i_ref1; i++ )
h->mb.pic.p_integral[1][i] = &h->fref1[i]->integral[ 16 * ( i_mb_x + i_mb_y * h->fdec->i_stride[0] )];
}
/* load cache */ /* load cache */
if( i_mb_xy >= h->sh.i_first_mb + h->mb.i_mb_stride ) if( i_mb_xy >= h->sh.i_first_mb + h->mb.i_mb_stride )
{ {
......
...@@ -449,6 +449,30 @@ void x264_frame_filter( int cpu, x264_frame_t *frame ) ...@@ -449,6 +449,30 @@ void x264_frame_filter( int cpu, x264_frame_t *frame )
} }
} }
} }
/* generate integral image:
* each entry in frame->integral is the sum of all luma samples above and
* to the left of its location (inclusive).
* this allows us to calculate the DC of any rectangle by looking only
* at the corner entries.
* individual entries will overflow 16 bits, but that's ok:
* we only need the differences between entries, and those will be correct
* as long as we don't try to evaluate a rectangle bigger than 16x16.
* likewise, we don't really have to init the edges to 0, leaving garbage
* there wouldn't affect the results.*/
if( frame->integral )
{
memset( frame->integral - 32 * stride - 32, 0, stride * sizeof(uint16_t) );
for( y = -31; y < frame->i_lines[0] + 32; y++ )
{
uint8_t *ref = frame->plane[0] + y * stride - 32;
uint16_t *line = frame->integral + y * stride - 32;
uint16_t v = line[0] = 0;
for( x = 1; x < stride; x++ )
line[x] = v += ref[x] + line[x-stride] - line[x-stride-1];
}
}
} }
void x264_frame_init_lowres( int cpu, x264_frame_t *frame ) void x264_frame_init_lowres( int cpu, x264_frame_t *frame )
......
...@@ -664,13 +664,14 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_ ...@@ -664,13 +664,14 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_
(m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \ (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
(m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]];
#define LOAD_HPELS(m, src, xoff, yoff) \ #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
(m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
(m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
(m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
(m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
(m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \ (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
(m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
(m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]];
#define REF_COST(list, ref) \ #define REF_COST(list, ref) \
(a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l##list##_active - 1, ref )) (a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l##list##_active - 1, ref ))
...@@ -697,7 +698,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) ...@@ -697,7 +698,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
m.i_ref = i_ref; m.i_ref = i_ref;
/* search with ref */ /* search with ref */
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 0 ); LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp ); x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc ); x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh ); x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
...@@ -792,7 +793,7 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t ...@@ -792,7 +793,7 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
m.i_ref_cost = i_ref_cost; m.i_ref_cost = i_ref_cost;
m.i_ref = i_ref; m.i_ref = i_ref;
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 8*x8, 8*y8 ); LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref ); x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp ); x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh ); x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
...@@ -852,7 +853,7 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a ) ...@@ -852,7 +853,7 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
m->i_ref = i_ref; m->i_ref = i_ref;
LOAD_FENC( m, p_fenc, 8*x8, 8*y8 ); LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
LOAD_HPELS( m, p_fref, 8*x8, 8*y8 ); LOAD_HPELS( m, p_fref, 0, i_ref, 8*x8, 8*y8 );
x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp ); x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
x264_me_search( h, m, mvc, i_mvc ); x264_me_search( h, m, mvc, i_mvc );
...@@ -915,7 +916,7 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a ) ...@@ -915,7 +916,7 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
*(uint64_t*)mvc[1] = *(uint64_t*)a->l0.mvc[i_ref][2*i+1]; *(uint64_t*)mvc[1] = *(uint64_t*)a->l0.mvc[i_ref][2*i+1];
*(uint64_t*)mvc[2] = *(uint64_t*)a->l0.mvc[i_ref][2*i+2]; *(uint64_t*)mvc[2] = *(uint64_t*)a->l0.mvc[i_ref][2*i+2];
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 8*i ); LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref ); x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp ); x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
x264_me_search( h, &m, mvc, 3 ); x264_me_search( h, &m, mvc, 3 );
...@@ -971,7 +972,7 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a ) ...@@ -971,7 +972,7 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
*(uint64_t*)mvc[1] = *(uint64_t*)a->l0.mvc[i_ref][i+1]; *(uint64_t*)mvc[1] = *(uint64_t*)a->l0.mvc[i_ref][i+1];
*(uint64_t*)mvc[2] = *(uint64_t*)a->l0.mvc[i_ref][i+3]; *(uint64_t*)mvc[2] = *(uint64_t*)a->l0.mvc[i_ref][i+3];
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 8*i, 0 ); LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref ); x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp ); x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
x264_me_search( h, &m, mvc, 3 ); x264_me_search( h, &m, mvc, 3 );
...@@ -1031,7 +1032,7 @@ static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8 ...@@ -1031,7 +1032,7 @@ static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8
{ {
uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref]; uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
uint8_t **p_fenc = h->mb.pic.p_fenc; uint8_t **p_fenc = h->mb.pic.p_fenc;
const int i_ref = a->l0.me8x8[i8x8].i_ref;
int i4x4; int i4x4;
/* XXX Needed for x264_mb_predict_mv */ /* XXX Needed for x264_mb_predict_mv */
...@@ -1050,7 +1051,7 @@ static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8 ...@@ -1050,7 +1051,7 @@ static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8
m->p_cost_mv = a->p_cost_mv; m->p_cost_mv = a->p_cost_mv;
LOAD_FENC( m, p_fenc, 4*x4, 4*y4 ); LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
LOAD_HPELS( m, p_fref, 4*x4, 4*y4 ); LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
x264_mb_predict_mv( h, 0, idx, 1, m->mvp ); x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc ); x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
...@@ -1061,7 +1062,7 @@ static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8 ...@@ -1061,7 +1062,7 @@ static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8
a->l0.me4x4[i8x8][1].cost + a->l0.me4x4[i8x8][1].cost +
a->l0.me4x4[i8x8][2].cost + a->l0.me4x4[i8x8][2].cost +
a->l0.me4x4[i8x8][3].cost + a->l0.me4x4[i8x8][3].cost +
REF_COST( 0, a->l0.me8x8[i8x8].i_ref ) + REF_COST( 0, i_ref ) +
a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4]; a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
if( h->mb.b_chroma_me ) if( h->mb.b_chroma_me )
a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 ); a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
...@@ -1071,7 +1072,7 @@ static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8 ...@@ -1071,7 +1072,7 @@ static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8
{ {
uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref]; uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
uint8_t **p_fenc = h->mb.pic.p_fenc; uint8_t **p_fenc = h->mb.pic.p_fenc;
const int i_ref = a->l0.me8x8[i8x8].i_ref;
int i8x4; int i8x4;
/* XXX Needed for x264_mb_predict_mv */ /* XXX Needed for x264_mb_predict_mv */
...@@ -1090,7 +1091,7 @@ static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8 ...@@ -1090,7 +1091,7 @@ static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8
m->p_cost_mv = a->p_cost_mv; m->p_cost_mv = a->p_cost_mv;
LOAD_FENC( m, p_fenc, 4*x4, 4*y4 ); LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
LOAD_HPELS( m, p_fref, 4*x4, 4*y4 ); LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
x264_mb_predict_mv( h, 0, idx, 2, m->mvp ); x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc ); x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
...@@ -1098,7 +1099,7 @@ static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8 ...@@ -1098,7 +1099,7 @@ static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8
x264_macroblock_cache_mv( h, x4, y4, 2, 1, 0, m->mv[0], m->mv[1] ); x264_macroblock_cache_mv( h, x4, y4, 2, 1, 0, m->mv[0], m->mv[1] );
} }
a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost + a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
REF_COST( 0, a->l0.me8x8[i8x8].i_ref ) + REF_COST( 0, i_ref ) +
a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4]; a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
if( h->mb.b_chroma_me ) if( h->mb.b_chroma_me )
a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 ); a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
...@@ -1108,7 +1109,7 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8 ...@@ -1108,7 +1109,7 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8
{ {
uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref]; uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
uint8_t **p_fenc = h->mb.pic.p_fenc; uint8_t **p_fenc = h->mb.pic.p_fenc;
const int i_ref = a->l0.me8x8[i8x8].i_ref;
int i4x8; int i4x8;
/* XXX Needed for x264_mb_predict_mv */ /* XXX Needed for x264_mb_predict_mv */
...@@ -1127,7 +1128,7 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8 ...@@ -1127,7 +1128,7 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8
m->p_cost_mv = a->p_cost_mv; m->p_cost_mv = a->p_cost_mv;
LOAD_FENC( m, p_fenc, 4*x4, 4*y4 ); LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
LOAD_HPELS( m, p_fref, 4*x4, 4*y4 ); LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
x264_mb_predict_mv( h, 0, idx, 1, m->mvp ); x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc ); x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
...@@ -1135,7 +1136,7 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8 ...@@ -1135,7 +1136,7 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8
x264_macroblock_cache_mv( h, x4, y4, 1, 2, 0, m->mv[0], m->mv[1] ); x264_macroblock_cache_mv( h, x4, y4, 1, 2, 0, m->mv[0], m->mv[1] );
} }
a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost + a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
REF_COST( 0, a->l0.me8x8[i8x8].i_ref ) + REF_COST( 0, i_ref ) +
a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8]; a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
if( h->mb.b_chroma_me ) if( h->mb.b_chroma_me )
a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 ); a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
...@@ -1208,7 +1209,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) ...@@ -1208,7 +1209,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
for( i_ref = 0; i_ref < h->i_ref0; i_ref++ ) for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
{ {
/* search with ref */ /* search with ref */
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 0 ); LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp ); x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc ); x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh ); x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
...@@ -1236,7 +1237,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) ...@@ -1236,7 +1237,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
for( i_ref = 0; i_ref < h->i_ref1; i_ref++ ) for( i_ref = 0; i_ref < h->i_ref1; i_ref++ )
{ {
/* search with ref */ /* search with ref */
LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 0, 0 ); LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp ); x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc ); x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh ); x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
...@@ -1467,7 +1468,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a ) ...@@ -1467,7 +1468,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
m->p_cost_mv = a->p_cost_mv; m->p_cost_mv = a->p_cost_mv;
LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 ); LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
LOAD_HPELS( m, p_fref[l], 8*x8, 8*y8 ); LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
x264_mb_predict_mv( h, l, 4*i, 2, m->mvp ); x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
x264_me_search( h, m, &lX->me16x16.mv, 1 ); x264_me_search( h, m, &lX->me16x16.mv, 1 );
...@@ -1556,7 +1557,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a ) ...@@ -1556,7 +1557,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
m->p_cost_mv = a->p_cost_mv; m->p_cost_mv = a->p_cost_mv;
LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i ); LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
LOAD_HPELS( m, p_fref[l], 0, 8*i ); LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
mvc[0][0] = lX->me8x8[2*i].mv[0]; mvc[0][0] = lX->me8x8[2*i].mv[0];
mvc[0][1] = lX->me8x8[2*i].mv[1]; mvc[0][1] = lX->me8x8[2*i].mv[1];
...@@ -1640,7 +1641,7 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a ) ...@@ -1640,7 +1641,7 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
m->p_cost_mv = a->p_cost_mv; m->p_cost_mv = a->p_cost_mv;
LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 ); LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
LOAD_HPELS( m, p_fref[l], 8*i, 0 ); LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
mvc[0][0] = lX->me8x8[i].mv[0]; mvc[0][0] = lX->me8x8[i].mv[0];
mvc[0][1] = lX->me8x8[i].mv[1]; mvc[0][1] = lX->me8x8[i].mv[1];
......
...@@ -263,11 +263,32 @@ me_hex2: ...@@ -263,11 +263,32 @@ me_hex2:
const int min_y = X264_MAX( bmy - i_me_range, mv_y_min); const int min_y = X264_MAX( bmy - i_me_range, mv_y_min);
const int max_x = X264_MIN( bmx + i_me_range, mv_x_max); const int max_x = X264_MIN( bmx + i_me_range, mv_x_max);
const int max_y = X264_MIN( bmy + i_me_range, mv_y_max); const int max_y = X264_MIN( bmy + i_me_range, mv_y_max);
for( omy = min_y; omy <= max_y; omy++ ) int mx, my;
for( omx = min_x; omx <= max_x; omx++ ) #if 0
/* plain old exhaustive search */
for( my = min_y; my <= max_y; my++ )
for( mx = min_x; mx <= max_x; mx++ )
COST_MV( mx, my );
#else
/* successive elimination by comparing DC before a full SAD,
* because sum(abs(diff)) >= abs(diff(sum)). */
const int stride = m->i_stride[0];
const int dw = x264_pixel_size[i_pixel].w;
const int dh = x264_pixel_size[i_pixel].h * stride;
static uint8_t zero[16*16] = {0,};
const int enc_dc = h->pixf.sad[i_pixel]( m->p_fenc[0], stride, zero, 16 );
const uint16_t *integral_base = &m->integral[ -1 - 1*stride ];
for( my = min_y; my <= max_y; my++ )
for( mx = min_x; mx <= max_x; mx++ )
{ {
COST_MV( omx, omy ); const uint16_t *integral = &integral_base[ mx + my * stride ];
const uint16_t ref_dc = integral[ 0 ] + integral[ dh + dw ]
- integral[ dw ] - integral[ dh ];
if( abs( ref_dc - enc_dc ) < bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ] )
COST_MV( mx, my );
} }
#endif
} }
break; break;
} }
......
...@@ -36,6 +36,7 @@ typedef struct ...@@ -36,6 +36,7 @@ typedef struct
uint8_t *p_fref[6]; uint8_t *p_fref[6];
uint8_t *p_fenc[3]; uint8_t *p_fenc[3];
uint16_t *integral;
int i_stride[2]; int i_stride[2];
int mvp[2]; int mvp[2];
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment