Commit b2d78b5c authored by Loren Merritt's avatar Loren Merritt

Save some memcopies in halfpel ME.

Patch by Radek Czyz.


git-svn-id: svn://svn.videolan.org/x264/trunk@124 df754926-b1dd-0310-bc7b-ec298dee348c
parent 46141bf2
......@@ -794,6 +794,8 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
{
uint8_t pix1[16*16], pix2[16*16];
uint8_t *src2;
int stride2 = 16;
x264_me_t m;
int i_ref;
......@@ -865,15 +867,32 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
/* get cost of BI mode */
h->mc.mc_luma( h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
pix1, 16,
a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
16, 16 );
h->mc.mc_luma( h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
pix2, 16,
a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
16, 16 );
h->pixf.avg[PIXEL_16x16]( pix1, 16, pix2, 16 );
if ( ((a->l0.me16x16.mv[0] | a->l0.me16x16.mv[1]) & 1) == 0 )
{
/* l0 reference is halfpel, so get_ref on it will make it faster */
src2 = h->mc.get_ref( h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
pix2, &stride2,
a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
16, 16 );
h->mc.mc_luma( h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
pix1, 16,
a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
16, 16 );
}
else
{
/* if l0 was qpel, we'll use get_ref on l1 instead */
h->mc.mc_luma( h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
pix1, 16,
a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
16, 16 );
src2 = h->mc.get_ref( h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
pix2, &stride2,
a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
16, 16 );
}
h->pixf.avg[PIXEL_16x16]( pix1, 16, src2, stride2 );
a->i_cost16x16bi = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0], pix1, 16 ) +
a->i_lambda * ( bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref ) +
......
......@@ -187,6 +187,8 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
const int bh = x264_pixel_size[m->i_pixel].h;
DECLARE_ALIGNED( uint8_t, pix[4][16*16], 16 );
uint8_t * src[4];
int stride[4];
int cost[4];
int best;
int step, i;
......@@ -198,18 +200,19 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
{
for( i = step>1 ? hpel_iters : qpel_iters; i > 0; i-- )
{
h->mc.mc_luma( m->p_fref, m->i_stride, pix[0], 16, bmx + 0, bmy - step, bw, bh );
h->mc.mc_luma( m->p_fref, m->i_stride, pix[1], 16, bmx + 0, bmy + step, bw, bh );
h->mc.mc_luma( m->p_fref, m->i_stride, pix[2], 16, bmx - step, bmy + 0, bw, bh );
h->mc.mc_luma( m->p_fref, m->i_stride, pix[3], 16, bmx + step, bmy + 0, bw, bh );
stride[0] = stride[1] = stride[2] = stride[3] = 16;
src[0] = h->mc.get_ref( m->p_fref, m->i_stride, pix[0], &stride[0], bmx + 0, bmy - step, bw, bh );
src[1] = h->mc.get_ref( m->p_fref, m->i_stride, pix[1], &stride[1], bmx + 0, bmy + step, bw, bh );
src[2] = h->mc.get_ref( m->p_fref, m->i_stride, pix[2], &stride[2], bmx - step, bmy + 0, bw, bh );
src[3] = h->mc.get_ref( m->p_fref, m->i_stride, pix[3], &stride[3], bmx + step, bmy + 0, bw, bh );
cost[0] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[0], 16 ) +
cost[0] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, src[0], stride[0] ) +
m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy - step - m->mvp[1] ) );
cost[1] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[1], 16 ) +
cost[1] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, src[1], stride[1] ) +
m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy + step - m->mvp[1] ) );
cost[2] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[2], 16 ) +
cost[2] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, src[2], stride[2] ) +
m->lm * ( bs_size_se( bmx - step - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
cost[3] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[3], 16 ) +
cost[3] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, src[3], stride[3] ) +
m->lm * ( bs_size_se( bmx + step - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
best = 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment