Commit 56108cb6 authored by Fiona Glaser's avatar Fiona Glaser

Use aligned memcpy for x264_me_t struct and cosmetics

parent dba0e5a2
......@@ -1010,7 +1010,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
i_halfpel_thresh += i_ref_cost;
if( m.cost < a->l0.me16x16.cost )
a->l0.me16x16 = m;
h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
/* save mv for predicting neighbors */
*(uint32_t*)a->l0.mvc[i_ref][0] =
......@@ -1072,22 +1072,22 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
l0m->cost = INT_MAX;
for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
{
const int i_ref_cost = REF_COST( 0, i_ref );
i_halfpel_thresh -= i_ref_cost;
m.i_ref_cost = i_ref_cost;
m.i_ref = i_ref;
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
m.cost += i_ref_cost;
i_halfpel_thresh += i_ref_cost;
*(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv;
if( m.cost < l0m->cost )
*l0m = m;
const int i_ref_cost = REF_COST( 0, i_ref );
i_halfpel_thresh -= i_ref_cost;
m.i_ref_cost = i_ref_cost;
m.i_ref = i_ref;
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
m.cost += i_ref_cost;
i_halfpel_thresh += i_ref_cost;
*(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv;
if( m.cost < l0m->cost )
h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
}
x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
......@@ -1176,25 +1176,25 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
l0m->cost = INT_MAX;
for( j = 0; j < i_ref8s; j++ )
{
const int i_ref = ref8[j];
const int i_ref_cost = REF_COST( 0, i_ref );
m.i_ref_cost = i_ref_cost;
m.i_ref = i_ref;
const int i_ref = ref8[j];
const int i_ref_cost = REF_COST( 0, i_ref );
m.i_ref_cost = i_ref_cost;
m.i_ref = i_ref;
/* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
*(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
*(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1];
*(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
/* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
*(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
*(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1];
*(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
x264_me_search( h, &m, mvc, 3 );
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
x264_me_search( h, &m, mvc, 3 );
m.cost += i_ref_cost;
m.cost += i_ref_cost;
if( m.cost < l0m->cost )
*l0m = m;
if( m.cost < l0m->cost )
h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
}
x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
......@@ -1226,24 +1226,24 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
l0m->cost = INT_MAX;
for( j = 0; j < i_ref8s; j++ )
{
const int i_ref = ref8[j];
const int i_ref_cost = REF_COST( 0, i_ref );
m.i_ref_cost = i_ref_cost;
m.i_ref = i_ref;
const int i_ref = ref8[j];
const int i_ref_cost = REF_COST( 0, i_ref );
m.i_ref_cost = i_ref_cost;
m.i_ref = i_ref;
*(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
*(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
*(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
*(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
*(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
*(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
x264_me_search( h, &m, mvc, 3 );
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
x264_me_search( h, &m, mvc, 3 );
m.cost += i_ref_cost;
m.cost += i_ref_cost;
if( m.cost < l0m->cost )
*l0m = m;
if( m.cost < l0m->cost )
h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
}
x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
......@@ -1467,7 +1467,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
if( m.cost < a->l0.me16x16.cost )
{
a->l0.i_ref = i_ref;
a->l0.me16x16 = m;
h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
}
/* save mv for predicting neighbors */
......@@ -1494,7 +1494,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
if( m.cost < a->l1.me16x16.cost )
{
a->l1.i_ref = i_ref;
a->l1.me16x16 = m;
h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
}
/* save mv for predicting neighbors */
......
......@@ -45,7 +45,7 @@ typedef struct
int cost_mv; /* lambda * nbits for the chosen mv */
int cost; /* satd + lambda * nbits */
DECLARE_ALIGNED_4( int16_t mv[2] );
} x264_me_t;
} DECLARE_ALIGNED_16( x264_me_t );
void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
static inline void x264_me_search( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc )
......
......@@ -89,13 +89,9 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
}
#define SAVE_MVS( mv0, mv1 ) \
{ \
fenc->mv[0][i_mb_xy][0] = mv0[0]; \
fenc->mv[0][i_mb_xy][1] = mv0[1]; \
*(uint32_t*)fenc->mv[0][i_mb_xy] = *(uint32_t*)mv0; \
if( b_bidir ) \
{ \
fenc->mv[1][i_mb_xy][0] = mv1[0]; \
fenc->mv[1][i_mb_xy][1] = mv1[1]; \
} \
*(uint32_t*)fenc->mv[1][i_mb_xy] = *(uint32_t*)mv1; \
}
#define CLIP_MV( mv ) \
{ \
......@@ -133,7 +129,7 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
int dmv[2][2];
int mv0[2] = {0,0};
m[1] = m[0];
h->mc.memcpy_aligned( &m[1], &m[0], sizeof(x264_me_t) );
LOAD_HPELS_LUMA( m[1].p_fref, fref1->lowres );
dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8;
......@@ -144,7 +140,7 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
CLIP_MV( dmv[1] );
TRY_BIDIR( dmv[0], dmv[1], 0 );
if( dmv[0][0] || dmv[0][1] || dmv[1][0] || dmv[1][1] )
if( dmv[0][0] | dmv[0][1] | dmv[1][0] | dmv[1][1] )
TRY_BIDIR( mv0, mv0, 0 );
// if( i_bcost < 60 ) // arbitrary threshold
// return i_bcost;
......@@ -153,10 +149,10 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
i_cost_bak = i_bcost;
for( l = 0; l < 1 + b_bidir; l++ )
{
int16_t mvc[4][2] = {{0}};
DECLARE_ALIGNED_4(int16_t mvc[4][2]) = {{0}};
int i_mvc = 0;
int16_t (*fenc_mv)[2] = &fenc->mv[l][i_mb_xy];
#define MVC(mv) { mvc[i_mvc][0] = mv[0]; mvc[i_mvc][1] = mv[1]; i_mvc++; }
#define MVC(mv) { *(uint32_t*)mvc[i_mvc] = *(uint32_t*)mv; i_mvc++; }
if( i_mb_x > 0 )
MVC(fenc_mv[-1]);
if( i_mb_y > 0 )
......@@ -172,12 +168,12 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
x264_me_search( h, &m[l], mvc, i_mvc );
m[l].cost -= 2; // remove mvcost from skip mbs
if( m[l].mv[0] || m[l].mv[1] )
if( *(uint32_t*)m[l].mv )
m[l].cost += 5;
i_bcost = X264_MIN( i_bcost, m[l].cost );
}
if( b_bidir && (m[0].mv[0] || m[0].mv[1] || m[1].mv[0] || m[1].mv[1]) )
if( b_bidir && ( *(uint32_t*)m[0].mv || *(uint32_t*)m[1].mv ) )
TRY_BIDIR( m[0].mv, m[1].mv, 5 );
if( i_bcost < i_cost_bak )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment