Commit 56108cb6 authored by Fiona Glaser's avatar Fiona Glaser

Use aligned memcpy for x264_me_t struct and cosmetics

parent dba0e5a2
...@@ -1010,7 +1010,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) ...@@ -1010,7 +1010,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
i_halfpel_thresh += i_ref_cost; i_halfpel_thresh += i_ref_cost;
if( m.cost < a->l0.me16x16.cost ) if( m.cost < a->l0.me16x16.cost )
a->l0.me16x16 = m; h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
/* save mv for predicting neighbors */ /* save mv for predicting neighbors */
*(uint32_t*)a->l0.mvc[i_ref][0] = *(uint32_t*)a->l0.mvc[i_ref][0] =
...@@ -1072,22 +1072,22 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t ...@@ -1072,22 +1072,22 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
l0m->cost = INT_MAX; l0m->cost = INT_MAX;
for( i_ref = 0; i_ref <= i_maxref; i_ref++ ) for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
{ {
const int i_ref_cost = REF_COST( 0, i_ref ); const int i_ref_cost = REF_COST( 0, i_ref );
i_halfpel_thresh -= i_ref_cost; i_halfpel_thresh -= i_ref_cost;
m.i_ref_cost = i_ref_cost; m.i_ref_cost = i_ref_cost;
m.i_ref = i_ref; m.i_ref = i_ref;
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 ); LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref ); x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp ); x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh ); x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
m.cost += i_ref_cost; m.cost += i_ref_cost;
i_halfpel_thresh += i_ref_cost; i_halfpel_thresh += i_ref_cost;
*(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv; *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv;
if( m.cost < l0m->cost ) if( m.cost < l0m->cost )
*l0m = m; h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
} }
x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv ); x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref ); x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
...@@ -1176,25 +1176,25 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a ) ...@@ -1176,25 +1176,25 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
l0m->cost = INT_MAX; l0m->cost = INT_MAX;
for( j = 0; j < i_ref8s; j++ ) for( j = 0; j < i_ref8s; j++ )
{ {
const int i_ref = ref8[j]; const int i_ref = ref8[j];
const int i_ref_cost = REF_COST( 0, i_ref ); const int i_ref_cost = REF_COST( 0, i_ref );
m.i_ref_cost = i_ref_cost; m.i_ref_cost = i_ref_cost;
m.i_ref = i_ref; m.i_ref = i_ref;
/* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */ /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
*(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0]; *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
*(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1]; *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1];
*(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2]; *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i ); LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref ); x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp ); x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
x264_me_search( h, &m, mvc, 3 ); x264_me_search( h, &m, mvc, 3 );
m.cost += i_ref_cost; m.cost += i_ref_cost;
if( m.cost < l0m->cost ) if( m.cost < l0m->cost )
*l0m = m; h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
} }
x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv ); x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref ); x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
...@@ -1226,24 +1226,24 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a ) ...@@ -1226,24 +1226,24 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
l0m->cost = INT_MAX; l0m->cost = INT_MAX;
for( j = 0; j < i_ref8s; j++ ) for( j = 0; j < i_ref8s; j++ )
{ {
const int i_ref = ref8[j]; const int i_ref = ref8[j];
const int i_ref_cost = REF_COST( 0, i_ref ); const int i_ref_cost = REF_COST( 0, i_ref );
m.i_ref_cost = i_ref_cost; m.i_ref_cost = i_ref_cost;
m.i_ref = i_ref; m.i_ref = i_ref;
*(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0]; *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
*(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1]; *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
*(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3]; *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 ); LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref ); x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp ); x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
x264_me_search( h, &m, mvc, 3 ); x264_me_search( h, &m, mvc, 3 );
m.cost += i_ref_cost; m.cost += i_ref_cost;
if( m.cost < l0m->cost ) if( m.cost < l0m->cost )
*l0m = m; h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
} }
x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv ); x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref ); x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
...@@ -1467,7 +1467,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) ...@@ -1467,7 +1467,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
if( m.cost < a->l0.me16x16.cost ) if( m.cost < a->l0.me16x16.cost )
{ {
a->l0.i_ref = i_ref; a->l0.i_ref = i_ref;
a->l0.me16x16 = m; h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
} }
/* save mv for predicting neighbors */ /* save mv for predicting neighbors */
...@@ -1494,7 +1494,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) ...@@ -1494,7 +1494,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
if( m.cost < a->l1.me16x16.cost ) if( m.cost < a->l1.me16x16.cost )
{ {
a->l1.i_ref = i_ref; a->l1.i_ref = i_ref;
a->l1.me16x16 = m; h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
} }
/* save mv for predicting neighbors */ /* save mv for predicting neighbors */
......
...@@ -45,7 +45,7 @@ typedef struct ...@@ -45,7 +45,7 @@ typedef struct
int cost_mv; /* lambda * nbits for the chosen mv */ int cost_mv; /* lambda * nbits for the chosen mv */
int cost; /* satd + lambda * nbits */ int cost; /* satd + lambda * nbits */
DECLARE_ALIGNED_4( int16_t mv[2] ); DECLARE_ALIGNED_4( int16_t mv[2] );
} x264_me_t; } DECLARE_ALIGNED_16( x264_me_t );
void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh ); void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
static inline void x264_me_search( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc ) static inline void x264_me_search( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc )
......
...@@ -89,13 +89,9 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, ...@@ -89,13 +89,9 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
} }
#define SAVE_MVS( mv0, mv1 ) \ #define SAVE_MVS( mv0, mv1 ) \
{ \ { \
fenc->mv[0][i_mb_xy][0] = mv0[0]; \ *(uint32_t*)fenc->mv[0][i_mb_xy] = *(uint32_t*)mv0; \
fenc->mv[0][i_mb_xy][1] = mv0[1]; \
if( b_bidir ) \ if( b_bidir ) \
{ \ *(uint32_t*)fenc->mv[1][i_mb_xy] = *(uint32_t*)mv1; \
fenc->mv[1][i_mb_xy][0] = mv1[0]; \
fenc->mv[1][i_mb_xy][1] = mv1[1]; \
} \
} }
#define CLIP_MV( mv ) \ #define CLIP_MV( mv ) \
{ \ { \
...@@ -133,7 +129,7 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, ...@@ -133,7 +129,7 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
int dmv[2][2]; int dmv[2][2];
int mv0[2] = {0,0}; int mv0[2] = {0,0};
m[1] = m[0]; h->mc.memcpy_aligned( &m[1], &m[0], sizeof(x264_me_t) );
LOAD_HPELS_LUMA( m[1].p_fref, fref1->lowres ); LOAD_HPELS_LUMA( m[1].p_fref, fref1->lowres );
dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8; dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8;
...@@ -144,7 +140,7 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, ...@@ -144,7 +140,7 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
CLIP_MV( dmv[1] ); CLIP_MV( dmv[1] );
TRY_BIDIR( dmv[0], dmv[1], 0 ); TRY_BIDIR( dmv[0], dmv[1], 0 );
if( dmv[0][0] || dmv[0][1] || dmv[1][0] || dmv[1][1] ) if( dmv[0][0] | dmv[0][1] | dmv[1][0] | dmv[1][1] )
TRY_BIDIR( mv0, mv0, 0 ); TRY_BIDIR( mv0, mv0, 0 );
// if( i_bcost < 60 ) // arbitrary threshold // if( i_bcost < 60 ) // arbitrary threshold
// return i_bcost; // return i_bcost;
...@@ -153,10 +149,10 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, ...@@ -153,10 +149,10 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
i_cost_bak = i_bcost; i_cost_bak = i_bcost;
for( l = 0; l < 1 + b_bidir; l++ ) for( l = 0; l < 1 + b_bidir; l++ )
{ {
int16_t mvc[4][2] = {{0}}; DECLARE_ALIGNED_4(int16_t mvc[4][2]) = {{0}};
int i_mvc = 0; int i_mvc = 0;
int16_t (*fenc_mv)[2] = &fenc->mv[l][i_mb_xy]; int16_t (*fenc_mv)[2] = &fenc->mv[l][i_mb_xy];
#define MVC(mv) { mvc[i_mvc][0] = mv[0]; mvc[i_mvc][1] = mv[1]; i_mvc++; } #define MVC(mv) { *(uint32_t*)mvc[i_mvc] = *(uint32_t*)mv; i_mvc++; }
if( i_mb_x > 0 ) if( i_mb_x > 0 )
MVC(fenc_mv[-1]); MVC(fenc_mv[-1]);
if( i_mb_y > 0 ) if( i_mb_y > 0 )
...@@ -172,12 +168,12 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, ...@@ -172,12 +168,12 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
x264_me_search( h, &m[l], mvc, i_mvc ); x264_me_search( h, &m[l], mvc, i_mvc );
m[l].cost -= 2; // remove mvcost from skip mbs m[l].cost -= 2; // remove mvcost from skip mbs
if( m[l].mv[0] || m[l].mv[1] ) if( *(uint32_t*)m[l].mv )
m[l].cost += 5; m[l].cost += 5;
i_bcost = X264_MIN( i_bcost, m[l].cost ); i_bcost = X264_MIN( i_bcost, m[l].cost );
} }
if( b_bidir && (m[0].mv[0] || m[0].mv[1] || m[1].mv[0] || m[1].mv[1]) ) if( b_bidir && ( *(uint32_t*)m[0].mv || *(uint32_t*)m[1].mv ) )
TRY_BIDIR( m[0].mv, m[1].mv, 5 ); TRY_BIDIR( m[0].mv, m[1].mv, 5 );
if( i_bcost < i_cost_bak ) if( i_bcost < i_cost_bak )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment