Commit 80a3909c authored by Fiona Glaser's avatar Fiona Glaser

Motion compensation optimizations

Turning off inlining saves a whole boatload of code size for near-zero speed cost.
Simplify offset calculation.
Various other optimizations.
parent 9ef68adb
...@@ -462,62 +462,62 @@ static void setup_inverse_delta_pocs( x264_t *h ) ...@@ -462,62 +462,62 @@ static void setup_inverse_delta_pocs( x264_t *h )
} }
} }
static inline void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int height ) static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int height )
{ {
const int i8 = x264_scan8[0]+x+8*y; const int i8 = x264_scan8[0]+x+8*y;
const int i_ref = h->mb.cache.ref[0][i8]; const int i_ref = h->mb.cache.ref[0][i8];
const int mvx = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ); const int mvx = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
int mvy = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ); int mvy = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0], h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0],
mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height ); mvx, mvy, 4*width, 4*height );
// chroma is offset if MCing from a field of opposite parity // chroma is offset if MCing from a field of opposite parity
if( h->mb.b_interlaced & i_ref ) if( h->mb.b_interlaced & i_ref )
mvy += (h->mb.i_mb_y & 1)*4 - 2; mvy += (h->mb.i_mb_y & 1)*4 - 2;
h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fref[0][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1],
mvx, mvy, 2*width, 2*height ); mvx, mvy, 2*width, 2*height );
h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fref[0][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], h->mb.pic.p_fref[0][i_ref][5], h->mb.pic.i_stride[2],
mvx, mvy, 2*width, 2*height ); mvx, mvy, 2*width, 2*height );
} }
static inline void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height ) static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
{ {
const int i8 = x264_scan8[0]+x+8*y; const int i8 = x264_scan8[0]+x+8*y;
const int i_ref = h->mb.cache.ref[1][i8]; const int i_ref = h->mb.cache.ref[1][i8];
const int mvx = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ); const int mvx = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
int mvy = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ); int mvy = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
h->mb.pic.p_fref[1][i_ref], h->mb.pic.i_stride[0], h->mb.pic.p_fref[1][i_ref], h->mb.pic.i_stride[0],
mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height ); mvx, mvy, 4*width, 4*height );
if( h->mb.b_interlaced & i_ref ) if( h->mb.b_interlaced & i_ref )
mvy += (h->mb.i_mb_y & 1)*4 - 2; mvy += (h->mb.i_mb_y & 1)*4 - 2;
h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fref[1][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1],
mvx, mvy, 2*width, 2*height ); mvx, mvy, 2*width, 2*height );
h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fref[1][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], h->mb.pic.p_fref[1][i_ref][5], h->mb.pic.i_stride[2],
mvx, mvy, 2*width, 2*height ); mvx, mvy, 2*width, 2*height );
} }
static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height ) static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
{ {
const int i8 = x264_scan8[0]+x+8*y; const int i8 = x264_scan8[0]+x+8*y;
const int i_ref0 = h->mb.cache.ref[0][i8]; const int i_ref0 = h->mb.cache.ref[0][i8];
const int i_ref1 = h->mb.cache.ref[1][i8]; const int i_ref1 = h->mb.cache.ref[1][i8];
const int weight = h->mb.bipred_weight[i_ref0][i_ref1]; const int weight = h->mb.bipred_weight[i_ref0][i_ref1];
const int mvx0 = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ); const int mvx0 = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
const int mvx1 = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ); const int mvx1 = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
int mvy0 = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ); int mvy0 = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ); int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
int i_mode = x264_size2pixel[height][width]; int i_mode = x264_size2pixel[height][width];
int i_stride0 = 16, i_stride1 = 16; int i_stride0 = 16, i_stride1 = 16;
ALIGNED_ARRAY_16( uint8_t, tmp0,[16*16] ); ALIGNED_ARRAY_16( uint8_t, tmp0,[16*16] );
...@@ -525,9 +525,9 @@ static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int he ...@@ -525,9 +525,9 @@ static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int he
uint8_t *src0, *src1; uint8_t *src0, *src1;
src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0], src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0],
mvx0 + 4*4*x, mvy0 + 4*4*y, 4*width, 4*height ); mvx0, mvy0, 4*width, 4*height );
src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0], src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height ); mvx1, mvy1, 4*width, 4*height );
h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
src0, i_stride0, src1, i_stride1, weight ); src0, i_stride0, src1, i_stride1, weight );
...@@ -536,14 +536,14 @@ static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int he ...@@ -536,14 +536,14 @@ static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int he
if( h->mb.b_interlaced & i_ref1 ) if( h->mb.b_interlaced & i_ref1 )
mvy1 += (h->mb.i_mb_y & 1)*4 - 2; mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1],
mvx0, mvy0, 2*width, 2*height ); mvx0, mvy0, 2*width, 2*height );
h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1],
mvx1, mvy1, 2*width, 2*height ); mvx1, mvy1, 2*width, 2*height );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][5], h->mb.pic.i_stride[2],
mvx0, mvy0, 2*width, 2*height ); mvx0, mvy0, 2*width, 2*height );
h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][5], h->mb.pic.i_stride[2],
mvx1, mvy1, 2*width, 2*height ); mvx1, mvy1, 2*width, 2*height );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
} }
......
...@@ -1488,20 +1488,23 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, ...@@ -1488,20 +1488,23 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
if( pixel == PIXEL_4x4 ) if( pixel == PIXEL_4x4 )
{ {
CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][0], 0,0 ); x264_me_t *m = a->l0.me4x4[i8x8];
CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][1], 2,0 ); CHROMA4x4MC( 2,2, m[0], 0,0 );
CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][2], 0,2 ); CHROMA4x4MC( 2,2, m[1], 2,0 );
CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][3], 2,2 ); CHROMA4x4MC( 2,2, m[2], 0,2 );
CHROMA4x4MC( 2,2, m[3], 2,2 );
} }
else if( pixel == PIXEL_8x4 ) else if( pixel == PIXEL_8x4 )
{ {
CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][0], 0,0 ); x264_me_t *m = a->l0.me8x4[i8x8];
CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][1], 0,2 ); CHROMA4x4MC( 4,2, m[0], 0,0 );
CHROMA4x4MC( 4,2, m[1], 0,2 );
} }
else else
{ {
CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][0], 0,0 ); x264_me_t *m = a->l0.me4x8[i8x8];
CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 ); CHROMA4x4MC( 2,4, m[0], 0,0 );
CHROMA4x4MC( 2,4, m[1], 2,0 );
} }
return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 ) return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
...@@ -1645,11 +1648,6 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a ) ...@@ -1645,11 +1648,6 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
} }
} }
#define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \
{ \
h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
}
static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
{ {
ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] ); ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment