Commit 5c767904 authored by Fiona Glaser's avatar Fiona Glaser

Much faster and more efficient MVD handling

Store MV deltas as clipped absolute values.
This means CABAC no longer has to calculate absolute values in MV context selection.
This also lets us cut the memory spent on MVDs by a factor of 2, speeding up cache_mvd and reducing memory usage by 32*threads*(num macroblocks) bytes.
On a Core i7 encoding 1080p, this is about 3 megabytes saved.
parent 1ec69bef
......@@ -171,13 +171,13 @@ static inline int x264_predictor_difference( int16_t (*mvc)[2], intptr_t i_mvc )
return sum;
}
static inline uint32_t x264_cabac_amvd_sum( int16_t *mvdleft, int16_t *mvdtop )
static inline uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvdtop )
{
int amvd0 = abs(mvdleft[0]) + abs(mvdtop[0]);
int amvd1 = abs(mvdleft[1]) + abs(mvdtop[1]);
amvd0 = (amvd0 > 2) + (amvd0 > 32);
amvd1 = (amvd1 > 2) + (amvd1 > 32);
return amvd0 + (amvd1<<16);
return amvd0 + (amvd1<<8);
}
extern const uint8_t x264_exp2_lut[64];
......@@ -527,7 +527,7 @@ struct x264_t
uint8_t (*non_zero_count)[16+4+4]; /* nzc. for I_PCM set to 16 */
int8_t *chroma_pred_mode; /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */
int16_t (*mv[2])[2]; /* mb mv. set to 0 for intra mb */
int16_t (*mvd[2])[2]; /* mb mv difference with predict. set to 0 if intra. cabac only */
uint8_t (*mvd[2])[2]; /* absolute value of mb mv difference with predict, clipped to [0,33]. set to 0 if intra. cabac only */
int8_t *ref[2]; /* mb ref. set to -1 if non used (intra or Lx only) */
int16_t (*mvr[2][32])[2]; /* 16x16 mv for each possible ref */
int8_t *skipbp; /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
......@@ -621,7 +621,7 @@ struct x264_t
/* 0 if not available */
ALIGNED_16( int16_t mv[2][X264_SCAN8_SIZE][2] );
ALIGNED_8( int16_t mvd[2][X264_SCAN8_SIZE][2] );
ALIGNED_8( uint8_t mvd[2][X264_SCAN8_SIZE][2] );
/* 1 if SKIP or DIRECT. set only for B-frames + CABAC */
ALIGNED_4( int8_t skip[X264_SCAN8_SIZE] );
......
......@@ -712,8 +712,8 @@ int x264_macroblock_cache_init( x264_t *h )
if( h->param.b_cabac )
{
CHECKED_MALLOC( h->mb.chroma_pred_mode, i_mb_count * sizeof(int8_t) );
CHECKED_MALLOC( h->mb.mvd[0], 2*16 * i_mb_count * sizeof(int16_t) );
CHECKED_MALLOC( h->mb.mvd[1], 2*16 * i_mb_count * sizeof(int16_t) );
CHECKED_MALLOC( h->mb.mvd[0], 2*16 * i_mb_count * sizeof(uint8_t) );
CHECKED_MALLOC( h->mb.mvd[1], 2*16 * i_mb_count * sizeof(uint8_t) );
}
for( i=0; i<2; i++ )
......@@ -1211,33 +1211,24 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
if( h->param.b_cabac )
{
if( i_top_type >= 0 )
{
const int i8 = x264_scan8[0] - 8;
const int iv = i_top_4x4;
CP64( h->mb.cache.mvd[i_list][i8+0], h->mb.mvd[i_list][iv+0] );
CP64( h->mb.cache.mvd[i_list][i8+2], h->mb.mvd[i_list][iv+2] );
}
CP64( h->mb.cache.mvd[i_list][x264_scan8[0] - 8], h->mb.mvd[i_list][i_top_4x4] );
else
{
const int i8 = x264_scan8[0] - 8;
M64( h->mb.cache.mvd[i_list][i8+0] ) = 0;
M64( h->mb.cache.mvd[i_list][i8+2] ) = 0;
}
M64( h->mb.cache.mvd[i_list][x264_scan8[0] - 8] ) = 0;
if( i_left_type >= 0 )
{
const int i8 = x264_scan8[0] - 1;
const int iv = i_mb_4x4 - 1;
CP32( h->mb.cache.mvd[i_list][i8+0*8], h->mb.mvd[i_list][iv + 0*s4x4] );
CP32( h->mb.cache.mvd[i_list][i8+1*8], h->mb.mvd[i_list][iv + 1*s4x4] );
CP32( h->mb.cache.mvd[i_list][i8+2*8], h->mb.mvd[i_list][iv + 2*s4x4] );
CP32( h->mb.cache.mvd[i_list][i8+3*8], h->mb.mvd[i_list][iv + 3*s4x4] );
CP16( h->mb.cache.mvd[i_list][i8+0*8], h->mb.mvd[i_list][iv + 0*s4x4] );
CP16( h->mb.cache.mvd[i_list][i8+1*8], h->mb.mvd[i_list][iv + 1*s4x4] );
CP16( h->mb.cache.mvd[i_list][i8+2*8], h->mb.mvd[i_list][iv + 2*s4x4] );
CP16( h->mb.cache.mvd[i_list][i8+3*8], h->mb.mvd[i_list][iv + 3*s4x4] );
}
else
{
const int i8 = x264_scan8[0] - 1;
for( i = 0; i < 4; i++ )
M32( h->mb.cache.mvd[i_list][i8+i*8] ) = 0;
M16( h->mb.cache.mvd[i_list][i8+i*8] ) = 0;
}
}
}
......@@ -1416,30 +1407,18 @@ void x264_macroblock_cache_save( x264_t *h )
if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) && !IS_DIRECT( i_mb_type ) )
{
for( y = 0; y < 4; y++ )
{
CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[0][x264_scan8[0]+8*y+0] );
CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[0][x264_scan8[0]+8*y+2] );
}
CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4], h->mb.cache.mvd[0][x264_scan8[0]+8*y] );
if( h->sh.i_type == SLICE_TYPE_B )
for( y = 0; y < 4; y++ )
{
CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[1][x264_scan8[0]+8*y+0] );
CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[1][x264_scan8[0]+8*y+2] );
}
CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4], h->mb.cache.mvd[1][x264_scan8[0]+8*y] );
}
else
{
for( y = 0; y < 4; y++ )
{
M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0] ) = 0;
M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2] ) = 0;
}
M64( h->mb.mvd[0][i_mb_4x4+y*s4x4] ) = 0;
if( h->sh.i_type == SLICE_TYPE_B )
for( y = 0; y < 4; y++ )
{
M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0] ) = 0;
M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2] ) = 0;
}
M64( h->mb.mvd[1][i_mb_4x4+y*s4x4] ) = 0;
}
if( h->sh.i_type == SLICE_TYPE_B )
......
......@@ -353,6 +353,33 @@ static ALWAYS_INLINE void x264_macroblock_cache_rect1( void *dst, int width, int
if( height == 4 ) M16( d+6 ) = val2;
}
}
static ALWAYS_INLINE void x264_macroblock_cache_rect2( void *dst, int width, int height, uint16_t val )
{
uint16_t *d = dst;
uint32_t val32 = val + (val<<16);
uint64_t val64 = val32 + ((uint64_t)val32<<32);
if( width == 4 )
{
M64( d+ 0 ) = val64;
if( height >= 2 ) M64( d+ 8 ) = val64;
if( height == 4 ) M64( d+16 ) = val64;
if( height == 4 ) M64( d+24 ) = val64;
}
else if( width == 2 )
{
M32( d+ 0 ) = val32;
if( height >= 2 ) M32( d+ 8 ) = val32;
if( height == 4 ) M32( d+16 ) = val32;
if( height == 4 ) M32( d+24 ) = val32;
}
else //if( width == 1 )
{
M16( d+ 0 ) = val;
if( height >= 2 ) M16( d+ 8 ) = val;
if( height == 4 ) M16( d+16 ) = val;
if( height == 4 ) M16( d+24 ) = val;
}
}
static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int height, uint32_t val )
{
int dy;
......@@ -383,9 +410,9 @@ static ALWAYS_INLINE void x264_macroblock_cache_mv( x264_t *h, int x, int y, int
{
x264_macroblock_cache_rect4( &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
}
static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, uint32_t mv )
static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, uint16_t mv )
{
x264_macroblock_cache_rect4( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
x264_macroblock_cache_rect2( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
}
static ALWAYS_INLINE void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, uint8_t ref )
{
......
......@@ -77,32 +77,26 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
);
return sum;
}
#define x264_cabac_amvd_sum x264_cabac_amvd_sum_mmxext
static ALWAYS_INLINE uint32_t x264_cabac_amvd_sum_mmxext(int16_t *mvdleft, int16_t *mvdtop)
#define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmxext
static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_t *mvdtop)
{
static const uint64_t pw_2 = 0x0002000200020002ULL;
static const uint64_t pw_28 = 0x001C001C001C001CULL;
static const uint64_t pw_2184 = 0x0888088808880888ULL;
/* MIN(((x+28)*2184)>>16,2) = (x>2) + (x>32) */
/* 2184 = fix16(1/30) */
uint32_t amvd;
static const uint64_t pb_2 = 0x0202020202020202ULL;
static const uint64_t pb_32 = 0x2020202020202020ULL;
int amvd;
asm(
"movd %1, %%mm0 \n"
"movd %2, %%mm1 \n"
"pxor %%mm2, %%mm2 \n"
"pxor %%mm3, %%mm3 \n"
"psubw %%mm0, %%mm2 \n"
"psubw %%mm1, %%mm3 \n"
"pmaxsw %%mm2, %%mm0 \n"
"pmaxsw %%mm3, %%mm1 \n"
"paddw %3, %%mm0 \n"
"paddw %%mm1, %%mm0 \n"
"pmulhuw %4, %%mm0 \n"
"pminsw %5, %%mm0 \n"
"movd %%mm0, %0 \n"
"movd %1, %%mm0 \n"
"movd %2, %%mm1 \n"
"paddb %%mm1, %%mm0 \n"
"pxor %%mm2, %%mm2 \n"
"movq %%mm0, %%mm1 \n"
"pcmpgtb %3, %%mm0 \n"
"pcmpgtb %4, %%mm1 \n"
"psubb %%mm0, %%mm2 \n"
"psubb %%mm1, %%mm2 \n"
"movd %%mm2, %0 \n"
:"=r"(amvd)
:"m"(M32( mvdleft )),"m"(M32( mvdtop )),
"m"(pw_28),"m"(pw_2184),"m"(pw_2)
:"m"(M16( mvdleft )),"m"(M16( mvdtop )),
"m"(pb_2),"m"(pb_32)
);
return amvd;
}
......
......@@ -349,7 +349,7 @@ static void x264_cabac_mb_ref( x264_t *h, x264_cabac_t *cb, int i_list, int idx
x264_cabac_encode_decision( cb, 54 + ctx, 0 );
}
static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx )
static inline int x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx )
{
const int i_abs = abs( mvd );
const int ctxbase = l ? 47 : 40;
......@@ -408,32 +408,34 @@ static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_lis
x264_cabac_encode_bypass( cb, mvd < 0 );
}
#endif
/* Since we don't need to keep track of MVDs larger than 33, just cap the value.
* This lets us store MVDs as 8-bit values instead of 16-bit. */
return X264_MIN( i_abs, 33 );
}
static NOINLINE uint32_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width )
static NOINLINE uint16_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width )
{
ALIGNED_4( int16_t mvp[2] );
uint32_t amvd;
int mdx, mdy;
/* Calculate mvd */
x264_mb_predict_mv( h, i_list, idx, width, mvp );
mdx = h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0];
mdy = h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1];
amvd = x264_cabac_amvd_sum(h->mb.cache.mvd[i_list][x264_scan8[idx] - 1],
h->mb.cache.mvd[i_list][x264_scan8[idx] - 8]);
uint16_t amvd = x264_cabac_mvd_sum(h->mb.cache.mvd[i_list][x264_scan8[idx] - 1],
h->mb.cache.mvd[i_list][x264_scan8[idx] - 8]);
/* encode */
x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 0, mdx, amvd&0xFFFF );
x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy, amvd>>16 );
mdx = x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 0, mdx, amvd&0xFF );
mdy = x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy, amvd>>8 );
return pack16to32_mask(mdx,mdy);
return pack8to16(mdx,mdy);
}
#define x264_cabac_mb_mvd(h,cb,i_list,idx,width,height)\
do\
{\
uint32_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width);\
uint16_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width);\
x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mvd );\
} while(0)
......
......@@ -1174,6 +1174,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
m->mv[0] = bmx;
m->mv[1] = bmy;
x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx, bmy) );
x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx - m->mvp[0], bmy - m->mvp[1]) );
uint16_t amvd = pack8to16(X264_MIN(abs(bmx - m->mvp[0]),33), X264_MIN(abs(bmy - m->mvp[1]),33));
x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, amvd );
h->mb.b_skip_mc = 0;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment