From 5c767904662ccb4703b421308d7270712f60b65b Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Thu, 18 Feb 2010 17:01:38 -0800 Subject: [PATCH] Much faster and more efficient MVD handling Store MV deltas as clipped absolute values. This means CABAC no longer has to calculate absolute values in MV context selection. This also lets us cut the memory spent on MVDs by a factor of 2, speeding up cache_mvd and reducing memory usage by 32*threads*(num macroblocks) bytes. On a Core i7 encoding 1080p, this is about 3 megabytes saved. --- common/common.h | 8 ++++---- common/macroblock.c | 47 +++++++++++++-------------------------------- common/macroblock.h | 31 ++++++++++++++++++++++++++++-- common/x86/util.h | 40 ++++++++++++++++---------------------- encoder/cabac.c | 20 ++++++++++--------- encoder/me.c | 3 ++- 6 files changed, 76 insertions(+), 73 deletions(-) diff --git a/common/common.h b/common/common.h index 8562aed8..661eda6d 100644 --- a/common/common.h +++ b/common/common.h @@ -171,13 +171,13 @@ static inline int x264_predictor_difference( int16_t (*mvc)[2], intptr_t i_mvc ) return sum; } -static inline uint32_t x264_cabac_amvd_sum( int16_t *mvdleft, int16_t *mvdtop ) +static inline uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvdtop ) { int amvd0 = abs(mvdleft[0]) + abs(mvdtop[0]); int amvd1 = abs(mvdleft[1]) + abs(mvdtop[1]); amvd0 = (amvd0 > 2) + (amvd0 > 32); amvd1 = (amvd1 > 2) + (amvd1 > 32); - return amvd0 + (amvd1<<16); + return amvd0 + (amvd1<<8); } extern const uint8_t x264_exp2_lut[64]; @@ -527,7 +527,7 @@ struct x264_t uint8_t (*non_zero_count)[16+4+4]; /* nzc. for I_PCM set to 16 */ int8_t *chroma_pred_mode; /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */ int16_t (*mv[2])[2]; /* mb mv. set to 0 for intra mb */ - int16_t (*mvd[2])[2]; /* mb mv difference with predict. set to 0 if intra. cabac only */ + uint8_t (*mvd[2])[2]; /* absolute value of mb mv difference with predict, clipped to [0,33]. set to 0 if intra. cabac only */ int8_t *ref[2]; /* mb ref. set to -1 if non used (intra or Lx only) */ int16_t (*mvr[2][32])[2]; /* 16x16 mv for each possible ref */ int8_t *skipbp; /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */ @@ -621,7 +621,7 @@ struct x264_t /* 0 if not available */ ALIGNED_16( int16_t mv[2][X264_SCAN8_SIZE][2] ); - ALIGNED_8( int16_t mvd[2][X264_SCAN8_SIZE][2] ); + ALIGNED_8( uint8_t mvd[2][X264_SCAN8_SIZE][2] ); /* 1 if SKIP or DIRECT. set only for B-frames + CABAC */ ALIGNED_4( int8_t skip[X264_SCAN8_SIZE] ); diff --git a/common/macroblock.c b/common/macroblock.c index fc8c9c40..decc0319 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -712,8 +712,8 @@ int x264_macroblock_cache_init( x264_t *h ) if( h->param.b_cabac ) { CHECKED_MALLOC( h->mb.chroma_pred_mode, i_mb_count * sizeof(int8_t) ); - CHECKED_MALLOC( h->mb.mvd[0], 2*16 * i_mb_count * sizeof(int16_t) ); - CHECKED_MALLOC( h->mb.mvd[1], 2*16 * i_mb_count * sizeof(int16_t) ); + CHECKED_MALLOC( h->mb.mvd[0], 2*16 * i_mb_count * sizeof(uint8_t) ); + CHECKED_MALLOC( h->mb.mvd[1], 2*16 * i_mb_count * sizeof(uint8_t) ); } for( i=0; i<2; i++ ) @@ -1211,33 +1211,24 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) if( h->param.b_cabac ) { if( i_top_type >= 0 ) - { - const int i8 = x264_scan8[0] - 8; - const int iv = i_top_4x4; - CP64( h->mb.cache.mvd[i_list][i8+0], h->mb.mvd[i_list][iv+0] ); - CP64( h->mb.cache.mvd[i_list][i8+2], h->mb.mvd[i_list][iv+2] ); - } + CP64( h->mb.cache.mvd[i_list][x264_scan8[0] - 8], h->mb.mvd[i_list][i_top_4x4] ); else - { - const int i8 = x264_scan8[0] - 8; - M64( h->mb.cache.mvd[i_list][i8+0] ) = 0; - M64( h->mb.cache.mvd[i_list][i8+2] ) = 0; - } + M64( h->mb.cache.mvd[i_list][x264_scan8[0] - 8] ) = 0; if( i_left_type >= 0 ) { const int i8 = x264_scan8[0] - 1; const int iv = i_mb_4x4 - 1; - CP32( h->mb.cache.mvd[i_list][i8+0*8], h->mb.mvd[i_list][iv + 0*s4x4] ); - CP32( h->mb.cache.mvd[i_list][i8+1*8], h->mb.mvd[i_list][iv + 1*s4x4] ); - CP32( h->mb.cache.mvd[i_list][i8+2*8], h->mb.mvd[i_list][iv + 2*s4x4] ); - CP32( h->mb.cache.mvd[i_list][i8+3*8], h->mb.mvd[i_list][iv + 3*s4x4] ); + CP16( h->mb.cache.mvd[i_list][i8+0*8], h->mb.mvd[i_list][iv + 0*s4x4] ); + CP16( h->mb.cache.mvd[i_list][i8+1*8], h->mb.mvd[i_list][iv + 1*s4x4] ); + CP16( h->mb.cache.mvd[i_list][i8+2*8], h->mb.mvd[i_list][iv + 2*s4x4] ); + CP16( h->mb.cache.mvd[i_list][i8+3*8], h->mb.mvd[i_list][iv + 3*s4x4] ); } else { const int i8 = x264_scan8[0] - 1; for( i = 0; i < 4; i++ ) - M32( h->mb.cache.mvd[i_list][i8+i*8] ) = 0; + M16( h->mb.cache.mvd[i_list][i8+i*8] ) = 0; } } } @@ -1416,30 +1407,18 @@ void x264_macroblock_cache_save( x264_t *h ) if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) && !IS_DIRECT( i_mb_type ) ) { for( y = 0; y < 4; y++ ) - { - CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[0][x264_scan8[0]+8*y+0] ); - CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[0][x264_scan8[0]+8*y+2] ); - } + CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4], h->mb.cache.mvd[0][x264_scan8[0]+8*y] ); if( h->sh.i_type == SLICE_TYPE_B ) for( y = 0; y < 4; y++ ) - { - CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[1][x264_scan8[0]+8*y+0] ); - CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[1][x264_scan8[0]+8*y+2] ); - } + CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4], h->mb.cache.mvd[1][x264_scan8[0]+8*y] ); } else { for( y = 0; y < 4; y++ ) - { - M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0] ) = 0; - M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2] ) = 0; - } + M64( h->mb.mvd[0][i_mb_4x4+y*s4x4] ) = 0; if( h->sh.i_type == SLICE_TYPE_B ) for( y = 0; y < 4; y++ ) - { - M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0] ) = 0; - M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2] ) = 0; - } + M64( h->mb.mvd[1][i_mb_4x4+y*s4x4] ) = 0; } if( h->sh.i_type == SLICE_TYPE_B ) diff --git a/common/macroblock.h b/common/macroblock.h index 086eec37..d173be4d 100644 --- a/common/macroblock.h +++ b/common/macroblock.h @@ -353,6 +353,33 @@ static ALWAYS_INLINE void x264_macroblock_cache_rect1( void *dst, int width, int if( height == 4 ) M16( d+6 ) = val2; } } +static ALWAYS_INLINE void x264_macroblock_cache_rect2( void *dst, int width, int height, uint16_t val ) +{ + uint16_t *d = dst; + uint32_t val32 = val + (val<<16); + uint64_t val64 = val32 + ((uint64_t)val32<<32); + if( width == 4 ) + { + M64( d+ 0 ) = val64; + if( height >= 2 ) M64( d+ 8 ) = val64; + if( height == 4 ) M64( d+16 ) = val64; + if( height == 4 ) M64( d+24 ) = val64; + } + else if( width == 2 ) + { + M32( d+ 0 ) = val32; + if( height >= 2 ) M32( d+ 8 ) = val32; + if( height == 4 ) M32( d+16 ) = val32; + if( height == 4 ) M32( d+24 ) = val32; + } + else //if( width == 1 ) + { + M16( d+ 0 ) = val; + if( height >= 2 ) M16( d+ 8 ) = val; + if( height == 4 ) M16( d+16 ) = val; + if( height == 4 ) M16( d+24 ) = val; + } +} static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int height, uint32_t val ) { int dy; @@ -383,9 +410,9 @@ static ALWAYS_INLINE void x264_macroblock_cache_mv( x264_t *h, int x, int y, int { x264_macroblock_cache_rect4( &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y], width, height, mv ); } -static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, uint32_t mv ) +static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, uint16_t mv ) { - x264_macroblock_cache_rect4( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, mv ); + x264_macroblock_cache_rect2( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, mv ); } static ALWAYS_INLINE void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, uint8_t ref ) { diff --git a/common/x86/util.h b/common/x86/util.h index da8437a3..7672f09e 100644 --- a/common/x86/util.h +++ b/common/x86/util.h @@ -77,32 +77,26 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t ); return sum; } -#define x264_cabac_amvd_sum x264_cabac_amvd_sum_mmxext -static ALWAYS_INLINE uint32_t x264_cabac_amvd_sum_mmxext(int16_t *mvdleft, int16_t *mvdtop) +#define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmxext +static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_t *mvdtop) { - static const uint64_t pw_2 = 0x0002000200020002ULL; - static const uint64_t pw_28 = 0x001C001C001C001CULL; - static const uint64_t pw_2184 = 0x0888088808880888ULL; - /* MIN(((x+28)*2184)>>16,2) = (x>2) + (x>32) */ - /* 2184 = fix16(1/30) */ - uint32_t amvd; + static const uint64_t pb_2 = 0x0202020202020202ULL; + static const uint64_t pb_32 = 0x2020202020202020ULL; + int amvd; asm( - "movd %1, %%mm0 \n" - "movd %2, %%mm1 \n" - "pxor %%mm2, %%mm2 \n" - "pxor %%mm3, %%mm3 \n" - "psubw %%mm0, %%mm2 \n" - "psubw %%mm1, %%mm3 \n" - "pmaxsw %%mm2, %%mm0 \n" - "pmaxsw %%mm3, %%mm1 \n" - "paddw %3, %%mm0 \n" - "paddw %%mm1, %%mm0 \n" - "pmulhuw %4, %%mm0 \n" - "pminsw %5, %%mm0 \n" - "movd %%mm0, %0 \n" + "movd %1, %%mm0 \n" + "movd %2, %%mm1 \n" + "paddb %%mm1, %%mm0 \n" + "pxor %%mm2, %%mm2 \n" + "movq %%mm0, %%mm1 \n" + "pcmpgtb %3, %%mm0 \n" + "pcmpgtb %4, %%mm1 \n" + "psubb %%mm0, %%mm2 \n" + "psubb %%mm1, %%mm2 \n" + "movd %%mm2, %0 \n" :"=r"(amvd) - :"m"(M32( mvdleft )),"m"(M32( mvdtop )), - "m"(pw_28),"m"(pw_2184),"m"(pw_2) + :"m"(M16( mvdleft )),"m"(M16( mvdtop )), + "m"(pb_2),"m"(pb_32) ); return amvd; } diff --git a/encoder/cabac.c b/encoder/cabac.c index a2220c66..dc1d1b8a 100644 --- a/encoder/cabac.c +++ b/encoder/cabac.c @@ -349,7 +349,7 @@ static void x264_cabac_mb_ref( x264_t *h, x264_cabac_t *cb, int i_list, int idx x264_cabac_encode_decision( cb, 54 + ctx, 0 ); } -static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx ) +static inline int x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx ) { const int i_abs = abs( mvd ); const int ctxbase = l ? 47 : 40; @@ -408,32 +408,34 @@ static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_lis x264_cabac_encode_bypass( cb, mvd < 0 ); } #endif + /* Since we don't need to keep track of MVDs larger than 33, just cap the value. + * This lets us store MVDs as 8-bit values instead of 16-bit. */ + return X264_MIN( i_abs, 33 ); } -static NOINLINE uint32_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width ) +static NOINLINE uint16_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width ) { ALIGNED_4( int16_t mvp[2] ); - uint32_t amvd; int mdx, mdy; /* Calculate mvd */ x264_mb_predict_mv( h, i_list, idx, width, mvp ); mdx = h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0]; mdy = h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1]; - amvd = x264_cabac_amvd_sum(h->mb.cache.mvd[i_list][x264_scan8[idx] - 1], - h->mb.cache.mvd[i_list][x264_scan8[idx] - 8]); + uint16_t amvd = x264_cabac_mvd_sum(h->mb.cache.mvd[i_list][x264_scan8[idx] - 1], + h->mb.cache.mvd[i_list][x264_scan8[idx] - 8]); /* encode */ - x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 0, mdx, amvd&0xFFFF ); - x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy, amvd>>16 ); + mdx = x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 0, mdx, amvd&0xFF ); + mdy = x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy, amvd>>8 ); - return pack16to32_mask(mdx,mdy); + return pack8to16(mdx,mdy); } #define x264_cabac_mb_mvd(h,cb,i_list,idx,width,height)\ do\ {\ - uint32_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width);\ + uint16_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width);\ x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mvd );\ } while(0) diff --git a/encoder/me.c b/encoder/me.c index 8972d459..5f29a640 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -1174,6 +1174,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int m->mv[0] = bmx; m->mv[1] = bmy; x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx, bmy) ); - x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx - m->mvp[0], bmy - m->mvp[1]) ); + uint16_t amvd = pack8to16(X264_MIN(abs(bmx - m->mvp[0]),33), X264_MIN(abs(bmy - m->mvp[1]),33)); + x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, amvd ); h->mb.b_skip_mc = 0; } -- GitLab