diff --git a/common/common.h b/common/common.h index 8562aed87f8fbaa507e7100c19e8e13970daa4fc..661eda6d7a31dc277799285cda98e925d109d13d 100644 --- a/common/common.h +++ b/common/common.h @@ -171,13 +171,13 @@ static inline int x264_predictor_difference( int16_t (*mvc)[2], intptr_t i_mvc ) return sum; } -static inline uint32_t x264_cabac_amvd_sum( int16_t *mvdleft, int16_t *mvdtop ) +static inline uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvdtop ) { int amvd0 = abs(mvdleft[0]) + abs(mvdtop[0]); int amvd1 = abs(mvdleft[1]) + abs(mvdtop[1]); amvd0 = (amvd0 > 2) + (amvd0 > 32); amvd1 = (amvd1 > 2) + (amvd1 > 32); - return amvd0 + (amvd1<<16); + return amvd0 + (amvd1<<8); } extern const uint8_t x264_exp2_lut[64]; @@ -527,7 +527,7 @@ struct x264_t uint8_t (*non_zero_count)[16+4+4]; /* nzc. for I_PCM set to 16 */ int8_t *chroma_pred_mode; /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */ int16_t (*mv[2])[2]; /* mb mv. set to 0 for intra mb */ - int16_t (*mvd[2])[2]; /* mb mv difference with predict. set to 0 if intra. cabac only */ + uint8_t (*mvd[2])[2]; /* absolute value of mb mv difference with predict, clipped to [0,33]. set to 0 if intra. cabac only */ int8_t *ref[2]; /* mb ref. set to -1 if non used (intra or Lx only) */ int16_t (*mvr[2][32])[2]; /* 16x16 mv for each possible ref */ int8_t *skipbp; /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */ @@ -621,7 +621,7 @@ struct x264_t /* 0 if not available */ ALIGNED_16( int16_t mv[2][X264_SCAN8_SIZE][2] ); - ALIGNED_8( int16_t mvd[2][X264_SCAN8_SIZE][2] ); + ALIGNED_8( uint8_t mvd[2][X264_SCAN8_SIZE][2] ); /* 1 if SKIP or DIRECT. set only for B-frames + CABAC */ ALIGNED_4( int8_t skip[X264_SCAN8_SIZE] ); diff --git a/common/macroblock.c b/common/macroblock.c index fc8c9c40fd994b0a4495e0ca31e8037e128f74ae..decc0319bea8f27f570a5f2f42f6333cf063967c 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -712,8 +712,8 @@ int x264_macroblock_cache_init( x264_t *h ) if( h->param.b_cabac ) { CHECKED_MALLOC( h->mb.chroma_pred_mode, i_mb_count * sizeof(int8_t) ); - CHECKED_MALLOC( h->mb.mvd[0], 2*16 * i_mb_count * sizeof(int16_t) ); - CHECKED_MALLOC( h->mb.mvd[1], 2*16 * i_mb_count * sizeof(int16_t) ); + CHECKED_MALLOC( h->mb.mvd[0], 2*16 * i_mb_count * sizeof(uint8_t) ); + CHECKED_MALLOC( h->mb.mvd[1], 2*16 * i_mb_count * sizeof(uint8_t) ); } for( i=0; i<2; i++ ) @@ -1211,33 +1211,24 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) if( h->param.b_cabac ) { if( i_top_type >= 0 ) - { - const int i8 = x264_scan8[0] - 8; - const int iv = i_top_4x4; - CP64( h->mb.cache.mvd[i_list][i8+0], h->mb.mvd[i_list][iv+0] ); - CP64( h->mb.cache.mvd[i_list][i8+2], h->mb.mvd[i_list][iv+2] ); - } + CP64( h->mb.cache.mvd[i_list][x264_scan8[0] - 8], h->mb.mvd[i_list][i_top_4x4] ); else - { - const int i8 = x264_scan8[0] - 8; - M64( h->mb.cache.mvd[i_list][i8+0] ) = 0; - M64( h->mb.cache.mvd[i_list][i8+2] ) = 0; - } + M64( h->mb.cache.mvd[i_list][x264_scan8[0] - 8] ) = 0; if( i_left_type >= 0 ) { const int i8 = x264_scan8[0] - 1; const int iv = i_mb_4x4 - 1; - CP32( h->mb.cache.mvd[i_list][i8+0*8], h->mb.mvd[i_list][iv + 0*s4x4] ); - CP32( h->mb.cache.mvd[i_list][i8+1*8], h->mb.mvd[i_list][iv + 1*s4x4] ); - CP32( h->mb.cache.mvd[i_list][i8+2*8], h->mb.mvd[i_list][iv + 2*s4x4] ); - CP32( h->mb.cache.mvd[i_list][i8+3*8], h->mb.mvd[i_list][iv + 3*s4x4] ); + CP16( h->mb.cache.mvd[i_list][i8+0*8], h->mb.mvd[i_list][iv + 0*s4x4] ); + CP16( h->mb.cache.mvd[i_list][i8+1*8], h->mb.mvd[i_list][iv + 1*s4x4] ); + CP16( h->mb.cache.mvd[i_list][i8+2*8], h->mb.mvd[i_list][iv + 2*s4x4] ); + CP16( h->mb.cache.mvd[i_list][i8+3*8], h->mb.mvd[i_list][iv + 3*s4x4] ); } else { const int i8 = x264_scan8[0] - 1; for( i = 0; i < 4; i++ ) - M32( h->mb.cache.mvd[i_list][i8+i*8] ) = 0; + M16( h->mb.cache.mvd[i_list][i8+i*8] ) = 0; } } } @@ -1416,30 +1407,18 @@ void x264_macroblock_cache_save( x264_t *h ) if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) && !IS_DIRECT( i_mb_type ) ) { for( y = 0; y < 4; y++ ) - { - CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[0][x264_scan8[0]+8*y+0] ); - CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[0][x264_scan8[0]+8*y+2] ); - } + CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4], h->mb.cache.mvd[0][x264_scan8[0]+8*y] ); if( h->sh.i_type == SLICE_TYPE_B ) for( y = 0; y < 4; y++ ) - { - CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[1][x264_scan8[0]+8*y+0] ); - CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[1][x264_scan8[0]+8*y+2] ); - } + CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4], h->mb.cache.mvd[1][x264_scan8[0]+8*y] ); } else { for( y = 0; y < 4; y++ ) - { - M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0] ) = 0; - M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2] ) = 0; - } + M64( h->mb.mvd[0][i_mb_4x4+y*s4x4] ) = 0; if( h->sh.i_type == SLICE_TYPE_B ) for( y = 0; y < 4; y++ ) - { - M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0] ) = 0; - M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2] ) = 0; - } + M64( h->mb.mvd[1][i_mb_4x4+y*s4x4] ) = 0; } if( h->sh.i_type == SLICE_TYPE_B ) diff --git a/common/macroblock.h b/common/macroblock.h index 086eec37c7c74fa9bcdd5d295bb65953fa8e213a..d173be4ded5a9fefc9c47c1dd1a8b7e94f162036 100644 --- a/common/macroblock.h +++ b/common/macroblock.h @@ -353,6 +353,33 @@ static ALWAYS_INLINE void x264_macroblock_cache_rect1( void *dst, int width, int if( height == 4 ) M16( d+6 ) = val2; } } +static ALWAYS_INLINE void x264_macroblock_cache_rect2( void *dst, int width, int height, uint16_t val ) +{ + uint16_t *d = dst; + uint32_t val32 = val + (val<<16); + uint64_t val64 = val32 + ((uint64_t)val32<<32); + if( width == 4 ) + { + M64( d+ 0 ) = val64; + if( height >= 2 ) M64( d+ 8 ) = val64; + if( height == 4 ) M64( d+16 ) = val64; + if( height == 4 ) M64( d+24 ) = val64; + } + else if( width == 2 ) + { + M32( d+ 0 ) = val32; + if( height >= 2 ) M32( d+ 8 ) = val32; + if( height == 4 ) M32( d+16 ) = val32; + if( height == 4 ) M32( d+24 ) = val32; + } + else //if( width == 1 ) + { + M16( d+ 0 ) = val; + if( height >= 2 ) M16( d+ 8 ) = val; + if( height == 4 ) M16( d+16 ) = val; + if( height == 4 ) M16( d+24 ) = val; + } +} static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int height, uint32_t val ) { int dy; @@ -383,9 +410,9 @@ static ALWAYS_INLINE void x264_macroblock_cache_mv( x264_t *h, int x, int y, int { x264_macroblock_cache_rect4( &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y], width, height, mv ); } -static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, uint32_t mv ) +static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, uint16_t mv ) { - x264_macroblock_cache_rect4( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, mv ); + x264_macroblock_cache_rect2( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, mv ); } static ALWAYS_INLINE void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, uint8_t ref ) { diff --git a/common/x86/util.h b/common/x86/util.h index da8437a3d197d87d4682dbd7fffd5aa0bbd3ed10..7672f09e0711809c7ddf31a4287ccb8f5b7e8dca 100644 --- a/common/x86/util.h +++ b/common/x86/util.h @@ -77,32 +77,26 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t ); return sum; } -#define x264_cabac_amvd_sum x264_cabac_amvd_sum_mmxext -static ALWAYS_INLINE uint32_t x264_cabac_amvd_sum_mmxext(int16_t *mvdleft, int16_t *mvdtop) +#define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmxext +static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_t *mvdtop) { - static const uint64_t pw_2 = 0x0002000200020002ULL; - static const uint64_t pw_28 = 0x001C001C001C001CULL; - static const uint64_t pw_2184 = 0x0888088808880888ULL; - /* MIN(((x+28)*2184)>>16,2) = (x>2) + (x>32) */ - /* 2184 = fix16(1/30) */ - uint32_t amvd; + static const uint64_t pb_2 = 0x0202020202020202ULL; + static const uint64_t pb_32 = 0x2020202020202020ULL; + int amvd; asm( - "movd %1, %%mm0 \n" - "movd %2, %%mm1 \n" - "pxor %%mm2, %%mm2 \n" - "pxor %%mm3, %%mm3 \n" - "psubw %%mm0, %%mm2 \n" - "psubw %%mm1, %%mm3 \n" - "pmaxsw %%mm2, %%mm0 \n" - "pmaxsw %%mm3, %%mm1 \n" - "paddw %3, %%mm0 \n" - "paddw %%mm1, %%mm0 \n" - "pmulhuw %4, %%mm0 \n" - "pminsw %5, %%mm0 \n" - "movd %%mm0, %0 \n" + "movd %1, %%mm0 \n" + "movd %2, %%mm1 \n" + "paddb %%mm1, %%mm0 \n" + "pxor %%mm2, %%mm2 \n" + "movq %%mm0, %%mm1 \n" + "pcmpgtb %3, %%mm0 \n" + "pcmpgtb %4, %%mm1 \n" + "psubb %%mm0, %%mm2 \n" + "psubb %%mm1, %%mm2 \n" + "movd %%mm2, %0 \n" :"=r"(amvd) - :"m"(M32( mvdleft )),"m"(M32( mvdtop )), - "m"(pw_28),"m"(pw_2184),"m"(pw_2) + :"m"(M16( mvdleft )),"m"(M16( mvdtop )), + "m"(pb_2),"m"(pb_32) ); return amvd; } diff --git a/encoder/cabac.c b/encoder/cabac.c index a2220c66f740cd3400d1256ed8c150a6b0a9c87d..dc1d1b8ac2a6f816cbea96c9ad5f6e1fe9359359 100644 --- a/encoder/cabac.c +++ b/encoder/cabac.c @@ -349,7 +349,7 @@ static void x264_cabac_mb_ref( x264_t *h, x264_cabac_t *cb, int i_list, int idx x264_cabac_encode_decision( cb, 54 + ctx, 0 ); } -static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx ) +static inline int x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx ) { const int i_abs = abs( mvd ); const int ctxbase = l ? 47 : 40; @@ -408,32 +408,34 @@ static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_lis x264_cabac_encode_bypass( cb, mvd < 0 ); } #endif + /* Since we don't need to keep track of MVDs larger than 33, just cap the value. + * This lets us store MVDs as 8-bit values instead of 16-bit. */ + return X264_MIN( i_abs, 33 ); } -static NOINLINE uint32_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width ) +static NOINLINE uint16_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width ) { ALIGNED_4( int16_t mvp[2] ); - uint32_t amvd; int mdx, mdy; /* Calculate mvd */ x264_mb_predict_mv( h, i_list, idx, width, mvp ); mdx = h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0]; mdy = h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1]; - amvd = x264_cabac_amvd_sum(h->mb.cache.mvd[i_list][x264_scan8[idx] - 1], - h->mb.cache.mvd[i_list][x264_scan8[idx] - 8]); + uint16_t amvd = x264_cabac_mvd_sum(h->mb.cache.mvd[i_list][x264_scan8[idx] - 1], + h->mb.cache.mvd[i_list][x264_scan8[idx] - 8]); /* encode */ - x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 0, mdx, amvd&0xFFFF ); - x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy, amvd>>16 ); + mdx = x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 0, mdx, amvd&0xFF ); + mdy = x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy, amvd>>8 ); - return pack16to32_mask(mdx,mdy); + return pack8to16(mdx,mdy); } #define x264_cabac_mb_mvd(h,cb,i_list,idx,width,height)\ do\ {\ - uint32_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width);\ + uint16_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width);\ x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mvd );\ } while(0) diff --git a/encoder/me.c b/encoder/me.c index 8972d459c59479e51c59ab409ffa9903e8cab8b1..5f29a640d5be513b65f0ed5a7f4484e5b842e105 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -1174,6 +1174,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int m->mv[0] = bmx; m->mv[1] = bmy; x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx, bmy) ); - x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx - m->mvp[0], bmy - m->mvp[1]) ); + uint16_t amvd = pack8to16(X264_MIN(abs(bmx - m->mvp[0]),33), X264_MIN(abs(bmy - m->mvp[1]),33)); + x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, amvd ); h->mb.b_skip_mc = 0; }