Commit 6371c3a5 authored by Fiona Glaser's avatar Fiona Glaser
Browse files

x86: optimize and clean up predictor checking

Branchlessly handle elimination of candidates in MMX roundclip asm.
Add a new asm function, similar to roundclip, except without the round part.
Optimize and organize the C code, and make both subme>=3 and subme<3 consistent.
Add lots of explanatory comments and try to make things a little more understandable.
~5-10% faster with subme>=3, ~15-20% faster with subme<3.
parent 00464065
......@@ -291,17 +291,6 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvd
return amvd0 + (amvd1<<8);
}
static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
{
for( int i = 0; i < i_mvc; i++ )
{
int mx = (mvc[i][0] + 2) >> 2;
int my = (mvc[i][1] + 2) >> 2;
dst[i][0] = x264_clip3( mx, mv_x_min, mv_x_max );
dst[i][1] = x264_clip3( my, mv_y_min, mv_y_max );
}
}
extern const uint8_t x264_exp2_lut[64];
extern const float x264_log2_lut[128];
extern const float x264_log2_lz_lut[32];
......@@ -671,8 +660,7 @@ struct x264_t
int mv_miny_spel_row[3];
int mv_maxy_spel_row[3];
/* Fullpel MV range for motion search */
int mv_min_fpel[2];
int mv_max_fpel[2];
ALIGNED_8( int16_t mv_limit_fpel[2][2] ); /* min_x, min_y, max_x, max_y */
int mv_miny_fpel_row[3];
int mv_maxy_fpel_row[3];
......@@ -952,6 +940,39 @@ struct x264_t
// included at the end because it needs x264_t
#include "macroblock.h"
static int ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
{
int cnt = 0;
for( int i = 0; i < i_mvc; i++ )
{
int mx = (mvc[i][0] + 2) >> 2;
int my = (mvc[i][1] + 2) >> 2;
uint32_t mv = pack16to32_mask(mx, my);
if( !mv || mv == pmv ) continue;
dst[cnt][0] = x264_clip3( mx, mv_limit[0][0], mv_limit[1][0] );
dst[cnt][1] = x264_clip3( my, mv_limit[0][1], mv_limit[1][1] );
cnt++;
}
return cnt;
}
static int ALWAYS_INLINE x264_predictor_clip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
{
int cnt = 0;
int qpel_limit[4] = {mv_limit[0][0] << 2, mv_limit[0][1] << 2, mv_limit[1][0] << 2, mv_limit[1][1] << 2};
for( int i = 0; i < i_mvc; i++ )
{
uint32_t mv = M32( mvc[i] );
int mx = mvc[i][0];
int my = mvc[i][1];
if( !mv || mv == pmv ) continue;
dst[cnt][0] = x264_clip3( mx, qpel_limit[0], qpel_limit[2] );
dst[cnt][1] = x264_clip3( my, qpel_limit[1], qpel_limit[3] );
cnt++;
}
return cnt;
}
#if ARCH_X86 || ARCH_X86_64
#include "x86/util.h"
#endif
......
......@@ -121,42 +121,132 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmx2(uint8_t *mvdleft, uint8_t
return amvd;
}
#define x264_predictor_clip x264_predictor_clip_mmx2
static int ALWAYS_INLINE x264_predictor_clip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
{
static const uint32_t pd_32 = 0x20;
intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
asm(
"movq (%2), %%mm5 \n"
"movd %6, %%mm3 \n"
"psllw $2, %%mm5 \n" // Convert to subpel
"pshufw $0xEE, %%mm5, %%mm6 \n"
"dec %k3 \n"
"jz 2f \n" // if( i_mvc == 1 ) {do the last iteration}
"punpckldq %%mm3, %%mm3 \n"
"punpckldq %%mm5, %%mm5 \n"
"movd %7, %%mm4 \n"
"lea (%0,%3,4), %3 \n"
"1: \n"
"movq (%0), %%mm0 \n"
"add $8, %0 \n"
"movq %%mm3, %%mm1 \n"
"pxor %%mm2, %%mm2 \n"
"pcmpeqd %%mm0, %%mm1 \n" // mv == pmv
"pcmpeqd %%mm0, %%mm2 \n" // mv == 0
"por %%mm1, %%mm2 \n" // (mv == pmv || mv == 0) * -1
"pmovmskb %%mm2, %k2 \n" // (mv == pmv || mv == 0) * 0xf
"pmaxsw %%mm5, %%mm0 \n"
"pminsw %%mm6, %%mm0 \n"
"pand %%mm4, %%mm2 \n" // (mv0 == pmv || mv0 == 0) * 32
"psrlq %%mm2, %%mm0 \n" // drop mv0 if it's skipped
"movq %%mm0, (%5,%4,4) \n"
"and $24, %k2 \n"
"add $2, %4 \n"
"add $8, %k2 \n"
"shr $4, %k2 \n" // (4-val)>>1
"sub %2, %4 \n" // +1 for each valid motion vector
"cmp %3, %0 \n"
"jl 1b \n"
"jg 3f \n" // if( i == i_mvc - 1 ) {do the last iteration}
/* Do the last iteration */
"2: \n"
"movd (%0), %%mm0 \n"
"pxor %%mm2, %%mm2 \n"
"pcmpeqd %%mm0, %%mm3 \n"
"pcmpeqd %%mm0, %%mm2 \n"
"por %%mm3, %%mm2 \n"
"pmovmskb %%mm2, %k2 \n"
"pmaxsw %%mm5, %%mm0 \n"
"pminsw %%mm6, %%mm0 \n"
"movd %%mm0, (%5,%4,4) \n"
"inc %4 \n"
"and $1, %k2 \n"
"sub %2, %4 \n" // output += !(mv == pmv || mv == 0)
"3: \n"
:"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
:"r"(dst), "g"(pmv), "m"(pd_32), "m"(M64( mvc ))
);
return i;
}
/* Same as the above, except we do (mv + 2) >> 2 on the input. */
#define x264_predictor_roundclip x264_predictor_roundclip_mmx2
static void ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
static int ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
{
uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min );
uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max );
static const uint64_t pw_2 = 0x0002000200020002ULL;
intptr_t i = i_mvc;
static const uint32_t pd_32 = 0x20;
intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
asm(
"movd %2, %%mm5 \n"
"movd %3, %%mm6 \n"
"movq %4, %%mm7 \n"
"punpckldq %%mm5, %%mm5 \n"
"punpckldq %%mm6, %%mm6 \n"
"test $1, %0 \n"
"jz 1f \n"
"movd -4(%6,%0,4), %%mm0 \n"
"paddw %%mm7, %%mm0 \n"
"psraw $2, %%mm0 \n"
"pmaxsw %%mm5, %%mm0 \n"
"pminsw %%mm6, %%mm0 \n"
"movd %%mm0, -4(%5,%0,4) \n"
"dec %0 \n"
"jz 2f \n"
"1: \n"
"movq -8(%6,%0,4), %%mm0 \n"
"paddw %%mm7, %%mm0 \n"
"psraw $2, %%mm0 \n"
"pmaxsw %%mm5, %%mm0 \n"
"pminsw %%mm6, %%mm0 \n"
"movq %%mm0, -8(%5,%0,4) \n"
"sub $2, %0 \n"
"jnz 1b \n"
"2: \n"
:"+r"(i), "=m"(M64( dst ))
:"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(dst), "r"(mvc), "m"(M64( mvc ))
"movq (%2), %%mm5 \n"
"movq %6, %%mm7 \n"
"movd %7, %%mm3 \n"
"pshufw $0xEE, %%mm5, %%mm6 \n"
"dec %k3 \n"
"jz 2f \n"
"punpckldq %%mm3, %%mm3 \n"
"punpckldq %%mm5, %%mm5 \n"
"movd %8, %%mm4 \n"
"lea (%0,%3,4), %3 \n"
"1: \n"
"movq (%0), %%mm0 \n"
"add $8, %0 \n"
"paddw %%mm7, %%mm0 \n"
"psraw $2, %%mm0 \n"
"movq %%mm3, %%mm1 \n"
"pxor %%mm2, %%mm2 \n"
"pcmpeqd %%mm0, %%mm1 \n"
"pcmpeqd %%mm0, %%mm2 \n"
"por %%mm1, %%mm2 \n"
"pmovmskb %%mm2, %k2 \n"
"pmaxsw %%mm5, %%mm0 \n"
"pminsw %%mm6, %%mm0 \n"
"pand %%mm4, %%mm2 \n"
"psrlq %%mm2, %%mm0 \n"
"movq %%mm0, (%5,%4,4) \n"
"and $24, %k2 \n"
"add $2, %4 \n"
"add $8, %k2 \n"
"shr $4, %k2 \n"
"sub %2, %4 \n"
"cmp %3, %0 \n"
"jl 1b \n"
"jg 3f \n"
/* Do the last iteration */
"2: \n"
"movd (%0), %%mm0 \n"
"paddw %%mm7, %%mm0 \n"
"psraw $2, %%mm0 \n"
"pxor %%mm2, %%mm2 \n"
"pcmpeqd %%mm0, %%mm3 \n"
"pcmpeqd %%mm0, %%mm2 \n"
"por %%mm3, %%mm2 \n"
"pmovmskb %%mm2, %k2 \n"
"pmaxsw %%mm5, %%mm0 \n"
"pminsw %%mm6, %%mm0 \n"
"movd %%mm0, (%5,%4,4) \n"
"inc %4 \n"
"and $1, %k2 \n"
"sub %2, %4 \n"
"3: \n"
:"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
:"r"(dst), "m"(pw_2), "g"(pmv), "m"(pd_32), "m"(M64( mvc ))
);
return i;
}
#endif
......
......@@ -467,8 +467,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
}
h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) )
{
int mb_y = h->mb.i_mb_y >> SLICE_MBAFF;
......@@ -516,8 +516,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
}
}
if( PARAM_INTERLACED )
......@@ -527,8 +527,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
h->mb.mv_max[1] = h->mb.mv_maxy_row[i];
h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i];
h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i];
h->mb.mv_min_fpel[1] = h->mb.mv_miny_fpel_row[i];
h->mb.mv_max_fpel[1] = h->mb.mv_maxy_fpel_row[i];
h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i];
h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i];
}
#undef CLIP_FMV
......
......@@ -61,21 +61,22 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
(p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2])
#define COST_MV( mx, my )\
do\
{\
int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE,\
&p_fref_w[(my)*stride+(mx)], stride )\
+ BITS_MVD(mx,my);\
COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\
}
} while(0)
#define COST_MV_HPEL( mx, my ) \
{ \
intptr_t stride2 = 16; \
pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \
int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \
}
#define COST_MV_HPEL( mx, my, cost )\
do\
{\
intptr_t stride2 = 16;\
pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] );\
cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 )\
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ];\
} while(0)
#define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
{\
......@@ -174,6 +175,9 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
}\
}
#define FPEL(mv) (((mv)+2)>>2) /* Convert subpel MV to fullpel with rounding... */
#define SPEL(mv) ((mv)<<2) /* ... and the reverse. */
void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_halfpel_thresh )
{
const int bw = x264_pixel_size[m->i_pixel].w;
......@@ -181,95 +185,135 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
const int i_pixel = m->i_pixel;
const int stride = m->i_stride[0];
int i_me_range = h->param.analyse.i_me_range;
int bmx, bmy, bcost;
int bmx, bmy, bcost = COST_MAX;
int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX;
int omx, omy, pmx, pmy;
pixel *p_fenc = m->p_fenc[0];
pixel *p_fref_w = m->p_fref_w;
ALIGNED_ARRAY_16( pixel, pix,[16*16] );
ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] );
int costs[16];
int mv_x_min = h->mb.mv_min_fpel[0];
int mv_y_min = h->mb.mv_min_fpel[1];
int mv_x_max = h->mb.mv_max_fpel[0];
int mv_y_max = h->mb.mv_max_fpel[1];
int mv_x_min_qpel = mv_x_min << 2;
int mv_y_min_qpel = mv_y_min << 2;
int mv_x_max_qpel = mv_x_max << 2;
int mv_y_max_qpel = mv_y_max << 2;
int mv_x_min = h->mb.mv_limit_fpel[0][0];
int mv_y_min = h->mb.mv_limit_fpel[0][1];
int mv_x_max = h->mb.mv_limit_fpel[1][0];
int mv_y_max = h->mb.mv_limit_fpel[1][1];
/* Special version of pack to allow shortcuts in CHECK_MVRANGE */
#define pack16to32_mask2(mx,my) ((mx<<16)|(my&0x7FFF))
uint32_t mv_min = pack16to32_mask2( -mv_x_min, -mv_y_min );
uint32_t mv_max = pack16to32_mask2( mv_x_max, mv_y_max )|0x8000;
uint32_t pmv;
#define CHECK_MVRANGE(mx,my) (!(((pack16to32_mask2(mx,my) + mv_min) | (mv_max - pack16to32_mask2(mx,my))) & 0x80004000))
const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
uint32_t pmv;
bmx = x264_clip3( m->mvp[0], mv_x_min_qpel, mv_x_max_qpel );
bmy = x264_clip3( m->mvp[1], mv_y_min_qpel, mv_y_max_qpel );
pmx = ( bmx + 2 ) >> 2;
pmy = ( bmy + 2 ) >> 2;
bcost = COST_MAX;
/* try extra predictors if provided */
/* Try extra predictors if provided. If subme >= 3, check subpel predictors,
* otherwise round them to fullpel. */
if( h->mb.i_subpel_refine >= 3 )
{
pmv = pack16to32_mask(bmx,bmy);
COST_MV_HPEL( bmx, bmy );
for( int i = 0; i < i_mvc; i++ )
/* Calculate and check the MVP first */
bpred_mx = x264_clip3( m->mvp[0], SPEL(mv_x_min), SPEL(mv_x_max) );
bpred_my = x264_clip3( m->mvp[1], SPEL(mv_y_min), SPEL(mv_y_max) );
pmv = pack16to32_mask( bpred_mx, bpred_my );
pmx = FPEL( bpred_mx );
pmy = FPEL( bpred_my );
COST_MV_HPEL( bpred_mx, bpred_my, bpred_cost );
int pmv_cost = bpred_cost;
if( i_mvc > 0 )
{
if( M32( mvc[i] ) && (pmv != M32( mvc[i] )) )
/* Clip MV candidates and eliminate those equal to zero and pmv. */
int valid_mvcs = x264_predictor_clip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv );
if( valid_mvcs > 0 )
{
int mx = x264_clip3( mvc[i][0], mv_x_min_qpel, mv_x_max_qpel );
int my = x264_clip3( mvc[i][1], mv_y_min_qpel, mv_y_max_qpel );
COST_MV_HPEL( mx, my );
int i = 1, cost;
/* We stuff pmv here to branchlessly pick between pmv and the various
* MV candidates. [0] gets skipped in order to maintain alignment for
* x264_predictor_clip. */
M32( mvc_temp[1] ) = pmv;
bpred_cost <<= 4;
do
{
int mx = mvc_temp[i+1][0];
int my = mvc_temp[i+1][1];
COST_MV_HPEL( mx, my, cost );
COPY1_IF_LT( bpred_cost, (cost << 4) + i );
} while( ++i <= valid_mvcs );
bpred_mx = mvc_temp[(bpred_cost&15)+1][0];
bpred_my = mvc_temp[(bpred_cost&15)+1][1];
bpred_cost >>= 4;
}
}
bmx = ( bpred_mx + 2 ) >> 2;
bmy = ( bpred_my + 2 ) >> 2;
COST_MV( bmx, bmy );
/* Round the best predictor back to fullpel and get the cost, since this is where
* we'll be starting the fullpel motion search. */
bmx = FPEL( bpred_mx );
bmy = FPEL( bpred_my );
if( (bpred_mx|bpred_my)&0x3 ) /* Only test if the tested predictor is actually subpel... */
COST_MV( bmx, bmy );
else /* Otherwise just copy the cost (we already know it) */
bcost = bpred_cost;
/* Test the zero vector if it hasn't been tested yet. */
if( pmv )
{
if( bmx|bmy ) COST_MV( 0, 0 );
}
/* If a subpel mv candidate was better than the zero vector, the previous
* fullpel check won't have gotten it even if the pmv was zero. So handle
* that possibility here. */
else
{
COPY3_IF_LT( bcost, pmv_cost, bmx, 0, bmy, 0 );
}
}
else
{
/* check the MVP */
bmx = pmx;
bmy = pmy;
/* Calculate and check the fullpel MVP first */
bmx = pmx = x264_clip3( FPEL(m->mvp[0]), mv_x_min, mv_x_max );
bmy = pmy = x264_clip3( FPEL(m->mvp[1]), mv_y_min, mv_y_max );
pmv = pack16to32_mask( bmx, bmy );
/* Because we are rounding the predicted motion vector to fullpel, there will be
* an extra MV cost in 15 out of 16 cases. However, when the predicted MV is
* chosen as the best predictor, it is often the case that the subpel search will
* result in a vector at or next to the predicted motion vector. Therefore, it is
* sensible to omit the cost of the MV from the rounded MVP to avoid unfairly
* biasing against use of the predicted motion vector. */
* result in a vector at or next to the predicted motion vector. Therefore, we omit
* the cost of the MV from the rounded MVP to avoid unfairly biasing against use of
* the predicted motion vector.
*
* Disclaimer: this is a post-hoc rationalization for why this hack works. */
bcost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[bmy*stride+bmx], stride );
pmv = pack16to32_mask( bmx, bmy );
if( i_mvc > 0 )
{
ALIGNED_ARRAY_8( int16_t, mvc_fpel,[16],[2] );
x264_predictor_roundclip( mvc_fpel+2, mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max );
M32( mvc_fpel[1] ) = pmv;
bcost <<= 4;
for( int i = 1; i <= i_mvc; i++ )
/* Like in subme>=3, except we also round the candidates to fullpel. */
int valid_mvcs = x264_predictor_roundclip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv );
if( valid_mvcs > 0 )
{
if( M32( mvc_fpel[i+1] ) && (pmv != M32( mvc_fpel[i+1] )) )
int i = 1, cost;
M32( mvc_temp[1] ) = pmv;
bcost <<= 4;
do
{
int mx = mvc_fpel[i+1][0];
int my = mvc_fpel[i+1][1];
int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my );
cost = (cost << 4) + i;
COPY1_IF_LT( bcost, cost );
}
int mx = mvc_temp[i+1][0];
int my = mvc_temp[i+1][1];
cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my );
COPY1_IF_LT( bcost, (cost << 4) + i );
} while( ++i <= valid_mvcs );
bmx = mvc_temp[(bcost&15)+1][0];
bmy = mvc_temp[(bcost&15)+1][1];
bcost >>= 4;
}
bmx = mvc_fpel[(bcost&15)+1][0];
bmy = mvc_fpel[(bcost&15)+1][1];
bcost >>= 4;
}
}
COST_MV( 0, 0 );
/* Same as above, except the condition is simpler. */
if( pmv )
COST_MV( 0, 0 );
}
switch( h->mb.i_me_method )
{
......@@ -733,8 +777,8 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
}
else
{
m->mv[0] = bmx << 2;
m->mv[1] = bmy << 2;
m->mv[0] = SPEL(bmx);
m->mv[1] = SPEL(bmy);
m->cost = bcost;
}
......
......@@ -472,16 +472,16 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
goto lowres_intra_mb;
// no need for h->mb.mv_min[]
h->mb.mv_min_fpel[0] = -8*h->mb.i_mb_x - 4;
h->mb.mv_max_fpel[0] = 8*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 4;
h->mb.mv_min_spel[0] = 4*( h->mb.mv_min_fpel[0] - 8 );
h->mb.mv_max_spel[0] = 4*( h->mb.mv_max_fpel[0] + 8 );
h->mb.mv_limit_fpel[0][0] = -8*h->mb.i_mb_x - 4;
h->mb.mv_limit_fpel[1][0] = 8*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 4;
h->mb.mv_min_spel[0] = 4*( h->mb.mv_limit_fpel[0][0] - 8 );
h->mb.mv_max_spel[0] = 4*( h->mb.mv_limit_fpel[1][0] + 8 );
if( h->mb.i_mb_x >= h->mb.i_mb_width - 2 )
{
h->mb.mv_min_fpel[1] = -8*h->mb.i_mb_y - 4;
h->mb.mv_max_fpel[1] = 8*( h->mb.i_mb_height - h->mb.i_mb_y - 1 ) + 4;
h->mb.mv_min_spel[1] = 4*( h->mb.mv_min_fpel[1] - 8 );
h->mb.mv_max_spel[1] = 4*( h->mb.mv_max_fpel[1] + 8 );
h->mb.mv_limit_fpel[0][1] = -8*h->mb.i_mb_y - 4;
h->mb.mv_limit_fpel[1][1] = 8*( h->mb.i_mb_height - h->mb.i_mb_y - 1 ) + 4;
h->mb.mv_min_spel[1] = 4*( h->mb.mv_limit_fpel[0][1] - 8 );
h->mb.mv_max_spel[1] = 4*( h->mb.mv_limit_fpel[1][1] + 8 );
}
#define LOAD_HPELS_LUMA(dst, src) \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment