Commit 8d09ebe2 authored by Loren Merritt's avatar Loren Merritt

satd exhaustive motion search (--me tesa)


git-svn-id: svn://svn.videolan.org/x264/trunk@728 df754926-b1dd-0310-bc7b-ec298dee348c
parent 12c833c8
......@@ -94,7 +94,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
}
}
if( h->param.analyse.i_me_method == X264_ME_ESA )
if( h->param.analyse.i_me_method >= X264_ME_ESA )
{
CHECKED_MALLOC( frame->buffer[7],
2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) );
......
......@@ -323,6 +323,45 @@ SAD_X( 8x16_vis )
SAD_X( 8x8_vis )
#endif
/****************************************************************************
* pixel_satd_x4
* no faster than single satd, but needed for satd to be a drop-in replacement for sad
****************************************************************************/
#define SATD_X( size, cpu ) \
static void x264_pixel_satd_x3_##size##cpu( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\
{\
scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
}\
static void x264_pixel_satd_x4_##size##cpu( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\
{\
scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
scores[3] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix3, i_stride );\
}
#define SATD_X_DECL5( cpu )\
SATD_X( 16x16, cpu )\
SATD_X( 16x8, cpu )\
SATD_X( 8x16, cpu )\
SATD_X( 8x8, cpu )\
SATD_X( 8x4, cpu )
#define SATD_X_DECL7( cpu )\
SATD_X_DECL5( cpu )\
SATD_X( 4x8, cpu )\
SATD_X( 4x4, cpu )
SATD_X_DECL7()
#ifdef HAVE_MMX
SATD_X_DECL7( _mmxext )
SATD_X_DECL5( _sse2 )
#ifdef HAVE_SSE3
SATD_X_DECL5( _ssse3 )
#endif
#endif
/****************************************************************************
* structural similarity metric
****************************************************************************/
......@@ -487,6 +526,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( sad_x4, );
INIT7( ssd, );
INIT7( satd, );
INIT7( satd_x3, );
INIT7( satd_x4, );
INIT4( sa8d, );
INIT_ADS( );
......@@ -505,6 +546,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( sad_x3, _mmxext );
INIT7( sad_x4, _mmxext );
INIT7( satd, _mmxext );
INIT7( satd_x3, _mmxext );
INIT7( satd_x4, _mmxext );
INIT_ADS( _mmxext );
#ifdef ARCH_X86
......@@ -552,6 +595,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x3, _sse2 );
INIT2( sad_x4, _sse2 );
INIT5( satd, _sse2 );
INIT5( satd_x3, _sse2 );
INIT5( satd_x4, _sse2 );
INIT_ADS( _sse2 );
#ifdef ARCH_X86
......@@ -588,6 +633,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
if( cpu&X264_CPU_SSSE3 )
{
INIT5( satd, _ssse3 );
INIT5( satd_x3, _ssse3 );
INIT5( satd_x4, _ssse3 );
INIT_ADS( _ssse3 );
#ifdef ARCH_X86_64
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
......
......@@ -69,15 +69,19 @@ typedef struct
x264_pixel_cmp_t ssim[7];
x264_pixel_cmp_t sa8d[4];
x264_pixel_cmp_t mbcmp[7]; /* either satd or sad for subpel refine and mode decision */
x264_pixel_cmp_t rdcmp[7]; /* either ssd or ssim for rate-distortion */
x264_pixel_cmp_t fpelcmp[7]; /* either satd or sad for fullpel motion search */
x264_pixel_cmp_x3_t fpelcmp_x3[7];
x264_pixel_cmp_x4_t fpelcmp_x4[7];
void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
const uint8_t *pix2, int stride2, int sums[2][4] );
float (*ssim_end4)( int sum0[5][4], int sum1[5][4], int width );
/* multiple parallel calls to sad. */
/* multiple parallel calls to cmp. */
x264_pixel_cmp_x3_t sad_x3[7];
x264_pixel_cmp_x4_t sad_x4[7];
x264_pixel_cmp_x3_t satd_x3[7];
x264_pixel_cmp_x4_t satd_x4[7];
/* abs-diff-sum for successive elimination.
* may round width up to a multiple of 16. */
......
......@@ -190,7 +190,7 @@ static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
a->p_cost_mv = p_cost_mv[a->i_qp];
/* FIXME is this useful for all me methods? */
if( h->param.analyse.i_me_method == X264_ME_ESA && !x264_cost_mv_fpel[a->i_qp][0] )
if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_qp][0] )
{
for( j=0; j<4; j++ )
{
......
......@@ -363,7 +363,7 @@ static int x264_validate_parameters( x264_t *h )
if( h->param.b_interlaced )
{
if( h->param.analyse.i_me_method == X264_ME_ESA )
if( h->param.analyse.i_me_method >= X264_ME_ESA )
{
x264_log( h, X264_LOG_WARNING, "interlace + me=esa is not implemented\n" );
h->param.analyse.i_me_method = X264_ME_UMH;
......@@ -449,12 +449,15 @@ static int x264_validate_parameters( x264_t *h )
h->param.i_cqm_preset = X264_CQM_FLAT;
if( h->param.analyse.i_me_method < X264_ME_DIA ||
h->param.analyse.i_me_method > X264_ME_ESA )
h->param.analyse.i_me_method > X264_ME_TESA )
h->param.analyse.i_me_method = X264_ME_HEX;
if( h->param.analyse.i_me_range < 4 )
h->param.analyse.i_me_range = 4;
if( h->param.analyse.i_me_range > 16 && h->param.analyse.i_me_method <= X264_ME_HEX )
h->param.analyse.i_me_range = 16;
if( h->param.analyse.i_me_method == X264_ME_TESA &&
(h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1) )
h->param.analyse.i_me_method = X264_ME_ESA;
h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 7 );
h->param.analyse.b_bframe_rdo = h->param.analyse.b_bframe_rdo && h->param.analyse.i_subpel_refine >= 6;
h->param.analyse.b_mixed_references = h->param.analyse.b_mixed_references && h->param.i_frame_reference > 1;
......@@ -546,9 +549,12 @@ static int x264_validate_parameters( x264_t *h )
static void mbcmp_init( x264_t *h )
{
memcpy( h->pixf.mbcmp,
( h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1 ) ? h->pixf.sad : h->pixf.satd,
sizeof(h->pixf.mbcmp) );
int satd = !h->mb.b_lossless && h->param.analyse.i_subpel_refine > 1;
memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp) );
satd &= h->param.analyse.i_me_method == X264_ME_TESA;
memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) );
memcpy( h->pixf.fpelcmp_x4, satd ? h->pixf.satd_x4 : h->pixf.sad_x4, sizeof(h->pixf.fpelcmp_x4) );
}
/****************************************************************************
......
......@@ -54,7 +54,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
#define COST_MV( mx, my )\
{\
int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE,\
int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE,\
&p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0] )\
+ BITS_MVD(mx,my);\
COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\
......@@ -64,7 +64,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
{ \
int stride = 16; \
uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \
}
......@@ -72,7 +72,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
#define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
{\
uint8_t *pix_base = p_fref + bmx + bmy*m->i_stride[0];\
h->pixf.sad_x3[i_pixel]( m->p_fenc[0],\
h->pixf.fpelcmp_x3[i_pixel]( m->p_fenc[0],\
pix_base + (m0x) + (m0y)*m->i_stride[0],\
pix_base + (m1x) + (m1y)*m->i_stride[0],\
pix_base + (m2x) + (m2y)*m->i_stride[0],\
......@@ -85,7 +85,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
#define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\
{\
uint8_t *pix_base = p_fref + omx + omy*m->i_stride[0];\
h->pixf.sad_x4[i_pixel]( m->p_fenc[0],\
h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0],\
pix_base + (m0x) + (m0y)*m->i_stride[0],\
pix_base + (m1x) + (m1y)*m->i_stride[0],\
pix_base + (m2x) + (m2y)*m->i_stride[0],\
......@@ -103,7 +103,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
{\
h->pixf.sad_x3[i_pixel]( m->p_fenc[0],\
h->pixf.fpelcmp_x3[i_pixel]( m->p_fenc[0],\
p_fref + (m0x) + (m0y)*m->i_stride[0],\
p_fref + (m1x) + (m1y)*m->i_stride[0],\
p_fref + (m2x) + (m2y)*m->i_stride[0],\
......@@ -450,6 +450,7 @@ me_hex2:
}
case X264_ME_ESA:
case X264_ME_TESA:
{
const int min_x = X264_MAX( bmx - i_me_range, mv_x_min );
const int min_y = X264_MAX( bmy - i_me_range, mv_y_min );
......@@ -488,16 +489,101 @@ me_hex2:
if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
enc_dc[1] = enc_dc[2];
for( my = min_y; my <= max_y; my++ )
if( h->mb.i_me_method == X264_ME_TESA )
{
// ADS threshold, then SAD threshold, then keep the best few SADs, then SATD
typedef struct {
int sad;
int16_t mx, my;
} mvsad_t;
mvsad_t *mvsads = x264_malloc( width*(max_y-min_y+1)*sizeof(mvsad_t) );
int nmvsad = 0, limit;
int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
int bsad = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, p_fref+bmy*stride+bmx, stride )
+ BITS_MVD( bmx, bmy );
for( my = min_y; my <= max_y; my++ )
{
int ycost = p_cost_mvy[my<<2];
bsad -= ycost;
xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
cost_fpel_mvx+min_x, xs, width, bsad*17/16 );
for( i=0; i<xn-2; i+=3 )
{
uint8_t *ref = p_fref+min_x+my*stride;
int sads[3];
h->pixf.sad_x3[i_pixel]( m->p_fenc[0], ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads );
for( j=0; j<3; j++ )
{
int sad = sads[j] + cost_fpel_mvx[xs[i+j]];
if( sad < bsad*sad_thresh>>3 )
{
COPY1_IF_LT( bsad, sad );
mvsads[nmvsad].sad = sad + ycost;
mvsads[nmvsad].mx = min_x+xs[i+j];
mvsads[nmvsad].my = my;
nmvsad++;
}
}
}
for( ; i<xn; i++ )
{
int mx = min_x+xs[i];
int sad = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, p_fref+mx+my*stride, stride )
+ cost_fpel_mvx[xs[i]];
if( sad < bsad*sad_thresh>>3 )
{
COPY1_IF_LT( bsad, sad );
mvsads[nmvsad].sad = sad + ycost;
mvsads[nmvsad].mx = mx;
mvsads[nmvsad].my = my;
nmvsad++;
}
}
bsad += ycost;
}
limit = i_me_range / 2;
if( nmvsad > limit*2 )
{
// halve the range if the domain is too large... eh, close enough
bsad = bsad*(sad_thresh+8)>>4;
for( i=0; i<nmvsad && mvsads[i].sad <= bsad; i++ );
for( j=i; j<nmvsad; j++ )
if( mvsads[j].sad <= bsad )
mvsads[i++] = mvsads[j];
nmvsad = i;
}
if( nmvsad > limit )
{
for( i=0; i<limit; i++ )
{
int bj = i;
int bsad = mvsads[bj].sad;
for( j=i+1; j<nmvsad; j++ )
COPY2_IF_LT( bsad, mvsads[j].sad, bj, j );
if( bj > i )
XCHG( mvsad_t, mvsads[i], mvsads[bj] );
}
nmvsad = limit;
}
for( i=0; i<nmvsad; i++ )
COST_MV( mvsads[i].mx, mvsads[i].my );
x264_free( mvsads );
}
else
{
bcost -= p_cost_mvy[my<<2];
xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
cost_fpel_mvx+min_x, xs, width, bcost );
for( i=0; i<xn-2; i+=3 )
COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my );
bcost += p_cost_mvy[my<<2];
for( ; i<xn; i++ )
COST_MV( min_x+xs[i], my );
// just ADS and SAD
for( my = min_y; my <= max_y; my++ )
{
bcost -= p_cost_mvy[my<<2];
xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
cost_fpel_mvx+min_x, xs, width, bcost );
for( i=0; i<xn-2; i+=3 )
COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my );
bcost += p_cost_mvy[my<<2];
for( ; i<xn; i++ )
COST_MV( min_x+xs[i], my );
}
}
if( xs != xs_buf )
......@@ -553,7 +639,7 @@ void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
{ \
int stride = 16; \
uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \
}
......@@ -623,7 +709,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
src2 = h->mc.get_ref( pix[1], &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh );
src1 = src0 + stride;
src3 = src2 + 1;
h->pixf.sad_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs );
h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs );
COPY2_IF_LT( bcost, costs[0] + p_cost_mvx[omx ] + p_cost_mvy[omy-2], bmy, omy-2 );
COPY2_IF_LT( bcost, costs[1] + p_cost_mvx[omx ] + p_cost_mvy[omy+2], bmy, omy+2 );
COPY3_IF_LT( bcost, costs[2] + p_cost_mvx[omx-2] + p_cost_mvy[omy ], bmx, omx-2, bmy, omy );
......
......@@ -225,7 +225,8 @@ static void Help( x264_param_t *defaults, int b_longhelp )
H1( " - dia: diamond search, radius 1 (fast)\n"
" - hex: hexagonal search, radius 2\n"
" - umh: uneven multi-hexagon search\n"
" - esa: exhaustive search (slow)\n" );
" - esa: exhaustive search\n"
" - tesa: hadamard exhaustive search (slow)\n" );
else H0( " - dia, hex, umh\n" );
H0( " --merange <integer> Maximum motion vector search range [%d]\n", defaults->analyse.i_me_range );
H1( " --mvrange <integer> Maximum motion vector length [-1 (auto)]\n" );
......
......@@ -74,6 +74,7 @@ typedef struct x264_t x264_t;
#define X264_ME_HEX 1
#define X264_ME_UMH 2
#define X264_ME_ESA 3
#define X264_ME_TESA 4
#define X264_CQM_FLAT 0
#define X264_CQM_JVT 1
#define X264_CQM_CUSTOM 2
......@@ -83,7 +84,7 @@ typedef struct x264_t x264_t;
#define X264_RC_ABR 2
static const char * const x264_direct_pred_names[] = { "none", "spatial", "temporal", "auto", 0 };
static const char * const x264_motion_est_names[] = { "dia", "hex", "umh", "esa", 0 };
static const char * const x264_motion_est_names[] = { "dia", "hex", "umh", "esa", "tesa", 0 };
static const char * const x264_overscan_names[] = { "undef", "show", "crop", 0 };
static const char * const x264_vidformat_names[] = { "component", "pal", "ntsc", "secam", "mac", "undef", 0 };
static const char * const x264_fullrange_names[] = { "off", "on", 0 };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment