Commit 48e28864 authored by Laurent Aimar's avatar Laurent Aimar

* all: Patches by Loren Merritt:

"Improved patch. Now supports subpel ME on all candidate MB types,
not just on the winner.

subpel_refine: (completely different scale from before)
0 => halfpel only
1 => 1 iteration of qpel on the winner (same as x264 r46)
2 => 2 iterations of qpel (about the same as my earlier patch, but faster
3 => halfpel on all MB types, qpel on the winner
4 => qpel on all
5 => more iterations

benchmarks:
mencoder dvd://1 -ovc x264 -x264encopts
qp_constant=19:fullinter:cabac:iframe=200:psnr

subpel_refine=1:  PSNR Global:46.82 kb/s:1048.1 fps:17.335
subpel_refine=2:  PSNR Global:46.83 kb/s:1034.4 fps:16.970
subpel_refine=3:  PSNR Global:46.84 kb/s:1023.3 fps:14.770
subpel_refine=4:  PSNR Global:46.87 kb/s:1010.8 fps:11.598
subpel_refine=5:  PSNR Global:46.88 kb/s:1006.9 fps:10.824"

 And

"The current code for calculating the cost of encoding which reference
frame a MB is predicted from, introduces a bias towards ref0 and
against P16x16.
Removing this bias produces an improvement of .4% - 2% bitrate,
depending on content and number of reference frames."



git-svn-id: svn://svn.videolan.org/x264/trunk@47 df754926-b1dd-0310-bc7b-ec298dee348c
parent f9bd35a3
......@@ -96,6 +96,7 @@ void x264_param_default( x264_param_t *param )
/* */
param->analyse.intra = X264_ANALYSE_I4x4;
param->analyse.inter = X264_ANALYSE_I4x4 | X264_ANALYSE_PSUB16x16;
param->analyse.i_subpel_refine = 1;
param->analyse.b_psnr = 1;
}
......
......@@ -25,6 +25,7 @@
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <limits.h>
#include "../core/common.h"
#include "../core/macroblock.h"
......@@ -464,15 +465,8 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
// m.mvc[0] = 0;
// m.mvc[1] = 0;
/* ME for ref 0 */
m.p_fref = h->mb.pic.p_fref[0][0][0];
x264_mb_predict_mv_16x16( h, 0, 0, m.mvp );
x264_me_search( h, &m );
a->l0.i_ref = 0;
a->l0.me16x16 = m;
for( i_ref = 1; i_ref < h->i_ref0; i_ref++ )
a->l0.me16x16.cost = INT_MAX;
for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
{
/* search with ref */
m.p_fref = h->mb.pic.p_fref[0][i_ref][0];
......@@ -489,6 +483,9 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
}
}
/* subtract ref cost, so we don't have to add it for the other P types */
a->l0.me16x16.cost -= m.lm * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
/* Set global ref, needed for all others modes */
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
}
......@@ -765,15 +762,9 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
m.b_mvc = 0;
m.i_mv_range = a->i_mv_range;
/* ME for List 0 ref 0 */
m.p_fref = h->mb.pic.p_fref[0][0][0];
x264_mb_predict_mv_16x16( h, 0, 0, m.mvp );
x264_me_search( h, &m );
a->l0.i_ref = 0;
a->l0.me16x16 = m;
for( i_ref = 1; i_ref < h->i_ref0; i_ref++ )
/* ME for List 0 */
a->l0.me16x16.cost = INT_MAX;
for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
{
/* search with ref */
m.p_fref = h->mb.pic.p_fref[0][i_ref][0];
......@@ -790,15 +781,9 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
}
}
/* ME for list 1 ref 0 */
m.p_fref = h->mb.pic.p_fref[1][0][0];
x264_mb_predict_mv_16x16( h, 1, 0, m.mvp );
x264_me_search( h, &m );
a->l1.i_ref = 0;
a->l1.me16x16 = m;
for( i_ref = 1; i_ref < h->i_ref1; i_ref++ )
/* ME for list 1 */
a->l1.me16x16.cost = INT_MAX;
for( i_ref = 0; i_ref < h->i_ref1; i_ref++ )
{
/* search with ref */
m.p_fref = h->mb.pic.p_fref[1][i_ref][0];
......
......@@ -347,6 +347,11 @@ x264_t *x264_encoder_open ( x264_param_t *param )
h->param.i_cabac_init_idc = x264_clip3( h->param.i_cabac_init_idc, -1, 2 );
if( param->analyse.i_subpel_refine < 0 )
param->analyse.i_subpel_refine = 0;
if( param->analyse.i_subpel_refine > 5 )
param->analyse.i_subpel_refine = 5;
/* VUI */
if( h->param.vui.i_sar_width > 0 && h->param.vui.i_sar_height > 0 )
{
......
......@@ -28,6 +28,20 @@
#include "../core/common.h"
#include "me.h"
/* presets selected from good points on the speed-vs-quality curve of several test videos
* subpel_iters[i_subpel_refine] = { refine_hpel, refine_qpel, me_hpel, me_qpel }
* where me_* are the number of EPZS iterations run on all candidate block types,
* and refine_* are run only on the winner. */
const static int subpel_iterations[][4] =
{{1,0,0,0},
{1,1,0,0},
{1,2,0,0},
{0,2,1,0},
{0,2,1,1},
{0,2,1,2}};
static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters );
void x264_me_search( x264_t *h, x264_me_t *m )
{
const int i_pixel = m->i_pixel;
......@@ -35,6 +49,7 @@ void x264_me_search( x264_t *h, x264_me_t *m )
int bmx, bmy;
uint8_t *p_fref = m->p_fref;
int i_iter;
int hpel, qpel;
/* init with mvp */
......@@ -118,9 +133,22 @@ void x264_me_search( x264_t *h, x264_me_t *m )
m->cost = h->pixf.satd[i_pixel]( m->p_fenc, m->i_stride, p_fref, m->i_stride ) +
m->lm * ( bs_size_se( m->mv[0] - m->mvp[0] ) +
bs_size_se( m->mv[1] - m->mvp[1] ) );
hpel = subpel_iterations[h->param.analyse.i_subpel_refine][2];
qpel = subpel_iterations[h->param.analyse.i_subpel_refine][3];
if( hpel || qpel )
refine_subpel( h, m, hpel, qpel );
}
void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
{
int hpel = subpel_iterations[h->param.analyse.i_subpel_refine][0];
int qpel = subpel_iterations[h->param.analyse.i_subpel_refine][1];
if( hpel || qpel )
refine_subpel( h, m, hpel, qpel );
}
static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters )
{
const int bw = x264_pixel_size[m->i_pixel].w;
const int bh = x264_pixel_size[m->i_pixel].h;
......@@ -128,66 +156,47 @@ void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
DECLARE_ALIGNED( uint8_t, pix[4][16*16], 16 );
int cost[4];
int best;
int step, i;
int bmx = m->mv[0];
int bmy = m->mv[1];
h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[0], 16, bmx + 0, bmy - 2, bw, bh );
h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[1], 16, bmx + 0, bmy + 2, bw, bh );
h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[2], 16, bmx - 2, bmy + 0, bw, bh );
h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[3], 16, bmx + 2, bmy + 0, bw, bh );
cost[0] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[0], 16 ) +
m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy - 2 - m->mvp[1] ) );
cost[1] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[1], 16 ) +
m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy + 2 - m->mvp[1] ) );
cost[2] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[2], 16 ) +
m->lm * ( bs_size_se( bmx - 2 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
cost[3] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[3], 16 ) +
m->lm * ( bs_size_se( bmx + 2 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
best = 0;
if( cost[1] < cost[0] ) best = 1;
if( cost[2] < cost[best] ) best = 2;
if( cost[3] < cost[best] ) best = 3;
if( cost[best] < m->cost )
for( step = 2; step >= 1; step-- )
{
m->cost = cost[best];
if( best == 0 ) bmy -= 2;
else if( best == 1 ) bmy += 2;
else if( best == 2 ) bmx -= 2;
else if( best == 3 ) bmx += 2;
}
h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[0], 16, bmx + 0, bmy - 1, bw, bh );
h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[1], 16, bmx + 0, bmy + 1, bw, bh );
h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[2], 16, bmx - 1, bmy + 0, bw, bh );
h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[3], 16, bmx + 1, bmy + 0, bw, bh );
cost[0] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[0], 16 ) +
m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy - 1 - m->mvp[1] ) );
cost[1] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[1], 16 ) +
m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy + 1 - m->mvp[1] ) );
cost[2] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[2], 16 ) +
m->lm * ( bs_size_se( bmx - 1 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
cost[3] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[3], 16 ) +
m->lm * ( bs_size_se( bmx + 1 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
best = 0;
if( cost[1] < cost[0] ) best = 1;
if( cost[2] < cost[best] ) best = 2;
if( cost[3] < cost[best] ) best = 3;
if( cost[best] < m->cost )
{
m->cost = cost[best];
if( best == 0 ) bmy--;
else if( best == 1 ) bmy++;
else if( best == 2 ) bmx--;
else if( best == 3 ) bmx++;
for( i = step>1 ? hpel_iters : qpel_iters; i > 0; i-- )
{
h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[0], 16, bmx + 0, bmy - step, bw, bh );
h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[1], 16, bmx + 0, bmy + step, bw, bh );
h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[2], 16, bmx - step, bmy + 0, bw, bh );
h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[3], 16, bmx + step, bmy + 0, bw, bh );
cost[0] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[0], 16 ) +
m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy - step - m->mvp[1] ) );
cost[1] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[1], 16 ) +
m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy + step - m->mvp[1] ) );
cost[2] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[2], 16 ) +
m->lm * ( bs_size_se( bmx - step - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
cost[3] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[3], 16 ) +
m->lm * ( bs_size_se( bmx + step - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
best = 0;
if( cost[1] < cost[0] ) best = 1;
if( cost[2] < cost[best] ) best = 2;
if( cost[3] < cost[best] ) best = 3;
if( cost[best] < m->cost )
{
m->cost = cost[best];
if( best == 0 ) bmy -= step;
else if( best == 1 ) bmy += step;
else if( best == 2 ) bmx -= step;
else if( best == 3 ) bmx += step;
}
else break;
}
}
m->mv[0] = bmx;
m->mv[1] = bmy;
}
......@@ -132,6 +132,7 @@ static void Help( void )
" - i4x4\n"
" - psub16x16,psub8x8\n"
" - none, all\n"
" --subme <integer> Subpixel motion estimation quality\n"
"\n"
" -s, --sar width:height Specify Sample Aspect Ratio\n"
" -o, --output Specify output file\n"
......@@ -176,6 +177,7 @@ static int Parse( int argc, char **argv,
#define OPT_QCOMP 266
#define OPT_NOPSNR 267
#define OPT_QUIET 268
#define OPT_SUBME 269
static struct option long_options[] =
{
......@@ -196,6 +198,7 @@ static int Parse( int argc, char **argv,
{ "sar", required_argument, NULL, 's' },
{ "output", required_argument, NULL, 'o' },
{ "analyse", required_argument, NULL, 'A' },
{ "subme", required_argument, NULL, OPT_SUBME },
{ "rcsens", required_argument, NULL, OPT_RCSENS },
{ "rcbuf", required_argument, NULL, OPT_RCBUF },
{ "rcinitbuf",required_argument, NULL, OPT_RCIBUF },
......@@ -304,6 +307,9 @@ static int Parse( int argc, char **argv,
if( strstr( optarg, "psub16x16" ) ) param->analyse.inter |= X264_ANALYSE_PSUB16x16;
if( strstr( optarg, "psub8x8" ) ) param->analyse.inter |= X264_ANALYSE_PSUB8x8;
break;
case OPT_SUBME:
param->analyse.i_subpel_refine = atoi(optarg);
break;
case OPT_RCBUF:
param->rc.i_rc_buffer_size = atoi(optarg);
break;
......
......@@ -124,6 +124,8 @@ typedef struct
unsigned int intra; /* intra flags */
unsigned int inter; /* inter flags */
int i_subpel_refine; /* subpixel motion estimation quality */
int b_psnr; /* Do we compute PSNR stats (save a few % of cpu) */
} analyse;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment