me.c 42.8 KB
Newer Older
Laurent Aimar's avatar
Laurent Aimar committed
1 2 3
/*****************************************************************************
 * me.c: h264 encoder library (Motion Estimation)
 *****************************************************************************
4
 * Copyright (C) 2003-2008 x264 project
Laurent Aimar's avatar
Laurent Aimar committed
5
 *
6 7 8
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *          Fiona Glaser <fiona@x264.com>
Laurent Aimar's avatar
Laurent Aimar committed
9 10 11 12 13 14 15 16 17 18 19 20 21
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
Laurent Aimar's avatar
Laurent Aimar committed
23 24
 *****************************************************************************/

25
#include "common/common.h"
Fiona Glaser's avatar
Fiona Glaser committed
26
#include "macroblock.h"
Laurent Aimar's avatar
Laurent Aimar committed
27 28
#include "me.h"

29 30 31
/* presets selected from good points on the speed-vs-quality curve of several test videos
 * subpel_iters[i_subpel_refine] = { refine_hpel, refine_qpel, me_hpel, me_qpel }
 * where me_* are the number of EPZS iterations run on all candidate block types,
32
 * and refine_* are run only on the winner.
33 34
 * the subme=8,9 values are much higher because any amount of satd search makes
 * up its time by reducing the number of qpel-rd iterations. */
Loren Merritt's avatar
Loren Merritt committed
35
static const int subpel_iterations[][4] =
36
   {{0,0,0,0},
37
    {1,1,0,0},
38
    {0,1,1,0},
39 40
    {0,2,1,0},
    {0,2,1,1},
41
    {0,2,1,2},
42
    {0,0,2,2},
43 44
    {0,0,2,2},
    {0,0,4,10},
Fiona Glaser's avatar
Fiona Glaser committed
45
    {0,0,4,10},
46 47 48 49 50 51
    {0,0,4,10}};

/* (x-1)%6 */
static const int mod6m1[8] = {5,0,1,2,3,4,5,0};
/* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
static const int hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}};
52
static const int square1[9][2] = {{0,0}, {0,-1}, {0,1}, {-1,0}, {1,0}, {-1,-1}, {-1,1}, {1,-1}, {1,1}};
53

54
static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel );
55

Loren Merritt's avatar
Loren Merritt committed
56 57 58 59 60
#define BITS_MVD( mx, my )\
    (p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2])

#define COST_MV( mx, my )\
{\
61 62
    int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE,\
                   &p_fref[(my)*stride+(mx)], stride )\
Loren Merritt's avatar
Loren Merritt committed
63 64 65 66
             + BITS_MVD(mx,my);\
    COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\
}

Loren Merritt's avatar
Loren Merritt committed
67
#define COST_MV_HPEL( mx, my ) \
68
{ \
69 70 71
    int stride2 = 16; \
    uint8_t *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh ); \
    int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \
72 73 74 75
             + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
    COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \
}

76 77
#define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
{\
78 79 80 81 82 83
    uint8_t *pix_base = p_fref + bmx + bmy*stride;\
    h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\
        pix_base + (m0x) + (m0y)*stride,\
        pix_base + (m1x) + (m1y)*stride,\
        pix_base + (m2x) + (m2y)*stride,\
        stride, costs );\
84 85 86 87 88
    (costs)[0] += BITS_MVD( bmx+(m0x), bmy+(m0y) );\
    (costs)[1] += BITS_MVD( bmx+(m1x), bmy+(m1y) );\
    (costs)[2] += BITS_MVD( bmx+(m2x), bmy+(m2y) );\
}

89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
#define COST_MV_X4_DIR( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs )\
{\
    uint8_t *pix_base = p_fref + bmx + bmy*stride;\
    h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
        pix_base + (m0x) + (m0y)*stride,\
        pix_base + (m1x) + (m1y)*stride,\
        pix_base + (m2x) + (m2y)*stride,\
        pix_base + (m3x) + (m3y)*stride,\
        stride, costs );\
    (costs)[0] += BITS_MVD( bmx+(m0x), bmy+(m0y) );\
    (costs)[1] += BITS_MVD( bmx+(m1x), bmy+(m1y) );\
    (costs)[2] += BITS_MVD( bmx+(m2x), bmy+(m2y) );\
    (costs)[3] += BITS_MVD( bmx+(m3x), bmy+(m3y) );\
}

104 105
#define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\
{\
106 107 108 109 110 111 112
    uint8_t *pix_base = p_fref + omx + omy*stride;\
    h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
        pix_base + (m0x) + (m0y)*stride,\
        pix_base + (m1x) + (m1y)*stride,\
        pix_base + (m2x) + (m2y)*stride,\
        pix_base + (m3x) + (m3y)*stride,\
        stride, costs );\
113 114 115 116 117 118 119 120 121 122
    costs[0] += BITS_MVD( omx+(m0x), omy+(m0y) );\
    costs[1] += BITS_MVD( omx+(m1x), omy+(m1y) );\
    costs[2] += BITS_MVD( omx+(m2x), omy+(m2y) );\
    costs[3] += BITS_MVD( omx+(m3x), omy+(m3y) );\
    COPY3_IF_LT( bcost, costs[0], bmx, omx+(m0x), bmy, omy+(m0y) );\
    COPY3_IF_LT( bcost, costs[1], bmx, omx+(m1x), bmy, omy+(m1y) );\
    COPY3_IF_LT( bcost, costs[2], bmx, omx+(m2x), bmy, omy+(m2y) );\
    COPY3_IF_LT( bcost, costs[3], bmx, omx+(m3x), bmy, omy+(m3y) );\
}

123
#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
Loren Merritt's avatar
Loren Merritt committed
124
{\
125 126 127 128 129
    h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\
        p_fref + (m0x) + (m0y)*stride,\
        p_fref + (m1x) + (m1y)*stride,\
        p_fref + (m2x) + (m2y)*stride,\
        stride, costs );\
130 131 132
    costs[0] += p_cost_mvx[(m0x)<<2]; /* no cost_mvy */\
    costs[1] += p_cost_mvx[(m1x)<<2];\
    costs[2] += p_cost_mvx[(m2x)<<2];\
Loren Merritt's avatar
Loren Merritt committed
133 134 135 136 137
    COPY3_IF_LT( bcost, costs[0], bmx, m0x, bmy, m0y );\
    COPY3_IF_LT( bcost, costs[1], bmx, m1x, bmy, m1y );\
    COPY3_IF_LT( bcost, costs[2], bmx, m2x, bmy, m2y );\
}

138 139 140 141 142 143 144 145
/*  1  */
/* 101 */
/*  1  */
#define DIA1_ITER( mx, my )\
{\
    omx = mx; omy = my;\
    COST_MV_X4( 0,-1, 0,1, -1,0, 1,0 );\
}
146

Loren Merritt's avatar
Loren Merritt committed
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
#define CROSS( start, x_max, y_max )\
{\
    i = start;\
    if( x_max <= X264_MIN(mv_x_max-omx, omx-mv_x_min) )\
        for( ; i < x_max-2; i+=4 )\
            COST_MV_X4( i,0, -i,0, i+2,0, -i-2,0 );\
    for( ; i < x_max; i+=2 )\
    {\
        if( omx+i <= mv_x_max )\
            COST_MV( omx+i, omy );\
        if( omx-i >= mv_x_min )\
            COST_MV( omx-i, omy );\
    }\
    i = start;\
    if( y_max <= X264_MIN(mv_y_max-omy, omy-mv_y_min) )\
        for( ; i < y_max-2; i+=4 )\
            COST_MV_X4( 0,i, 0,-i, 0,i+2, 0,-i-2 );\
    for( ; i < y_max; i+=2 )\
    {\
        if( omy+i <= mv_y_max )\
            COST_MV( omx, omy+i );\
        if( omy-i >= mv_y_min )\
            COST_MV( omx, omy-i );\
    }\
}
172

173
void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_halfpel_thresh )
Laurent Aimar's avatar
Laurent Aimar committed
174
{
175 176
    const int bw = x264_pixel_size[m->i_pixel].w;
    const int bh = x264_pixel_size[m->i_pixel].h;
Laurent Aimar's avatar
Laurent Aimar committed
177
    const int i_pixel = m->i_pixel;
178
    const int stride = m->i_stride[0];
179
    int i_me_range = h->param.analyse.i_me_range;
180
    int bmx, bmy, bcost;
181
    int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX;
182
    int omx, omy, pmx, pmy;
183
    uint8_t *p_fenc = m->p_fenc[0];
184
    uint8_t *p_fref = m->p_fref[0];
185
    ALIGNED_ARRAY_16( uint8_t, pix,[16*16] );
Loren Merritt's avatar
Loren Merritt committed
186

187
    int i, j;
188
    int dir;
189
    int costs[16];
Laurent Aimar's avatar
Laurent Aimar committed
190

191 192 193 194
    int mv_x_min = h->mb.mv_min_fpel[0];
    int mv_y_min = h->mb.mv_min_fpel[1];
    int mv_x_max = h->mb.mv_max_fpel[0];
    int mv_y_max = h->mb.mv_max_fpel[1];
Laurent Aimar's avatar
Laurent Aimar committed
195

Loren Merritt's avatar
Loren Merritt committed
196 197
#define CHECK_MVRANGE(mx,my) ( mx >= mv_x_min && mx <= mv_x_max && my >= mv_y_min && my <= mv_y_max )

198 199
    const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
    const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
200

201 202 203 204
    bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 );
    bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 );
    pmx = ( bmx + 2 ) >> 2;
    pmy = ( bmy + 2 ) >> 2;
205
    bcost = COST_MAX;
Laurent Aimar's avatar
Laurent Aimar committed
206

207
    /* try extra predictors if provided */
208
    if( h->mb.i_subpel_refine >= 3 )
Laurent Aimar's avatar
Laurent Aimar committed
209
    {
Fiona Glaser's avatar
Fiona Glaser committed
210
        uint32_t bmv = pack16to32_mask(bmx,bmy);
Fiona Glaser's avatar
Fiona Glaser committed
211
        COST_MV_HPEL( bmx, bmy );
212
        for( i = 0; i < i_mvc; i++ )
213
        {
Fiona Glaser's avatar
Fiona Glaser committed
214
            if( *(uint32_t*)mvc[i] && (bmv - *(uint32_t*)mvc[i]) )
215
            {
Fiona Glaser's avatar
Fiona Glaser committed
216 217
                int mx = x264_clip3( mvc[i][0], mv_x_min*4, mv_x_max*4 );
                int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 );
218 219
                COST_MV_HPEL( mx, my );
            }
220
        }
221 222 223
        bmx = ( bpred_mx + 2 ) >> 2;
        bmy = ( bpred_my + 2 ) >> 2;
        COST_MV( bmx, bmy );
224
    }
225 226 227 228
    else
    {
        /* check the MVP */
        COST_MV( pmx, pmy );
Fiona Glaser's avatar
Fiona Glaser committed
229 230 231 232 233 234 235
        /* Because we are rounding the predicted motion vector to fullpel, there will be
         * an extra MV cost in 15 out of 16 cases.  However, when the predicted MV is
         * chosen as the best predictor, it is often the case that the subpel search will
         * result in a vector at or next to the predicted motion vector.  Therefore, it is
         * sensible to remove the cost of the MV from the rounded MVP to avoid unfairly
         * biasing against use of the predicted motion vector. */
        bcost -= BITS_MVD( pmx, pmy );
236
        for( i = 0; i < i_mvc; i++ )
237
        {
238 239 240 241 242 243 244 245
            int mx = (mvc[i][0] + 2) >> 2;
            int my = (mvc[i][1] + 2) >> 2;
            if( (mx | my) && ((mx-bmx) | (my-bmy)) )
            {
                mx = x264_clip3( mx, mv_x_min, mv_x_max );
                my = x264_clip3( my, mv_y_min, mv_y_max );
                COST_MV( mx, my );
            }
246
        }
247
    }
248 249
    COST_MV( 0, 0 );

Loren Merritt's avatar
Loren Merritt committed
250 251
    switch( h->mb.i_me_method )
    {
252
    case X264_ME_DIA:
253
        /* diamond search, radius 1 */
254
        i = 0;
255
        bcost <<= 4;
256
        do
257
        {
258 259 260 261 262 263
            COST_MV_X4_DIR( 0,-1, 0,1, -1,0, 1,0, costs );
            COPY1_IF_LT( bcost, (costs[0]<<4)+1 );
            COPY1_IF_LT( bcost, (costs[1]<<4)+3 );
            COPY1_IF_LT( bcost, (costs[2]<<4)+4 );
            COPY1_IF_LT( bcost, (costs[3]<<4)+12 );
            if( !(bcost&15) )
264
                break;
265 266 267
            bmx -= (bcost<<28)>>30;
            bmy -= (bcost<<30)>>30;
            bcost &= ~15;
Loren Merritt's avatar
Loren Merritt committed
268 269
            if( !CHECK_MVRANGE(bmx, bmy) )
                break;
270
        } while( ++i < i_me_range );
271
        bcost >>= 4;
272
        break;
273

274
    case X264_ME_HEX:
275
me_hex2:
276
        /* hexagon search, radius 2 */
277
#if 0
278 279
        for( i = 0; i < i_me_range/2; i++ )
        {
280 281 282 283 284 285 286
            omx = bmx; omy = bmy;
            COST_MV( omx-2, omy   );
            COST_MV( omx-1, omy+2 );
            COST_MV( omx+1, omy+2 );
            COST_MV( omx+2, omy   );
            COST_MV( omx+1, omy-2 );
            COST_MV( omx-1, omy-2 );
287 288
            if( bmx == omx && bmy == omy )
                break;
Loren Merritt's avatar
Loren Merritt committed
289 290
            if( !CHECK_MVRANGE(bmx, bmy) )
                break;
Laurent Aimar's avatar
Laurent Aimar committed
291
        }
292 293
#else
        /* equivalent to the above, but eliminates duplicate candidates */
294 295 296 297

        /* hexagon */
        COST_MV_X3_DIR( -2,0, -1, 2,  1, 2, costs   );
        COST_MV_X3_DIR(  2,0,  1,-2, -1,-2, costs+3 );
298 299 300 301 302 303 304 305 306
        bcost <<= 3;
        COPY1_IF_LT( bcost, (costs[0]<<3)+2 );
        COPY1_IF_LT( bcost, (costs[1]<<3)+3 );
        COPY1_IF_LT( bcost, (costs[2]<<3)+4 );
        COPY1_IF_LT( bcost, (costs[3]<<3)+5 );
        COPY1_IF_LT( bcost, (costs[4]<<3)+6 );
        COPY1_IF_LT( bcost, (costs[5]<<3)+7 );

        if( bcost&7 )
307
        {
308
            dir = (bcost&7)-2;
309 310 311
            bmx += hex2[dir+1][0];
            bmy += hex2[dir+1][1];
            /* half hexagon, not overlapping the previous iteration */
Loren Merritt's avatar
Loren Merritt committed
312
            for( i = 1; i < i_me_range/2 && CHECK_MVRANGE(bmx, bmy); i++ )
313
            {
314 315 316
                COST_MV_X3_DIR( hex2[dir+0][0], hex2[dir+0][1],
                                hex2[dir+1][0], hex2[dir+1][1],
                                hex2[dir+2][0], hex2[dir+2][1],
317
                                costs );
318 319 320 321 322
                bcost &= ~7;
                COPY1_IF_LT( bcost, (costs[0]<<3)+1 );
                COPY1_IF_LT( bcost, (costs[1]<<3)+2 );
                COPY1_IF_LT( bcost, (costs[2]<<3)+3 );
                if( !(bcost&7) )
323
                    break;
324 325
                dir += (bcost&7)-2;
                dir = mod6m1[dir+1];
326 327
                bmx += hex2[dir+1][0];
                bmy += hex2[dir+1][1];
328 329
            }
        }
330
        bcost >>= 3;
331
#endif
332
        /* square refine */
333 334 335 336 337 338 339 340 341 342 343 344 345
        dir = 0;
        COST_MV_X4_DIR(  0,-1,  0,1, -1,0, 1,0, costs );
        COPY2_IF_LT( bcost, costs[0], dir, 1 );
        COPY2_IF_LT( bcost, costs[1], dir, 2 );
        COPY2_IF_LT( bcost, costs[2], dir, 3 );
        COPY2_IF_LT( bcost, costs[3], dir, 4 );
        COST_MV_X4_DIR( -1,-1, -1,1, 1,-1, 1,1, costs );
        COPY2_IF_LT( bcost, costs[0], dir, 5 );
        COPY2_IF_LT( bcost, costs[1], dir, 6 );
        COPY2_IF_LT( bcost, costs[2], dir, 7 );
        COPY2_IF_LT( bcost, costs[3], dir, 8 );
        bmx += square1[dir][0];
        bmy += square1[dir][1];
346
        break;
347 348

    case X264_ME_UMH:
349 350
        {
            /* Uneven-cross Multi-Hexagon-grid Search
351
             * as in JM, except with different early termination */
352

353 354 355 356
            static const int x264_pixel_size_shift[7] = { 0, 1, 1, 2, 3, 3, 4 };

            int ucost1, ucost2;
            int cross_start = 1;
357

358
            /* refine predictors */
359
            ucost1 = bcost;
360
            DIA1_ITER( pmx, pmy );
361
            if( pmx | pmy )
362
                DIA1_ITER( 0, 0 );
363

364 365
            if(i_pixel == PIXEL_4x4)
                goto me_hex2;
366

367
            ucost2 = bcost;
368
            if( (bmx | bmy) && ((bmx-pmx) | (bmy-pmy)) )
369
                DIA1_ITER( bmx, bmy );
370 371
            if( bcost == ucost2 )
                cross_start = 3;
372
            omx = bmx; omy = bmy;
373 374 375 376

            /* early termination */
#define SAD_THRESH(v) ( bcost < ( v >> x264_pixel_size_shift[i_pixel] ) )
            if( bcost == ucost2 && SAD_THRESH(2000) )
377
            {
378 379
                COST_MV_X4( 0,-2, -1,-1, 1,-1, -2,0 );
                COST_MV_X4( 2, 0, -1, 1, 1, 1,  0,2 );
380 381 382 383 384 385
                if( bcost == ucost1 && SAD_THRESH(500) )
                    break;
                if( bcost == ucost2 )
                {
                    int range = (i_me_range>>1) | 1;
                    CROSS( 3, range, range );
386 387
                    COST_MV_X4( -1,-2, 1,-2, -2,-1, 2,-1 );
                    COST_MV_X4( -2, 1, 2, 1, -1, 2, 1, 2 );
388 389 390 391
                    if( bcost == ucost2 )
                        break;
                    cross_start = range + 2;
                }
392
            }
393 394

            /* adaptive search range */
395
            if( i_mvc )
396
            {
397 398 399 400 401 402 403 404 405 406 407 408
                /* range multipliers based on casual inspection of some statistics of
                 * average distance between current predictor and final mv found by ESA.
                 * these have not been tuned much by actual encoding. */
                static const int range_mul[4][4] =
                {
                    { 3, 3, 4, 4 },
                    { 3, 4, 4, 4 },
                    { 4, 4, 4, 5 },
                    { 4, 4, 5, 6 },
                };
                int mvd;
                int sad_ctx, mvd_ctx;
Loren Merritt's avatar
Loren Merritt committed
409
                int denom = 1;
410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425

                if( i_mvc == 1 )
                {
                    if( i_pixel == PIXEL_16x16 )
                        /* mvc is probably the same as mvp, so the difference isn't meaningful.
                         * but prediction usually isn't too bad, so just use medium range */
                        mvd = 25;
                    else
                        mvd = abs( m->mvp[0] - mvc[0][0] )
                            + abs( m->mvp[1] - mvc[0][1] );
                }
                else
                {
                    /* calculate the degree of agreement between predictors. */
                    /* in 16x16, mvc includes all the neighbors used to make mvp,
                     * so don't count mvp separately. */
Loren Merritt's avatar
Loren Merritt committed
426
                    denom = i_mvc - 1;
427 428 429 430 431
                    mvd = 0;
                    if( i_pixel != PIXEL_16x16 )
                    {
                        mvd = abs( m->mvp[0] - mvc[0][0] )
                            + abs( m->mvp[1] - mvc[0][1] );
Loren Merritt's avatar
Loren Merritt committed
432
                        denom++;
433
                    }
Fiona Glaser's avatar
Fiona Glaser committed
434
                    mvd += x264_predictor_difference( mvc, i_mvc );
435 436 437 438 439
                }

                sad_ctx = SAD_THRESH(1000) ? 0
                        : SAD_THRESH(2000) ? 1
                        : SAD_THRESH(4000) ? 2 : 3;
Loren Merritt's avatar
Loren Merritt committed
440 441 442
                mvd_ctx = mvd < 10*denom ? 0
                        : mvd < 20*denom ? 1
                        : mvd < 40*denom ? 2 : 3;
443 444

                i_me_range = i_me_range * range_mul[mvd_ctx][sad_ctx] / 4;
445 446
            }

447 448 449 450
            /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
             * we are still centered on the same place as the DIA2. is this desirable? */
            CROSS( cross_start, i_me_range, i_me_range/2 );

451
            COST_MV_X4( -2,-2, -2,2, 2,-2, 2,2 );
452

453 454
            /* hexagon grid */
            omx = bmx; omy = bmy;
455 456
            const uint16_t *p_cost_omvx = p_cost_mvx + omx*4;
            const uint16_t *p_cost_omvy = p_cost_mvy + omy*4;
457 458
            i = 1;
            do
459
            {
460
                static const int hex4[16][2] = {
461 462 463 464
                    { 0,-4}, { 0, 4}, {-2,-3}, { 2,-3},
                    {-4,-2}, { 4,-2}, {-4,-1}, { 4,-1},
                    {-4, 0}, { 4, 0}, {-4, 1}, { 4, 1},
                    {-4, 2}, { 4, 2}, {-2, 3}, { 2, 3},
465 466
                };

467 468
                if( 4*i > X264_MIN4( mv_x_max-omx, omx-mv_x_min,
                                     mv_y_max-omy, omy-mv_y_min ) )
469
                {
470 471 472 473
                    for( j = 0; j < 16; j++ )
                    {
                        int mx = omx + hex4[j][0]*i;
                        int my = omy + hex4[j][1]*i;
Loren Merritt's avatar
Loren Merritt committed
474
                        if( CHECK_MVRANGE(mx, my) )
475
                            COST_MV( mx, my );
476 477 478 479
                    }
                }
                else
                {
480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
                    int dir = 0;
                    uint8_t *pix_base = p_fref + omx + (omy-4*i)*stride;
                    int dy = i*stride;
#define SADS(k,x0,y0,x1,y1,x2,y2,x3,y3)\
                    h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
                            pix_base x0*i+(y0-2*k+4)*dy,\
                            pix_base x1*i+(y1-2*k+4)*dy,\
                            pix_base x2*i+(y2-2*k+4)*dy,\
                            pix_base x3*i+(y3-2*k+4)*dy,\
                            stride, costs+4*k );\
                    pix_base += 2*dy;
#define ADD_MVCOST(k,x,y) costs[k] += p_cost_omvx[x*4*i] + p_cost_omvy[y*4*i]
#define MIN_MV(k,x,y)     COPY2_IF_LT( bcost, costs[k], dir, x*16+(y&15) )
                    SADS( 0, +0,-4, +0,+4, -2,-3, +2,-3 );
                    SADS( 1, -4,-2, +4,-2, -4,-1, +4,-1 );
                    SADS( 2, -4,+0, +4,+0, -4,+1, +4,+1 );
                    SADS( 3, -4,+2, +4,+2, -2,+3, +2,+3 );
                    ADD_MVCOST(  0, 0,-4 );
                    ADD_MVCOST(  1, 0, 4 );
                    ADD_MVCOST(  2,-2,-3 );
                    ADD_MVCOST(  3, 2,-3 );
                    ADD_MVCOST(  4,-4,-2 );
                    ADD_MVCOST(  5, 4,-2 );
                    ADD_MVCOST(  6,-4,-1 );
                    ADD_MVCOST(  7, 4,-1 );
                    ADD_MVCOST(  8,-4, 0 );
                    ADD_MVCOST(  9, 4, 0 );
                    ADD_MVCOST( 10,-4, 1 );
                    ADD_MVCOST( 11, 4, 1 );
                    ADD_MVCOST( 12,-4, 2 );
                    ADD_MVCOST( 13, 4, 2 );
                    ADD_MVCOST( 14,-2, 3 );
                    ADD_MVCOST( 15, 2, 3 );
                    MIN_MV(  0, 0,-4 );
                    MIN_MV(  1, 0, 4 );
                    MIN_MV(  2,-2,-3 );
                    MIN_MV(  3, 2,-3 );
                    MIN_MV(  4,-4,-2 );
                    MIN_MV(  5, 4,-2 );
                    MIN_MV(  6,-4,-1 );
                    MIN_MV(  7, 4,-1 );
                    MIN_MV(  8,-4, 0 );
                    MIN_MV(  9, 4, 0 );
                    MIN_MV( 10,-4, 1 );
                    MIN_MV( 11, 4, 1 );
                    MIN_MV( 12,-4, 2 );
                    MIN_MV( 13, 4, 2 );
                    MIN_MV( 14,-2, 3 );
                    MIN_MV( 15, 2, 3 );
#undef SADS
#undef ADD_MVCOST
#undef MIN_MV
                    if(dir)
                    {
                        bmx = omx + i*(dir>>4);
                        bmy = omy + i*((dir<<28)>>28);
                    }
537
                }
538
            } while( ++i <= i_me_range/4 );
539
            if( bmy <= mv_y_max && bmy >= mv_y_min )
540 541
                goto me_hex2;
            break;
542 543
        }

544
    case X264_ME_ESA:
545
    case X264_ME_TESA:
546
        {
547 548
            const int min_x = X264_MAX( bmx - i_me_range, mv_x_min );
            const int min_y = X264_MAX( bmy - i_me_range, mv_y_min );
549
            const int max_x = X264_MIN( bmx + i_me_range, mv_x_max );
550
            const int max_y = X264_MIN( bmy + i_me_range, mv_y_max );
551 552 553
            /* SEA is fastest in multiples of 4 */
            const int width = (max_x - min_x + 3) & ~3;
            int my;
554 555
#if 0
            /* plain old exhaustive search */
556
            int mx;
557 558 559 560 561 562
            for( my = min_y; my <= max_y; my++ )
                for( mx = min_x; mx <= max_x; mx++ )
                    COST_MV( mx, my );
#else
            /* successive elimination by comparing DC before a full SAD,
             * because sum(abs(diff)) >= abs(diff(sum)). */
Loren Merritt's avatar
Loren Merritt committed
563
            uint16_t *sums_base = m->integral;
564
            /* due to a GCC bug on some platforms (win32?), zero[] may not actually be aligned.
Anton Mitrofanov's avatar
Anton Mitrofanov committed
565
             * this is not a problem because it is not used for any SSE instructions. */
566 567
            ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] );
            ALIGNED_ARRAY_16( int, enc_dc,[4] );
Loren Merritt's avatar
Loren Merritt committed
568
            int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
Loren Merritt's avatar
Loren Merritt committed
569
            int delta = x264_pixel_size[sad_size].w;
570
            int16_t *xs = h->scratch_buffer;
571
            int xn;
572
            uint16_t *cost_fpel_mvx = h->cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
Loren Merritt's avatar
Loren Merritt committed
573

574 575
            h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta,
                p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE,
Loren Merritt's avatar
Loren Merritt committed
576
                FENC_STRIDE, enc_dc );
Loren Merritt's avatar
Loren Merritt committed
577
            if( delta == 4 )
578
                sums_base += stride * (h->fenc->i_lines[0] + PADV*2);
Loren Merritt's avatar
Loren Merritt committed
579 580 581 582
            if( i_pixel == PIXEL_16x16 || i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
                delta *= stride;
            if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
                enc_dc[1] = enc_dc[2];
583

584 585 586
            if( h->mb.i_me_method == X264_ME_TESA )
            {
                // ADS threshold, then SAD threshold, then keep the best few SADs, then SATD
587
                mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15));
588 589
                int nmvsad = 0, limit;
                int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
590
                int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref+bmy*stride+bmx, stride )
591 592 593 594
                         + BITS_MVD( bmx, bmy );
                for( my = min_y; my <= max_y; my++ )
                {
                    int ycost = p_cost_mvy[my<<2];
595 596
                    if( bsad <= ycost )
                        continue;
597 598
                    bsad -= ycost;
                    xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
599
                                               cost_fpel_mvx+min_x, xs, width, bsad*17/16 );
600 601 602 603
                    for( i=0; i<xn-2; i+=3 )
                    {
                        uint8_t *ref = p_fref+min_x+my*stride;
                        int sads[3];
604
                        h->pixf.sad_x3[i_pixel]( p_fenc, ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads );
605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620
                        for( j=0; j<3; j++ )
                        {
                            int sad = sads[j] + cost_fpel_mvx[xs[i+j]];
                            if( sad < bsad*sad_thresh>>3 )
                            {
                                COPY1_IF_LT( bsad, sad );
                                mvsads[nmvsad].sad = sad + ycost;
                                mvsads[nmvsad].mx = min_x+xs[i+j];
                                mvsads[nmvsad].my = my;
                                nmvsad++;
                            }
                        }
                    }
                    for( ; i<xn; i++ )
                    {
                        int mx = min_x+xs[i];
621
                        int sad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref+mx+my*stride, stride )
622 623 624 625 626 627 628 629 630 631 632 633 634 635
                                + cost_fpel_mvx[xs[i]];
                        if( sad < bsad*sad_thresh>>3 )
                        {
                            COPY1_IF_LT( bsad, sad );
                            mvsads[nmvsad].sad = sad + ycost;
                            mvsads[nmvsad].mx = mx;
                            mvsads[nmvsad].my = my;
                            nmvsad++;
                        }
                    }
                    bsad += ycost;
                }

                limit = i_me_range / 2;
Loren Merritt's avatar
Loren Merritt committed
636 637
                sad_thresh = bsad*sad_thresh>>3;
                while( nmvsad > limit*2 && sad_thresh > bsad )
638 639
                {
                    // halve the range if the domain is too large... eh, close enough
Loren Merritt's avatar
Loren Merritt committed
640 641
                    sad_thresh = (sad_thresh + bsad) >> 1;
                    for( i=0; i<nmvsad && mvsads[i].sad <= sad_thresh; i++ );
642
                    for( j=i; j<nmvsad; j++ )
Loren Merritt's avatar
Loren Merritt committed
643 644 645 646 647 648 649 650
                    {
                        /* mvsad_t is not guaranteed to be 8 bytes on all archs, so check before using explicit write-combining */
                        if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
                            *(uint64_t*)&mvsads[i] = *(uint64_t*)&mvsads[j];
                        else
                            mvsads[i] = mvsads[j];
                        i += mvsads[j].sad <= sad_thresh;
                    }
651 652
                    nmvsad = i;
                }
Loren Merritt's avatar
Loren Merritt committed
653
                while( nmvsad > limit )
654
                {
Loren Merritt's avatar
Loren Merritt committed
655 656 657 658 659 660 661 662 663 664
                    int bsad = mvsads[0].sad;
                    int bi = 0;
                    for( i=1; i<nmvsad; i++ )
                        COPY2_IF_GT( bsad, mvsads[i].sad, bi, i );
                    nmvsad--;
                    mvsads[bi] = mvsads[nmvsad];
                    if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
                        *(uint64_t*)&mvsads[bi] = *(uint64_t*)&mvsads[nmvsad];
                    else
                        mvsads[bi] = mvsads[nmvsad];
665 666 667 668 669
                }
                for( i=0; i<nmvsad; i++ )
                    COST_MV( mvsads[i].mx, mvsads[i].my );
            }
            else
Loren Merritt's avatar
Loren Merritt committed
670
            {
671 672 673
                // just ADS and SAD
                for( my = min_y; my <= max_y; my++ )
                {
674 675 676 677
                    int ycost = p_cost_mvy[my<<2];
                    if( bcost <= ycost )
                        continue;
                    bcost -= ycost;
678
                    xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
679
                                               cost_fpel_mvx+min_x, xs, width, bcost );
680 681
                    for( i=0; i<xn-2; i+=3 )
                        COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my );
682
                    bcost += ycost;
683 684 685
                    for( ; i<xn; i++ )
                        COST_MV( min_x+xs[i], my );
                }
686
            }
687
#endif
Laurent Aimar's avatar
Laurent Aimar committed
688
        }
689
        break;
Laurent Aimar's avatar
Laurent Aimar committed
690 691 692
    }

    /* -> qpel mv */
693 694 695 696 697 698 699 700 701 702 703 704
    if( bpred_cost < bcost )
    {
        m->mv[0] = bpred_mx;
        m->mv[1] = bpred_my;
        m->cost = bpred_cost;
    }
    else
    {
        m->mv[0] = bmx << 2;
        m->mv[1] = bmy << 2;
        m->cost = bcost;
    }
Laurent Aimar's avatar
Laurent Aimar committed
705 706

    /* compute the real cost */
707
    m->cost_mv = p_cost_mvx[ m->mv[0] ] + p_cost_mvy[ m->mv[1] ];
708
    if( bmx == pmx && bmy == pmy && h->mb.i_subpel_refine < 3 )
709
        m->cost += m->cost_mv;
Loren Merritt's avatar
Loren Merritt committed
710

711
    /* subpel refine */
712
    if( h->mb.i_subpel_refine >= 2 )
713
    {
714 715 716
        int hpel = subpel_iterations[h->mb.i_subpel_refine][2];
        int qpel = subpel_iterations[h->mb.i_subpel_refine][3];
        refine_subpel( h, m, hpel, qpel, p_halfpel_thresh, 0 );
717
    }
Laurent Aimar's avatar
Laurent Aimar committed
718
}
719
#undef COST_MV
Laurent Aimar's avatar
Laurent Aimar committed
720 721

void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
722
{
723 724
    int hpel = subpel_iterations[h->mb.i_subpel_refine][0];
    int qpel = subpel_iterations[h->mb.i_subpel_refine][1];
Loren Merritt's avatar
Loren Merritt committed
725 726 727 728

    if( m->i_pixel <= PIXEL_8x8 && h->sh.i_type == SLICE_TYPE_P )
        m->cost -= m->i_ref_cost;
	
729
    refine_subpel( h, m, hpel, qpel, NULL, 1 );
730 731
}

Loren Merritt's avatar
Loren Merritt committed
732
#define COST_MV_SAD( mx, my ) \
733 734
{ \
    int stride = 16; \
735
    uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
736
    int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
737
             + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
Loren Merritt's avatar
Loren Merritt committed
738
    COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \
739 740
}

741 742
#define COST_MV_SATD( mx, my, dir ) \
if( b_refine_qpel || (dir^1) != odir ) \
Loren Merritt's avatar
Loren Merritt committed
743 744
{ \
    int stride = 16; \
745
    uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
746
    int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
Loren Merritt's avatar
Loren Merritt committed
747 748 749
             + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
    if( b_chroma_me && cost < bcost ) \
    { \
750
        h->mc.mc_chroma( pix[0], 8, m->p_fref[4], m->i_stride[1], mx, my, bw/2, bh/2 ); \
Loren Merritt's avatar
Loren Merritt committed
751
        cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix[0], 8 ); \
Loren Merritt's avatar
Loren Merritt committed
752 753
        if( cost < bcost ) \
        { \
754
            h->mc.mc_chroma( pix[0], 8, m->p_fref[5], m->i_stride[1], mx, my, bw/2, bh/2 ); \
Loren Merritt's avatar
Loren Merritt committed
755
            cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix[0], 8 ); \
Loren Merritt's avatar
Loren Merritt committed
756 757 758 759 760
        } \
    } \
    if( cost < bcost ) \
    {                  \
        bcost = cost;  \
761 762
        bmx = mx;      \
        bmy = my;      \
763
        bdir = dir;    \
Loren Merritt's avatar
Loren Merritt committed
764 765 766
    } \
}

767
static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel )
Laurent Aimar's avatar
Laurent Aimar committed
768 769 770
{
    const int bw = x264_pixel_size[m->i_pixel].w;
    const int bh = x264_pixel_size[m->i_pixel].h;
771 772
    const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
    const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
Loren Merritt's avatar
Loren Merritt committed
773 774
    const int i_pixel = m->i_pixel;
    const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
Laurent Aimar's avatar
Laurent Aimar committed
775

776
    ALIGNED_ARRAY_16( uint8_t, pix,[2],[32*18] );   // really 17x17, but round up for alignment
777 778
    int omx, omy;
    int i;
Laurent Aimar's avatar
Laurent Aimar committed
779 780 781

    int bmx = m->mv[0];
    int bmy = m->mv[1];
782
    int bcost = m->cost;
783
    int odir = -1, bdir;
784

Loren Merritt's avatar
Loren Merritt committed
785
    /* try the subpel component of the predicted mv */
786
    if( hpel_iters && h->mb.i_subpel_refine < 3 )
787
    {
788 789
        int mx = x264_clip3( m->mvp[0], h->mb.mv_min_spel[0]+2, h->mb.mv_max_spel[0]-2 );
        int my = x264_clip3( m->mvp[1], h->mb.mv_min_spel[1]+2, h->mb.mv_max_spel[1]-2 );
Fiona Glaser's avatar
Fiona Glaser committed
790
        if( (mx-bmx)|(my-bmy) )
Loren Merritt's avatar
Loren Merritt committed
791
            COST_MV_SAD( mx, my );
792
    }
Loren Merritt's avatar
Loren Merritt committed
793 794

    /* halfpel diamond search */
795 796
    for( i = hpel_iters; i > 0; i-- )
    {
Loren Merritt's avatar
Loren Merritt committed
797 798
        int omx = bmx, omy = bmy;
        int costs[4];
799
        int stride = 32; // candidates are either all hpel or all qpel, so one stride is enough
Loren Merritt's avatar
Loren Merritt committed
800
        uint8_t *src0, *src1, *src2, *src3;
801 802
        src0 = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1 );
        src2 = h->mc.get_ref( pix[1], &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh );
803 804
        src1 = src0 + stride;
        src3 = src2 + 1;
805
        h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs );
Loren Merritt's avatar
Loren Merritt committed
806 807 808 809
        COPY2_IF_LT( bcost, costs[0] + p_cost_mvx[omx  ] + p_cost_mvy[omy-2], bmy, omy-2 );
        COPY2_IF_LT( bcost, costs[1] + p_cost_mvx[omx  ] + p_cost_mvy[omy+2], bmy, omy+2 );
        COPY3_IF_LT( bcost, costs[2] + p_cost_mvx[omx-2] + p_cost_mvy[omy  ], bmx, omx-2, bmy, omy );
        COPY3_IF_LT( bcost, costs[3] + p_cost_mvx[omx+2] + p_cost_mvy[omy  ], bmx, omx+2, bmy, omy );
Fiona Glaser's avatar
Fiona Glaser committed
810
        if( (bmx == omx) & (bmy == omy) )
811 812
            break;
    }
Loren Merritt's avatar
Loren Merritt committed
813

814
    if( !b_refine_qpel )
Laurent Aimar's avatar
Laurent Aimar committed
815
    {
816
        bcost = COST_MAX;
817
        COST_MV_SATD( bmx, bmy, -1 );
818
    }
Loren Merritt's avatar
Loren Merritt committed
819

820 821 822 823
    /* early termination when examining multiple reference frames */
    if( p_halfpel_thresh )
    {
        if( (bcost*7)>>3 > *p_halfpel_thresh )
824
        {
825 826 827 828 829 830 831 832 833 834
            m->cost = bcost;
            m->mv[0] = bmx;
            m->mv[1] = bmy;
            // don't need cost_mv
            return;
        }
        else if( bcost < *p_halfpel_thresh )
            *p_halfpel_thresh = bcost;
    }

Loren Merritt's avatar
Loren Merritt committed
835
    /* quarterpel diamond search */
836
    bdir = -1;
837 838
    for( i = qpel_iters; i > 0; i-- )
    {
839 840
        if( bmy <= h->mb.mv_min_spel[1] || bmy >= h->mb.mv_max_spel[1] )
            break;
841
        odir = bdir;
842 843
        omx = bmx;
        omy = bmy;
844 845 846 847
        COST_MV_SATD( omx, omy - 1, 0 );
        COST_MV_SATD( omx, omy + 1, 1 );
        COST_MV_SATD( omx - 1, omy, 2 );
        COST_MV_SATD( omx + 1, omy, 3 );
848 849
        if( bmx == omx && bmy == omy )
            break;
Laurent Aimar's avatar
Laurent Aimar committed
850 851
    }

852
    m->cost = bcost;
Laurent Aimar's avatar
Laurent Aimar committed
853 854
    m->mv[0] = bmx;
    m->mv[1] = bmy;
855
    m->cost_mv = p_cost_mvx[ bmx ] + p_cost_mvy[ bmy ];
Laurent Aimar's avatar
Laurent Aimar committed
856
}
857

858
#define BIME_CACHE( dx, dy, list ) \
859
{ \
860
    x264_me_t *m = m##list;\
861
    int i = 4 + 3*dx + dy; \
862 863 864 865
    int mvx = om##list##x+dx;\
    int mvy = om##list##y+dy;\
    stride##list[i] = bw;\
    src##list[i] = h->mc.get_ref( pixy_buf[list][i], &stride##list[i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh ); \
866 867
    if( rd )\
    {\
868 869 870 871
        if( h->mb.b_interlaced & ref##list )\
            mvy += (h->mb.i_mb_y & 1)*4 - 2;\
        h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy, bw>>1, bh>>1 );\
        h->mc.mc_chroma( pixv_buf[list][i], 8, m->p_fref[5], m->i_stride[1], mvx, mvy, bw>>1, bh>>1 );\
872
    }\
873 874
}

875 876 877 878 879 880 881 882 883 884 885
#define BIME_CACHE2(a,b,list) \
    BIME_CACHE(a,b,list) \
    BIME_CACHE(-(a),-(b),list)

#define BIME_CACHE8(list) \
{\
    BIME_CACHE2( 1, 0, list );\
    BIME_CACHE2( 0, 1, list );\