me.c 35.8 KB
Newer Older
Laurent Aimar's avatar
Laurent Aimar committed
1
2
3
4
5
6
7
/*****************************************************************************
 * me.c: h264 encoder library (Motion Estimation)
 *****************************************************************************
 * Copyright (C) 2003 Laurent Aimar
 * $Id: me.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
8
 *          Loren Merritt <lorenm@u.washington.edu>
Laurent Aimar's avatar
Laurent Aimar committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
 *****************************************************************************/

25
#include "common/common.h"
Laurent Aimar's avatar
Laurent Aimar committed
26
27
#include "me.h"

28
29
30
/* presets selected from good points on the speed-vs-quality curve of several test videos
 * subpel_iters[i_subpel_refine] = { refine_hpel, refine_qpel, me_hpel, me_qpel }
 * where me_* are the number of EPZS iterations run on all candidate block types,
31
32
33
 * and refine_* are run only on the winner.
 * the subme=7 values are much higher because any amount of satd search makes
 * up its time by reducing the number of rd iterations. */
Loren Merritt's avatar
Loren Merritt committed
34
static const int subpel_iterations[][4] = 
35
36
   {{1,0,0,0},
    {1,1,0,0},
37
    {0,1,1,0},
38
39
    {0,2,1,0},
    {0,2,1,1},
40
    {0,2,1,2},
41
    {0,0,2,2},
42
43
44
45
46
47
48
    {0,0,4,10}};

/* (x-1)%6 */
static const int mod6m1[8] = {5,0,1,2,3,4,5,0};
/* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
static const int hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}};
static const int square1[8][2] = {{0,-1}, {0,1}, {-1,0}, {1,0}, {-1,-1}, {1,1}, {-1,1}, {1,-1}};
49

50
static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel );
51

Loren Merritt's avatar
Loren Merritt committed
52
53
54
55
56
#define BITS_MVD( mx, my )\
    (p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2])

#define COST_MV( mx, my )\
{\
57
    int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE,\
Loren Merritt's avatar
Loren Merritt committed
58
59
60
61
62
                   &p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0] )\
             + BITS_MVD(mx,my);\
    COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\
}

Loren Merritt's avatar
Loren Merritt committed
63
#define COST_MV_HPEL( mx, my ) \
64
65
{ \
    int stride = 16; \
66
    uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
67
    int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
68
69
70
71
             + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
    COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \
}

72
73
74
#define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
{\
    uint8_t *pix_base = p_fref + bmx + bmy*m->i_stride[0];\
75
    h->pixf.fpelcmp_x3[i_pixel]( m->p_fenc[0],\
76
77
78
79
80
81
82
83
84
85
86
87
        pix_base + (m0x) + (m0y)*m->i_stride[0],\
        pix_base + (m1x) + (m1y)*m->i_stride[0],\
        pix_base + (m2x) + (m2y)*m->i_stride[0],\
        m->i_stride[0], costs );\
    (costs)[0] += BITS_MVD( bmx+(m0x), bmy+(m0y) );\
    (costs)[1] += BITS_MVD( bmx+(m1x), bmy+(m1y) );\
    (costs)[2] += BITS_MVD( bmx+(m2x), bmy+(m2y) );\
}

#define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\
{\
    uint8_t *pix_base = p_fref + omx + omy*m->i_stride[0];\
88
    h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0],\
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
        pix_base + (m0x) + (m0y)*m->i_stride[0],\
        pix_base + (m1x) + (m1y)*m->i_stride[0],\
        pix_base + (m2x) + (m2y)*m->i_stride[0],\
        pix_base + (m3x) + (m3y)*m->i_stride[0],\
        m->i_stride[0], costs );\
    costs[0] += BITS_MVD( omx+(m0x), omy+(m0y) );\
    costs[1] += BITS_MVD( omx+(m1x), omy+(m1y) );\
    costs[2] += BITS_MVD( omx+(m2x), omy+(m2y) );\
    costs[3] += BITS_MVD( omx+(m3x), omy+(m3y) );\
    COPY3_IF_LT( bcost, costs[0], bmx, omx+(m0x), bmy, omy+(m0y) );\
    COPY3_IF_LT( bcost, costs[1], bmx, omx+(m1x), bmy, omy+(m1y) );\
    COPY3_IF_LT( bcost, costs[2], bmx, omx+(m2x), bmy, omy+(m2y) );\
    COPY3_IF_LT( bcost, costs[3], bmx, omx+(m3x), bmy, omy+(m3y) );\
}

104
#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
Loren Merritt's avatar
Loren Merritt committed
105
{\
106
    h->pixf.fpelcmp_x3[i_pixel]( m->p_fenc[0],\
Loren Merritt's avatar
Loren Merritt committed
107
108
109
110
        p_fref + (m0x) + (m0y)*m->i_stride[0],\
        p_fref + (m1x) + (m1y)*m->i_stride[0],\
        p_fref + (m2x) + (m2y)*m->i_stride[0],\
        m->i_stride[0], costs );\
111
112
113
    costs[0] += p_cost_mvx[(m0x)<<2]; /* no cost_mvy */\
    costs[1] += p_cost_mvx[(m1x)<<2];\
    costs[2] += p_cost_mvx[(m2x)<<2];\
Loren Merritt's avatar
Loren Merritt committed
114
115
116
117
118
    COPY3_IF_LT( bcost, costs[0], bmx, m0x, bmy, m0y );\
    COPY3_IF_LT( bcost, costs[1], bmx, m1x, bmy, m1y );\
    COPY3_IF_LT( bcost, costs[2], bmx, m2x, bmy, m2y );\
}

119
120
121
122
123
124
125
126
/*  1  */
/* 101 */
/*  1  */
#define DIA1_ITER( mx, my )\
{\
    omx = mx; omy = my;\
    COST_MV_X4( 0,-1, 0,1, -1,0, 1,0 );\
}
127

Loren Merritt's avatar
Loren Merritt committed
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#define CROSS( start, x_max, y_max )\
{\
    i = start;\
    if( x_max <= X264_MIN(mv_x_max-omx, omx-mv_x_min) )\
        for( ; i < x_max-2; i+=4 )\
            COST_MV_X4( i,0, -i,0, i+2,0, -i-2,0 );\
    for( ; i < x_max; i+=2 )\
    {\
        if( omx+i <= mv_x_max )\
            COST_MV( omx+i, omy );\
        if( omx-i >= mv_x_min )\
            COST_MV( omx-i, omy );\
    }\
    i = start;\
    if( y_max <= X264_MIN(mv_y_max-omy, omy-mv_y_min) )\
        for( ; i < y_max-2; i+=4 )\
            COST_MV_X4( 0,i, 0,-i, 0,i+2, 0,-i-2 );\
    for( ; i < y_max; i+=2 )\
    {\
        if( omy+i <= mv_y_max )\
            COST_MV( omx, omy+i );\
        if( omy-i >= mv_y_min )\
            COST_MV( omx, omy-i );\
    }\
}
153

154
void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_halfpel_thresh )
Laurent Aimar's avatar
Laurent Aimar committed
155
{
156
157
    const int bw = x264_pixel_size[m->i_pixel].w;
    const int bh = x264_pixel_size[m->i_pixel].h;
Laurent Aimar's avatar
Laurent Aimar committed
158
    const int i_pixel = m->i_pixel;
159
    int i_me_range = h->param.analyse.i_me_range;
160
    int bmx, bmy, bcost;
161
    int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX;
162
    int omx, omy, pmx, pmy;
163
    uint8_t *p_fref = m->p_fref[0];
Loren Merritt's avatar
Loren Merritt committed
164
    DECLARE_ALIGNED_16( uint8_t pix[16*16] );
165
    
Fiona Glaser's avatar
Fiona Glaser committed
166
    int i = 0, j;
167
    int dir;
168
    int costs[6];
Laurent Aimar's avatar
Laurent Aimar committed
169

170
171
172
173
    int mv_x_min = h->mb.mv_min_fpel[0];
    int mv_y_min = h->mb.mv_min_fpel[1];
    int mv_x_max = h->mb.mv_max_fpel[0];
    int mv_y_max = h->mb.mv_max_fpel[1];
Laurent Aimar's avatar
Laurent Aimar committed
174

Loren Merritt's avatar
Loren Merritt committed
175
176
#define CHECK_MVRANGE(mx,my) ( mx >= mv_x_min && mx <= mv_x_max && my >= mv_y_min && my <= mv_y_max )

177
178
179
    const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
    const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];

180
181
182
183
    bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 );
    bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 );
    pmx = ( bmx + 2 ) >> 2;
    pmy = ( bmy + 2 ) >> 2;
184
    bcost = COST_MAX;
Laurent Aimar's avatar
Laurent Aimar committed
185

186
    /* try extra predictors if provided */
187
    if( h->mb.i_subpel_refine >= 3 )
Laurent Aimar's avatar
Laurent Aimar committed
188
    {
Loren Merritt's avatar
Loren Merritt committed
189
        COST_MV_HPEL( bmx, bmy );
Fiona Glaser's avatar
Fiona Glaser committed
190
191
        uint32_t bmv = pack16to32_mask(bmx,bmy);
        do
192
        {
Fiona Glaser's avatar
Fiona Glaser committed
193
            if( *(uint32_t*)mvc[i] && (bmv - *(uint32_t*)mvc[i]) )
194
            {
Fiona Glaser's avatar
Fiona Glaser committed
195
196
                int mx = x264_clip3( mvc[i][0], mv_x_min*4, mv_x_max*4 );
                int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 );
197
198
                COST_MV_HPEL( mx, my );
            }
Fiona Glaser's avatar
Fiona Glaser committed
199
200
            i++;
        } while( i < i_mvc );
201
202
203
        bmx = ( bpred_mx + 2 ) >> 2;
        bmy = ( bpred_my + 2 ) >> 2;
        COST_MV( bmx, bmy );
204
    }
205
206
207
208
    else
    {
        /* check the MVP */
        COST_MV( pmx, pmy );
Fiona Glaser's avatar
Fiona Glaser committed
209
210
211
212
213
214
215
216
        /* Because we are rounding the predicted motion vector to fullpel, there will be
         * an extra MV cost in 15 out of 16 cases.  However, when the predicted MV is
         * chosen as the best predictor, it is often the case that the subpel search will
         * result in a vector at or next to the predicted motion vector.  Therefore, it is
         * sensible to remove the cost of the MV from the rounded MVP to avoid unfairly
         * biasing against use of the predicted motion vector. */
        bcost -= BITS_MVD( pmx, pmy );
        do
217
        {
218
219
220
221
222
223
224
225
            int mx = (mvc[i][0] + 2) >> 2;
            int my = (mvc[i][1] + 2) >> 2;
            if( (mx | my) && ((mx-bmx) | (my-bmy)) )
            {
                mx = x264_clip3( mx, mv_x_min, mv_x_max );
                my = x264_clip3( my, mv_y_min, mv_y_max );
                COST_MV( mx, my );
            }
Fiona Glaser's avatar
Fiona Glaser committed
226
227
            i++;
        } while( i < i_mvc );
228
    }
229
230
    COST_MV( 0, 0 );

Loren Merritt's avatar
Loren Merritt committed
231
232
    switch( h->mb.i_me_method )
    {
233
    case X264_ME_DIA:
234
235
        /* diamond search, radius 1 */
        for( i = 0; i < i_me_range; i++ )
236
        {
237
            DIA1_ITER( bmx, bmy );
238
239
            if( bmx == omx && bmy == omy )
                break;
Loren Merritt's avatar
Loren Merritt committed
240
241
            if( !CHECK_MVRANGE(bmx, bmy) )
                break;
242
243
        }
        break;
244

245
    case X264_ME_HEX:
246
me_hex2:
247
        /* hexagon search, radius 2 */
248
#if 0
249
250
        for( i = 0; i < i_me_range/2; i++ )
        {
251
252
253
254
255
256
257
            omx = bmx; omy = bmy;
            COST_MV( omx-2, omy   );
            COST_MV( omx-1, omy+2 );
            COST_MV( omx+1, omy+2 );
            COST_MV( omx+2, omy   );
            COST_MV( omx+1, omy-2 );
            COST_MV( omx-1, omy-2 );
258
259
            if( bmx == omx && bmy == omy )
                break;
Loren Merritt's avatar
Loren Merritt committed
260
261
            if( !CHECK_MVRANGE(bmx, bmy) )
                break;
Laurent Aimar's avatar
Laurent Aimar committed
262
        }
263
264
#else
        /* equivalent to the above, but eliminates duplicate candidates */
265
266
267
268
269
270
271
272
273
274
275
276
277
        dir = -2;

        /* hexagon */
        COST_MV_X3_DIR( -2,0, -1, 2,  1, 2, costs   );
        COST_MV_X3_DIR(  2,0,  1,-2, -1,-2, costs+3 );
        COPY2_IF_LT( bcost, costs[0], dir, 0 );
        COPY2_IF_LT( bcost, costs[1], dir, 1 );
        COPY2_IF_LT( bcost, costs[2], dir, 2 );
        COPY2_IF_LT( bcost, costs[3], dir, 3 );
        COPY2_IF_LT( bcost, costs[4], dir, 4 );
        COPY2_IF_LT( bcost, costs[5], dir, 5 );

        if( dir != -2 )
278
        {
279
280
281
            bmx += hex2[dir+1][0];
            bmy += hex2[dir+1][1];
            /* half hexagon, not overlapping the previous iteration */
Loren Merritt's avatar
Loren Merritt committed
282
            for( i = 1; i < i_me_range/2 && CHECK_MVRANGE(bmx, bmy); i++ )
283
            {
284
                const int odir = mod6m1[dir+1];
285
286
287
288
289
290
291
292
293
                COST_MV_X3_DIR( hex2[odir+0][0], hex2[odir+0][1],
                                hex2[odir+1][0], hex2[odir+1][1],
                                hex2[odir+2][0], hex2[odir+2][1],
                                costs );
                dir = -2;
                COPY2_IF_LT( bcost, costs[0], dir, odir-1 );
                COPY2_IF_LT( bcost, costs[1], dir, odir   );
                COPY2_IF_LT( bcost, costs[2], dir, odir+1 );
                if( dir == -2 )
294
                    break;
295
296
                bmx += hex2[dir+1][0];
                bmy += hex2[dir+1][1];
297
298
299
            }
        }
#endif
300
        /* square refine */
301
302
303
        omx = bmx; omy = bmy;
        COST_MV_X4(  0,-1,  0,1, -1,0, 1,0 );
        COST_MV_X4( -1,-1, -1,1, 1,-1, 1,1 );
304
        break;
305
306

    case X264_ME_UMH:
307
308
        {
            /* Uneven-cross Multi-Hexagon-grid Search
309
             * as in JM, except with different early termination */
310

311
312
313
314
            static const int x264_pixel_size_shift[7] = { 0, 1, 1, 2, 3, 3, 4 };

            int ucost1, ucost2;
            int cross_start = 1;
315

316
            /* refine predictors */
317
            ucost1 = bcost;
318
            DIA1_ITER( pmx, pmy );
319
            if( pmx | pmy )
320
                DIA1_ITER( 0, 0 );
321

322
323
            if(i_pixel == PIXEL_4x4)
                goto me_hex2;
324

325
            ucost2 = bcost;
326
            if( (bmx | bmy) && ((bmx-pmx) | (bmy-pmy)) )
327
                DIA1_ITER( bmx, bmy );
328
329
            if( bcost == ucost2 )
                cross_start = 3;
330
            omx = bmx; omy = bmy;
331
332
333
334

            /* early termination */
#define SAD_THRESH(v) ( bcost < ( v >> x264_pixel_size_shift[i_pixel] ) )
            if( bcost == ucost2 && SAD_THRESH(2000) )
335
            {
336
337
                COST_MV_X4( 0,-2, -1,-1, 1,-1, -2,0 );
                COST_MV_X4( 2, 0, -1, 1, 1, 1,  0,2 );
338
339
340
341
342
343
                if( bcost == ucost1 && SAD_THRESH(500) )
                    break;
                if( bcost == ucost2 )
                {
                    int range = (i_me_range>>1) | 1;
                    CROSS( 3, range, range );
344
345
                    COST_MV_X4( -1,-2, 1,-2, -2,-1, 2,-1 );
                    COST_MV_X4( -2, 1, 2, 1, -1, 2, 1, 2 );
346
347
348
349
                    if( bcost == ucost2 )
                        break;
                    cross_start = range + 2;
                }
350
            }
351
352

            /* adaptive search range */
353
            if( i_mvc )
354
            {
355
356
357
358
359
360
361
362
363
364
365
366
                /* range multipliers based on casual inspection of some statistics of
                 * average distance between current predictor and final mv found by ESA.
                 * these have not been tuned much by actual encoding. */
                static const int range_mul[4][4] =
                {
                    { 3, 3, 4, 4 },
                    { 3, 4, 4, 4 },
                    { 4, 4, 4, 5 },
                    { 4, 4, 5, 6 },
                };
                int mvd;
                int sad_ctx, mvd_ctx;
Loren Merritt's avatar
Loren Merritt committed
367
                int denom = 1;
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383

                if( i_mvc == 1 )
                {
                    if( i_pixel == PIXEL_16x16 )
                        /* mvc is probably the same as mvp, so the difference isn't meaningful.
                         * but prediction usually isn't too bad, so just use medium range */
                        mvd = 25;
                    else
                        mvd = abs( m->mvp[0] - mvc[0][0] )
                            + abs( m->mvp[1] - mvc[0][1] );
                }
                else
                {
                    /* calculate the degree of agreement between predictors. */
                    /* in 16x16, mvc includes all the neighbors used to make mvp,
                     * so don't count mvp separately. */
Loren Merritt's avatar
Loren Merritt committed
384
                    denom = i_mvc - 1;
385
386
387
388
389
                    mvd = 0;
                    if( i_pixel != PIXEL_16x16 )
                    {
                        mvd = abs( m->mvp[0] - mvc[0][0] )
                            + abs( m->mvp[1] - mvc[0][1] );
Loren Merritt's avatar
Loren Merritt committed
390
                        denom++;
391
392
393
394
395
396
397
398
399
                    }
                    for( i = 0; i < i_mvc-1; i++ )
                        mvd += abs( mvc[i][0] - mvc[i+1][0] )
                             + abs( mvc[i][1] - mvc[i+1][1] );
                }

                sad_ctx = SAD_THRESH(1000) ? 0
                        : SAD_THRESH(2000) ? 1
                        : SAD_THRESH(4000) ? 2 : 3;
Loren Merritt's avatar
Loren Merritt committed
400
401
402
                mvd_ctx = mvd < 10*denom ? 0
                        : mvd < 20*denom ? 1
                        : mvd < 40*denom ? 2 : 3;
403
404

                i_me_range = i_me_range * range_mul[mvd_ctx][sad_ctx] / 4;
405
406
            }

407
408
409
410
            /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
             * we are still centered on the same place as the DIA2. is this desirable? */
            CROSS( cross_start, i_me_range, i_me_range/2 );

411
            COST_MV_X4( -2,-2, -2,2, 2,-2, 2,2 );
412

413
414
415
416
            /* hexagon grid */
            omx = bmx; omy = bmy;
            for( i = 1; i <= i_me_range/4; i++ )
            {
417
418
419
420
421
422
423
                static const int hex4[16][2] = {
                    {-4, 2}, {-4, 1}, {-4, 0}, {-4,-1}, {-4,-2},
                    { 4,-2}, { 4,-1}, { 4, 0}, { 4, 1}, { 4, 2},
                    { 2, 3}, { 0, 4}, {-2, 3},
                    {-2,-3}, { 0,-4}, { 2,-3},
                };

424
425
                if( 4*i > X264_MIN4( mv_x_max-omx, omx-mv_x_min,
                                     mv_y_max-omy, omy-mv_y_min ) )
426
                {
427
428
429
430
                    for( j = 0; j < 16; j++ )
                    {
                        int mx = omx + hex4[j][0]*i;
                        int my = omy + hex4[j][1]*i;
Loren Merritt's avatar
Loren Merritt committed
431
                        if( CHECK_MVRANGE(mx, my) )
432
                            COST_MV( mx, my );
433
434
435
436
                    }
                }
                else
                {
437
438
439
440
                    COST_MV_X4( -4*i, 2*i, -4*i, 1*i, -4*i, 0*i, -4*i,-1*i );
                    COST_MV_X4( -4*i,-2*i,  4*i,-2*i,  4*i,-1*i,  4*i, 0*i );
                    COST_MV_X4(  4*i, 1*i,  4*i, 2*i,  2*i, 3*i,  0*i, 4*i );
                    COST_MV_X4( -2*i, 3*i, -2*i,-3*i,  0*i,-4*i,  2*i,-3*i );
441
442
                }
            }
443
444
445
            if( bmy <= mv_y_max )
                goto me_hex2;
            break;
446
447
        }

448
    case X264_ME_ESA:
449
    case X264_ME_TESA:
450
        {
451
452
            const int min_x = X264_MAX( bmx - i_me_range, mv_x_min );
            const int min_y = X264_MAX( bmy - i_me_range, mv_y_min );
453
            const int max_x = X264_MIN( bmx + i_me_range, mv_x_max );
454
            const int max_y = X264_MIN( bmy + i_me_range, mv_y_max );
455
456
457
            /* SEA is fastest in multiples of 4 */
            const int width = (max_x - min_x + 3) & ~3;
            int my;
458
459
#if 0
            /* plain old exhaustive search */
460
            int mx;
461
462
463
464
465
466
467
            for( my = min_y; my <= max_y; my++ )
                for( mx = min_x; mx <= max_x; mx++ )
                    COST_MV( mx, my );
#else
            /* successive elimination by comparing DC before a full SAD,
             * because sum(abs(diff)) >= abs(diff(sum)). */
            const int stride = m->i_stride[0];
Loren Merritt's avatar
Loren Merritt committed
468
            uint16_t *sums_base = m->integral;
Loren Merritt's avatar
Loren Merritt committed
469
470
            DECLARE_ALIGNED_16( static uint8_t zero[16*16] );
            DECLARE_ALIGNED_16( int enc_dc[4] );
Loren Merritt's avatar
Loren Merritt committed
471
            int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
Loren Merritt's avatar
Loren Merritt committed
472
            int delta = x264_pixel_size[sad_size].w;
473
            int16_t xs_buf[64];
474
            int16_t *xs = width<=64 ? xs_buf : x264_malloc( (width+15)*sizeof(int16_t) );
475
476
            int xn;
            uint16_t *cost_fpel_mvx = x264_cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2);
Loren Merritt's avatar
Loren Merritt committed
477
478
479

            h->pixf.sad_x4[sad_size]( zero, m->p_fenc[0], m->p_fenc[0]+delta,
                m->p_fenc[0]+delta*FENC_STRIDE, m->p_fenc[0]+delta+delta*FENC_STRIDE,
Loren Merritt's avatar
Loren Merritt committed
480
                FENC_STRIDE, enc_dc );
Loren Merritt's avatar
Loren Merritt committed
481
            if( delta == 4 )
482
                sums_base += stride * (h->fenc->i_lines[0] + PADV*2);
Loren Merritt's avatar
Loren Merritt committed
483
484
485
486
            if( i_pixel == PIXEL_16x16 || i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
                delta *= stride;
            if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
                enc_dc[1] = enc_dc[2];
487

488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
            if( h->mb.i_me_method == X264_ME_TESA )
            {
                // ADS threshold, then SAD threshold, then keep the best few SADs, then SATD
                typedef struct {
                    int sad;
                    int16_t mx, my;
                } mvsad_t;
                mvsad_t *mvsads = x264_malloc( width*(max_y-min_y+1)*sizeof(mvsad_t) );
                int nmvsad = 0, limit;
                int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
                int bsad = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, p_fref+bmy*stride+bmx, stride )
                         + BITS_MVD( bmx, bmy );
                for( my = min_y; my <= max_y; my++ )
                {
                    int ycost = p_cost_mvy[my<<2];
503
504
                    if( bsad <= ycost )
                        continue;
505
506
                    bsad -= ycost;
                    xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
507
                                               cost_fpel_mvx+min_x, xs, width, bsad*17/16 );
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
                    for( i=0; i<xn-2; i+=3 )
                    {
                        uint8_t *ref = p_fref+min_x+my*stride;
                        int sads[3];
                        h->pixf.sad_x3[i_pixel]( m->p_fenc[0], ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads );
                        for( j=0; j<3; j++ )
                        {
                            int sad = sads[j] + cost_fpel_mvx[xs[i+j]];
                            if( sad < bsad*sad_thresh>>3 )
                            {
                                COPY1_IF_LT( bsad, sad );
                                mvsads[nmvsad].sad = sad + ycost;
                                mvsads[nmvsad].mx = min_x+xs[i+j];
                                mvsads[nmvsad].my = my;
                                nmvsad++;
                            }
                        }
                    }
                    for( ; i<xn; i++ )
                    {
                        int mx = min_x+xs[i];
                        int sad = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, p_fref+mx+my*stride, stride )
                                + cost_fpel_mvx[xs[i]];
                        if( sad < bsad*sad_thresh>>3 )
                        {
                            COPY1_IF_LT( bsad, sad );
                            mvsads[nmvsad].sad = sad + ycost;
                            mvsads[nmvsad].mx = mx;
                            mvsads[nmvsad].my = my;
                            nmvsad++;
                        }
                    }
                    bsad += ycost;
                }

                limit = i_me_range / 2;
                if( nmvsad > limit*2 )
                {
                    // halve the range if the domain is too large... eh, close enough
                    bsad = bsad*(sad_thresh+8)>>4;
                    for( i=0; i<nmvsad && mvsads[i].sad <= bsad; i++ );
                    for( j=i; j<nmvsad; j++ )
                        if( mvsads[j].sad <= bsad )
                            mvsads[i++] = mvsads[j];
                    nmvsad = i;
                }
                if( nmvsad > limit )
                {
                    for( i=0; i<limit; i++ )
                    {
                        int bj = i;
                        int bsad = mvsads[bj].sad;
                        for( j=i+1; j<nmvsad; j++ )
                            COPY2_IF_LT( bsad, mvsads[j].sad, bj, j );
                        if( bj > i )
                            XCHG( mvsad_t, mvsads[i], mvsads[bj] );
                    }
                    nmvsad = limit;
                }
                for( i=0; i<nmvsad; i++ )
                    COST_MV( mvsads[i].mx, mvsads[i].my );
                x264_free( mvsads );
            }
            else
Loren Merritt's avatar
Loren Merritt committed
572
            {
573
574
575
                // just ADS and SAD
                for( my = min_y; my <= max_y; my++ )
                {
576
577
578
579
                    int ycost = p_cost_mvy[my<<2];
                    if( bcost <= ycost )
                        continue;
                    bcost -= ycost;
580
                    xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
581
                                               cost_fpel_mvx+min_x, xs, width, bcost );
582
583
                    for( i=0; i<xn-2; i+=3 )
                        COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my );
584
                    bcost += ycost;
585
586
587
                    for( ; i<xn; i++ )
                        COST_MV( min_x+xs[i], my );
                }
588
            }
589

590
591
            if( xs != xs_buf )
                x264_free( xs );
592
#endif
Laurent Aimar's avatar
Laurent Aimar committed
593
        }
594
        break;
Laurent Aimar's avatar
Laurent Aimar committed
595
596
597
    }

    /* -> qpel mv */
598
599
600
601
602
603
604
605
606
607
608
609
    if( bpred_cost < bcost )
    {
        m->mv[0] = bpred_mx;
        m->mv[1] = bpred_my;
        m->cost = bpred_cost;
    }
    else
    {
        m->mv[0] = bmx << 2;
        m->mv[1] = bmy << 2;
        m->cost = bcost;
    }
Laurent Aimar's avatar
Laurent Aimar committed
610
611

    /* compute the real cost */
612
    m->cost_mv = p_cost_mvx[ m->mv[0] ] + p_cost_mvy[ m->mv[1] ];
613
    if( bmx == pmx && bmy == pmy && h->mb.i_subpel_refine < 3 )
614
        m->cost += m->cost_mv;
Loren Merritt's avatar
Loren Merritt committed
615

616
    /* subpel refine */
617
    if( h->mb.i_subpel_refine >= 2 )
618
    {
619
620
621
        int hpel = subpel_iterations[h->mb.i_subpel_refine][2];
        int qpel = subpel_iterations[h->mb.i_subpel_refine][3];
        refine_subpel( h, m, hpel, qpel, p_halfpel_thresh, 0 );
622
    }
Loren Merritt's avatar
Loren Merritt committed
623
624
    else if( m->mv[1] > h->mb.mv_max_spel[1] )
        m->mv[1] = h->mb.mv_max_spel[1];
Laurent Aimar's avatar
Laurent Aimar committed
625
}
626
#undef COST_MV
Laurent Aimar's avatar
Laurent Aimar committed
627
628

void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
629
{
630
631
    int hpel = subpel_iterations[h->mb.i_subpel_refine][0];
    int qpel = subpel_iterations[h->mb.i_subpel_refine][1];
Loren Merritt's avatar
Loren Merritt committed
632
633
634
635

    if( m->i_pixel <= PIXEL_8x8 && h->sh.i_type == SLICE_TYPE_P )
        m->cost -= m->i_ref_cost;
	
636
    refine_subpel( h, m, hpel, qpel, NULL, 1 );
637
638
}

Loren Merritt's avatar
Loren Merritt committed
639
#define COST_MV_SAD( mx, my ) \
640
641
{ \
    int stride = 16; \
642
    uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
643
    int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
644
             + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
Loren Merritt's avatar
Loren Merritt committed
645
    COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \
646
647
}

648
649
#define COST_MV_SATD( mx, my, dir ) \
if( b_refine_qpel || (dir^1) != odir ) \
Loren Merritt's avatar
Loren Merritt committed
650
651
{ \
    int stride = 16; \
652
    uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
653
    int cost = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
Loren Merritt's avatar
Loren Merritt committed
654
655
656
             + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
    if( b_chroma_me && cost < bcost ) \
    { \
657
        h->mc.mc_chroma( pix[0], 8, m->p_fref[4], m->i_stride[1], mx, my, bw/2, bh/2 ); \
Loren Merritt's avatar
Loren Merritt committed
658
        cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix[0], 8 ); \
Loren Merritt's avatar
Loren Merritt committed
659
660
        if( cost < bcost ) \
        { \
661
            h->mc.mc_chroma( pix[0], 8, m->p_fref[5], m->i_stride[1], mx, my, bw/2, bh/2 ); \
Loren Merritt's avatar
Loren Merritt committed
662
            cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix[0], 8 ); \
Loren Merritt's avatar
Loren Merritt committed
663
664
665
666
667
        } \
    } \
    if( cost < bcost ) \
    {                  \
        bcost = cost;  \
668
669
        bmx = mx;      \
        bmy = my;      \
670
        bdir = dir;    \
Loren Merritt's avatar
Loren Merritt committed
671
672
673
    } \
}

674
static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel )
Laurent Aimar's avatar
Laurent Aimar committed
675
676
677
{
    const int bw = x264_pixel_size[m->i_pixel].w;
    const int bh = x264_pixel_size[m->i_pixel].h;
678
679
    const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
    const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
Loren Merritt's avatar
Loren Merritt committed
680
681
    const int i_pixel = m->i_pixel;
    const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
Laurent Aimar's avatar
Laurent Aimar committed
682

Loren Merritt's avatar
Loren Merritt committed
683
    DECLARE_ALIGNED_16( uint8_t pix[2][32*18] ); // really 17x17, but round up for alignment
684
685
    int omx, omy;
    int i;
Laurent Aimar's avatar
Laurent Aimar committed
686
687
688

    int bmx = m->mv[0];
    int bmy = m->mv[1];
689
    int bcost = m->cost;
690
    int odir = -1, bdir;
691

692

Loren Merritt's avatar
Loren Merritt committed
693
    /* try the subpel component of the predicted mv */
694
    if( hpel_iters && h->mb.i_subpel_refine < 3 )
695
    {
696
697
        int mx = x264_clip3( m->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
        int my = x264_clip3( m->mvp[1], h->mb.mv_min_spel[1], h->mb.mv_max_spel[1] );
698
        if( mx != bmx || my != bmy )
Loren Merritt's avatar
Loren Merritt committed
699
            COST_MV_SAD( mx, my );
700
    }
Loren Merritt's avatar
Loren Merritt committed
701
702

    /* halfpel diamond search */
703
704
    for( i = hpel_iters; i > 0; i-- )
    {
Loren Merritt's avatar
Loren Merritt committed
705
706
        int omx = bmx, omy = bmy;
        int costs[4];
707
        int stride = 32; // candidates are either all hpel or all qpel, so one stride is enough
Loren Merritt's avatar
Loren Merritt committed
708
        uint8_t *src0, *src1, *src2, *src3;
709
710
        src0 = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1 );
        src2 = h->mc.get_ref( pix[1], &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh );
711
712
        src1 = src0 + stride;
        src3 = src2 + 1;
713
        h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs );
Loren Merritt's avatar
Loren Merritt committed
714
715
716
717
        COPY2_IF_LT( bcost, costs[0] + p_cost_mvx[omx  ] + p_cost_mvy[omy-2], bmy, omy-2 );
        COPY2_IF_LT( bcost, costs[1] + p_cost_mvx[omx  ] + p_cost_mvy[omy+2], bmy, omy+2 );
        COPY3_IF_LT( bcost, costs[2] + p_cost_mvx[omx-2] + p_cost_mvy[omy  ], bmx, omx-2, bmy, omy );
        COPY3_IF_LT( bcost, costs[3] + p_cost_mvx[omx+2] + p_cost_mvy[omy  ], bmx, omx+2, bmy, omy );
718
719
720
        if( bmx == omx && bmy == omy )
            break;
    }
Loren Merritt's avatar
Loren Merritt committed
721

722
    if( !b_refine_qpel )
Laurent Aimar's avatar
Laurent Aimar committed
723
    {
Loren Merritt's avatar
Loren Merritt committed
724
725
726
        /* check for mvrange */
        if( bmy > h->mb.mv_max_spel[1] )
            bmy = h->mb.mv_max_spel[1];
727
        bcost = COST_MAX;
728
        COST_MV_SATD( bmx, bmy, -1 );
729
    }
Loren Merritt's avatar
Loren Merritt committed
730

731
732
733
734
    /* early termination when examining multiple reference frames */
    if( p_halfpel_thresh )
    {
        if( (bcost*7)>>3 > *p_halfpel_thresh )
735
        {
736
737
738
739
740
741
742
743
744
745
            m->cost = bcost;
            m->mv[0] = bmx;
            m->mv[1] = bmy;
            // don't need cost_mv
            return;
        }
        else if( bcost < *p_halfpel_thresh )
            *p_halfpel_thresh = bcost;
    }

Loren Merritt's avatar
Loren Merritt committed
746
    /* quarterpel diamond search */
747
    bdir = -1;
748
749
    for( i = qpel_iters; i > 0; i-- )
    {
750
        odir = bdir;
751
752
        omx = bmx;
        omy = bmy;
753
754
755
756
        COST_MV_SATD( omx, omy - 1, 0 );
        COST_MV_SATD( omx, omy + 1, 1 );
        COST_MV_SATD( omx - 1, omy, 2 );
        COST_MV_SATD( omx + 1, omy, 3 );
757
758
        if( bmx == omx && bmy == omy )
            break;
Laurent Aimar's avatar
Laurent Aimar committed
759
760
    }

Loren Merritt's avatar
Loren Merritt committed
761
762
763
764
765
766
767
768
    /* check for mvrange */
    if( bmy > h->mb.mv_max_spel[1] )
    {
        bmy = h->mb.mv_max_spel[1];
        bcost = COST_MAX;
        COST_MV_SATD( bmx, bmy, -1 );
    }

769
    m->cost = bcost;
Laurent Aimar's avatar
Laurent Aimar committed
770
771
    m->mv[0] = bmx;
    m->mv[1] = bmy;
772
    m->cost_mv = p_cost_mvx[ bmx ] + p_cost_mvy[ bmy ];
Laurent Aimar's avatar
Laurent Aimar committed
773
}
774

775
776
777
#define BIME_CACHE( dx, dy ) \
{ \
    int i = 4 + 3*dx + dy; \
778
779
    h->mc.mc_luma( pix0[i], bw, m0->p_fref, m0->i_stride[0], om0x+dx, om0y+dy, bw, bh ); \
    h->mc.mc_luma( pix1[i], bw, m1->p_fref, m1->i_stride[0], om1x+dx, om1y+dy, bw, bh ); \
780
781
782
}

#define BIME_CACHE2(a,b) \
Loren Merritt's avatar
Loren Merritt committed
783
784
    BIME_CACHE(a,b) \
    BIME_CACHE(-(a),-(b))
785
786
787
788
789
790
791
792

#define COST_BIMV_SATD( m0x, m0y, m1x, m1y ) \
if( pass == 0 || !visited[(m0x)&7][(m0y)&7][(m1x)&7][(m1y)&7] ) \
{ \
    int cost; \
    int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y); \
    int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y); \
    visited[(m0x)&7][(m0y)&7][(m1x)&7][(m1y)&7] = 1; \
793
    h->mc.memcpy_aligned( pix, pix0[i0], bs ); \
794
795
796
797
    if( i_weight == 32 ) \
        h->mc.avg[i_pixel]( pix, bw, pix1[i1], bw ); \
    else \
        h->mc.avg_weight[i_pixel]( pix, bw, pix1[i1], bw, i_weight ); \
798
    cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, bw ) \
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
         + p_cost_m0x[ m0x ] + p_cost_m0y[ m0y ] \
         + p_cost_m1x[ m1x ] + p_cost_m1y[ m1y ]; \
    if( cost < bcost ) \
    {                  \
        bcost = cost;  \
        bm0x = m0x;    \
        bm0y = m0y;    \
        bm1x = m1x;    \
        bm1y = m1y;    \
    } \
}

#define CHECK_BIDIR(a,b,c,d) \
    COST_BIMV_SATD(om0x+a, om0y+b, om1x+c, om1y+d)

#define CHECK_BIDIR2(a,b,c,d) \
Loren Merritt's avatar
Loren Merritt committed
815
816
    CHECK_BIDIR(a,b,c,d) \
    CHECK_BIDIR(-(a),-(b),-(c),-(d))
817
818
819
820
821
822
823
824
825
826
827
828
829

#define CHECK_BIDIR8(a,b,c,d) \
    CHECK_BIDIR2(a,b,c,d) \
    CHECK_BIDIR2(b,c,d,a) \
    CHECK_BIDIR2(c,d,a,b) \
    CHECK_BIDIR2(d,a,b,c)

int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight )
{
    const int i_pixel = m0->i_pixel;
    const int bw = x264_pixel_size[i_pixel].w;
    const int bh = x264_pixel_size[i_pixel].h;
    const int bs = bw*bh;
830
831
832
833
    const int16_t *p_cost_m0x = m0->p_cost_mv - x264_clip3( m0->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
    const int16_t *p_cost_m0y = m0->p_cost_mv - x264_clip3( m0->mvp[1], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
    const int16_t *p_cost_m1x = m1->p_cost_mv - x264_clip3( m1->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
    const int16_t *p_cost_m1y = m1->p_cost_mv - x264_clip3( m1->mvp[1], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
Loren Merritt's avatar
Loren Merritt committed
834
835
836
    DECLARE_ALIGNED_16( uint8_t pix0[9][16*16] );
    DECLARE_ALIGNED_16( uint8_t pix1[9][16*16] );
    DECLARE_ALIGNED_16( uint8_t pix[16*16] );
837
838
839
840
841
842
843
    int bm0x = m0->mv[0], om0x = bm0x;
    int bm0y = m0->mv[1], om0y = bm0y;
    int bm1x = m1->mv[0], om1x = bm1x;
    int bm1y = m1->mv[1], om1y = bm1y;
    int bcost = COST_MAX;
    int pass = 0;
    uint8_t visited[8][8][8][8];
Fiona Glaser's avatar
Fiona Glaser committed
844
    h->mc.memzero_aligned( visited, sizeof(visited) );
845
846
847
848

    BIME_CACHE( 0, 0 );
    CHECK_BIDIR( 0, 0, 0, 0 );

Loren Merritt's avatar
Loren Merritt committed
849
850
851
852
    if( bm0y > h->mb.mv_max_spel[1] - 8 ||
        bm1y > h->mb.mv_max_spel[1] - 8 )
        return bcost;

853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
    for( pass = 0; pass < 8; pass++ )
    {
        /* check all mv pairs that differ in at most 2 components from the current mvs. */
        /* doesn't do chroma ME. this probably doesn't matter, as the gains
         * from bidir ME are the same with and without chroma ME. */

        BIME_CACHE2( 1, 0 );
        BIME_CACHE2( 0, 1 );
        BIME_CACHE2( 1, 1 );
        BIME_CACHE2( 1,-1 );

        CHECK_BIDIR8( 0, 0, 0, 1 );
        CHECK_BIDIR8( 0, 0, 1, 1 );
        CHECK_BIDIR2( 0, 1, 0, 1 );
        CHECK_BIDIR2( 1, 0, 1, 0 );
        CHECK_BIDIR8( 0, 0,-1, 1 );
        CHECK_BIDIR2( 0,-1, 0, 1 );
        CHECK_BIDIR2(-1, 0, 1, 0 );

        if( om0x == bm0x && om0y == bm0y && om1x == bm1x && om1y == bm1y )
            break;

        om0x = bm0x;
        om0y = bm0y;
        om1x = bm1x;
        om1y = bm1y;
        BIME_CACHE( 0, 0 );
    }

    m0->mv[0] = bm0x;
    m0->mv[1] = bm0y;
    m1->mv[0] = bm1x;
    m1->mv[1] = bm1y;
    return bcost;
}
888

889
890
#undef COST_MV_SATD
#define COST_MV_SATD( mx, my, dst ) \
891
{ \
892
    int stride = 16; \
893
    uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw*4, bh*4 ); \
894
895
896
897
898
    dst = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
        + p_cost_mvx[mx] + p_cost_mvy[my]; \
    COPY1_IF_LT( bsatd, dst ); \
}

899
#define COST_MV_RD( mx, my, satd, do_dir, mdir ) \
900
{ \
901
    if( satd <= bsatd * SATD_THRESH )\
902
903
904
905
906
    { \
        int cost; \
        cache_mv[0] = cache_mv2[0] = mx; \
        cache_mv[1] = cache_mv2[1] = my; \
        cost = x264_rd_cost_part( h, i_lambda2, i8, m->i_pixel ); \
907
        COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
908
909
910
    } \
}

911
912
#define SATD_THRESH 17/16

913
914
915
916
917
918
void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8 )
{
    // don't have to fill the whole mv cache rectangle
    static const int pixel_mv_offs[] = { 0, 4, 4*8, 0 };
    int16_t *cache_mv = h->mb.cache.mv[0][x264_scan8[i8*4]];
    int16_t *cache_mv2 = cache_mv + pixel_mv_offs[m->i_pixel];
919
    const int16_t *p_cost_mvx, *p_cost_mvy;
920
921
    const int bw = x264_pixel_size[m->i_pixel].w>>2;
    const int bh = x264_pixel_size[m->i_pixel].h>>2;
922
    const int i_pixel = m->i_pixel;
923

Loren Merritt's avatar
Loren Merritt committed
924
    DECLARE_ALIGNED_16( uint8_t pix[16*16] );
925
    int bcost = m->i_pixel == PIXEL_16x16 ? m->cost : COST_MAX;
926
    int bmx = m->mv[0];
927
    int bmy = m->mv[1];
928
929
930
931
932
933
934
    int omx = bmx;
    int omy = bmy;
    int pmx, pmy, i, j;
    unsigned bsatd;
    int satd = 0;
    int dir = -2;
    int satds[8];
935

936
    if( m->i_pixel != PIXEL_16x16 && i8 != 0 )
937
        x264_mb_predict_mv( h, 0, i8*4, bw, m->mvp );
938
939
940
941
942
    pmx = m->mvp[0];
    pmy = m->mvp[1];
    p_cost_mvx = m->p_cost_mv - pmx;
    p_cost_mvy = m->p_cost_mv - pmy;
    COST_MV_SATD( bmx, bmy, bsatd );
943
    COST_MV_RD( bmx, bmy, 0, 0, 0);
944
945

    /* check the predicted mv */
946
947
948
949
950
    if( (bmx != pmx || bmy != pmy)
        && pmx >= h->mb.mv_min_spel[0] && pmx <= h->mb.mv_max_spel[0]
        && pmy >= h->mb.mv_min_spel[1] && pmy <= h->mb.mv_max_spel[1] )
    {
        COST_MV_SATD( pmx, pmy, satd );
951
        COST_MV_RD( pmx, pmy, satd, 0,0 );
952
    }
953

954
955
956
957
958
959
960
    /* subpel hex search, same pattern as ME HEX. */
    dir = -2;
    omx = bmx;
    omy = bmy;
    for( j=0; j<6; j++ ) COST_MV_SATD( omx + hex2[j+1][0], omy + hex2[j+1][1], satds[j] );
    for( j=0; j<6; j++ ) COST_MV_RD  ( omx + hex2[j+1][0], omy + hex2[j+1][1], satds[j], 1,j );
    if( dir != -2 )
961
    {
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
        /* half hexagon, not overlapping the previous iteration */
        for( i = 1; i < 10; i++ )
        {
            const int odir = mod6m1[dir+1];
            if( bmy > h->mb.mv_max_spel[1] - 2 ||
                bmy < h->mb.mv_min_spel[1] - 2 )
                break;
            dir = -2;
            omx = bmx;
            omy = bmy;
            for( j=0; j<3; j++ ) COST_MV_SATD( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satds[j] );
            for( j=0; j<3; j++ ) COST_MV_RD  ( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satds[j], 1, odir-1+j );
            if( dir == -2 )
                break;
        }
977
978
    }

979
980
981
982
983
    /* square refine, same as pattern as ME HEX. */
    omx = bmx;
    omy = bmy;
    for( i=0; i<8; i++ ) COST_MV_SATD( omx + square1[i][0], omy  + square1[i][1], satds[i] );
    for( i=0; i<8; i++ ) COST_MV_RD  ( omx + square1[i][0], omy  + square1[i][1], satds[i], 0,0 );
Loren Merritt's avatar
Loren Merritt committed
984

985
    bmy = x264_clip3( bmy, h->mb.mv_min_spel[1],  h->mb.mv_max_spel[1] );
986
987
988
    m->cost = bcost;
    m->mv[0] = bmx;
    m->mv[1] = bmy;
989
990
    x264_macroblock_cache_mv ( h, 2*(i8&1), i8&2, bw, bh, 0, pack16to32_mask(bmx, bmy) );
    x264_macroblock_cache_mvd( h, 2*(i8&1), i8&2, bw, bh, 0, pack16to32_mask(bmx - pmx, bmy - pmy) );
991
992
}