slicetype.c 81.9 KB
Newer Older
1
/*****************************************************************************
Fiona Glaser's avatar
Fiona Glaser committed
2
 * slicetype.c: lookahead analysis
3
 *****************************************************************************
Henrik Gramner's avatar
Henrik Gramner committed
4
 * Copyright (C) 2005-2018 x264 project
5
 *
Dylan Yudaken's avatar
Dylan Yudaken committed
6
7
8
 * Authors: Fiona Glaser <fiona@x264.com>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Dylan Yudaken <dyudaken@gmail.com>
9
10
11
12
13
14
15
16
17
18
19
20
21
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
Fiona Glaser's avatar
Fiona Glaser committed
23
24
25
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
26
27
28
29
30
31
 *****************************************************************************/

#include "common/common.h"
#include "macroblock.h"
#include "me.h"

32
33
34
// Indexed by pic_struct values
static const uint8_t delta_tfi_divisor[10] = { 0, 2, 1, 1, 2, 2, 3, 3, 4, 6 };

35
static int slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
36
                                      x264_frame_t **frames, int p0, int p1, int b );
37

38
#define x264_weights_analyse x264_template(weights_analyse)
Steve Borho's avatar
Steve Borho committed
39
40
41
void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );

#if HAVE_OPENCL
42
#include "slicetype-cl.h"
Steve Borho's avatar
Steve Borho committed
43
44
#endif

45
static void lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
Loren Merritt's avatar
Loren Merritt committed
46
{
47
    a->i_qp = X264_LOOKAHEAD_QP;
48
    a->i_lambda = x264_lambda_tab[ a->i_qp ];
49
    mb_analyse_load_costs( h, a );
Fiona Glaser's avatar
Fiona Glaser committed
50
51
52
53
54
55
56
57
    if( h->param.analyse.i_subpel_refine > 1 )
    {
        h->mb.i_me_method = X264_MIN( X264_ME_HEX, h->param.analyse.i_me_method );
        h->mb.i_subpel_refine = 4;
    }
    else
    {
        h->mb.i_me_method = X264_ME_DIA;
Fiona Glaser's avatar
Fiona Glaser committed
58
        h->mb.i_subpel_refine = 2;
Fiona Glaser's avatar
Fiona Glaser committed
59
    }
Loren Merritt's avatar
Loren Merritt committed
60
61
    h->mb.b_chroma_me = 0;
}
62

Loren Merritt's avatar
Loren Merritt committed
63
/* makes a non-h264 weight (i.e. fix7), into an h264 weight */
64
static void weight_get_h264( int weight_nonh264, int offset, x264_weight_t *w )
Dylan Yudaken's avatar
Dylan Yudaken committed
65
66
67
68
{
    w->i_offset = offset;
    w->i_denom = 7;
    w->i_scale = weight_nonh264;
69
    while( w->i_denom > 0 && (w->i_scale > 127) )
Dylan Yudaken's avatar
Dylan Yudaken committed
70
71
72
73
74
75
76
    {
        w->i_denom--;
        w->i_scale >>= 1;
    }
    w->i_scale = X264_MIN( w->i_scale, 127 );
}

77
static NOINLINE pixel *weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dest )
Dylan Yudaken's avatar
Dylan Yudaken committed
78
79
80
81
{
    int ref0_distance = fenc->i_frame - ref->i_frame - 1;
    /* Note: this will never run during lookahead as weights_analyse is only called if no
     * motion search has been done. */
Dylan Yudaken's avatar
Dylan Yudaken committed
82
    if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF )
Dylan Yudaken's avatar
Dylan Yudaken committed
83
    {
Dylan Yudaken's avatar
Dylan Yudaken committed
84
85
86
        int i_stride = fenc->i_stride_lowres;
        int i_lines = fenc->i_lines_lowres;
        int i_width = fenc->i_width_lowres;
Dylan Yudaken's avatar
Dylan Yudaken committed
87
        int i_mb_xy = 0;
88
        pixel *p = dest;
Dylan Yudaken's avatar
Dylan Yudaken committed
89

90
91
        for( int y = 0; y < i_lines; y += 8, p += i_stride*8 )
            for( int x = 0; x < i_width; x += 8, i_mb_xy++ )
Dylan Yudaken's avatar
Dylan Yudaken committed
92
            {
Dylan Yudaken's avatar
Dylan Yudaken committed
93
94
                int mvx = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][0];
                int mvy = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][1];
Fiona Glaser's avatar
Fiona Glaser committed
95
                h->mc.mc_luma( p+x, i_stride, ref->lowres, i_stride,
Anton Mitrofanov's avatar
Anton Mitrofanov committed
96
                               mvx+(x<<2), mvy+(y<<2), 8, 8, x264_weight_none );
Dylan Yudaken's avatar
Dylan Yudaken committed
97
            }
98
        x264_emms();
Dylan Yudaken's avatar
Dylan Yudaken committed
99
100
        return dest;
    }
101
    x264_emms();
Dylan Yudaken's avatar
Dylan Yudaken committed
102
    return ref->lowres[0];
Dylan Yudaken's avatar
Dylan Yudaken committed
103
104
}

Henrik Gramner's avatar
Henrik Gramner committed
105
/* How data is organized for 4:2:0/4:2:2 chroma weightp:
Fiona Glaser's avatar
Fiona Glaser committed
106
107
108
 * [U: ref] [U: fenc]
 * [V: ref] [V: fenc]
 * fenc = ref + offset
Henrik Gramner's avatar
Henrik Gramner committed
109
 * v = u + stride * chroma height */
Fiona Glaser's avatar
Fiona Glaser committed
110

111
static NOINLINE void weight_cost_init_chroma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dstu, pixel *dstv )
Fiona Glaser's avatar
Fiona Glaser committed
112
113
114
115
116
117
{
    int ref0_distance = fenc->i_frame - ref->i_frame - 1;
    int i_stride = fenc->i_stride[1];
    int i_offset = i_stride / 2;
    int i_lines = fenc->i_lines[1];
    int i_width = fenc->i_width[1];
118
    int v_shift = CHROMA_V_SHIFT;
Henrik Gramner's avatar
Henrik Gramner committed
119
120
121
    int cw = 8*h->mb.i_mb_width;
    int ch = 16*h->mb.i_mb_height >> v_shift;
    int height = 16 >> v_shift;
Fiona Glaser's avatar
Fiona Glaser committed
122
123
124

    if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF )
    {
125
        x264_frame_expand_border_chroma( h, ref, 1 );
Henrik Gramner's avatar
Henrik Gramner committed
126
        for( int y = 0, mb_xy = 0, pel_offset_y = 0; y < i_lines; y += height, pel_offset_y = y*i_stride )
Fiona Glaser's avatar
Fiona Glaser committed
127
128
129
130
            for( int x = 0, pel_offset_x = 0; x < i_width; x += 8, mb_xy++, pel_offset_x += 8 )
            {
                pixel *pixu = dstu + pel_offset_y + pel_offset_x;
                pixel *pixv = dstv + pel_offset_y + pel_offset_x;
Henrik Gramner's avatar
Henrik Gramner committed
131
                pixel *src1 =  ref->plane[1] + pel_offset_y + pel_offset_x*2; /* NV12/NV16 */
Fiona Glaser's avatar
Fiona Glaser committed
132
133
                int mvx = fenc->lowres_mvs[0][ref0_distance][mb_xy][0];
                int mvy = fenc->lowres_mvs[0][ref0_distance][mb_xy][1];
Henrik Gramner's avatar
Henrik Gramner committed
134
                h->mc.mc_chroma( pixu, pixv, i_stride, src1, i_stride, mvx, 2*mvy>>v_shift, 8, height );
Fiona Glaser's avatar
Fiona Glaser committed
135
136
137
138
139
140
141
142
            }
    }
    else
        h->mc.plane_copy_deinterleave( dstu, i_stride, dstv, i_stride, ref->plane[1], i_stride, cw, ch );
    h->mc.plane_copy_deinterleave( dstu+i_offset, i_stride, dstv+i_offset, i_stride, fenc->plane[1], i_stride, cw, ch );
    x264_emms();
}

143
static NOINLINE pixel *weight_cost_init_chroma444( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dst, int p )
Fiona Glaser's avatar
Fiona Glaser committed
144
145
146
147
148
149
150
151
{
    int ref0_distance = fenc->i_frame - ref->i_frame - 1;
    int i_stride = fenc->i_stride[p];
    int i_lines = fenc->i_lines[p];
    int i_width = fenc->i_width[p];

    if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF )
    {
152
        x264_frame_expand_border_chroma( h, ref, p );
Fiona Glaser's avatar
Fiona Glaser committed
153
154
155
156
        for( int y = 0, mb_xy = 0, pel_offset_y = 0; y < i_lines; y += 16, pel_offset_y = y*i_stride )
            for( int x = 0, pel_offset_x = 0; x < i_width; x += 16, mb_xy++, pel_offset_x += 16 )
            {
                pixel *pix = dst + pel_offset_y + pel_offset_x;
157
                pixel *src = ref->plane[p] + pel_offset_y + pel_offset_x;
Fiona Glaser's avatar
Fiona Glaser committed
158
159
160
161
162
163
164
165
166
167
                int mvx = fenc->lowres_mvs[0][ref0_distance][mb_xy][0] / 2;
                int mvy = fenc->lowres_mvs[0][ref0_distance][mb_xy][1] / 2;
                /* We don't want to calculate hpels for fenc frames, so we round the motion
                 * vectors to fullpel here.  It's not too bad, I guess? */
                h->mc.copy_16x16_unaligned( pix, i_stride, src+mvx+mvy*i_stride, i_stride, 16 );
            }
        x264_emms();
        return dst;
    }
    x264_emms();
168
    return ref->plane[p];
Fiona Glaser's avatar
Fiona Glaser committed
169
170
}

171
static int weight_slice_header_cost( x264_t *h, x264_weight_t *w, int b_chroma )
Fiona Glaser's avatar
Fiona Glaser committed
172
173
174
{
    /* Add cost of weights in the slice header. */
    int lambda = x264_lambda_tab[X264_LOOKAHEAD_QP];
175
176
177
    /* 4 times higher, because chroma is analyzed at full resolution. */
    if( b_chroma )
        lambda *= 4;
Fiona Glaser's avatar
Fiona Glaser committed
178
179
180
181
182
183
184
185
186
187
188
189
190
191
    int numslices;
    if( h->param.i_slice_count )
        numslices = h->param.i_slice_count;
    else if( h->param.i_slice_max_mbs )
        numslices = (h->mb.i_mb_width * h->mb.i_mb_height + h->param.i_slice_max_mbs-1) / h->param.i_slice_max_mbs;
    else
        numslices = 1;
    /* FIXME: find a way to account for --slice-max-size?
     * Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used.
     * Cut denom cost in half if chroma, since it's shared between the two chroma planes. */
    int denom_cost = bs_size_ue( w[0].i_denom ) * (2 - b_chroma);
    return lambda * numslices * ( 10 + denom_cost + 2 * (bs_size_se( w[0].i_scale ) + bs_size_se( w[0].i_offset )) );
}

192
static NOINLINE unsigned int weight_cost_luma( x264_t *h, x264_frame_t *fenc, pixel *src, x264_weight_t *w )
Dylan Yudaken's avatar
Dylan Yudaken committed
193
194
{
    unsigned int cost = 0;
Dylan Yudaken's avatar
Dylan Yudaken committed
195
196
197
    int i_stride = fenc->i_stride_lowres;
    int i_lines = fenc->i_lines_lowres;
    int i_width = fenc->i_width_lowres;
198
    pixel *fenc_plane = fenc->lowres[0];
199
    ALIGNED_ARRAY_16( pixel, buf,[8*8] );
Dylan Yudaken's avatar
Dylan Yudaken committed
200
201
202
203
    int pixoff = 0;
    int i_mb = 0;

    if( w )
Fiona Glaser's avatar
Fiona Glaser committed
204
    {
205
206
        for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
            for( int x = 0; x < i_width; x += 8, i_mb++, pixoff += 8)
Dylan Yudaken's avatar
Dylan Yudaken committed
207
            {
Dylan Yudaken's avatar
Dylan Yudaken committed
208
                w->weightfn[8>>2]( buf, 8, &src[pixoff], i_stride, w, 8 );
209
210
                int cmp = h->pixf.mbcmp[PIXEL_8x8]( buf, 8, &fenc_plane[pixoff], i_stride );
                cost += X264_MIN( cmp, fenc->i_intra_cost[i_mb] );
Dylan Yudaken's avatar
Dylan Yudaken committed
211
            }
212
        cost += weight_slice_header_cost( h, w, 0 );
Dylan Yudaken's avatar
Dylan Yudaken committed
213
    }
Fiona Glaser's avatar
Fiona Glaser committed
214
    else
215
216
        for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
            for( int x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
217
218
219
220
            {
                int cmp = h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride );
                cost += X264_MIN( cmp, fenc->i_intra_cost[i_mb] );
            }
221
    x264_emms();
Dylan Yudaken's avatar
Dylan Yudaken committed
222
223
224
    return cost;
}

225
static NOINLINE unsigned int weight_cost_chroma( x264_t *h, x264_frame_t *fenc, pixel *ref, x264_weight_t *w )
Fiona Glaser's avatar
Fiona Glaser committed
226
227
228
229
230
{
    unsigned int cost = 0;
    int i_stride = fenc->i_stride[1];
    int i_lines = fenc->i_lines[1];
    int i_width = fenc->i_width[1];
231
    pixel *src = ref + (i_stride >> 1);
Henrik Gramner's avatar
Henrik Gramner committed
232
    ALIGNED_ARRAY_16( pixel, buf, [8*16] );
Fiona Glaser's avatar
Fiona Glaser committed
233
    int pixoff = 0;
234
    int height = 16 >> CHROMA_V_SHIFT;
Fiona Glaser's avatar
Fiona Glaser committed
235
236
    if( w )
    {
Henrik Gramner's avatar
Henrik Gramner committed
237
        for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride )
Fiona Glaser's avatar
Fiona Glaser committed
238
239
            for( int x = 0; x < i_width; x += 8, pixoff += 8 )
            {
Henrik Gramner's avatar
Henrik Gramner committed
240
                w->weightfn[8>>2]( buf, 8, &ref[pixoff], i_stride, w, height );
Fiona Glaser's avatar
Fiona Glaser committed
241
242
243
244
                /* The naive and seemingly sensible algorithm is to use mbcmp as in luma.
                 * But testing shows that for chroma the DC coefficient is by far the most
                 * important part of the coding cost.  Thus a more useful chroma weight is
                 * obtained by comparing each block's DC coefficient instead of the actual
245
246
                 * pixels. */
                cost += h->pixf.asd8( buf, 8, &src[pixoff], i_stride, height );
Fiona Glaser's avatar
Fiona Glaser committed
247
            }
248
        cost += weight_slice_header_cost( h, w, 1 );
Fiona Glaser's avatar
Fiona Glaser committed
249
250
    }
    else
Henrik Gramner's avatar
Henrik Gramner committed
251
        for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride )
Fiona Glaser's avatar
Fiona Glaser committed
252
            for( int x = 0; x < i_width; x += 8, pixoff += 8 )
253
                cost += h->pixf.asd8( &ref[pixoff], i_stride, &src[pixoff], i_stride, height );
Fiona Glaser's avatar
Fiona Glaser committed
254
255
256
257
    x264_emms();
    return cost;
}

258
static NOINLINE unsigned int weight_cost_chroma444( x264_t *h, x264_frame_t *fenc, pixel *ref, x264_weight_t *w, int p )
Fiona Glaser's avatar
Fiona Glaser committed
259
260
261
262
263
264
{
    unsigned int cost = 0;
    int i_stride = fenc->i_stride[p];
    int i_lines = fenc->i_lines[p];
    int i_width = fenc->i_width[p];
    pixel *src = fenc->plane[p];
Henrik Gramner's avatar
Henrik Gramner committed
265
    ALIGNED_ARRAY_64( pixel, buf, [16*16] );
Fiona Glaser's avatar
Fiona Glaser committed
266
267
268
269
270
271
272
273
274
    int pixoff = 0;
    if( w )
    {
        for( int y = 0; y < i_lines; y += 16, pixoff = y*i_stride )
            for( int x = 0; x < i_width; x += 16, pixoff += 16 )
            {
                w->weightfn[16>>2]( buf, 16, &ref[pixoff], i_stride, w, 16 );
                cost += h->pixf.mbcmp[PIXEL_16x16]( buf, 16, &src[pixoff], i_stride );
            }
275
        cost += weight_slice_header_cost( h, w, 1 );
Fiona Glaser's avatar
Fiona Glaser committed
276
277
278
279
    }
    else
        for( int y = 0; y < i_lines; y += 16, pixoff = y*i_stride )
            for( int x = 0; x < i_width; x += 16, pixoff += 16 )
280
                cost += h->pixf.mbcmp[PIXEL_16x16]( &ref[pixoff], i_stride, &src[pixoff], i_stride );
Fiona Glaser's avatar
Fiona Glaser committed
281
282
283
284
    x264_emms();
    return cost;
}

Steve Borho's avatar
Steve Borho committed
285
void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
Dylan Yudaken's avatar
Dylan Yudaken committed
286
287
288
{
    int i_delta_index = fenc->i_frame - ref->i_frame - 1;
    /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
289
    const float epsilon = 1.f/128.f;
Dylan Yudaken's avatar
Dylan Yudaken committed
290
    x264_weight_t *weights = fenc->weight[0];
291
    SET_WEIGHT( weights[0], 0, 1, 0, 0 );
Fiona Glaser's avatar
Fiona Glaser committed
292
293
    SET_WEIGHT( weights[1], 0, 1, 0, 0 );
    SET_WEIGHT( weights[2], 0, 1, 0, 0 );
294
    int chroma_initted = 0;
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
    float guess_scale[3];
    float fenc_mean[3];
    float ref_mean[3];
    for( int plane = 0; plane <= 2*!b_lookahead; plane++ )
    {
        float fenc_var = fenc->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane];
        float ref_var  =  ref->i_pixel_ssd[plane] + !ref->i_pixel_ssd[plane];
        guess_scale[plane] = sqrtf( fenc_var / ref_var );
        fenc_mean[plane] = (float)fenc->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8));
        ref_mean[plane]  = (float) ref->i_pixel_sum[plane] / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8));
    }

    int chroma_denom = 7;
    if( !b_lookahead )
    {
        /* make sure both our scale factors fit */
        while( chroma_denom > 0 )
        {
            float thresh = 127.f / (1<<chroma_denom);
            if( guess_scale[1] < thresh && guess_scale[2] < thresh )
                break;
            chroma_denom--;
        }
    }

Fiona Glaser's avatar
Fiona Glaser committed
320
    /* Don't check chroma in lookahead, or if there wasn't a luma weight. */
321
    for( int plane = 0; plane <= 2 && !( plane && ( !weights[0].weightfn || b_lookahead ) ); plane++ )
Fiona Glaser's avatar
Fiona Glaser committed
322
    {
323
324
325
        int minoff, minscale, mindenom;
        unsigned int minscore, origscore;
        int found;
Dylan Yudaken's avatar
Dylan Yudaken committed
326

Fiona Glaser's avatar
Fiona Glaser committed
327
        //early termination
328
        if( fabsf( ref_mean[plane] - fenc_mean[plane] ) < 0.5f && fabsf( 1.f - guess_scale[plane] ) < epsilon )
Fiona Glaser's avatar
Fiona Glaser committed
329
330
331
332
        {
            SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
            continue;
        }
Dylan Yudaken's avatar
Dylan Yudaken committed
333

Fiona Glaser's avatar
Fiona Glaser committed
334
335
        if( plane )
        {
336
337
            weights[plane].i_denom = chroma_denom;
            weights[plane].i_scale = x264_clip3( round( guess_scale[plane] * (1<<chroma_denom) ), 0, 255 );
Fiona Glaser's avatar
Fiona Glaser committed
338
339
340
341
342
343
344
            if( weights[plane].i_scale > 127 )
            {
                weights[1].weightfn = weights[2].weightfn = NULL;
                break;
            }
        }
        else
345
            weight_get_h264( round( guess_scale[plane] * 128 ), 0, &weights[plane] );
Dylan Yudaken's avatar
Dylan Yudaken committed
346

Fiona Glaser's avatar
Fiona Glaser committed
347
348
349
350
        found = 0;
        mindenom = weights[plane].i_denom;
        minscale = weights[plane].i_scale;
        minoff = 0;
Dylan Yudaken's avatar
Dylan Yudaken committed
351

Fiona Glaser's avatar
Fiona Glaser committed
352
353
354
355
356
357
        pixel *mcbuf;
        if( !plane )
        {
            if( !fenc->b_intra_calculated )
            {
                x264_mb_analysis_t a;
358
359
                lowres_context_init( h, &a );
                slicetype_frame_cost( h, &a, &fenc, 0, 0, 0 );
Fiona Glaser's avatar
Fiona Glaser committed
360
            }
361
362
            mcbuf = weight_cost_init_luma( h, fenc, ref, h->mb.p_weight_buf[0] );
            origscore = minscore = weight_cost_luma( h, fenc, mcbuf, NULL );
Fiona Glaser's avatar
Fiona Glaser committed
363
364
365
        }
        else
        {
Fiona Glaser's avatar
Fiona Glaser committed
366
            if( CHROMA444 )
367
            {
368
369
                mcbuf = weight_cost_init_chroma444( h, fenc, ref, h->mb.p_weight_buf[0], plane );
                origscore = minscore = weight_cost_chroma444( h, fenc, mcbuf, NULL, plane );
370
            }
Fiona Glaser's avatar
Fiona Glaser committed
371
372
373
374
            else
            {
                pixel *dstu = h->mb.p_weight_buf[0];
                pixel *dstv = h->mb.p_weight_buf[0]+fenc->i_stride[1]*fenc->i_lines[1];
375
                if( !chroma_initted++ )
376
                    weight_cost_init_chroma( h, fenc, ref, dstu, dstv );
Fiona Glaser's avatar
Fiona Glaser committed
377
                mcbuf = plane == 1 ? dstu : dstv;
378
                origscore = minscore = weight_cost_chroma( h, fenc, mcbuf, NULL );
Fiona Glaser's avatar
Fiona Glaser committed
379
            }
Fiona Glaser's avatar
Fiona Glaser committed
380
        }
Dylan Yudaken's avatar
Dylan Yudaken committed
381

Fiona Glaser's avatar
Fiona Glaser committed
382
383
        if( !minscore )
            continue;
Dylan Yudaken's avatar
Dylan Yudaken committed
384

385
386
387
388
389
390
391
392
393
394
395
396
397
        /* Picked somewhat arbitrarily */
        static const uint8_t weight_check_distance[][2] =
        {
            {0,0},{0,0},{0,1},{0,1},
            {0,1},{0,1},{0,1},{1,1},
            {1,1},{2,1},{2,1},{4,2}
        };
        int scale_dist =  b_lookahead ? 0 : weight_check_distance[h->param.analyse.i_subpel_refine][0];
        int offset_dist = b_lookahead ? 0 : weight_check_distance[h->param.analyse.i_subpel_refine][1];

        int start_scale  = x264_clip3( minscale - scale_dist, 0, 127 );
        int end_scale    = x264_clip3( minscale + scale_dist, 0, 127 );
        for( int i_scale = start_scale; i_scale <= end_scale; i_scale++ )
Fiona Glaser's avatar
Fiona Glaser committed
398
        {
399
400
401
            int cur_scale = i_scale;
            int cur_offset = fenc_mean[plane] - ref_mean[plane] * cur_scale / (1 << mindenom) + 0.5f * b_lookahead;
            if( cur_offset < - 128 || cur_offset > 127 )
Fiona Glaser's avatar
Fiona Glaser committed
402
            {
403
404
405
406
407
408
                /* Rescale considering the constraints on cur_offset. We do it in this order
                 * because scale has a much wider range than offset (because of denom), so
                 * it should almost never need to be clamped. */
                cur_offset = x264_clip3( cur_offset, -128, 127 );
                cur_scale = (1 << mindenom) * (fenc_mean[plane] - cur_offset) / ref_mean[plane] + 0.5f;
                cur_scale = x264_clip3( cur_scale, 0, 127 );
Fiona Glaser's avatar
Fiona Glaser committed
409
            }
410
411
412
413
414
415
416
417
418
            int start_offset = x264_clip3( cur_offset - offset_dist, -128, 127 );
            int end_offset   = x264_clip3( cur_offset + offset_dist, -128, 127 );
            for( int i_off = start_offset; i_off <= end_offset; i_off++ )
            {
                SET_WEIGHT( weights[plane], 1, cur_scale, mindenom, i_off );
                unsigned int s;
                if( plane )
                {
                    if( CHROMA444 )
419
                        s = weight_cost_chroma444( h, fenc, mcbuf, &weights[plane], plane );
420
                    else
421
                        s = weight_cost_chroma( h, fenc, mcbuf, &weights[plane] );
422
423
                }
                else
424
                    s = weight_cost_luma( h, fenc, mcbuf, &weights[plane] );
425
                COPY4_IF_LT( minscore, s, minscale, cur_scale, minoff, i_off, found, 1 );
426

427
428
429
430
                // Don't check any more offsets if the previous one had a lower cost than the current one
                if( minoff == start_offset && i_off != start_offset )
                    break;
            }
Fiona Glaser's avatar
Fiona Glaser committed
431
432
433
        }
        x264_emms();

434
435
436
437
438
439
440
441
442
443
        /* Use a smaller denominator if possible */
        if( !plane )
        {
            while( mindenom > 0 && !(minscale&1) )
            {
                mindenom--;
                minscale >>= 1;
            }
        }

Fiona Glaser's avatar
Fiona Glaser committed
444
445
        /* FIXME: More analysis can be done here on SAD vs. SATD termination. */
        /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
446
        if( !found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f )
Fiona Glaser's avatar
Fiona Glaser committed
447
448
449
450
451
452
453
454
455
        {
            SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
            continue;
        }
        else
            SET_WEIGHT( weights[plane], 1, minscale, mindenom, minoff );

        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE && weights[0].weightfn && !plane )
            fenc->f_weighted_cost_delta[i_delta_index] = (float)minscore / origscore;
Dylan Yudaken's avatar
Dylan Yudaken committed
456
457
    }

458
459
    /* Optimize and unify denominator */
    if( weights[1].weightfn || weights[2].weightfn )
Dylan Yudaken's avatar
Dylan Yudaken committed
460
    {
461
462
463
464
465
466
467
        int denom = weights[1].weightfn ? weights[1].i_denom : weights[2].i_denom;
        int both_weighted = weights[1].weightfn && weights[2].weightfn;
        /* If only one plane is weighted, the other has an implicit scale of 1<<denom.
         * With denom==7, this comes out to 128, which is invalid, so don't allow that. */
        while( (!both_weighted && denom==7) ||
               (denom > 0 && !(weights[1].weightfn && (weights[1].i_scale&1))
                         && !(weights[2].weightfn && (weights[2].i_scale&1))) )
Fiona Glaser's avatar
Fiona Glaser committed
468
        {
469
470
471
472
473
474
475
            denom--;
            for( int i = 1; i <= 2; i++ )
                if( weights[i].weightfn )
                {
                    weights[i].i_scale >>= 1;
                    weights[i].i_denom = denom;
                }
Fiona Glaser's avatar
Fiona Glaser committed
476
        }
Dylan Yudaken's avatar
Dylan Yudaken committed
477
    }
478
479
480
    for( int i = 1; i <= 2; i++ )
        if( weights[i].weightfn )
            h->mc.weight_cache( h, &weights[i] );
Dylan Yudaken's avatar
Dylan Yudaken committed
481
482
483
484

    if( weights[0].weightfn && b_lookahead )
    {
        //scale lowres in lookahead for slicetype_frame_cost
485
486
        pixel *src = ref->buffer_lowres[0];
        pixel *dst = h->mb.p_weight_buf[0];
Dylan Yudaken's avatar
Dylan Yudaken committed
487
        int width = ref->i_width_lowres + PADH*2;
488
        int height = ref->i_lines_lowres + PADV*2;
Dylan Yudaken's avatar
Dylan Yudaken committed
489
490
        x264_weight_scale_plane( h, dst, ref->i_stride_lowres, src, ref->i_stride_lowres,
                                 width, height, &weights[0] );
491
        fenc->weighted[0] = h->mb.p_weight_buf[0] + PADH + ref->i_stride_lowres * PADV;
Dylan Yudaken's avatar
Dylan Yudaken committed
492
493
494
    }
}

Fiona Glaser's avatar
Fiona Glaser committed
495
496
497
498
499
500
501
502
503
504
505
/* Output buffers are separated by 128 bytes to avoid false sharing of cachelines
 * in multithreaded lookahead. */
#define PAD_SIZE 32
/* cost_est, cost_est_aq, intra_mbs, num rows */
#define NUM_INTS 4
#define COST_EST 0
#define COST_EST_AQ 1
#define INTRA_MBS 2
#define NUM_ROWS 3
#define ROW_SATD (NUM_INTS + (h->mb.i_mb_y - h->i_threadslice_start))

506
507
508
509
static void slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
                               x264_frame_t **frames, int p0, int p1, int b,
                               int dist_scale_factor, int do_search[2], const x264_weight_t *w,
                               int *output_inter, int *output_intra )
510
511
512
513
514
515
516
{
    x264_frame_t *fref0 = frames[p0];
    x264_frame_t *fref1 = frames[p1];
    x264_frame_t *fenc  = frames[b];
    const int b_bidir = (b < p1);
    const int i_mb_x = h->mb.i_mb_x;
    const int i_mb_y = h->mb.i_mb_y;
517
    const int i_mb_stride = h->mb.i_mb_width;
518
519
    const int i_mb_xy = i_mb_x + i_mb_y * i_mb_stride;
    const int i_stride = fenc->i_stride_lowres;
Loren Merritt's avatar
Loren Merritt committed
520
    const int i_pel_offset = 8 * (i_mb_x + i_mb_y * i_stride);
521
    const int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32;
522
523
    int16_t (*fenc_mvs[2])[2] = { &fenc->lowres_mvs[0][b-p0-1][i_mb_xy], &fenc->lowres_mvs[1][p1-b-1][i_mb_xy] };
    int (*fenc_costs[2]) = { &fenc->lowres_mv_costs[0][b-p0-1][i_mb_xy], &fenc->lowres_mv_costs[1][p1-b-1][i_mb_xy] };
524
525
526
    int b_frame_score_mb = (i_mb_x > 0 && i_mb_x < h->mb.i_mb_width - 1 &&
                            i_mb_y > 0 && i_mb_y < h->mb.i_mb_height - 1) ||
                            h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2;
527

528
    ALIGNED_ARRAY_16( pixel, pix1,[9*FDEC_STRIDE] );
529
    pixel *pix2 = pix1+8;
530
    x264_me_t m[2];
Loren Merritt's avatar
Loren Merritt committed
531
    int i_bcost = COST_MAX;
Fiona Glaser's avatar
Fiona Glaser committed
532
    int list_used = 0;
533
534
    /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
    int lowres_penalty = 4;
535

536
537
538
    h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
    h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, &fenc->lowres[0][i_pel_offset], i_stride, 8 );

Fiona Glaser's avatar
Fiona Glaser committed
539
    if( p0 == p1 )
Loren Merritt's avatar
Loren Merritt committed
540
541
        goto lowres_intra_mb;

542
    int mv_range = 2 * h->param.analyse.i_mv_range;
543
    // no need for h->mb.mv_min[]
544
545
546
547
    h->mb.mv_min_spel[0] = X264_MAX( 4*(-8*h->mb.i_mb_x - 12), -mv_range );
    h->mb.mv_max_spel[0] = X264_MIN( 4*(8*(h->mb.i_mb_width - h->mb.i_mb_x - 1) + 12), mv_range-1 );
    h->mb.mv_limit_fpel[0][0] = h->mb.mv_min_spel[0] >> 2;
    h->mb.mv_limit_fpel[1][0] = h->mb.mv_max_spel[0] >> 2;
548
    if( h->mb.i_mb_x >= h->mb.i_mb_width - 2 )
549
    {
550
551
552
553
        h->mb.mv_min_spel[1] = X264_MAX( 4*(-8*h->mb.i_mb_y - 12), -mv_range );
        h->mb.mv_max_spel[1] = X264_MIN( 4*(8*( h->mb.i_mb_height - h->mb.i_mb_y - 1) + 12), mv_range-1 );
        h->mb.mv_limit_fpel[0][1] = h->mb.mv_min_spel[1] >> 2;
        h->mb.mv_limit_fpel[1][1] = h->mb.mv_max_spel[1] >> 2;
554
    }
555

Loren Merritt's avatar
Loren Merritt committed
556
557
558
559
560
561
562
#define LOAD_HPELS_LUMA(dst, src) \
    { \
        (dst)[0] = &(src)[0][i_pel_offset]; \
        (dst)[1] = &(src)[1][i_pel_offset]; \
        (dst)[2] = &(src)[2][i_pel_offset]; \
        (dst)[3] = &(src)[3][i_pel_offset]; \
    }
Dylan Yudaken's avatar
Dylan Yudaken committed
563
564
565
#define LOAD_WPELS_LUMA(dst,src) \
    (dst) = &(src)[i_pel_offset];

Loren Merritt's avatar
Loren Merritt committed
566
567
568
569
570
#define CLIP_MV( mv ) \
    { \
        mv[0] = x264_clip3( mv[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); \
        mv[1] = x264_clip3( mv[1], h->mb.mv_min_spel[1], h->mb.mv_max_spel[1] ); \
    }
571
572
573
#define TRY_BIDIR( mv0, mv1, penalty ) \
    { \
        int i_cost; \
Fiona Glaser's avatar
Fiona Glaser committed
574
575
576
577
        if( h->param.analyse.i_subpel_refine <= 1 ) \
        { \
            int hpel_idx1 = (((mv0)[0]&2)>>1) + ((mv0)[1]&2); \
            int hpel_idx2 = (((mv1)[0]&2)>>1) + ((mv1)[1]&2); \
578
579
            pixel *src1 = m[0].p_fref[hpel_idx1] + ((mv0)[0]>>2) + ((mv0)[1]>>2) * m[0].i_stride[0]; \
            pixel *src2 = m[1].p_fref[hpel_idx2] + ((mv1)[0]>>2) + ((mv1)[1]>>2) * m[1].i_stride[0]; \
Fiona Glaser's avatar
Fiona Glaser committed
580
581
582
583
            h->mc.avg[PIXEL_8x8]( pix1, 16, src1, m[0].i_stride[0], src2, m[1].i_stride[0], i_bipred_weight ); \
        } \
        else \
        { \
584
            intptr_t stride1 = 16, stride2 = 16; \
585
            pixel *src1, *src2; \
Fiona Glaser's avatar
Fiona Glaser committed
586
587
588
589
590
591
            src1 = h->mc.get_ref( pix1, &stride1, m[0].p_fref, m[0].i_stride[0], \
                                  (mv0)[0], (mv0)[1], 8, 8, w ); \
            src2 = h->mc.get_ref( pix2, &stride2, m[1].p_fref, m[1].i_stride[0], \
                                  (mv1)[0], (mv1)[1], 8, 8, w ); \
            h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \
        } \
592
        i_cost = penalty * a->i_lambda + h->pixf.mbcmp[PIXEL_8x8]( \
593
                           m[0].p_fenc[0], FENC_STRIDE, pix1, 16 ); \
Fiona Glaser's avatar
Fiona Glaser committed
594
        COPY2_IF_LT( i_bcost, i_cost, list_used, 3 ); \
595
596
597
598
    }

    m[0].i_pixel = PIXEL_8x8;
    m[0].p_cost_mv = a->p_cost_mv;
Loren Merritt's avatar
Loren Merritt committed
599
    m[0].i_stride[0] = i_stride;
600
    m[0].p_fenc[0] = h->mb.pic.p_fenc[0];
Dylan Yudaken's avatar
Dylan Yudaken committed
601
    m[0].weight = w;
Fiona Glaser's avatar
Fiona Glaser committed
602
    m[0].i_ref = 0;
Loren Merritt's avatar
Loren Merritt committed
603
    LOAD_HPELS_LUMA( m[0].p_fref, fref0->lowres );
Dylan Yudaken's avatar
Dylan Yudaken committed
604
605
606
    m[0].p_fref_w = m[0].p_fref[0];
    if( w[0].weightfn )
        LOAD_WPELS_LUMA( m[0].p_fref_w, fenc->weighted[0] );
607
608
609

    if( b_bidir )
    {
610
        ALIGNED_ARRAY_8( int16_t, dmv,[2],[2] );
611

Fiona Glaser's avatar
Fiona Glaser committed
612
613
614
615
616
        m[1].i_pixel = PIXEL_8x8;
        m[1].p_cost_mv = a->p_cost_mv;
        m[1].i_stride[0] = i_stride;
        m[1].p_fenc[0] = h->mb.pic.p_fenc[0];
        m[1].i_ref = 0;
Anton Mitrofanov's avatar
Anton Mitrofanov committed
617
        m[1].weight = x264_weight_none;
Loren Merritt's avatar
Loren Merritt committed
618
        LOAD_HPELS_LUMA( m[1].p_fref, fref1->lowres );
Dylan Yudaken's avatar
Dylan Yudaken committed
619
        m[1].p_fref_w = m[1].p_fref[0];
620

621
622
623
624
625
626
627
628
629
630
631
632
633
634
        if( fref1->lowres_mvs[0][p1-p0-1][0][0] != 0x7FFF )
        {
            int16_t *mvr = fref1->lowres_mvs[0][p1-p0-1][i_mb_xy];
            dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8;
            dmv[0][1] = ( mvr[1] * dist_scale_factor + 128 ) >> 8;
            dmv[1][0] = dmv[0][0] - mvr[0];
            dmv[1][1] = dmv[0][1] - mvr[1];
            CLIP_MV( dmv[0] );
            CLIP_MV( dmv[1] );
            if( h->param.analyse.i_subpel_refine <= 1 )
                M64( dmv ) &= ~0x0001000100010001ULL; /* mv & ~1 */
        }
        else
            M64( dmv ) = 0;
635
636

        TRY_BIDIR( dmv[0], dmv[1], 0 );
Fiona Glaser's avatar
Fiona Glaser committed
637
        if( M64( dmv ) )
638
639
640
641
        {
            int i_cost;
            h->mc.avg[PIXEL_8x8]( pix1, 16, m[0].p_fref[0], m[0].i_stride[0], m[1].p_fref[0], m[1].i_stride[0], i_bipred_weight );
            i_cost = h->pixf.mbcmp[PIXEL_8x8]( m[0].p_fenc[0], FENC_STRIDE, pix1, 16 );
Fiona Glaser's avatar
Fiona Glaser committed
642
            COPY2_IF_LT( i_bcost, i_cost, list_used, 3 );
643
        }
644
645
    }

646
    for( int l = 0; l < 1 + b_bidir; l++ )
647
    {
648
        if( do_search[l] )
649
        {
650
651
            int i_mvc = 0;
            int16_t (*fenc_mv)[2] = fenc_mvs[l];
652
            ALIGNED_4( int16_t mvc[4][2] );
653

654
            /* Reverse-order MV prediction. */
Fiona Glaser's avatar
Fiona Glaser committed
655
656
657
            M32( mvc[0] ) = 0;
            M32( mvc[2] ) = 0;
#define MVC(mv) { CP32( mvc[i_mvc], mv ); i_mvc++; }
658
            if( i_mb_x < h->mb.i_mb_width - 1 )
659
                MVC( fenc_mv[1] );
Fiona Glaser's avatar
Fiona Glaser committed
660
            if( i_mb_y < h->i_threadslice_end - 1 )
661
            {
662
                MVC( fenc_mv[i_mb_stride] );
663
                if( i_mb_x > 0 )
664
                    MVC( fenc_mv[i_mb_stride-1] );
665
                if( i_mb_x < h->mb.i_mb_width - 1 )
666
                    MVC( fenc_mv[i_mb_stride+1] );
667
            }
Loren Merritt's avatar
Loren Merritt committed
668
#undef MVC
669
670
671
672
            if( i_mvc <= 1 )
                CP32( m[l].mvp, mvc[0] );
            else
                x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] );
673

674
675
676
677
678
679
680
681
682
683
684
685
686
            /* Fast skip for cases of near-zero residual.  Shortcut: don't bother except in the mv0 case,
             * since anything else is likely to have enough residual to not trigger the skip. */
            if( !M32( m[l].mvp ) )
            {
                m[l].cost = h->pixf.mbcmp[PIXEL_8x8]( m[l].p_fenc[0], FENC_STRIDE, m[l].p_fref[0], m[l].i_stride[0] );
                if( m[l].cost < 64 )
                {
                    M32( m[l].mv ) = 0;
                    goto skip_motionest;
                }
            }

            x264_me_search( h, &m[l], mvc, i_mvc );
687
            m[l].cost -= a->p_cost_mv[0]; // remove mvcost from skip mbs
Fiona Glaser's avatar
Fiona Glaser committed
688
            if( M32( m[l].mv ) )
689
                m[l].cost += 5 * a->i_lambda;
690
691

skip_motionest:
Fiona Glaser's avatar
Fiona Glaser committed
692
            CP32( fenc_mvs[l], m[l].mv );
693
694
695
696
            *fenc_costs[l] = m[l].cost;
        }
        else
        {
Fiona Glaser's avatar
Fiona Glaser committed
697
            CP32( m[l].mv, fenc_mvs[l] );
698
699
            m[l].cost = *fenc_costs[l];
        }
Fiona Glaser's avatar
Fiona Glaser committed
700
        COPY2_IF_LT( i_bcost, m[l].cost, list_used, l+1 );
701
702
    }

Fiona Glaser's avatar
Fiona Glaser committed
703
    if( b_bidir && ( M32( m[0].mv ) || M32( m[1].mv ) ) )
704
705
        TRY_BIDIR( m[0].mv, m[1].mv, 5 );

Loren Merritt's avatar
Loren Merritt committed
706
lowres_intra_mb:
Fiona Glaser's avatar
Fiona Glaser committed
707
    if( !fenc->b_intra_calculated )
708
    {
709
        ALIGNED_ARRAY_16( pixel, edge,[36] );
710
711
        pixel *pix = &pix1[8+FDEC_STRIDE];
        pixel *src = &fenc->lowres[0][i_pel_offset];
712
        const int intra_penalty = 5 * a->i_lambda;
Fiona Glaser's avatar
Fiona Glaser committed
713
        int satds[3];
714
        int pixoff = 4 / sizeof(pixel);
Fiona Glaser's avatar
Fiona Glaser committed
715

716
717
718
719
        /* Avoid store forwarding stalls by writing larger chunks */
        memcpy( pix-FDEC_STRIDE, src-i_stride, 16 * sizeof(pixel) );
        for( int i = -1; i < 8; i++ )
            M32( &pix[i*FDEC_STRIDE-pixoff] ) = M32( &src[i*i_stride-pixoff] );
Fiona Glaser's avatar
Fiona Glaser committed
720

Fiona Glaser's avatar
Fiona Glaser committed
721
        h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[0], pix, satds );
Fiona Glaser's avatar
Fiona Glaser committed
722
        int i_icost = X264_MIN3( satds[0], satds[1], satds[2] );
Loren Merritt's avatar
Loren Merritt committed
723

Fiona Glaser's avatar
Fiona Glaser committed
724
725
726
        if( h->param.analyse.i_subpel_refine > 1 )
        {
            h->predict_8x8c[I_PRED_CHROMA_P]( pix );
727
            int satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE );
Fiona Glaser's avatar
Fiona Glaser committed
728
729
730
            i_icost = X264_MIN( i_icost, satd );
            h->predict_8x8_filter( pix, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
            for( int i = 3; i < 9; i++ )
731
            {
Fiona Glaser's avatar
Fiona Glaser committed
732
                h->predict_8x8[i]( pix, edge );
733
                satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE );
Loren Merritt's avatar
Loren Merritt committed
734
                i_icost = X264_MIN( i_icost, satd );
735
            }
Fiona Glaser's avatar
Fiona Glaser committed
736
        }
Loren Merritt's avatar
Loren Merritt committed
737

738
        i_icost = ((i_icost + intra_penalty) >> (BIT_DEPTH - 8)) + lowres_penalty;
Fiona Glaser's avatar
Fiona Glaser committed
739
        fenc->i_intra_cost[i_mb_xy] = i_icost;
740
741
742
        int i_icost_aq = i_icost;
        if( h->param.rc.i_aq_mode )
            i_icost_aq = (i_icost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8;
Fiona Glaser's avatar
Fiona Glaser committed
743
        output_intra[ROW_SATD] += i_icost_aq;
Fiona Glaser's avatar
Fiona Glaser committed
744
745
        if( b_frame_score_mb )
        {
Fiona Glaser's avatar
Fiona Glaser committed
746
747
            output_intra[COST_EST] += i_icost;
            output_intra[COST_EST_AQ] += i_icost_aq;
748
        }
Fiona Glaser's avatar
Fiona Glaser committed
749
    }
750
    i_bcost = (i_bcost >> (BIT_DEPTH - 8)) + lowres_penalty;
Fiona Glaser's avatar
Fiona Glaser committed
751
752
753
754
755
756
757
758

    /* forbid intra-mbs in B-frames, because it's rare and not worth checking */
    /* FIXME: Should we still forbid them now that we cache intra scores? */
    if( !b_bidir )
    {
        int i_icost = fenc->i_intra_cost[i_mb_xy];
        int b_intra = i_icost < i_bcost;
        if( b_intra )
759
        {
Fiona Glaser's avatar
Fiona Glaser committed
760
            i_bcost = i_icost;
761
762
            list_used = 0;
        }
Fiona Glaser's avatar
Fiona Glaser committed
763
        if( b_frame_score_mb )
Fiona Glaser's avatar
Fiona Glaser committed
764
            output_inter[INTRA_MBS] += b_intra;
Fiona Glaser's avatar
Fiona Glaser committed
765
766
767
768
769
770
771
    }

    /* In an I-frame, we've already added the results above in the intra section. */
    if( p0 != p1 )
    {
        int i_bcost_aq = i_bcost;
        if( h->param.rc.i_aq_mode )
772
            i_bcost_aq = (i_bcost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8;
Fiona Glaser's avatar
Fiona Glaser committed
773
        output_inter[ROW_SATD] += i_bcost_aq;
Fiona Glaser's avatar
Fiona Glaser committed
774
        if( b_frame_score_mb )
Loren Merritt's avatar
Loren Merritt committed
775
        {
Fiona Glaser's avatar
Fiona Glaser committed
776
            /* Don't use AQ-weighted costs for slicetype decision, only for ratecontrol. */
Fiona Glaser's avatar
Fiona Glaser committed
777
778
            output_inter[COST_EST] += i_bcost;
            output_inter[COST_EST_AQ] += i_bcost_aq;
779
780
781
        }
    }

782
    fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = X264_MIN( i_bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
783
784
785
}
#undef TRY_BIDIR

786
#define NUM_MBS\
787
788
789
   (h->mb.i_mb_width > 2 && h->mb.i_mb_height > 2 ?\
   (h->mb.i_mb_width - 2) * (h->mb.i_mb_height - 2) :\
    h->mb.i_mb_width * h->mb.i_mb_height)
790

Fiona Glaser's avatar
Fiona Glaser committed
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
typedef struct
{
    x264_t *h;
    x264_mb_analysis_t *a;
    x264_frame_t **frames;
    int p0;
    int p1;
    int b;
    int dist_scale_factor;
    int *do_search;
    const x264_weight_t *w;
    int *output_inter;
    int *output_intra;
} x264_slicetype_slice_t;

806
static void slicetype_slice_cost( x264_slicetype_slice_t *s )
Fiona Glaser's avatar
Fiona Glaser committed
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
{
    x264_t *h = s->h;

    /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode.
     * This considerably improves MV prediction overall. */

    /* The edge mbs seem to reduce the predictive quality of the
     * whole frame's score, but are needed for a spatial distribution. */
    int do_edges = h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size || h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2;

    int start_y = X264_MIN( h->i_threadslice_end - 1, h->mb.i_mb_height - 2 + do_edges );
    int end_y = X264_MAX( h->i_threadslice_start, 1 - do_edges );
    int start_x = h->mb.i_mb_width - 2 + do_edges;
    int end_x = 1 - do_edges;

    for( h->mb.i_mb_y = start_y; h->mb.i_mb_y >= end_y; h->mb.i_mb_y-- )
        for( h->mb.i_mb_x = start_x; h->mb.i_mb_x >= end_x; h->mb.i_mb_x-- )
824
825
            slicetype_mb_cost( h, s->a, s->frames, s->p0, s->p1, s->b, s->dist_scale_factor,
                               s->do_search, s->w, s->output_inter, s->output_intra );
Fiona Glaser's avatar
Fiona Glaser committed
826
827
}

828
829
static int slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
                                 x264_frame_t **frames, int p0, int p1, int b )
830
831
{
    int i_score = 0;
832
    int do_search[2];
Anton Mitrofanov's avatar
Anton Mitrofanov committed
833
    const x264_weight_t *w = x264_weight_none;
Fiona Glaser's avatar
Fiona Glaser committed
834
835
    x264_frame_t *fenc = frames[b];

836
837
838
    /* Check whether we already evaluated this frame
     * If we have tried this frame as P, then we have also tried
     * the preceding frames as B. (is this still true?) */
839
    /* Also check that we already calculated the row SATDs for the current frame. */
Fiona Glaser's avatar
Fiona Glaser committed
840
841
    if( fenc->i_cost_est[b-p0][p1-b] >= 0 && (!h->param.rc.i_vbv_buffer_size || fenc->i_row_satds[b-p0][p1-b][0] != -1) )
        i_score = fenc->i_cost_est[b-p0][p1-b];
Loren Merritt's avatar
Loren Merritt committed
842
843
844
845
    else
    {
        int dist_scale_factor = 128;

846
        /* For each list, check to see whether we have lowres motion-searched this reference frame before. */
Fiona Glaser's avatar
Fiona Glaser committed
847
848
        do_search[0] = b != p0 && fenc->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
        do_search[1] = b != p1 && fenc->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
Dylan Yudaken's avatar
Dylan Yudaken committed
849
850
        if( do_search[0] )
        {
851
            if( h->param.analyse.i_weighted_pred && b == p1 )
Dylan Yudaken's avatar
Dylan Yudaken committed
852
            {
Fiona Glaser's avatar
Fiona Glaser committed
853
                x264_emms();
Fiona Glaser's avatar
Fiona Glaser committed
854
855
                x264_weights_analyse( h, fenc, frames[p0], 1 );
                w = fenc->weight[0];
Dylan Yudaken's avatar
Dylan Yudaken committed
856
            }
Fiona Glaser's avatar
Fiona Glaser committed
857
            fenc->lowres_mvs[0][b-p0-1][0][0] = 0;
Dylan Yudaken's avatar
Dylan Yudaken committed
858
        }
Fiona Glaser's avatar
Fiona Glaser committed
859
        if( do_search[1] ) fenc->lowres_mvs[1][p1-b-1][0][0] = 0;
Loren Merritt's avatar
Loren Merritt committed
860
861
862
863

        if( p1 != p0 )
            dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);

Fiona Glaser's avatar
Fiona Glaser committed
864
865
866
867
868
        int output_buf_size = h->mb.i_mb_height + (NUM_INTS + PAD_SIZE) * h->param.i_lookahead_threads;
        int *output_inter[X264_LOOKAHEAD_THREAD_MAX+1];
        int *output_intra[X264_LOOKAHEAD_THREAD_MAX+1];
        output_inter[0] = h->scratch_buffer2;
        output_intra[0] = output_inter[0] + output_buf_size;
869

Steve Borho's avatar
Steve Borho committed
870
871
#if HAVE_OPENCL
        if( h->param.b_opencl )
Loren Merritt's avatar
Loren Merritt committed
872
        {
Steve Borho's avatar
Steve Borho committed
873
874
875
876
877
878
879
880
881
882
883
884
885
886
            x264_opencl_lowres_init(h, fenc, a->i_lambda );
            if( do_search[0] )
            {
                x264_opencl_lowres_init( h, frames[p0], a->i_lambda );
                x264_opencl_motionsearch( h, frames, b, p0, 0, a->i_lambda, w );
            }
            if( do_search[1] )
            {
                x264_opencl_lowres_init( h, frames[p1], a->i_lambda );
                x264_opencl_motionsearch( h, frames, b, p1, 1, a->i_lambda, NULL );
            }
            if( b != p0 )
                x264_opencl_finalize_cost( h, a->i_lambda, frames, p0, p1, b, dist_scale_factor );
            x264_opencl_flush( h );
Fiona Glaser's avatar
Fiona Glaser committed
887

Steve Borho's avatar
Steve Borho committed
888
889
890
891
892
893
            i_score = fenc->i_cost_est[b-p0][p1-b];
        }
        else
#endif
        {
            if( h->param.i_lookahead_threads > 1 )
894
            {
Steve Borho's avatar
Steve Borho committed
895
                x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX];
Fiona Glaser's avatar
Fiona Glaser committed
896

Steve Borho's avatar
Steve Borho committed
897
898
899
                for( int i = 0; i < h->param.i_lookahead_threads; i++ )
                {
                    x264_t *t = h->lookahead_thread[i];
Fiona Glaser's avatar
Fiona Glaser committed
900

Steve Borho's avatar
Steve Borho committed
901
902
903
904
                    /* FIXME move this somewhere else */
                    t->mb.i_me_method = h->mb.i_me_method;
                    t->mb.i_subpel_refine = h->mb.i_subpel_refine;
                    t->mb.b_chroma_me = h->mb.b_chroma_me;
Fiona Glaser's avatar
Fiona Glaser committed
905

Steve Borho's avatar
Steve Borho committed
906
907
                    s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
                        output_inter[i], output_intra[i] };
Fiona Glaser's avatar
Fiona Glaser committed
908

Steve Borho's avatar
Steve Borho committed
909
910
                    t->i_threadslice_start = ((h->mb.i_mb_height *  i    + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
                    t->i_threadslice_end   = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
Fiona Glaser's avatar
Fiona Glaser committed
911

Steve Borho's avatar
Steve Borho committed
912
913
914
915
916
                    int thread_height = t->i_threadslice_end - t->i_threadslice_start;
                    int thread_output_size = thread_height + NUM_INTS;
                    memset( output_inter[i], 0, thread_output_size * sizeof(int) );
                    memset( output_intra[i], 0, thread_output_size * sizeof(int) );
                    output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height;
Fiona Glaser's avatar
Fiona Glaser committed
917

Steve Borho's avatar
Steve Borho committed
918
919
                    output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE;
                    output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE;
Fiona Glaser's avatar
Fiona Glaser committed
920

921
                    x264_threadpool_run( h->lookaheadpool, (void*)slicetype_slice_cost, &s[i] );
Steve Borho's avatar
Steve Borho committed
922
923
924
925
926
927
928
929
930
931
932
933
934
                }
                for( int i = 0; i < h->param.i_lookahead_threads; i++ )
                    x264_threadpool_wait( h->lookaheadpool, &s[i] );
            }
            else
            {
                h->i_threadslice_start = 0;
                h->i_threadslice_end = h->mb.i_mb_height;
                memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
                memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
                output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height;
                x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
                    output_inter[0], output_intra[0] };
935
                slicetype_slice_cost( &s );