macroblock.c 45.8 KB
Newer Older
1 2 3
/*****************************************************************************
 * macroblock.c: h264 encoder library
 *****************************************************************************
4
 * Copyright (C) 2003-2008 x264 project
5 6
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 8
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Fiona Glaser <fiona@x264.com>
9 10 11 12 13 14 15 16 17 18 19 20 21
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23 24
 *****************************************************************************/

25
#include "common/common.h"
26 27
#include "macroblock.h"

Fiona Glaser's avatar
Fiona Glaser committed
28 29
/* These chroma DC functions don't have assembly versions and are only used here. */

30 31
#define ZIG(i,y,x) level[i] = dct[x*2+y];
static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[4] )
32
{
33 34 35 36
    ZIG(0,0,0)
    ZIG(1,0,1)
    ZIG(2,1,0)
    ZIG(3,1,1)
37
}
38
#undef ZIG
39

40
#define IDCT_DEQUANT_START \
41 42 43 44
    int d0 = dct[0] + dct[1]; \
    int d1 = dct[2] + dct[3]; \
    int d2 = dct[0] - dct[1]; \
    int d3 = dct[2] - dct[3]; \
45
    int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
46

47
static inline void idct_dequant_2x2_dc( int16_t dct[4], int16_t dct4x4[4][16], int dequant_mf[6][16], int i_qp )
Fiona Glaser's avatar
Fiona Glaser committed
48
{
49
    IDCT_DEQUANT_START
50 51 52 53
    dct4x4[0][0] = (d0 + d1) * dmf >> 5;
    dct4x4[1][0] = (d0 - d1) * dmf >> 5;
    dct4x4[2][0] = (d2 + d3) * dmf >> 5;
    dct4x4[3][0] = (d2 - d3) * dmf >> 5;
Fiona Glaser's avatar
Fiona Glaser committed
54 55
}

56
static inline void idct_dequant_2x2_dconly( int16_t out[4], int16_t dct[4], int dequant_mf[6][16], int i_qp )
57 58
{
    IDCT_DEQUANT_START
59 60 61 62
    out[0] = (d0 + d1) * dmf >> 5;
    out[1] = (d0 - d1) * dmf >> 5;
    out[2] = (d2 + d3) * dmf >> 5;
    out[3] = (d2 - d3) * dmf >> 5;
63 64
}

65
static inline void dct2x2dc( int16_t d[4], int16_t dct4x4[4][16] )
Fiona Glaser's avatar
Fiona Glaser committed
66
{
67 68 69 70 71 72 73 74 75 76 77 78
    int d0 = dct4x4[0][0] + dct4x4[1][0];
    int d1 = dct4x4[2][0] + dct4x4[3][0];
    int d2 = dct4x4[0][0] - dct4x4[1][0];
    int d3 = dct4x4[2][0] - dct4x4[3][0];
    d[0] = d0 + d1;
    d[2] = d2 + d3;
    d[1] = d0 - d1;
    d[3] = d2 - d3;
    dct4x4[0][0] = 0;
    dct4x4[1][0] = 0;
    dct4x4[2][0] = 0;
    dct4x4[3][0] = 0;
Fiona Glaser's avatar
Fiona Glaser committed
79 80
}

81
static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, int16_t dct[16], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
82 83 84
{
    int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY;
    if( h->mb.b_trellis )
85
        return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, 0, idx );
86
    else
87
        return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
88 89
}

90
static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, int16_t dct[64], int i_qp, int b_intra, int idx )
91 92 93
{
    int i_quant_cat = b_intra ? CQM_8IY : CQM_8PY;
    if( h->mb.b_trellis )
94
        return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra, idx );
95
    else
96
        return h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
97 98
}

99 100 101 102 103 104 105 106 107 108 109
/* All encoding functions must output the correct CBP and NNZ values.
 * The entropy coding functions will check CBP first, then NNZ, before
 * actually reading the DCT coefficients.  NNZ still must be correct even
 * if CBP is zero because of the use of NNZ values for context selection.
 * "NNZ" need only be 0 or 1 rather than the exact coefficient count because
 * that is only needed in CAVLC, and will be calculated by CAVLC's residual
 * coding and stored as necessary. */

/* This means that decimation can be done merely by adjusting the CBP and NNZ
 * rather than memsetting the coefficients. */

110
void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
111
{
112
    int nz;
113 114
    uint8_t *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]];
    uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]];
115
    ALIGNED_ARRAY_16( int16_t, dct4x4,[16] );
116

117 118
    if( h->mb.b_lossless )
    {
119
        nz = h->zigzagf.sub_4x4( h->dct.luma4x4[idx], p_src, p_dst );
120 121
        h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
        h->mb.i_cbp_luma |= nz<<(idx>>2);
122 123 124
        return;
    }

125
    h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
Loren Merritt's avatar
Loren Merritt committed
126

127 128 129
    nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1, idx );
    h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
    if( nz )
130
    {
131
        h->mb.i_cbp_luma |= 1<<(idx>>2);
132
        h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 );
133
        h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qp );
134 135
        h->dctf.add4x4_idct( p_dst, dct4x4 );
    }
136 137 138 139
}

#define STORE_8x8_NNZ(idx,nz)\
{\
140 141 142 143 144 145 146 147 148 149
    M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] ) = nz * 0x0101;\
    M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] ) = nz * 0x0101;\
}

#define CLEAR_16x16_NNZ \
{\
    M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;\
    M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = 0;\
    M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = 0;\
    M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = 0;\
150 151
}

152
void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
153
{
154 155
    int x = 8 * (idx&1);
    int y = 8 * (idx>>1);
156
    int nz;
157 158
    uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
    uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
159
    ALIGNED_ARRAY_16( int16_t, dct8x8,[64] );
160

161 162
    if( h->mb.b_lossless )
    {
163
        nz = h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
164 165
        STORE_8x8_NNZ(idx,nz);
        h->mb.i_cbp_luma |= nz<<idx;
166 167 168
        return;
    }

169
    h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
Loren Merritt's avatar
Loren Merritt committed
170

171 172 173 174
    nz = x264_quant_8x8( h, dct8x8, i_qp, 1, idx );
    if( nz )
    {
        h->mb.i_cbp_luma |= 1<<idx;
175
        h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
176 177 178 179 180 181
        h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
        h->dctf.add8x8_idct8( p_dst, dct8x8 );
        STORE_8x8_NNZ(idx,1);
    }
    else
        STORE_8x8_NNZ(idx,0);
182 183
}

184
static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
185 186 187 188
{
    uint8_t  *p_src = h->mb.pic.p_fenc[0];
    uint8_t  *p_dst = h->mb.pic.p_fdec[0];

189 190
    ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[16] );
    ALIGNED_ARRAY_16( int16_t, dct_dc4x4,[16] );
191

192
    int nz;
193
    int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
194

195 196
    if( h->mb.b_lossless )
    {
197
        for( int i = 0; i < 16; i++ )
198
        {
199 200
            int oe = block_idx_xy_fenc[i];
            int od = block_idx_xy_fdec[i];
201
            nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[i], p_src+oe, p_dst+od, &dct_dc4x4[block_idx_yx_1d[i]] );
202 203
            h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
            h->mb.i_cbp_luma |= nz;
204
        }
205 206
        h->mb.i_cbp_luma *= 0xf;
        h->mb.cache.non_zero_count[x264_scan8[24]] = array_non_zero( dct_dc4x4 );
207
        h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
208 209 210
        return;
    }

211
    h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
212

213
    for( int i = 0; i < 16; i++ )
214 215
    {
        /* copy dc coeff */
216 217
        dct_dc4x4[block_idx_xy_1d[i]] = dct4x4[i][0];
        dct4x4[i][0] = 0;
218 219

        /* quant/scan/dequant */
220 221 222 223 224 225
        nz = x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i );
        h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
        if( nz )
        {
            h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] );
            h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp );
226
            if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[i] );
227 228
            h->mb.i_cbp_luma = 0xf;
        }
229 230
    }

231 232 233 234 235
    /* Writing the 16 CBFs in an i16x16 block is quite costly, so decimation can save many bits. */
    /* More useful with CAVLC, but still useful with CABAC. */
    if( decimate_score < 6 )
    {
        h->mb.i_cbp_luma = 0;
236
        CLEAR_16x16_NNZ
237 238
    }

239
    h->dctf.dct4x4dc( dct_dc4x4 );
240
    if( h->mb.b_trellis )
241
        nz = x264_quant_dc_trellis( h, dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1, 0 );
242
    else
243
        nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 );
244

245 246
    h->mb.cache.non_zero_count[x264_scan8[24]] = nz;
    if( nz )
247
    {
248 249 250 251 252 253
        h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );

        /* output samples to fdec */
        h->dctf.idct4x4dc( dct_dc4x4 );
        h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp );  /* XXX not inversed */
        if( h->mb.i_cbp_luma )
254
            for( int i = 0; i < 16; i++ )
255
                dct4x4[i][0] = dct_dc4x4[block_idx_xy_1d[i]];
256
    }
257

258
    /* put pixels to fdec */
259 260 261 262
    if( h->mb.i_cbp_luma )
        h->dctf.add16x16_idct( p_dst, dct4x4 );
    else if( nz )
        h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
263 264
}

265
static inline int idct_dequant_round_2x2_dc( int16_t ref[4], int16_t dct[4], int dequant_mf[6][16], int i_qp )
266
{
267
    int16_t out[4];
268
    idct_dequant_2x2_dconly( out, dct, dequant_mf, i_qp );
269 270 271 272
    return ((ref[0] ^ (out[0]+32))
          | (ref[1] ^ (out[1]+32))
          | (ref[2] ^ (out[2]+32))
          | (ref[3] ^ (out[3]+32))) >> 6;
273 274 275 276 277 278
}

/* Round down coefficients losslessly in DC-only chroma blocks.
 * Unlike luma blocks, this can't be done with a lookup table or
 * other shortcut technique because of the interdependencies
 * between the coefficients due to the chroma DC transform. */
279
static inline int x264_mb_optimize_chroma_dc( x264_t *h, int b_inter, int i_qp, int16_t dct2x2[4] )
280
{
281
    int16_t dct2x2_orig[4];
282
    int coeff, nz;
283 284

    /* If the QP is too high, there's no benefit to rounding optimization. */
285
    if( h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << (i_qp/6) > 32*64 )
286 287 288
        return 1;

    idct_dequant_2x2_dconly( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
289 290 291 292
    dct2x2_orig[0] += 32;
    dct2x2_orig[1] += 32;
    dct2x2_orig[2] += 32;
    dct2x2_orig[3] += 32;
293 294

    /* If the DC coefficients already round to zero, terminate early. */
295
    if( !((dct2x2_orig[0]|dct2x2_orig[1]|dct2x2_orig[2]|dct2x2_orig[3])>>6) )
296 297 298
        return 0;

    /* Start with the highest frequency coefficient... is this the best option? */
299
    for( nz = 0, coeff = h->quantf.coeff_last[DCT_CHROMA_DC]( dct2x2 ); coeff >= 0; coeff-- )
300
    {
301
        int level = dct2x2[coeff];
302
        int sign = level>>31 | 1; /* dct2x2[coeff] < 0 ? -1 : 1 */
303 304 305

        while( level )
        {
306
            dct2x2[coeff] = level - sign;
307
            if( idct_dequant_round_2x2_dc( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) )
308 309 310
            {
                nz = 1;
                dct2x2[coeff] = level;
311
                break;
312
            }
313 314 315 316
            level -= sign;
        }
    }

317
    return nz;
318 319
}

320
void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
321
{
322
    int nz, nz_dc;
323
    int b_decimate = b_inter && h->mb.b_dct_decimate;
324
    ALIGNED_ARRAY_16( int16_t, dct2x2,[4] );
325
    h->mb.i_cbp_chroma = 0;
326

327 328 329 330 331 332 333
    /* Early termination: check variance of chroma residual before encoding.
     * Don't bother trying early termination at low QPs.
     * Values are experimentally derived. */
    if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) )
    {
        int thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
        int ssd[2];
334 335
        int score = h->pixf.var2_8x8( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
        if( score < thresh*4 )
336 337 338 339 340 341 342 343 344 345 346
            score += h->pixf.var2_8x8( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
        if( score < thresh*4 )
        {
            h->mb.cache.non_zero_count[x264_scan8[16]] = 0;
            h->mb.cache.non_zero_count[x264_scan8[17]] = 0;
            h->mb.cache.non_zero_count[x264_scan8[18]] = 0;
            h->mb.cache.non_zero_count[x264_scan8[19]] = 0;
            h->mb.cache.non_zero_count[x264_scan8[20]] = 0;
            h->mb.cache.non_zero_count[x264_scan8[21]] = 0;
            h->mb.cache.non_zero_count[x264_scan8[22]] = 0;
            h->mb.cache.non_zero_count[x264_scan8[23]] = 0;
347 348
            M16( &h->mb.cache.non_zero_count[x264_scan8[25]] ) = 0;

349
            for( int ch = 0; ch < 2; ch++ )
350 351 352 353 354
            {
                if( ssd[ch] > thresh )
                {
                    h->dctf.sub8x8_dct_dc( dct2x2, h->mb.pic.p_fenc[1+ch], h->mb.pic.p_fdec[1+ch] );
                    if( h->mb.b_trellis )
355
                        nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1 );
356
                    else
357
                        nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
358

359 360
                    if( nz_dc )
                    {
361 362
                        if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp, dct2x2 ) )
                            continue;
363 364
                        h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1;
                        zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
365
                        idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
366 367 368 369 370 371 372 373 374
                        h->dctf.add8x8_idct_dc( h->mb.pic.p_fdec[1+ch], dct2x2 );
                        h->mb.i_cbp_chroma = 1;
                    }
                }
            }
            return;
        }
    }

375
    for( int ch = 0; ch < 2; ch++ )
376 377 378 379
    {
        uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
        uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
        int i_decimate_score = 0;
380
        int nz_ac = 0;
381

382
        ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[16] );
383

384 385
        if( h->mb.b_lossless )
        {
386
            for( int i = 0; i < 4; i++ )
387
            {
388 389
                int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
                int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
390
                nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+ch*4], p_src+oe, p_dst+od, &h->dct.chroma_dc[ch][i] );
391 392
                h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
                h->mb.i_cbp_chroma |= nz;
393
            }
394
            h->mb.cache.non_zero_count[x264_scan8[25]+ch] = array_non_zero( h->dct.chroma_dc[ch] );
395 396
            continue;
        }
Loren Merritt's avatar
Loren Merritt committed
397

398
        h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
Fiona Glaser's avatar
Fiona Glaser committed
399
        dct2x2dc( dct2x2, dct4x4 );
400
        /* calculate dct coeffs */
401
        for( int i = 0; i < 4; i++ )
402
        {
403
            if( h->mb.b_trellis )
404
                nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 );
405
            else
406 407 408 409 410 411 412 413 414 415
                nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
            h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
            if( nz )
            {
                nz_ac = 1;
                h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] );
                h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
                if( b_decimate )
                    i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*4] );
            }
416 417
        }

418
        if( h->mb.b_trellis )
419
            nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1 );
420
        else
421
            nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
422

423 424 425
        h->mb.cache.non_zero_count[x264_scan8[25]+ch] = nz_dc;

        if( (b_decimate && i_decimate_score < 7) || !nz_ac )
426
        {
427 428 429 430 431
            /* Decimate the block */
            h->mb.cache.non_zero_count[x264_scan8[16+0]+24*ch] = 0;
            h->mb.cache.non_zero_count[x264_scan8[16+1]+24*ch] = 0;
            h->mb.cache.non_zero_count[x264_scan8[16+2]+24*ch] = 0;
            h->mb.cache.non_zero_count[x264_scan8[16+3]+24*ch] = 0;
432
            if( !nz_dc ) /* Whole block is empty */
433
                continue;
434 435 436 437 438
            if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp, dct2x2 ) )
            {
                h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 0;
                continue;
            }
439 440
            /* DC-only */
            zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
441
            idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
442
            h->dctf.add8x8_idct_dc( p_dst, dct2x2 );
443
        }
444 445
        else
        {
446 447
            h->mb.i_cbp_chroma = 1;
            if( nz_dc )
448
            {
449 450
                zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
                idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
451 452
            }
            h->dctf.add8x8_idct( p_dst, dct4x4 );
453
        }
454
    }
455

456 457
    /* 0 = none, 1 = DC only, 2 = DC+AC */
    h->mb.i_cbp_chroma = ((!!M16( &h->mb.cache.non_zero_count[x264_scan8[25]] )) | h->mb.i_cbp_chroma) + h->mb.i_cbp_chroma;
458 459
}

460 461
static void x264_macroblock_encode_skip( x264_t *h )
{
Fiona Glaser's avatar
Fiona Glaser committed
462 463 464 465 466 467
    M32( &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] ) = 0;
    M32( &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] ) = 0;
    M32( &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] ) = 0;
    M32( &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] ) = 0;
    for( int i = 16; i < 24; i++ )
        h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
Henrik Gramner's avatar
Henrik Gramner committed
468 469
    h->mb.i_cbp_luma = 0;
    h->mb.i_cbp_chroma = 0;
470 471 472
    h->mb.cbp[h->mb.i_mb_xy] = 0;
}

473 474 475 476
/*****************************************************************************
 * x264_macroblock_encode_pskip:
 *  Encode an already marked skip block
 *****************************************************************************/
477
static void x264_macroblock_encode_pskip( x264_t *h )
478
{
479
    /* don't do pskip motion compensation if it was already done in macroblock_analyse */
480
    if( !h->mb.b_skip_mc )
481
    {
482 483 484 485 486
        int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
                              h->mb.mv_min[0], h->mb.mv_max[0] );
        int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
                              h->mb.mv_min[1], h->mb.mv_max[1] );

487 488
        h->mc.mc_luma( h->mb.pic.p_fdec[0],    FDEC_STRIDE,
                       h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
489
                       mvx, mvy, 16, 16, &h->sh.weight[0][0] );
490

491 492 493 494 495 496 497 498 499 500 501 502 503 504 505
        /* Special case for mv0, which is (of course) very common in P-skip mode. */
        if( mvx | mvy )
        {
            h->mc.mc_chroma( h->mb.pic.p_fdec[1],       FDEC_STRIDE,
                             h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
                             mvx, mvy, 8, 8 );
            h->mc.mc_chroma( h->mb.pic.p_fdec[2],       FDEC_STRIDE,
                             h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2],
                             mvx, mvy, 8, 8 );
        }
        else
        {
            h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], 8 );
            h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2], 8 );
        }
506

507 508 509 510 511 512 513 514 515
        if( h->sh.weight[0][1].weightfn )
            h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
                                               h->mb.pic.p_fdec[1], FDEC_STRIDE,
                                               &h->sh.weight[0][1], 8 );

        if( h->sh.weight[0][2].weightfn )
            h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
                                               h->mb.pic.p_fdec[2], FDEC_STRIDE,
                                               &h->sh.weight[0][2], 8 );
516
    }
517

518
    x264_macroblock_encode_skip( h );
519 520
}

521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588
/*****************************************************************************
 * Intra prediction for predictive lossless mode.
 *****************************************************************************/

/* Note that these functions take a shortcut (mc.copy instead of actual pixel prediction) which assumes
 * that the edge pixels of the reconstructed frame are the same as that of the source frame.  This means
 * they will only work correctly if the neighboring blocks are losslessly coded.  In practice, this means
 * lossless mode cannot be mixed with lossy mode within a frame. */
/* This can be resolved by explicitly copying the edge pixels after doing the mc.copy, but this doesn't
 * need to be done unless we decide to allow mixing lossless and lossy compression. */

void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode )
{
    int stride = h->fenc->i_stride[1] << h->mb.b_interlaced;
    if( i_mode == I_PRED_CHROMA_V )
    {
        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc_plane[1]-stride, stride, 8 );
        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc_plane[2]-stride, stride, 8 );
    }
    else if( i_mode == I_PRED_CHROMA_H )
    {
        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc_plane[1]-1, stride, 8 );
        h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc_plane[2]-1, stride, 8 );
    }
    else
    {
        h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
        h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
    }
}

void x264_predict_lossless_4x4( x264_t *h, uint8_t *p_dst, int idx, int i_mode )
{
    int stride = h->fenc->i_stride[0] << h->mb.b_interlaced;
    uint8_t *p_src = h->mb.pic.p_fenc_plane[0] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride;

    if( i_mode == I_PRED_4x4_V )
        h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 );
    else if( i_mode == I_PRED_4x4_H )
        h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 );
    else
        h->predict_4x4[i_mode]( p_dst );
}

void x264_predict_lossless_8x8( x264_t *h, uint8_t *p_dst, int idx, int i_mode, uint8_t edge[33] )
{
    int stride = h->fenc->i_stride[0] << h->mb.b_interlaced;
    uint8_t *p_src = h->mb.pic.p_fenc_plane[0] + (idx&1)*8 + (idx>>1)*8*stride;

    if( i_mode == I_PRED_8x8_V )
        h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 );
    else if( i_mode == I_PRED_8x8_H )
        h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 );
    else
        h->predict_8x8[i_mode]( p_dst, edge );
}

void x264_predict_lossless_16x16( x264_t *h, int i_mode )
{
    int stride = h->fenc->i_stride[0] << h->mb.b_interlaced;
    if( i_mode == I_PRED_16x16_V )
        h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc_plane[0]-stride, stride, 16 );
    else if( i_mode == I_PRED_16x16_H )
        h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc_plane[0]-1, stride, 16 );
    else
        h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] );
}

589 590 591 592 593
/*****************************************************************************
 * x264_macroblock_encode:
 *****************************************************************************/
void x264_macroblock_encode( x264_t *h )
{
594
    int i_qp = h->mb.i_qp;
595
    int b_decimate = h->mb.b_dct_decimate;
596
    int b_force_no_skip = 0;
597
    int nz;
598 599
    h->mb.i_cbp_luma = 0;
    h->mb.cache.non_zero_count[x264_scan8[24]] = 0;
600

Fiona Glaser's avatar
Fiona Glaser committed
601 602 603 604 605 606 607 608 609
    if( h->mb.i_type == I_PCM )
    {
        /* if PCM is chosen, we need to store reconstructed frame data */
        h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE, 16 );
        h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 );
        h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 );
        return;
    }

610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626
    if( h->sh.b_mbaff
        && h->mb.i_mb_xy == h->sh.i_first_mb + h->mb.i_mb_stride
        && IS_SKIP(h->mb.type[h->sh.i_first_mb]) )
    {
        /* The first skip is predicted to be a frame mb pair.
         * We don't yet support the aff part of mbaff, so force it to non-skip
         * so that we can pick the aff flag. */
        b_force_no_skip = 1;
        if( IS_SKIP(h->mb.i_type) )
        {
            if( h->mb.i_type == P_SKIP )
                h->mb.i_type = P_L0;
            else if( h->mb.i_type == B_SKIP )
                h->mb.i_type = B_DIRECT;
        }
    }

627 628 629 630 631 632
    if( h->mb.i_type == P_SKIP )
    {
        /* A bit special */
        x264_macroblock_encode_pskip( h );
        return;
    }
633 634
    if( h->mb.i_type == B_SKIP )
    {
635
        /* don't do bskip motion compensation if it was already done in macroblock_analyse */
636
        if( !h->mb.b_skip_mc )
637
            x264_mb_mc( h );
638 639 640
        x264_macroblock_encode_skip( h );
        return;
    }
641 642 643 644

    if( h->mb.i_type == I_16x16 )
    {
        const int i_mode = h->mb.i_intra16x16_pred_mode;
645
        h->mb.b_transform_8x8 = 0;
646 647 648 649 650

        if( h->mb.b_lossless )
            x264_predict_lossless_16x16( h, i_mode );
        else
            h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] );
651 652

        /* encode the 16x16 macroblock */
653
        x264_mb_encode_i16x16( h, i_qp );
654
    }
655 656
    else if( h->mb.i_type == I_8x8 )
    {
657
        ALIGNED_ARRAY_16( uint8_t, edge,[33] );
658
        h->mb.b_transform_8x8 = 1;
659 660 661 662
        /* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */
        if( h->mb.i_skip_intra )
        {
            h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
663 664 665 666
            M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i8x8_nnz_buf[0];
            M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i8x8_nnz_buf[1];
            M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i8x8_nnz_buf[2];
            M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i8x8_nnz_buf[3];
667
            h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;
668 669 670 671
            /* In RD mode, restore the now-overwritten DCT data. */
            if( h->mb.i_skip_intra == 2 )
                h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
        }
672
        for( int i = h->mb.i_skip_intra ? 3 : 0 ; i < 4; i++ )
673
        {
674
            uint8_t  *p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * FDEC_STRIDE];
675
            int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
676
            h->predict_8x8_filter( p_dst, edge, h->mb.i_neighbour8[i], x264_pred_i4x4_neighbors[i_mode] );
677 678 679 680 681 682

            if( h->mb.b_lossless )
                x264_predict_lossless_8x8( h, p_dst, i, i_mode, edge );
            else
                h->predict_8x8[i_mode]( p_dst, edge );

683
            x264_mb_encode_i8x8( h, i, i_qp );
684 685
        }
    }
686 687
    else if( h->mb.i_type == I_4x4 )
    {
688
        h->mb.b_transform_8x8 = 0;
689 690 691 692
        /* If we already encoded 15 of the 16 i4x4 blocks, we don't have to do them again. */
        if( h->mb.i_skip_intra )
        {
            h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
693 694 695 696
            M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i4x4_nnz_buf[0];
            M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i4x4_nnz_buf[1];
            M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i4x4_nnz_buf[2];
            M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i4x4_nnz_buf[3];
697
            h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;
698 699
            /* In RD mode, restore the now-overwritten DCT data. */
            if( h->mb.i_skip_intra == 2 )
700
                h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
701
        }
702
        for( int i = h->mb.i_skip_intra ? 15 : 0 ; i < 16; i++ )
703
        {
704
            uint8_t  *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i]];
705 706
            int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];

707 708
            if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
                /* emulate missing topright samples */
709
                M32( &p_dst[4-FDEC_STRIDE] ) = p_dst[3-FDEC_STRIDE] * 0x01010101U;
710

711 712 713 714
            if( h->mb.b_lossless )
                x264_predict_lossless_4x4( h, p_dst, i, i_mode );
            else
                h->predict_4x4[i_mode]( p_dst );
715
            x264_mb_encode_i4x4( h, i, i_qp );
716 717 718 719 720 721
        }
    }
    else    /* Inter MB */
    {
        int i_decimate_mb = 0;

722 723 724
        /* Don't repeat motion compensation if it was already done in non-RD transform analysis */
        if( !h->mb.b_skip_mc )
            x264_mb_mc( h );
725

726 727
        if( h->mb.b_lossless )
        {
728
            if( h->mb.b_transform_8x8 )
729
                for( int i8x8 = 0; i8x8 < 4; i8x8++ )
730 731 732
                {
                    int x = 8*(i8x8&1);
                    int y = 8*(i8x8>>1);
733
                    nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8],
734 735
                                        h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
                                        h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
736 737
                    STORE_8x8_NNZ(i8x8,nz);
                    h->mb.i_cbp_luma |= nz << i8x8;
738 739
                }
            else
740
                for( int i4x4 = 0; i4x4 < 16; i4x4++ )
741
                {
742
                    nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4],
743 744
                                        h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4],
                                        h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] );
745 746
                    h->mb.cache.non_zero_count[x264_scan8[i4x4]] = nz;
                    h->mb.i_cbp_luma |= nz << (i4x4>>2);
747
                }
748 749
        }
        else if( h->mb.b_transform_8x8 )
750
        {
751
            ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[64] );
752
            b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
753
            h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
Fiona Glaser's avatar
Fiona Glaser committed
754
            h->nr_count[1] += h->mb.b_noise_reduction * 4;
755

756
            for( int idx = 0; idx < 4; idx++ )
757
            {
758
                if( h->mb.b_noise_reduction )
759
                    h->quantf.denoise_dct( dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 );
760
                nz = x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx );
761

762
                if( nz )
763
                {
764 765 766 767 768 769 770 771 772 773
                    h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] );
                    if( b_decimate )
                    {
                        int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[idx] );
                        i_decimate_mb += i_decimate_8x8;
                        if( i_decimate_8x8 >= 4 )
                            h->mb.i_cbp_luma |= 1<<idx;
                    }
                    else
                        h->mb.i_cbp_luma |= 1<<idx;
774
                }
775 776
            }

777
            if( i_decimate_mb < 6 && b_decimate )
778 779
            {
                h->mb.i_cbp_luma = 0;
780
                CLEAR_16x16_NNZ
781
            }
782
            else
783
            {
784
                for( int idx = 0; idx < 4; idx++ )
785 786
                {
                    if( h->mb.i_cbp_luma&(1<<idx) )
787 788
                    {
                        h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
789
                        h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], dct8x8[idx] );
790
                        STORE_8x8_NNZ(idx,1);
791
                    }
792 793 794
                    else
                        STORE_8x8_NNZ(idx,0);
                }
795
            }
796 797 798
        }
        else
        {
799
            ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[16] );
800
            h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
Fiona Glaser's avatar
Fiona Glaser committed
801
            h->nr_count[0] += h->mb.b_noise_reduction * 16;
802

803
            for( int i8x8 = 0; i8x8 < 4; i8x8++ )
804
            {
805 806
                int i_decimate_8x8 = 0;
                int cbp = 0;
807 808

                /* encode one 4x4 block */
809
                for( int i4x4 = 0; i4x4 < 4; i4x4++ )
810
                {
811
                    int idx = i8x8 * 4 + i4x4;
812

813
                    if( h->mb.b_noise_reduction )
814
                        h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
815 816
                    nz = x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx );
                    h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
Loren Merritt's avatar
Loren Merritt committed
817

818 819 820 821 822 823 824 825
                    if( nz )
                    {
                        h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
                        h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[CQM_4PY], i_qp );
                        if( b_decimate && i_decimate_8x8 < 6 )
                            i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[idx] );
                        cbp = 1;
                    }
826 827
                }

828 829
                /* decimate this 8x8 block */
                i_decimate_mb += i_decimate_8x8;
830 831 832 833 834 835 836 837 838 839 840 841
                if( b_decimate )
                {
                    if( i_decimate_8x8 < 4 )
                        STORE_8x8_NNZ(i8x8,0)
                    else
                        h->mb.i_cbp_luma |= 1<<i8x8;
                }
                else if( cbp )
                {
                    h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
                    h->mb.i_cbp_luma |= 1<<i8x8;
                }
842
            }
843

844
            if( b_decimate )
845
            {
846 847 848
                if( i_decimate_mb < 6 )
                {
                    h->mb.i_cbp_luma = 0;
849
                    CLEAR_16x16_NNZ
850 851 852
                }
                else
                {
853
                    for( int i8x8 = 0; i8x8 < 4; i8x8++ )
854 855 856
                        if( h->mb.i_cbp_luma&(1<<i8x8) )
                            h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
                }
857
            }
858 859 860 861 862 863 864
        }
    }

    /* encode chroma */
    if( IS_INTRA( h->mb.i_type ) )
    {
        const int i_mode = h->mb.i_chroma_pred_mode;
865 866 867 868 869 870 871
        if( h->mb.b_lossless )
            x264_predict_lossless_8x8_chroma( h, i_mode );
        else
        {
            h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
            h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
        }
872 873 874
    }

    /* encode the 8x8 blocks */
875
    x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
876 877

    /* store cbp */
878 879 880 881 882 883
    int cbp = h->mb.i_cbp_chroma << 4 | h->mb.i_cbp_luma;
    if( h->param.b_cabac )
        cbp |= h->mb.cache.non_zero_count[x264_scan8[24]] << 8
            |  h->mb.cache.non_zero_count[x264_scan8[25]] << 9
            |  h->mb.cache.non_zero_count[x264_scan8[26]] << 10;
    h->mb.cbp[h->mb.i_mb_xy] = cbp;
884 885 886 887

    /* Check for P_SKIP
     * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
     *      (if multiple mv give same result)*/
888
    if( !b_force_no_skip )
889
    {
890
        if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
Loren Merritt's avatar
Loren Merritt committed
891
            !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
892
            M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
893
            && h->mb.cache.ref[0][x264_scan8[0]] == 0 )
894
        {
895
            h->mb.i_type = P_SKIP;
896
        }
897

898
        /* Check for B_SKIP */
899
        if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) )
900 901 902
        {
            h->mb.i_type = B_SKIP;
        }
903
    }
904 905 906
}

/*****************************************************************************
907
 * x264_macroblock_probe_skip:
908
 *  Check if the current MB could be encoded as a [PB]_SKIP
909
 *****************************************************************************/
910
int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
911
{
912 913
    ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[16] );
    ALIGNED_ARRAY_16( int16_t, dct2x2,[4] );
914
    ALIGNED_ARRAY_16( int16_t, dctscan,[16] );
915
    ALIGNED_4( int16_t mvp[2] );
916

917
    int i_qp = h->mb.i_qp;
918
    int thresh, ssd;
919

920 921 922
    if( !b_bidir )
    {
        /* Get the MV */
923 924
        mvp[0] = x264_clip3( h->mb.cache.pskip_mv[0], h->mb.mv_min[0], h->mb.mv_max[0] );
        mvp[1] = x264_clip3( h->mb.cache.pskip_mv[1], h->mb.mv_min[1], h->mb.mv_max[1] );
925

926
        /* Motion compensation */
927 928
        h->mc.mc_luma( h->mb.pic.p_fdec[0],    FDEC_STRIDE,
                       h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
929
                       mvp[0], mvp[1], 16, 16, &h->sh.weight[0][0] );
930
    }
931

932
    for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
933
    {
Fiona Glaser's avatar
Fiona Glaser committed
934 935 936 937 938
        int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8;
        int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8;
        /* get luma diff */
        h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[0] + fenc_offset,
                                    h->mb.pic.p_fdec[0] + fdec_offset );
939
        /* encode one 4x4 block */
940
        for( int i4x4 = 0; i4x4 < 4; i4x4++ )
941
        {
942
            if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ) )
Fiona Glaser's avatar
Fiona Glaser committed
943 944
                continue;
            h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
945
            i_decimate_mb += h->quantf.decimate_score16( dctscan );
946 947 948 949 950 951
            if( i_decimate_mb >= 6 )
                return 0;
        }
    }

    /* encode chroma */
952
    i_qp = h->mb.i_chroma_qp;
953
    thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
954

955
    for( int ch = 0; ch < 2; ch++ )
956 957 958 959
    {
        uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
        uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];

960 961
        if( !b_bidir )
        {
962 963 964 965 966 967 968 969 970
            /* Special case for mv0, which is (of course) very common in P-skip mode. */
            if( M32( mvp ) )
            {
                h->mc.mc_chroma( h->mb.pic.p_fdec[1+ch],       FDEC_STRIDE,
                                 h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch],
                                 mvp[0], mvp[1], 8, 8 );
            }
            else
                h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch], 8 );
971 972 973 974 975

            if( h->sh.weight[0][1+ch].weightfn )
                h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
                                                      h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
                                                      &h->sh.weight[0][1+ch], 8 );
976
        }
977

978 979
        /* there is almost never a termination during chroma, but we can't avoid the check entirely */
        /* so instead we check SSD and skip the actual check if the score is low enough. */
Fiona Glaser's avatar
Fiona Glaser committed
980 981
        ssd = h->pixf.ssd[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
        if( ssd < thresh )
982 983
            continue;

984 985 986
        /* The vast majority of chroma checks will terminate during the DC check or the higher
         * threshold check, so we can save time by doing a DC-only DCT. */
        h->dctf.sub8x8_dct_dc( dct2x2, p_src, p_dst );
987

988
        if( h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ) )
989 990
            return 0;

Fiona Glaser's avatar
Fiona Glaser committed
991 992 993 994
        /* If there wasn't a termination in DC, we can check against a much higher threshold. */
        if( ssd < thresh*4 )
            continue;

995 996
        h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );

997
        /* calculate dct coeffs */
998
        for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
999
        {
1000
            dct4x4[i4x4][0] = 0;
1001
            if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
Fiona Glaser's avatar
Fiona Glaser committed
1002
                continue;
1003
            h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
1004
            i_decimate_mb += h->quantf.decimate_score15( dctscan );
1005 1006 1007 1008 1009
            if( i_decimate_mb >= 7 )
                return 0;
        }
    }

1010
    h->mb.b_skip_mc = 1;
1011 1012
    return 1;
}
1013 1014 1015 1016 1017 1018 1019 1020

/****************************************************************************
 * DCT-domain noise reduction / adaptive deadzone
 * from libavcodec
 ****************************************************************************/

void x264_noise_reduction_update( x264_t *h )
{
1021
    for( int cat = 0; cat < 2; cat++ )
1022
    {
1023
        int size = cat ? 64 : 16;
1024
        const uint16_t *weight = cat ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
1025

1026
        if( h->nr_count[cat] > (cat ? (1<<16) : (1<<18)) )
1027
        {
1028
            for( int i = 0; i < size; i++ )
1029 1030 1031 1032
                h->nr_residual_sum[cat][i] >>= 1;
            h->nr_count[cat] >>= 1;
        }

1033
        for( int i = 0; i < size; i++ )
1034 1035 1036 1037 1038 1039 1040
            h->nr_offset[cat][i] =
                ((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat]
                 + h->nr_residual_sum[cat][i]/2)
              / ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1);
    }
}

1041 1042 1043 1044 1045 1046 1047 1048 1049
/*****************************************************************************
 * RD only; 4 calls to this do not make up for one macroblock_encode.
 * doesn't transform chroma dc.
 *****************************************************************************/
void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
{
    int i_qp = h->mb.i_qp;
    uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
    uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
1050
    int b_decimate = h->mb.b_dct_decimate;
1051
    int nnz8x8 = 0;
1052
    int nz;
1053

1054 1055
    if( !h->mb.b_skip_mc )
        x264_mb_mc_8x8( h, i8 );
1056

1057
    if( h->mb.b_lossless )
1058
    {
1059 1060
        if( h->mb.b_transform_8x8 )
        {
1061
            nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
1062
            STORE_8x8_NNZ(i8,nnz8x8);
1063 1064
        }
        else
1065
        {
1066
            for( int i4 = i8*4; i4 < i8*4+4; i4++ )
1067
            {
1068
                nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4],
1069 1070
                                    h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4],
                                    h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] );
1071
                h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
1072
                nnz8x8 |= nz;
1073
            }