rdo.c 24.3 KB
Newer Older
1 2 3
/*****************************************************************************
 * rdo.c: h264 encoder library (rate-distortion optimization)
 *****************************************************************************
4 5 6 7
 * Copyright (C) 2005-2008 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Fiona Glaser <fiona@x264.com>
8 9 10 11 12 13 14 15 16 17 18 19 20
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22 23
 *****************************************************************************/

24 25 26 27
/* duplicate all the writer functions, just calculating bit cost
 * instead of writing the bitstream.
 * TODO: use these for fast 1st pass too. */

Fiona Glaser's avatar
Fiona Glaser committed
28
#define RDO_SKIP_BS 1
29

30 31 32 33 34 35 36 37
/* Transition and size tables for abs<9 MVD and residual coding */
/* Consist of i_prefix-2 1s, one zero, and a bypass sign bit */
static uint8_t cabac_transition_unary[15][128];
static uint16_t cabac_size_unary[15][128];
/* Transition and size tables for abs>9 MVD */
/* Consist of 5 1s and a bypass sign bit */
static uint8_t cabac_transition_5ones[128];
static uint16_t cabac_size_5ones[128];
38

39 40 41 42 43 44 45
/* CAVLC: produces exactly the same bit count as a normal encode */
/* this probably still leaves some unnecessary computations */
#define bs_write1(s,v)     ((s)->i_bits_encoded += 1)
#define bs_write(s,n,v)    ((s)->i_bits_encoded += (n))
#define bs_write_ue(s,v)   ((s)->i_bits_encoded += bs_size_ue(v))
#define bs_write_se(s,v)   ((s)->i_bits_encoded += bs_size_se(v))
#define bs_write_te(s,v,l) ((s)->i_bits_encoded += bs_size_te(v,l))
Loic Le Loarer's avatar
Loic Le Loarer committed
46
#define x264_macroblock_write_cavlc  static x264_macroblock_size_cavlc
47 48 49 50
#include "cavlc.c"

/* CABAC: not exactly the same. x264_cabac_size_decision() keeps track of
 * fractional bits, but only finite precision. */
Loren Merritt's avatar
Loren Merritt committed
51
#undef  x264_cabac_encode_decision
52
#undef  x264_cabac_encode_decision_noup
53
#define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v)
54
#define x264_cabac_encode_decision_noup(c,x,v) x264_cabac_size_decision_noup(c,x,v)
55
#define x264_cabac_encode_terminal(c)     ((c)->f8_bits_encoded += 7)
56
#define x264_cabac_encode_bypass(c,v)     ((c)->f8_bits_encoded += 256)
Fiona Glaser's avatar
Fiona Glaser committed
57
#define x264_cabac_encode_ue_bypass(c,e,v) ((c)->f8_bits_encoded += (bs_size_ue_big(v+(1<<e)-1)-e)<<8)
Loic Le Loarer's avatar
Loic Le Loarer committed
58
#define x264_macroblock_write_cabac  static x264_macroblock_size_cabac
59 60
#include "cabac.c"

61 62
#define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
        sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
Loren Merritt's avatar
Loren Merritt committed
63

64
static inline uint64_t cached_hadamard( x264_t *h, int pixel, int x, int y )
65
{
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
    static const uint8_t hadamard_shift_x[4] = {4,   4,   3,   3};
    static const uint8_t hadamard_shift_y[4] = {4-0, 3-0, 4-1, 3-1};
    static const uint8_t  hadamard_offset[4] = {0,   1,   3,   5};
    int cache_index = (x >> hadamard_shift_x[pixel]) + (y >> hadamard_shift_y[pixel])
                    + hadamard_offset[pixel];
    uint64_t res = h->mb.pic.fenc_hadamard_cache[cache_index];
    if( res )
        return res - 1;
    else
    {
        uint8_t *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
        res = h->pixf.hadamard_ac[pixel]( fenc, FENC_STRIDE );
        h->mb.pic.fenc_hadamard_cache[cache_index] = res + 1;
        return res;
    }
81 82
}

83
static inline int cached_satd( x264_t *h, int pixel, int x, int y )
84
{
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
    static const uint8_t satd_shift_x[3] = {3,   2,   2};
    static const uint8_t satd_shift_y[3] = {2-1, 3-2, 2-2};
    static const uint8_t  satd_offset[3] = {0,   8,   16};
    ALIGNED_16( static uint8_t zero[16] );
    int cache_index = (x >> satd_shift_x[pixel - PIXEL_8x4]) + (y >> satd_shift_y[pixel - PIXEL_8x4])
                    + satd_offset[pixel - PIXEL_8x4];
    int res = h->mb.pic.fenc_satd_cache[cache_index];
    if( res )
        return res - 1;
    else
    {
        uint8_t *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
        int dc = h->pixf.sad[pixel]( fenc, FENC_STRIDE, zero, 0 ) >> 1;
        res = h->pixf.satd[pixel]( fenc, FENC_STRIDE, zero, 0 ) - dc;
        h->mb.pic.fenc_satd_cache[cache_index] = res + 1;
        return res;
    }
102 103 104 105 106 107 108 109 110 111 112 113
}

/* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */
/* SATD and SA8D are used to measure block complexity. */
/* The difference between SATD and SA8D scores are both used to avoid bias from the DCT size.  Using SATD */
/* only, for example, results in overusage of 8x8dct, while the opposite occurs when using SA8D. */

/* FIXME:  Is there a better metric than averaged SATD/SA8D difference for complexity difference? */
/* Hadamard transform is recursive, so a SATD+SA8D can be done faster by taking advantage of this fact. */
/* This optimization can also be used in non-RD transform decision. */

static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
Loren Merritt's avatar
Loren Merritt committed
114
{
115
    ALIGNED_16(static uint8_t zero[16]);
116 117 118 119 120 121
    int satd = 0;
    uint8_t *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
    uint8_t *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
    if( p == 0 && h->mb.i_psy_rd )
    {
        /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */
Loren Merritt's avatar
Loren Merritt committed
122
        if( size <= PIXEL_8x8 )
123
        {
124 125 126 127
            uint64_t fdec_acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE );
            uint64_t fenc_acs = cached_hadamard( h, size, x, y );
            satd = abs((int32_t)fdec_acs - (int32_t)fenc_acs)
                 + abs((int32_t)(fdec_acs>>32) - (int32_t)(fenc_acs>>32));
128 129
            satd >>= 1;
        }
Loren Merritt's avatar
Loren Merritt committed
130 131 132
        else
        {
            int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1;
133
            satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - cached_satd( h, size, x, y ));
Loren Merritt's avatar
Loren Merritt committed
134
        }
Fiona Glaser's avatar
Fiona Glaser committed
135
        satd = (satd * h->mb.i_psy_rd * h->mb.i_psy_rd_lambda + 128) >> 8;
136 137
    }
    return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd;
Loren Merritt's avatar
Loren Merritt committed
138 139
}

140
static inline int ssd_mb( x264_t *h )
141
{
Fiona Glaser's avatar
Fiona Glaser committed
142
    int chromassd = ssd_plane(h, PIXEL_8x8, 1, 0, 0) + ssd_plane(h, PIXEL_8x8, 2, 0, 0);
143
    chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
Fiona Glaser's avatar
Fiona Glaser committed
144
    return ssd_plane(h, PIXEL_16x16, 0, 0, 0) + chromassd;
145 146
}

147 148 149 150 151
static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
{
    int b_transform_bak = h->mb.b_transform_8x8;
    int i_ssd;
    int i_bits;
Fiona Glaser's avatar
Fiona Glaser committed
152
    int type_bak = h->mb.i_type;
153 154 155

    x264_macroblock_encode( h );

Loren Merritt's avatar
Loren Merritt committed
156
    i_ssd = ssd_mb( h );
157 158 159

    if( IS_SKIP( h->mb.i_type ) )
    {
Fiona Glaser's avatar
Fiona Glaser committed
160
        i_bits = (1 * i_lambda2 + 128) >> 8;
161 162 163
    }
    else if( h->param.b_cabac )
    {
Fiona Glaser's avatar
Fiona Glaser committed
164
        x264_cabac_t cabac_tmp;
165
        COPY_CABAC;
166
        x264_macroblock_size_cabac( h, &cabac_tmp );
Fiona Glaser's avatar
Fiona Glaser committed
167
        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
168 169 170
    }
    else
    {
171 172
        x264_macroblock_size_cavlc( h );
        i_bits = ( h->out.bs.i_bits_encoded * i_lambda2 + 128 ) >> 8;
173
    }
174

175
    h->mb.b_transform_8x8 = b_transform_bak;
Fiona Glaser's avatar
Fiona Glaser committed
176
    h->mb.i_type = type_bak;
177

178
    return i_ssd + i_bits;
179
}
Loren Merritt's avatar
Loren Merritt committed
180

Fiona Glaser's avatar
Fiona Glaser committed
181
/* partition RD functions use 8 bits more precision to avoid large rounding errors at low QPs */
182

Fiona Glaser's avatar
Fiona Glaser committed
183
static uint64_t x264_rd_cost_subpart( x264_t *h, int i_lambda2, int i4, int i_pixel )
184
{
185
    uint64_t i_ssd, i_bits;
186

Fiona Glaser's avatar
Fiona Glaser committed
187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
    x264_macroblock_encode_p4x4( h, i4 );
    if( i_pixel == PIXEL_8x4 )
        x264_macroblock_encode_p4x4( h, i4+1 );
    if( i_pixel == PIXEL_4x8 )
        x264_macroblock_encode_p4x4( h, i4+2 );

    i_ssd = ssd_plane( h, i_pixel, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );

    if( h->param.b_cabac )
    {
        x264_cabac_t cabac_tmp;
        COPY_CABAC;
        x264_subpartition_size_cabac( h, &cabac_tmp, i4, i_pixel );
        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
    }
    else
    {
        i_bits = x264_subpartition_size_cavlc( h, i4, i_pixel );
    }

    return (i_ssd<<8) + i_bits;
}

uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
{
    uint64_t i_ssd, i_bits;
    int i8 = i4 >> 2;
Fiona Glaser's avatar
Fiona Glaser committed
214
    int chromassd;
Fiona Glaser's avatar
Fiona Glaser committed
215

216 217 218 219 220 221
    if( i_pixel == PIXEL_16x16 )
    {
        int i_cost = x264_rd_cost_mb( h, i_lambda2 );
        return i_cost;
    }

Fiona Glaser's avatar
Fiona Glaser committed
222 223 224
    if( i_pixel > PIXEL_8x8 )
        return x264_rd_cost_subpart( h, i_lambda2, i4, i_pixel );

225 226
    h->mb.i_cbp_luma = 0;

227 228 229 230 231 232
    x264_macroblock_encode_p8x8( h, i8 );
    if( i_pixel == PIXEL_16x8 )
        x264_macroblock_encode_p8x8( h, i8+1 );
    if( i_pixel == PIXEL_8x16 )
        x264_macroblock_encode_p8x8( h, i8+2 );

Fiona Glaser's avatar
Fiona Glaser committed
233 234
    chromassd = ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
              + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
235
    chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
Fiona Glaser's avatar
Fiona Glaser committed
236
    i_ssd = ssd_plane( h, i_pixel,   0, (i8&1)*8, (i8>>1)*8 ) + chromassd;
237 238 239

    if( h->param.b_cabac )
    {
Fiona Glaser's avatar
Fiona Glaser committed
240
        x264_cabac_t cabac_tmp;
241
        COPY_CABAC;
242
        x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel );
243
        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
244 245 246
    }
    else
    {
247
        i_bits = x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2;
248 249
    }

250
    return (i_ssd<<8) + i_bits;
251 252
}

Loic Le Loarer's avatar
Loic Le Loarer committed
253
static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
254
{
255
    uint64_t i_ssd, i_bits;
256
    h->mb.i_cbp_luma &= ~(1<<i8);
257
    h->mb.b_transform_8x8 = 1;
258 259 260 261 262 263

    x264_mb_encode_i8x8( h, i8, h->mb.i_qp );
    i_ssd = ssd_plane( h, PIXEL_8x8, 0, (i8&1)*8, (i8>>1)*8 );

    if( h->param.b_cabac )
    {
Fiona Glaser's avatar
Fiona Glaser committed
264
        x264_cabac_t cabac_tmp;
265
        COPY_CABAC;
266
        x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
267
        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
268 269 270
    }
    else
    {
271
        i_bits = x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2;
272 273
    }

274
    return (i_ssd<<8) + i_bits;
275 276
}

Loic Le Loarer's avatar
Loic Le Loarer committed
277
static uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
278
{
279
    uint64_t i_ssd, i_bits;
280 281 282 283 284 285

    x264_mb_encode_i4x4( h, i4, h->mb.i_qp );
    i_ssd = ssd_plane( h, PIXEL_4x4, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );

    if( h->param.b_cabac )
    {
Fiona Glaser's avatar
Fiona Glaser committed
286
        x264_cabac_t cabac_tmp;
287
        COPY_CABAC;
288
        x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
289
        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
290 291 292
    }
    else
    {
293
        i_bits = x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2;
294 295
    }

296
    return (i_ssd<<8) + i_bits;
297
}
Loren Merritt's avatar
Loren Merritt committed
298

Loic Le Loarer's avatar
Loic Le Loarer committed
299
static uint64_t x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
300
{
301
    uint64_t i_ssd, i_bits;
302 303

    if( b_dct )
304
        x264_mb_encode_8x8_chroma( h, 0, h->mb.i_chroma_qp );
305 306 307 308 309 310 311
    i_ssd = ssd_plane( h, PIXEL_8x8, 1, 0, 0 ) +
            ssd_plane( h, PIXEL_8x8, 2, 0, 0 );

    h->mb.i_chroma_pred_mode = i_mode;

    if( h->param.b_cabac )
    {
Fiona Glaser's avatar
Fiona Glaser committed
312
        x264_cabac_t cabac_tmp;
313
        COPY_CABAC;
314
        x264_i8x8_chroma_size_cabac( h, &cabac_tmp );
315
        i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
316 317 318
    }
    else
    {
319
        i_bits = x264_i8x8_chroma_size_cavlc( h ) * i_lambda2;
320 321
    }

322
    return (i_ssd<<8) + i_bits;
323
}
Loren Merritt's avatar
Loren Merritt committed
324 325 326 327
/****************************************************************************
 * Trellis RD quantization
 ****************************************************************************/

Loren Merritt's avatar
Loren Merritt committed
328
#define TRELLIS_SCORE_MAX ((uint64_t)1<<50)
Loren Merritt's avatar
Loren Merritt committed
329 330 331 332
#define CABAC_SIZE_BITS 8
#define SSD_WEIGHT_BITS 5
#define LAMBDA_BITS 4

333
/* precalculate the cost of coding various combinations of bits in a single context */
Loic Le Loarer's avatar
Loic Le Loarer committed
334
void x264_rdo_init( void )
Loren Merritt's avatar
Loren Merritt committed
335
{
336
    int i_prefix, i_ctx, i;
Loren Merritt's avatar
Loren Merritt committed
337 338 339 340 341 342 343 344 345 346 347 348 349
    for( i_prefix = 0; i_prefix < 15; i_prefix++ )
    {
        for( i_ctx = 0; i_ctx < 128; i_ctx++ )
        {
            int f8_bits = 0;
            uint8_t ctx = i_ctx;

            for( i = 1; i < i_prefix; i++ )
                f8_bits += x264_cabac_size_decision2( &ctx, 1 );
            if( i_prefix > 0 && i_prefix < 14 )
                f8_bits += x264_cabac_size_decision2( &ctx, 0 );
            f8_bits += 1 << CABAC_SIZE_BITS; //sign

350 351
            cabac_size_unary[i_prefix][i_ctx] = f8_bits;
            cabac_transition_unary[i_prefix][i_ctx] = ctx;
Loren Merritt's avatar
Loren Merritt committed
352 353
        }
    }
354 355 356 357 358 359 360 361 362 363 364 365
    for( i_ctx = 0; i_ctx < 128; i_ctx++ )
    {
        int f8_bits = 0;
        uint8_t ctx = i_ctx;

        for( i = 0; i < 5; i++ )
            f8_bits += x264_cabac_size_decision2( &ctx, 1 );
        f8_bits += 1 << CABAC_SIZE_BITS; //sign

        cabac_size_5ones[i_ctx] = f8_bits;
        cabac_transition_5ones[i_ctx] = ctx;
    }
Loren Merritt's avatar
Loren Merritt committed
366 367 368
}

typedef struct {
369
    int64_t score;
Loren Merritt's avatar
Loren Merritt committed
370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393
    int level_idx; // index into level_tree[]
    uint8_t cabac_state[10]; //just the contexts relevant to coding abs_level_m1
} trellis_node_t;

// TODO:
// save cabac state between blocks?
// use trellis' RD score instead of x264_mb_decimate_score?
// code 8x8 sig/last flags forwards with deadzone and save the contexts at
//   each position?
// change weights when using CQMs?

// possible optimizations:
// make scores fit in 32bit
// save quantized coefs during rd, to avoid a duplicate trellis in the final encode
// if trellissing all MBRD modes, finish SSD calculation so we can skip all of
//   the normal dequant/idct/ssd/cabac

// the unquant_mf here is not the same as dequant_mf:
// in normal operation (dct->quant->dequant->idct) the dct and idct are not
// normalized. quant/dequant absorb those scaling factors.
// in this function, we just do (quant->unquant) and want the output to be
// comparable to the input. so unquant is the direct inverse of quant,
// and uses the dct scaling factors, not the idct ones.

394
static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, int16_t *dct,
Loren Merritt's avatar
Loren Merritt committed
395
                                 const uint16_t *quant_mf, const int *unquant_mf,
396
                                 const int *coef_weight, const uint8_t *zigzag,
397
                                 int i_ctxBlockCat, int i_lambda2, int b_ac, int dc, int i_coefs, int idx )
Loren Merritt's avatar
Loren Merritt committed
398 399 400 401 402 403
{
    int abs_coefs[64], signs[64];
    trellis_node_t nodes[2][8];
    trellis_node_t *nodes_cur = nodes[0];
    trellis_node_t *nodes_prev = nodes[1];
    trellis_node_t *bnode;
404
    const int b_interlaced = h->mb.b_interlaced;
405 406
    uint8_t *cabac_state_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ];
    uint8_t *cabac_state_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ];
Loren Merritt's avatar
Loren Merritt committed
407
    const int f = 1 << 15; // no deadzone
408
    int i_last_nnz;
409
    int i, j;
Loren Merritt's avatar
Loren Merritt committed
410 411 412 413 414 415 416 417 418 419 420

    // (# of coefs) * (# of ctx) * (# of levels tried) = 1024
    // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough,
    // but it takes more time to remove dead states than you gain in reduced memory.
    struct {
        uint16_t abs_level;
        uint16_t next;
    } level_tree[64*8*2];
    int i_levels_used = 1;

    /* init coefs */
421
    for( i = i_coefs-1; i >= b_ac; i-- )
422
        if( (unsigned)(dct[zigzag[i]] * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) + f-1) >= 2*f )
423
            break;
Loren Merritt's avatar
Loren Merritt committed
424

425
    if( i < b_ac )
Loren Merritt's avatar
Loren Merritt committed
426
    {
427 428 429 430
        /* We only need to memset an empty 4x4 block.  8x8 can be
           implicitly emptied via zero nnz, as can dc. */
        if( i_coefs == 16 && !dc )
            memset( dct, 0, 16 * sizeof(int16_t) );
431
        return 0;
Loren Merritt's avatar
Loren Merritt committed
432 433
    }

434 435 436 437 438 439 440 441 442
    i_last_nnz = i;

    for( ; i >= b_ac; i-- )
    {
        int coef = dct[zigzag[i]];
        abs_coefs[i] = abs(coef);
        signs[i] = coef < 0 ? -1 : 1;
    }

Loren Merritt's avatar
Loren Merritt committed
443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463
    /* init trellis */
    for( i = 1; i < 8; i++ )
        nodes_cur[i].score = TRELLIS_SCORE_MAX;
    nodes_cur[0].score = 0;
    nodes_cur[0].level_idx = 0;
    level_tree[0].abs_level = 0;
    level_tree[0].next = 0;

    // coefs are processed in reverse order, because that's how the abs value is coded.
    // last_coef and significant_coef flags are normally coded in forward order, but
    // we have to reverse them to match the levels.
    // in 4x4 blocks, last_coef and significant_coef use a separate context for each
    // position, so the order doesn't matter, and we don't even have to update their contexts.
    // in 8x8 blocks, some positions share contexts, so we'll just have to hope that
    // cabac isn't too sensitive.

    memcpy( nodes_cur[0].cabac_state, &h->cabac.state[ coeff_abs_level_m1_offset[i_ctxBlockCat] ], 10 );

    for( i = i_last_nnz; i >= b_ac; i-- )
    {
        int i_coef = abs_coefs[i];
464
        int q = ( f + i_coef * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) ) >> 16;
Loren Merritt's avatar
Loren Merritt committed
465 466 467 468 469 470 471 472 473
        int abs_level;
        int cost_sig[2], cost_last[2];
        trellis_node_t n;

        // skip 0s: this doesn't affect the output, but saves some unnecessary computation.
        if( q == 0 )
        {
            // no need to calculate ssd of 0s: it's the same in all nodes.
            // no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s.
474 475
            int sigindex = i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] : i;
            const uint32_t cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )
476
                                     * (uint64_t)i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
Loren Merritt's avatar
Loren Merritt committed
477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500
            for( j = 1; j < 8; j++ )
            {
                if( nodes_cur[j].score != TRELLIS_SCORE_MAX )
                {
#define SET_LEVEL(n,l) \
                    level_tree[i_levels_used].abs_level = l; \
                    level_tree[i_levels_used].next = n.level_idx; \
                    n.level_idx = i_levels_used; \
                    i_levels_used++;

                    SET_LEVEL( nodes_cur[j], 0 );
                    nodes_cur[j].score += cost_sig0;
                }
            }
            continue;
        }

        XCHG( trellis_node_t*, nodes_cur, nodes_prev );

        for( j = 0; j < 8; j++ )
            nodes_cur[j].score = TRELLIS_SCORE_MAX;

        if( i < i_coefs-1 )
        {
501 502 503 504 505 506
            int sigindex = i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] : i;
            int lastindex = i_coefs == 64 ? last_coeff_flag_offset_8x8[i] : i;
            cost_sig[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );
            cost_sig[1] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );
            cost_last[0] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 );
            cost_last[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 );
Loren Merritt's avatar
Loren Merritt committed
507 508 509 510 511 512 513 514 515 516 517 518 519
        }
        else
        {
            cost_sig[0] = cost_sig[1] = 0;
            cost_last[0] = cost_last[1] = 0;
        }

        // there are a few cases where increasing the coeff magnitude helps,
        // but it's only around .003 dB, and skipping them ~doubles the speed of trellis.
        // could also try q-2: that sometimes helps, but also sometimes decimates blocks
        // that are better left coded, especially at QP > 40.
        for( abs_level = q; abs_level >= q-1; abs_level-- )
        {
520
            int unquant_abs_level = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[i]]) * abs_level + 128) >> 8);
521 522 523
            int d = i_coef - unquant_abs_level;
            int64_t ssd;
            /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */
524
            if( h->mb.i_psy_trellis && i && !dc && i_ctxBlockCat != DCT_CHROMA_AC )
525
            {
Fiona Glaser's avatar
Fiona Glaser committed
526
                int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][zigzag[i]] : h->mb.pic.fenc_dct4[idx][zigzag[i]];
527 528 529 530 531 532
                int predicted_coef = orig_coef - i_coef * signs[i];
                int psy_value = h->mb.i_psy_trellis * abs(predicted_coef + unquant_abs_level * signs[i]);
                int psy_weight = (i_coefs == 64) ? x264_dct8_weight_tab[zigzag[i]] : x264_dct4_weight_tab[zigzag[i]];
                ssd = (int64_t)d*d * coef_weight[i] - psy_weight * psy_value;
            }
            else
533 534
            /* FIXME: for i16x16 dc is this weight optimal? */
                ssd = (int64_t)d*d * (dc?256:coef_weight[i]);
Loren Merritt's avatar
Loren Merritt committed
535 536 537 538 539 540 541 542 543 544 545

            for( j = 0; j < 8; j++ )
            {
                int node_ctx = j;
                if( nodes_prev[j].score == TRELLIS_SCORE_MAX )
                    continue;
                n = nodes_prev[j];

                /* code the proposed level, and count how much entropy it would take */
                if( abs_level || node_ctx )
                {
546
                    unsigned f8_bits = cost_sig[ abs_level != 0 ];
Loren Merritt's avatar
Loren Merritt committed
547 548 549 550 551 552 553 554
                    if( abs_level )
                    {
                        const int i_prefix = X264_MIN( abs_level - 1, 14 );
                        f8_bits += cost_last[ node_ctx == 0 ];
                        f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[node_ctx]], i_prefix > 0 );
                        if( i_prefix > 0 )
                        {
                            uint8_t *ctx = &n.cabac_state[coeff_abs_levelgt1_ctx[node_ctx]];
555 556
                            f8_bits += cabac_size_unary[i_prefix][*ctx];
                            *ctx = cabac_transition_unary[i_prefix][*ctx];
Loren Merritt's avatar
Loren Merritt committed
557
                            if( abs_level >= 15 )
558
                                f8_bits += bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
Loren Merritt's avatar
Loren Merritt committed
559 560 561 562 563 564 565 566
                            node_ctx = coeff_abs_level_transition[1][node_ctx];
                        }
                        else
                        {
                            f8_bits += 1 << CABAC_SIZE_BITS;
                            node_ctx = coeff_abs_level_transition[0][node_ctx];
                        }
                    }
567
                    n.score += (uint64_t)f8_bits * i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
Loren Merritt's avatar
Loren Merritt committed
568 569
                }

570 571 572 573 574 575 576 577
                if( j || i || dc )
                    n.score += ssd;
                /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. */
                else
                {
                    d = i_coef * signs[0] - ((unquant_abs_level * signs[0] + 8)&~15);
                    n.score += (int64_t)d*d * coef_weight[i];
                }
Loren Merritt's avatar
Loren Merritt committed
578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594

                /* save the node if it's better than any existing node with the same cabac ctx */
                if( n.score < nodes_cur[node_ctx].score )
                {
                    SET_LEVEL( n, abs_level );
                    nodes_cur[node_ctx] = n;
                }
            }
        }
    }

    /* output levels from the best path through the trellis */
    bnode = &nodes_cur[0];
    for( j = 1; j < 8; j++ )
        if( nodes_cur[j].score < bnode->score )
            bnode = &nodes_cur[j];

595 596 597 598 599 600 601
    if( bnode == &nodes_cur[0] )
    {
        if( i_coefs == 16 && !dc )
            memset( dct, 0, 16 * sizeof(int16_t) );
        return 0;
    }

Loren Merritt's avatar
Loren Merritt committed
602
    j = bnode->level_idx;
603
    for( i = b_ac; j; i++ )
Loren Merritt's avatar
Loren Merritt committed
604 605 606 607
    {
        dct[zigzag[i]] = level_tree[j].abs_level * signs[i];
        j = level_tree[j].next;
    }
608 609
    for( ; i < i_coefs; i++ )
        dct[zigzag[i]] = 0;
610 611

    return 1;
Loren Merritt's avatar
Loren Merritt committed
612 613
}

614 615
const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3};

616
int x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
Fiona Glaser's avatar
Fiona Glaser committed
617
                            int i_qp, int i_ctxBlockCat, int b_intra, int b_chroma )
618
{
Loren Merritt's avatar
Loren Merritt committed
619
    return quant_trellis_cabac( h, dct,
620 621
        h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
        NULL, i_ctxBlockCat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced],
Fiona Glaser's avatar
Fiona Glaser committed
622
        i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0 );
623
}
Loren Merritt's avatar
Loren Merritt committed
624

Loren Merritt's avatar
Loren Merritt committed
625
int x264_quant_4x4_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
Fiona Glaser's avatar
Fiona Glaser committed
626
                             int i_qp, int i_ctxBlockCat, int b_intra, int b_chroma, int idx )
Loren Merritt's avatar
Loren Merritt committed
627
{
628
    int b_ac = (i_ctxBlockCat == DCT_LUMA_AC || i_ctxBlockCat == DCT_CHROMA_AC);
Loren Merritt's avatar
Loren Merritt committed
629
    return quant_trellis_cabac( h, dct,
Loren Merritt's avatar
Loren Merritt committed
630
        h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
631 632
        x264_dct4_weight2_zigzag[h->mb.b_interlaced],
        x264_zigzag_scan4[h->mb.b_interlaced],
Fiona Glaser's avatar
Fiona Glaser committed
633
        i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, 0, 16, idx );
Loren Merritt's avatar
Loren Merritt committed
634 635
}

Loren Merritt's avatar
Loren Merritt committed
636
int x264_quant_8x8_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
637
                             int i_qp, int b_intra, int idx )
Loren Merritt's avatar
Loren Merritt committed
638
{
Loren Merritt's avatar
Loren Merritt committed
639
    return quant_trellis_cabac( h, dct,
Loren Merritt's avatar
Loren Merritt committed
640
        h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
641 642
        x264_dct8_weight2_zigzag[h->mb.b_interlaced],
        x264_zigzag_scan8[h->mb.b_interlaced],
Fiona Glaser's avatar
Fiona Glaser committed
643
        DCT_LUMA_8x8, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 64, idx );
Loren Merritt's avatar
Loren Merritt committed
644 645
}