Commit dd354db4 authored by Loren Merritt's avatar Loren Merritt Committed by Fiona Glaser

CABAC trellis opts part 3: make some arrays non-static

parent 4abcf60a
......@@ -36,6 +36,70 @@
# include "arm/dct.h"
#endif
/* the inverse of the scaling factors introduced by 8x8 fdct */
/* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */
#define W(i) (i==0 ? FIX8(1.0000) :\
i==1 ? FIX8(0.8859) :\
i==2 ? FIX8(1.6000) :\
i==3 ? FIX8(0.9415) :\
i==4 ? FIX8(1.2651) :\
i==5 ? FIX8(1.1910) :0)
const uint32_t x264_dct8_weight_tab[64] = {
W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
};
#undef W
#define W(i) (i==0 ? FIX8(1.76777) :\
i==1 ? FIX8(1.11803) :\
i==2 ? FIX8(0.70711) :0)
const uint32_t x264_dct4_weight_tab[16] = {
W(0), W(1), W(0), W(1),
W(1), W(2), W(1), W(2),
W(0), W(1), W(0), W(1),
W(1), W(2), W(1), W(2)
};
#undef W
/* inverse squared */
#define W(i) (i==0 ? FIX8(3.125) :\
i==1 ? FIX8(1.25) :\
i==2 ? FIX8(0.5) :0)
const uint32_t x264_dct4_weight2_tab[16] = {
W(0), W(1), W(0), W(1),
W(1), W(2), W(1), W(2),
W(0), W(1), W(0), W(1),
W(1), W(2), W(1), W(2)
};
#undef W
#define W(i) (i==0 ? FIX8(1.00000) :\
i==1 ? FIX8(0.78487) :\
i==2 ? FIX8(2.56132) :\
i==3 ? FIX8(0.88637) :\
i==4 ? FIX8(1.60040) :\
i==5 ? FIX8(1.41850) :0)
const uint32_t x264_dct8_weight2_tab[64] = {
W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
};
#undef W
static void dct4x4dc( dctcoef d[16] )
{
dctcoef tmp[16];
......
......@@ -26,67 +26,10 @@
#ifndef X264_DCT_H
#define X264_DCT_H
/* the inverse of the scaling factors introduced by 8x8 fdct */
#define W(i) (i==0 ? FIX8(1.0000) :\
i==1 ? FIX8(0.8859) :\
i==2 ? FIX8(1.6000) :\
i==3 ? FIX8(0.9415) :\
i==4 ? FIX8(1.2651) :\
i==5 ? FIX8(1.1910) :0)
static const uint16_t x264_dct8_weight_tab[64] = {
W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
};
#undef W
#define W(i) (i==0 ? FIX8(1.76777) :\
i==1 ? FIX8(1.11803) :\
i==2 ? FIX8(0.70711) :0)
static const uint16_t x264_dct4_weight_tab[16] = {
W(0), W(1), W(0), W(1),
W(1), W(2), W(1), W(2),
W(0), W(1), W(0), W(1),
W(1), W(2), W(1), W(2)
};
#undef W
/* inverse squared */
#define W(i) (i==0 ? FIX8(3.125) :\
i==1 ? FIX8(1.25) :\
i==2 ? FIX8(0.5) :0)
static const uint16_t x264_dct4_weight2_tab[16] = {
W(0), W(1), W(0), W(1),
W(1), W(2), W(1), W(2),
W(0), W(1), W(0), W(1),
W(1), W(2), W(1), W(2)
};
#undef W
#define W(i) (i==0 ? FIX8(1.00000) :\
i==1 ? FIX8(0.78487) :\
i==2 ? FIX8(2.56132) :\
i==3 ? FIX8(0.88637) :\
i==4 ? FIX8(1.60040) :\
i==5 ? FIX8(1.41850) :0)
static const uint16_t x264_dct8_weight2_tab[64] = {
W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
};
#undef W
extern const uint32_t x264_dct4_weight_tab[16];
extern const uint32_t x264_dct8_weight_tab[64];
extern const uint32_t x264_dct4_weight2_tab[16];
extern const uint32_t x264_dct8_weight2_tab[64];
typedef struct
{
......
......@@ -294,8 +294,8 @@ static ALWAYS_INLINE int x264_cabac_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_
x264_cabac_encode_decision( cb, ctxbase + 5, 1 );
if( i_abs < 9 )
{
cb->f8_bits_encoded += cabac_size_unary[i_abs - 3][cb->state[ctxbase+6]];
cb->state[ctxbase+6] = cabac_transition_unary[i_abs - 3][cb->state[ctxbase+6]];
cb->f8_bits_encoded += x264_cabac_size_unary[i_abs - 3][cb->state[ctxbase+6]];
cb->state[ctxbase+6] = x264_cabac_transition_unary[i_abs - 3][cb->state[ctxbase+6]];
}
else
{
......@@ -658,7 +658,12 @@ static const uint16_t coeff_abs_level_m1_offset[14] =
{
227+0, 227+10, 227+20, 227+30, 227+39, 426, 952+0, 952+10, 952+20, 708, 982+0, 982+10, 982+20, 766
};
static const uint8_t significant_coeff_flag_offset_8x8[2][63] =
#if RDO_SKIP_BS
extern const uint8_t x264_significant_coeff_flag_offset_8x8[2][63];
extern const uint8_t x264_last_coeff_flag_offset_8x8[63];
extern const uint8_t x264_coeff_flag_offset_chroma_422_dc[7];
#else
const uint8_t x264_significant_coeff_flag_offset_8x8[2][63] =
{{
0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
......@@ -670,14 +675,15 @@ static const uint8_t significant_coeff_flag_offset_8x8[2][63] =
9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14
}};
static const uint8_t last_coeff_flag_offset_8x8[63] =
const uint8_t x264_last_coeff_flag_offset_8x8[63] =
{
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
};
static const uint8_t coeff_flag_offset_chroma_422_dc[7] = { 0, 0, 1, 1, 2, 2, 2 }; /* MIN( i/2, 2 ) */
const uint8_t x264_coeff_flag_offset_chroma_422_dc[7] = { 0, 0, 1, 1, 2, 2, 2 }; /* MIN( i/2, 2 ) */
#endif
// node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
// 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
......@@ -737,15 +743,15 @@ static ALWAYS_INLINE void x264_cabac_block_residual_internal( x264_t *h, x264_ca
if( chroma422dc )
{
int count_m1 = 7;
WRITE_SIGMAP( coeff_flag_offset_chroma_422_dc[i], coeff_flag_offset_chroma_422_dc[i] )
WRITE_SIGMAP( x264_coeff_flag_offset_chroma_422_dc[i], x264_coeff_flag_offset_chroma_422_dc[i] )
}
else
{
int count_m1 = count_cat_m1[ctx_block_cat];
if( count_m1 == 63 )
{
const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED];
WRITE_SIGMAP( sig_offset[i], last_coeff_flag_offset_8x8[i] )
const uint8_t *sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED];
WRITE_SIGMAP( sig_offset[i], x264_last_coeff_flag_offset_8x8[i] )
}
else
WRITE_SIGMAP( i, i )
......@@ -799,7 +805,7 @@ static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int c
* is nearly no quality penalty for this (~0.001db) and the speed boost (~30%) is worth it. */
static void ALWAYS_INLINE x264_cabac_block_residual_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int b_8x8, int chroma422dc )
{
const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED];
const uint8_t *sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED];
int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat];
......@@ -812,9 +818,9 @@ static void ALWAYS_INLINE x264_cabac_block_residual_internal( x264_t *h, x264_ca
if( last != (b_8x8 ? 63 : chroma422dc ? 7 : count_cat_m1[ctx_block_cat]) )
{
x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] :
chroma422dc ? coeff_flag_offset_chroma_422_dc[last] : last), 1 );
x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[last] :
chroma422dc ? coeff_flag_offset_chroma_422_dc[last] : last), 1 );
chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[last] : last), 1 );
x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? x264_last_coeff_flag_offset_8x8[last] :
chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[last] : last), 1 );
}
if( coeff_abs > 1 )
......@@ -823,13 +829,13 @@ static void ALWAYS_INLINE x264_cabac_block_residual_internal( x264_t *h, x264_ca
ctx = levelgt1_ctx[0] + ctx_level;
if( coeff_abs < 15 )
{
cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]];
cb->state[ctx] = cabac_transition_unary[coeff_abs-1][cb->state[ctx]];
cb->f8_bits_encoded += x264_cabac_size_unary[coeff_abs-1][cb->state[ctx]];
cb->state[ctx] = x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx]];
}
else
{
cb->f8_bits_encoded += cabac_size_unary[14][cb->state[ctx]];
cb->state[ctx] = cabac_transition_unary[14][cb->state[ctx]];
cb->f8_bits_encoded += x264_cabac_size_unary[14][cb->state[ctx]];
cb->state[ctx] = x264_cabac_transition_unary[14][cb->state[ctx]];
x264_cabac_encode_ue_bypass( cb, 0, coeff_abs - 15 );
}
node_ctx = coeff_abs_level_transition[1][0];
......@@ -847,9 +853,9 @@ static void ALWAYS_INLINE x264_cabac_block_residual_internal( x264_t *h, x264_ca
{
coeff_abs = abs(l[i]);
x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] :
chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 1 );
x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[i] :
chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 0 );
chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[i] : i), 1 );
x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? x264_last_coeff_flag_offset_8x8[i] :
chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[i] : i), 0 );
ctx = coeff_abs_level1_ctx[node_ctx] + ctx_level;
if( coeff_abs > 1 )
......@@ -858,13 +864,13 @@ static void ALWAYS_INLINE x264_cabac_block_residual_internal( x264_t *h, x264_ca
ctx = levelgt1_ctx[node_ctx] + ctx_level;
if( coeff_abs < 15 )
{
cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]];
cb->state[ctx] = cabac_transition_unary[coeff_abs-1][cb->state[ctx]];
cb->f8_bits_encoded += x264_cabac_size_unary[coeff_abs-1][cb->state[ctx]];
cb->state[ctx] = x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx]];
}
else
{
cb->f8_bits_encoded += cabac_size_unary[14][cb->state[ctx]];
cb->state[ctx] = cabac_transition_unary[14][cb->state[ctx]];
cb->f8_bits_encoded += x264_cabac_size_unary[14][cb->state[ctx]];
cb->state[ctx] = x264_cabac_transition_unary[14][cb->state[ctx]];
x264_cabac_encode_ue_bypass( cb, 0, coeff_abs - 15 );
}
node_ctx = coeff_abs_level_transition[1][node_ctx];
......@@ -878,7 +884,7 @@ static void ALWAYS_INLINE x264_cabac_block_residual_internal( x264_t *h, x264_ca
}
else
x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] :
chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 0 );
chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[i] : i), 0 );
}
}
......
......@@ -1090,7 +1090,7 @@ void x264_noise_reduction_update( x264_t *h )
{
int dct8x8 = cat&1;
int size = dct8x8 ? 64 : 16;
const uint16_t *weight = dct8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
const uint32_t *weight = dct8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
if( h->nr_count[cat] > (dct8x8 ? (1<<16) : (1<<18)) )
{
......
......@@ -32,8 +32,8 @@
/* Transition and size tables for abs<9 MVD and residual coding */
/* Consist of i_prefix-2 1s, one zero, and a bypass sign bit */
static uint8_t cabac_transition_unary[15][128];
static uint16_t cabac_size_unary[15][128];
uint8_t x264_cabac_transition_unary[15][128];
uint16_t x264_cabac_size_unary[15][128];
/* Transition and size tables for abs>9 MVD */
/* Consist of 5 1s and a bypass sign bit */
static uint8_t cabac_transition_5ones[128];
......@@ -386,8 +386,8 @@ void x264_rdo_init( void )
f8_bits += x264_cabac_size_decision2( &ctx, 0 );
f8_bits += 1 << CABAC_SIZE_BITS; //sign
cabac_size_unary[i_prefix][i_ctx] = f8_bits;
cabac_transition_unary[i_prefix][i_ctx] = ctx;
x264_cabac_size_unary[i_prefix][i_ctx] = f8_bits;
x264_cabac_transition_unary[i_prefix][i_ctx] = ctx;
}
}
for( int i_ctx = 0; i_ctx < 128; i_ctx++ )
......@@ -469,7 +469,7 @@ int trellis_dc_shortcut( int sign_coef, int quant_coef, int unquant_mf, int coef
unsigned f8_bits = cost_sig;
int prefix = X264_MIN( abs_level - 1, 14 );
f8_bits += x264_cabac_size_decision_noup2( cabac_state+1, prefix > 0 );
f8_bits += cabac_size_unary[prefix][cabac_state[5]];
f8_bits += x264_cabac_size_unary[prefix][cabac_state[5]];
if( abs_level >= 15 )
f8_bits += bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
......@@ -496,7 +496,7 @@ int trellis_coef( int j, int const_level, int abs_level, int prefix, int suffix_
if( const_level > 1 )
{
levelgt1_state = j >= 6 ? nodes_prev[j].cabac_state[levelgt1_ctx-6] : level_state[levelgt1_ctx];
f8_bits += cabac_size_unary[prefix][levelgt1_state] + suffix_cost;
f8_bits += x264_cabac_size_unary[prefix][levelgt1_state] + suffix_cost;
}
else
f8_bits += 1 << CABAC_SIZE_BITS;
......@@ -513,7 +513,7 @@ int trellis_coef( int j, int const_level, int abs_level, int prefix, int suffix_
if( j >= 3 ) // skip the transition if we're not going to reuse the context
nodes_cur[node_ctx].cabac_state[level1_ctx>>2] = x264_cabac_transition[level1_state][const_level > 1];
if( const_level > 1 && node_ctx == 7 )
nodes_cur[node_ctx].cabac_state[levelgt1_ctx-6] = cabac_transition_unary[prefix][levelgt1_state];
nodes_cur[node_ctx].cabac_state[levelgt1_ctx-6] = x264_cabac_transition_unary[prefix][levelgt1_state];
nodes_cur[node_ctx].level_idx = nodes_prev[j].level_idx;
SET_LEVEL( nodes_cur[node_ctx], nodes_prev[j], abs_level );
}
......@@ -636,8 +636,8 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
{
ALIGNED_ARRAY_16( dctcoef, orig_coefs, [64] );
ALIGNED_ARRAY_16( dctcoef, quant_coefs, [64] );
const uint16_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
const uint16_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
const int b_interlaced = MB_INTERLACED;
uint8_t *cabac_state_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
uint8_t *cabac_state_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
......@@ -738,8 +738,8 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
* subtracting from one score is equivalent to adding to the rest. */\
if( !ctx_hi )\
{\
int sigindex = !dc && num_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :\
b_chroma && dc && num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;\
int sigindex = !dc && num_coefs == 64 ? x264_significant_coeff_flag_offset_8x8[b_interlaced][i] :\
b_chroma && dc && num_coefs == 8 ? x264_coeff_flag_offset_chroma_422_dc[i] : i;\
uint64_t cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )\
* (uint64_t)lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );\
nodes_cur[0].score -= cost_sig0;\
......@@ -759,10 +759,10 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
\
if( i < num_coefs-1 || ctx_hi )\
{\
int sigindex = !dc && num_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :\
b_chroma && dc && num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;\
int lastindex = !dc && num_coefs == 64 ? last_coeff_flag_offset_8x8[i] :\
b_chroma && dc && num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;\
int sigindex = !dc && num_coefs == 64 ? x264_significant_coeff_flag_offset_8x8[b_interlaced][i] :\
b_chroma && dc && num_coefs == 8 ? x264_coeff_flag_offset_chroma_422_dc[i] : i;\
int lastindex = !dc && num_coefs == 64 ? x264_last_coeff_flag_offset_8x8[i] :\
b_chroma && dc && num_coefs == 8 ? x264_coeff_flag_offset_chroma_422_dc[i] : i;\
cost_siglast[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );\
int cost_sig1 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );\
cost_siglast[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ) + cost_sig1;\
......@@ -890,8 +890,8 @@ int quant_trellis_cavlc( x264_t *h, dctcoef *dct,
{
ALIGNED_16( dctcoef quant_coefs[2][16] );
ALIGNED_16( dctcoef coefs[16] ) = {0};
const uint16_t *coef_weight1 = b_8x8 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
const uint16_t *coef_weight2 = b_8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
const uint32_t *coef_weight1 = b_8x8 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
const uint32_t *coef_weight2 = b_8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
int delta_distortion[16];
int64_t score = 1ULL<<62;
int i, j;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment