Commit 99448f6c authored by Fiona Glaser's avatar Fiona Glaser

Much faster CAVLC residual coding

Use a VLC table for common levelcodes instead of constructing them on-the-spot
Branchless version of i_trailing calculation (2x faster on Nehalem)
Completely remove array_non_zero_count and instead use the count calculated in level/run coding.  Note: this slightly changes output with subme > 7 due to different nonzero counts being stored during qpel RD.
parent 89a893a0
......@@ -31,6 +31,14 @@ typedef struct
uint8_t i_size;
} vlc_t;
typedef struct
{
uint16_t i_bits;
uint8_t i_size;
/* Next level table to use */
uint8_t i_next;
} vlc_large_t;
typedef struct bs_s
{
uint8_t *p_start;
......@@ -47,6 +55,14 @@ extern const vlc_t x264_total_zeros[15][16];
extern const vlc_t x264_total_zeros_dc[3][4];
extern const vlc_t x264_run_before[7][15];
/* A larger level table size theoretically could help a bit at extremely
* high bitrates, but the cost in cache is usually too high for it to be
* useful.
* This size appears to be optimal for QP18 encoding on a Nehalem CPU.
* FIXME: Do further testing? */
#define LEVEL_TABLE_SIZE 128
extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
static inline void bs_init( bs_t *s, void *p_data, int i_data )
{
int offset = ((intptr_t)p_data & (WORD_SIZE-1));
......
......@@ -99,6 +99,7 @@ char *x264_param2string( x264_param_t *p, int b_res );
void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
void x264_reduce_fraction( int *n, int *d );
void x264_init_vlc_tables();
static inline uint8_t x264_clip_uint8( int x )
{
......
......@@ -424,18 +424,6 @@ static ALWAYS_INLINE int array_non_zero_int_c( void *v, int i_count )
return 0;
}
}
/* This function and its MMX version only work on arrays of size 16 */
static ALWAYS_INLINE int array_non_zero_count( int16_t *v )
{
int i;
int i_nz;
for( i = 0, i_nz = 0; i < 16; i++ )
if( v[i] )
i_nz++;
return i_nz;
}
static inline int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
{
const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1];
......
......@@ -884,3 +884,49 @@ const vlc_t x264_run_before[7][15] =
MKVLC( 0x1, 11 ), /* str=00000000001 */
},
};
vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
void x264_init_vlc_tables()
{
int16_t level;
int i_suffix;
for( i_suffix = 0; i_suffix < 7; i_suffix++ )
for( level = -LEVEL_TABLE_SIZE/2; level < LEVEL_TABLE_SIZE/2; level++ )
{
int mask = level >> 15;
int abs_level = (level^mask)-mask;
int i_level_code = abs_level*2-mask-2;
int i_next = i_suffix;
vlc_large_t *vlc = &x264_level_token[i_suffix][level+LEVEL_TABLE_SIZE/2];
if( ( i_level_code >> i_suffix ) < 14 )
{
vlc->i_size = (i_level_code >> i_suffix) + 1 + i_suffix;
vlc->i_bits = (1<<i_suffix) + (i_level_code & ((1<<i_suffix)-1));
}
else if( i_suffix == 0 && i_level_code < 30 )
{
vlc->i_size = 19;
vlc->i_bits = (1<<4) + (i_level_code - 14);
}
else if( i_suffix > 0 && ( i_level_code >> i_suffix ) == 14 )
{
vlc->i_size = 15 + i_suffix;
vlc->i_bits = (1<<i_suffix) + (i_level_code & ((1<<i_suffix)-1));
}
else
{
i_level_code -= 15 << i_suffix;
if( i_suffix == 0 )
i_level_code -= 15;
vlc->i_size = 28;
vlc->i_bits = (1<<12) + i_level_code;
}
if( i_next == 0 )
i_next++;
if( abs_level > (3 << (i_next-1)) && i_next < 6 )
i_next++;
vlc->i_next = i_next;
}
}
......@@ -74,26 +74,6 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
sum += output[0] + output[1] + output[2] + output[3];
return sum;
}
#define array_non_zero_count array_non_zero_count_mmx
static inline int array_non_zero_count_mmx( int16_t *v )
{
int count;
asm(
"pxor %%mm7, %%mm7 \n"
"movq (%1), %%mm0 \n"
"movq 8(%1), %%mm1 \n"
"packsswb 16(%1), %%mm0 \n"
"packsswb 24(%1), %%mm1 \n"
"pcmpeqb %%mm7, %%mm0 \n"
"pcmpeqb %%mm7, %%mm1 \n"
"paddb %%mm0, %%mm1 \n"
"psadbw %%mm7, %%mm1 \n"
"movd %%mm1, %0 \n"
:"=r"(count)
:"r"(v), "m"(*(struct {int16_t x[16];} *)v)
);
return (count+0x10)&0xff;
}
#undef array_non_zero_int
#define array_non_zero_int array_non_zero_int_mmx
static ALWAYS_INLINE int array_non_zero_int_mmx( void *v, int i_count )
......
......@@ -1073,29 +1073,15 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
if( h->mb.i_cbp_luma & (1 << i8) )
{
if( h->mb.b_transform_8x8 )
{
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0x0101;
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
block_residual_write_cabac( h, cb, DCT_LUMA_8x8, i8, h->dct.luma8x8[i8], 64 );
}
else
{
int i4;
for( i4 = 0; i4 < 4; i4++ )
{
h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero( h->dct.luma4x4[i4+i8*4] );
block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
}
}
}
else
{
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0;
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
}
h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero( h->dct.luma4x4[16+i8] );
h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero( h->dct.luma4x4[20+i8] );
block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 15 );
block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1, 15 );
......@@ -1106,14 +1092,12 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
static void x264_subpartition_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_pixel )
{
int b_8x4 = i_pixel == PIXEL_8x4;
h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
if( i_pixel == PIXEL_4x4 )
x264_cabac_mb_mvd( h, cb, 0, i4, 1, 1 );
else
{
x264_cabac_mb_mvd( h, cb, 0, i4, 1+b_8x4, 2-b_8x4 );
h->mb.cache.non_zero_count[x264_scan8[i4+2-b_8x4]] = array_non_zero( h->dct.luma4x4[i4+2-b_8x4] );
block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+2-b_8x4, h->dct.luma4x4[i4+2-b_8x4], 16 );
}
}
......
......@@ -56,25 +56,70 @@ static const uint8_t sub_mb_type_b_to_golomb[13]=
10, 4, 5, 1, 11, 6, 7, 2, 12, 8, 9, 3, 0
};
static inline void bs_write_vlc( bs_t *s, vlc_t v )
{
bs_write( s, v.i_size, v.i_bits );
}
#define bs_write_vlc(s,v) bs_write( s, (v).i_size, (v).i_bits )
/****************************************************************************
* block_residual_write_cavlc:
****************************************************************************/
static inline int block_residual_write_cavlc_escape( x264_t *h, bs_t *s, int i_suffix_length, int level )
{
static const uint16_t next_suffix[7] = { 0, 3, 6, 12, 24, 48, 0xffff };
int i_level_prefix = 15;
int mask = level >> 15;
int abs_level = (level^mask)-mask;
int i_level_code = abs_level*2-mask-2;
if( ( i_level_code >> i_suffix_length ) < 15 )
{
bs_write( s, (i_level_code >> i_suffix_length) + 1 + i_suffix_length,
(1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) );
}
else
{
i_level_code -= 15 << i_suffix_length;
if( i_suffix_length == 0 )
i_level_code -= 15;
/* If the prefix size exceeds 15, High Profile is required. */
if( i_level_code >= 1<<12 )
{
if( h->sps->i_profile_idc >= PROFILE_HIGH )
{
while( i_level_code > 1<<(i_level_prefix-3) )
{
i_level_code -= 1<<(i_level_prefix-3);
i_level_prefix++;
}
}
else
{
#if RDO_SKIP_BS
/* Weight highly against overflows. */
s->i_bits_encoded += 1000000;
#else
x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile", i_level_code );
/* clip level, preserving sign */
i_level_code = (1<<12) - 2 + (i_level_code & 1);
#endif
}
}
bs_write( s, i_level_prefix + 1, 1 );
bs_write( s, i_level_prefix - 3, i_level_code & ((1<<(i_level_prefix-3))-1) );
}
if( i_suffix_length == 0 )
i_suffix_length++;
if( abs_level > next_suffix[i_suffix_length] )
i_suffix_length++;
return i_suffix_length;
}
static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
{
static const int ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
int level[16], run[16];
int i_total, i_trailing;
int i_total_zero;
int i_last;
int i_trailing, i_total_zero, i_last, i_suffix_length, i;
int i_total = 0;
unsigned int i_sign;
int i;
int idx = 0;
int i_suffix_length;
/* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
int nC = i_idx >= 25 ? 4 : ct_index[x264_mb_predict_non_zero_code( h, i_idx == 24 ? 0 : i_idx )];
......@@ -85,97 +130,66 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, i
}
i_last = h->quantf.coeff_last[i_ctxBlockCat](l);
i_sign = 0;
i_total = 0;
i_trailing = 0;
i_total_zero = i_last + 1;
/* level and run and total */
while( i_last >= 0 )
/* set these to 2 to allow branchless i_trailing calculation */
level[1] = 2;
level[2] = 2;
do
{
int r = 0;
level[idx] = l[i_last];
level[i_total] = l[i_last];
while( --i_last >= 0 && l[i_last] == 0 )
r++;
run[idx++] = r;
}
run[i_total++] = r;
} while( i_last >= 0 );
i_total = idx;
i_total_zero -= idx;
h->mb.cache.non_zero_count[x264_scan8[i_idx]] = i_total;
i_trailing = X264_MIN(3, idx);
for( idx = 0; idx < i_trailing; idx++ )
{
if( (unsigned)(level[idx]+1) > 2 )
{
i_trailing = idx;
break;
}
i_sign <<= 1;
i_sign |= level[idx] < 0;
}
i_total_zero -= i_total;
i_trailing = ((((level[0]+1) | (1-level[0])) >> 31) & 1) // abs(level[0])>1
| ((((level[1]+1) | (1-level[1])) >> 31) & 2)
| ((((level[2]+1) | (1-level[2])) >> 31) & 4);
i_trailing = ctz_index[i_trailing];
i_sign = ((level[2] >> 31) & 1)
| ((level[1] >> 31) & 2)
| ((level[0] >> 31) & 4);
i_sign >>= 3-i_trailing;
/* total/trailing */
bs_write_vlc( s, x264_coeff_token[nC][i_total*4+i_trailing] );
i_suffix_length = i_total > 10 && i_trailing < 3 ? 1 : 0;
if( i_trailing > 0 )
i_suffix_length = i_total > 10 && i_trailing < 3;
if( i_trailing > 0 || RDO_SKIP_BS )
bs_write( s, i_trailing, i_sign );
for( i = i_trailing; i < i_total; i++ )
if( i_trailing < i_total )
{
int mask = level[i] >> 15;
int abs_level = (level[i]^mask)-mask;
int i_level_code = abs_level*2-mask-2;
if( i == i_trailing && i_trailing < 3 )
i_level_code -= 2; /* as level[i] can't be 1 for the first one if i_trailing < 3 */
if( ( i_level_code >> i_suffix_length ) < 14 )
bs_write( s, (i_level_code >> i_suffix_length) + 1 + i_suffix_length,
(1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) );
else if( i_suffix_length == 0 && i_level_code < 30 )
bs_write( s, 19, (1<<4) + (i_level_code - 14) );
else if( i_suffix_length > 0 && ( i_level_code >> i_suffix_length ) == 14 )
bs_write( s, 15 + i_suffix_length,
(1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) );
int16_t val = level[i_trailing];
int16_t val_original = level[i_trailing]+LEVEL_TABLE_SIZE/2;
if( i_trailing < 3 )
val -= (val>>15)|1; /* as level[i] can't be 1 for the first one if i_trailing < 3 */
val += LEVEL_TABLE_SIZE/2;
if( (unsigned)val_original < LEVEL_TABLE_SIZE )
{
bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
i_suffix_length = x264_level_token[i_suffix_length][val_original].i_next;
}
else
i_suffix_length = block_residual_write_cavlc_escape( h, s, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
for( i = i_trailing+1; i < i_total; i++ )
{
int i_level_prefix = 15;
i_level_code -= 15 << i_suffix_length;
if( i_suffix_length == 0 )
i_level_code -= 15;
/* If the prefix size exceeds 15, High Profile is required. */
if( i_level_code >= 1<<12 )
val = level[i] + LEVEL_TABLE_SIZE/2;
if( (unsigned)val < LEVEL_TABLE_SIZE )
{
if( h->sps->i_profile_idc >= PROFILE_HIGH )
{
while( i_level_code > 1<<(i_level_prefix-3) )
{
i_level_code -= 1<<(i_level_prefix-3);
i_level_prefix++;
}
}
else
{
#if RDO_SKIP_BS
/* Weight highly against overflows. */
s->i_bits_encoded += 1000000;
#else
x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile", i_level_code );
/* clip level, preserving sign */
i_level_code = (1<<12) - 2 + (i_level_code & 1);
#endif
}
bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
i_suffix_length = x264_level_token[i_suffix_length][val].i_next;
}
bs_write( s, i_level_prefix + 1, 1 );
bs_write( s, i_level_prefix - 3, i_level_code & ((1<<(i_level_prefix-3))-1) );
else
i_suffix_length = block_residual_write_cavlc_escape( h, s, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
}
if( i_suffix_length == 0 )
i_suffix_length++;
if( abs_level > (3 << (i_suffix_length-1)) && i_suffix_length < 6 )
i_suffix_length++;
}
if( i_total < i_count )
......@@ -269,16 +283,17 @@ static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8s
/* shuffle 8x8 dct coeffs into 4x4 lists */
for( i8 = i8start; i8 <= i8end; i8++ )
if( h->mb.i_cbp_luma & (1 << i8) )
{
h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8] );
for( i4 = 0; i4 < 4; i4++ )
h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero( h->dct.luma4x4[i4+i8*4] );
}
}
for( i8 = i8start; i8 <= i8end; i8++ )
if( h->mb.i_cbp_luma & (1 << i8) )
for( i4 = 0; i4 < 4; i4++ )
{
h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4] );
block_residual_write_cavlc( h, s, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
}
}
/*****************************************************************************
......@@ -595,10 +610,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
/* AC Luma */
if( h->mb.i_cbp_luma )
for( i = 0; i < 16; i++ )
{
h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
block_residual_write_cavlc( h, s, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1, 15 );
}
}
else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
{
......@@ -612,10 +624,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
block_residual_write_cavlc( h, s, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4 );
if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
for( i = 16; i < 24; i++ )
{
h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
block_residual_write_cavlc( h, s, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15 );
}
}
#if !RDO_SKIP_BS
......@@ -663,9 +672,7 @@ static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
{
x264_macroblock_luma_write_cavlc( h, &s, i8, i8 );
h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero_count( h->dct.luma4x4[16+i8] );
block_residual_write_cavlc( h, &s, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 15 );
h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero_count( h->dct.luma4x4[20+i8] );
block_residual_write_cavlc( h, &s, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1, 15 );
i8 += x264_pixel_size[i_pixel].h >> 3;
}
......@@ -679,12 +686,10 @@ static int x264_subpartition_size_cavlc( x264_t *h, int i4, int i_pixel )
int b_8x4 = i_pixel == PIXEL_8x4;
s.i_bits_encoded = 0;
cavlc_mb_mvd( h, &s, 0, i4, 1+b_8x4 );
h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero_count( h->dct.luma4x4[i4] );
block_residual_write_cavlc( h, &s, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
if( i_pixel != PIXEL_4x4 )
{
i4 += 2-b_8x4;
h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero_count( h->dct.luma4x4[i4] );
block_residual_write_cavlc( h, &s, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
}
......@@ -706,7 +711,7 @@ static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8] );
for( i4 = 0; i4 < 4; i4++ )
{
h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4] );
h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero( h->dct.luma4x4[i4+i8*4] );
block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
}
return h->out.bs.i_bits_encoded;
......@@ -715,7 +720,7 @@ static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
{
h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, i4, i_mode );
h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero_count( h->dct.luma4x4[i4] );
h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
return h->out.bs.i_bits_encoded;
}
......@@ -732,10 +737,7 @@ static int x264_i8x8_chroma_size_cavlc( x264_t *h )
{
int i;
for( i = 16; i < 24; i++ )
{
h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15 );
}
}
}
return h->out.bs.i_bits_encoded;
......
......@@ -730,7 +730,8 @@ x264_t *x264_encoder_open ( x264_param_t *param )
x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c );
x264_predict_8x8_init( h->param.cpu, h->predict_8x8 );
x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
if( !h->param.b_cabac );
x264_init_vlc_tables();
x264_pixel_init( h->param.cpu, &h->pixf );
x264_dct_init( h->param.cpu, &h->dctf );
x264_zigzag_init( h->param.cpu, &h->zigzagf, h->param.b_interlaced );
......
......@@ -855,15 +855,20 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
{
h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
nnz8x8 = array_non_zero( h->dct.luma8x8[i8] );
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0x0101 * nnz8x8;
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101 * nnz8x8;
}
else
{
for( i4 = i8*4; i4 < i8*4+4; i4++ )
{
int nz;
h->zigzagf.sub_4x4( h->dct.luma4x4[i4],
h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4],
h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] );
nnz8x8 |= array_non_zero( h->dct.luma4x4[i4] );
nz = array_non_zero( h->dct.luma4x4[i4] );
h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = nz;
nnz8x8 |= nz;
}
}
for( ch = 0; ch < 2; ch++ )
......@@ -872,6 +877,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
h->zigzagf.sub_4x4( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec );
h->dct.luma4x4[16+i8+ch*4][0] = 0;
h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = array_non_zero( h->dct.luma4x4[16+i8+ch*4] );
}
}
else
......@@ -892,6 +898,13 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
{
h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
h->dctf.add8x8_idct8( p_fdec, dct8x8 );
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0x0101;
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
}
else
{
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0;
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
}
}
else
......@@ -918,9 +931,17 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
if( nnz8x8 )
{
for( i4 = 0; i4 < 4; i4++ )
{
h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = array_non_zero( dct4x4[i4] );
}
h->dctf.add8x8_idct( p_fdec, dct4x4 );
}
else
{
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0;
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
}
}
i_qp = h->mb.i_chroma_qp;
......@@ -944,7 +965,10 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
{
h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
h->dctf.add4x4_idct( p_fdec, dct4x4 );
h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = 1;
}
else
h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = 0;
}
}
h->mb.i_cbp_luma &= ~(1 << i8);
......@@ -967,7 +991,10 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
h->mc.mc_luma( p_fdec, FDEC_STRIDE, h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0], mvx + 4*4*block_idx_x[i4], mvy + 4*4*block_idx_y[i4], 4, 4 );
if( h->mb.b_lossless )
{
h->zigzagf.sub_4x4( h->dct.luma4x4[i4], p_fenc, p_fdec );
h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
}
else
{
DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
......@@ -978,6 +1005,9 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
{
h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PY], i_qp );
h->dctf.add4x4_idct( p_fdec, dct4x4 );
h->mb.cache.non_zero_count[x264_scan8[i4]] = 1;
}
else
h->mb.cache.non_zero_count[x264_scan8[i4]] = 0;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment