Commit c1d73389 authored by Fiona Glaser's avatar Fiona Glaser

Significantly faster CABAC and CAVLC residual coding and bit cost calculation

Early-terminate in residual writing using stored nnz counts
To allow the above, store nnz counts for luma and chroma DC
Add assembly functions to find the last nonzero coefficient in a block
Overall ~1.9% faster at subme9+8x8dct+qp25 with CAVLC, ~0.7% faster with CABAC
Note this changes output slightly with CABAC RDO because it requires always storing correct nnz values during RDO, which wasn't done before in cases it wasn't useful.
CAVLC output should be equivalent.
parent ecb04a3b
......@@ -213,7 +213,7 @@ typedef struct
#define X264_SCAN8_SIZE (6*8)
#define X264_SCAN8_0 (4+1*8)
static const int x264_scan8[16+2*4] =
static const int x264_scan8[16+2*4+3] =
{
/* Luma */
4+1*8, 5+1*8, 4+2*8, 5+2*8,
......@@ -228,6 +228,12 @@ static const int x264_scan8[16+2*4] =
/* Cr */
1+4*8, 2+4*8,
1+5*8, 2+5*8,
/* Luma DC */
4+5*8,
/* Chroma DC */
5+5*8, 6+5*8
};
/*
0 1 2 3 4 5 6 7
......@@ -236,7 +242,7 @@ static const int x264_scan8[16+2*4] =
2 B B L L L L
3 L L L L
4 R R L L L L
5 R R
5 R R DyDuDv
*/
typedef struct x264_ratecontrol_t x264_ratecontrol_t;
......
......@@ -245,6 +245,34 @@ static int x264_decimate_score64( int16_t *dct )
return x264_decimate_score_internal( dct, 64 );
}
static int ALWAYS_INLINE x264_coeff_last_internal( int16_t *l, int i_count )
{
int i_last;
for( i_last = i_count-1; i_last >= 3; i_last -= 4 )
if( *(uint64_t*)(l+i_last-3) )
break;
while( i_last >= 0 && l[i_last] == 0 )
i_last--;
return i_last;
}
static int x264_coeff_last4( int16_t *l )
{
return x264_coeff_last_internal( l, 4 );
}
static int x264_coeff_last15( int16_t *l )
{
return x264_coeff_last_internal( l, 15 );
}
static int x264_coeff_last16( int16_t *l )
{
return x264_coeff_last_internal( l, 16 );
}
static int x264_coeff_last64( int16_t *l )
{
return x264_coeff_last_internal( l, 64 );
}
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
pf->quant_8x8 = quant_8x8;
......@@ -261,6 +289,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->decimate_score16 = x264_decimate_score16;
pf->decimate_score64 = x264_decimate_score64;
pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64;
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMX )
{
......@@ -287,7 +320,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->decimate_score15 = x264_decimate_score15_mmxext;
pf->decimate_score16 = x264_decimate_score16_mmxext;
pf->decimate_score64 = x264_decimate_score64_mmxext;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmxext;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmxext;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmxext;
#endif
pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext;
}
if( cpu&X264_CPU_SSE2 )
......@@ -307,6 +344,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->decimate_score15 = x264_decimate_score15_sse2;
pf->decimate_score16 = x264_decimate_score16_sse2;
pf->decimate_score64 = x264_decimate_score64_sse2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
}
if( cpu&X264_CPU_SSSE3 )
......@@ -333,4 +373,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_8x8 = x264_dequant_8x8_altivec;
}
#endif
pf->coeff_last[ DCT_LUMA_DC] = pf->coeff_last[DCT_LUMA_4x4];
pf->coeff_last[DCT_CHROMA_AC] = pf->coeff_last[ DCT_LUMA_AC];
}
......@@ -39,6 +39,7 @@ typedef struct
int (*decimate_score15)( int16_t *dct );
int (*decimate_score16)( int16_t *dct );
int (*decimate_score64)( int16_t *dct );
int (*coeff_last[6])( int16_t *dct );
} x264_quant_function_t;
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
......
......@@ -671,3 +671,107 @@ INIT_XMM
DECIMATE8x8 sse2
DECIMATE8x8 ssse3
%macro LAST_MASK_SSE2 2-3
movdqa xmm0, [%2+ 0]
pxor xmm2, xmm2
packsswb xmm0, [%2+16]
pcmpeqb xmm0, xmm2
pmovmskb %1, xmm0
%endmacro
%macro LAST_MASK_MMX 3
movq mm0, [%2+ 0]
movq mm1, [%2+16]
pxor mm2, mm2
packsswb mm0, [%2+ 8]
packsswb mm1, [%2+24]
pcmpeqb mm0, mm2
pcmpeqb mm1, mm2
pmovmskb %1, mm0
pmovmskb %3, mm1
shl %3, 8
or %1, %3
%endmacro
%ifdef ARCH_X86_64
cglobal x264_coeff_last4_mmxext, 1,1
bsr rax, [r0]
shr eax, 4
RET
%else
cglobal x264_coeff_last4_mmxext, 0,3
mov edx, r0m
mov eax, [edx+4]
xor ecx, ecx
test eax, eax
cmovz eax, [edx]
setnz cl
bsr eax, eax
shr eax, 4
lea eax, [eax+ecx*2]
RET
%endif
%macro COEFF_LAST 1
cglobal x264_coeff_last15_%1, 1,3
LAST_MASK r1d, r0-2, r2d
xor r1d, 0xffff
bsr eax, r1d
dec eax
RET
cglobal x264_coeff_last16_%1, 1,3
LAST_MASK r1d, r0, r2d
xor r1d, 0xffff
bsr eax, r1d
RET
%ifndef ARCH_X86_64
%ifidn %1, mmxext
cglobal x264_coeff_last64_%1, 1,5
%else
cglobal x264_coeff_last64_%1, 1,4
%endif
LAST_MASK r1d, r0, r4d
LAST_MASK r2d, r0+32, r4d
shl r2d, 16
or r1d, r2d
LAST_MASK r2d, r0+64, r4d
LAST_MASK r3d, r0+96, r4d
shl r3d, 16
or r2d, r3d
not r1d
xor r2d, -1
jne .secondhalf
bsr eax, r1d
RET
.secondhalf:
bsr eax, r2d
add eax, 32
RET
%endif
%endmacro
%ifdef ARCH_X86_64
cglobal x264_coeff_last64_sse2, 1,4
LAST_MASK_SSE2 r1d, r0
LAST_MASK_SSE2 r2d, r0+32
LAST_MASK_SSE2 r3d, r0+64
LAST_MASK_SSE2 r0d, r0+96
shl r2d, 16
shl r0d, 16
or r1d, r2d
or r3d, r0d
shl r3, 32
or r1, r3
not r1
bsr rax, r1
RET
%endif
%ifndef ARCH_X86_64
%define LAST_MASK LAST_MASK_MMX
COEFF_LAST mmxext
%endif
%define LAST_MASK LAST_MASK_SSE2
COEFF_LAST sse2
......@@ -57,5 +57,12 @@ int x264_decimate_score16_ssse3 ( int16_t *dct );
int x264_decimate_score64_mmxext( int16_t *dct );
int x264_decimate_score64_sse2 ( int16_t *dct );
int x264_decimate_score64_ssse3 ( int16_t *dct );
int x264_coeff_last4_mmxext( int16_t *dct );
int x264_coeff_last15_mmxext( int16_t *dct );
int x264_coeff_last16_mmxext( int16_t *dct );
int x264_coeff_last64_mmxext( int16_t *dct );
int x264_coeff_last15_sse2( int16_t *dct );
int x264_coeff_last16_sse2( int16_t *dct );
int x264_coeff_last64_sse2( int16_t *dct );
#endif
......@@ -595,6 +595,7 @@ static int x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx )
break;
case DCT_CHROMA_DC:
/* no need to test skip/pcm */
i_idx -= 25;
if( h->mb.i_neighbour & MB_LEFT )
{
i_mba_xy = h->mb.i_mb_xy - 1;
......@@ -684,23 +685,18 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
const uint8_t *significant_coeff_flag_offset = significant_coeff_flag_offset_8x8[h->mb.b_interlaced];
int i_coeff_abs_m1[64];
int UNUSED i_coeff_sign[64];
int i_coeff_sign[64];
int i_coeff = 0;
int i_last = 0;
int i_last;
int i_sigmap_size;
int node_ctx = 0;
int i, j;
/* yes this is always aligned, and l[-1] exists in the cases where it's used (ac) */
for( j = i_count - 4; j >= -1; j -= 4 )
if( *(uint64_t*)(l+j) )
break;
int i;
if( i_count != 64 )
{
/* coded block flag */
int ctx = 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx );
if( j >= -1 )
if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )
x264_cabac_encode_decision( cb, ctx, 1 );
else
{
......@@ -709,9 +705,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
}
}
for( i = j; i < j+4; i++)
if( l[i] )
i_last = i;
i_last = h->quantf.coeff_last[i_ctxBlockCat](l);
i_sigmap_size = X264_MIN( i_last+1, i_count-1 );
......@@ -722,7 +716,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
{\
i_coeff_abs_m1[i_coeff] = abs(l[i]) - 1;\
if( !RDO_SKIP_BS )\
i_coeff_sign[i_coeff] = l[i] < 0;\
i_coeff_sign[i_coeff] = l[i] < 0;\
i_coeff++;\
x264_cabac_encode_decision( cb, i_ctx_sig + (l8x8 ? significant_coeff_flag_offset[i] : i), 1 );\
x264_cabac_encode_decision( cb, i_ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), i == i_last );\
......@@ -762,7 +756,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]];
cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]];
#else
for( j = 0; j < i_prefix - 1; j++ )
for( i = 0; i < i_prefix - 1; i++ )
x264_cabac_encode_decision( cb, ctx, 1 );
if( i_prefix < 14 )
x264_cabac_encode_decision( cb, ctx, 0 );
......@@ -1002,7 +996,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
if( i_mb_type == I_16x16 )
{
/* DC Luma */
block_residual_write_cabac( h, cb, DCT_LUMA_DC, 0, h->dct.luma16x16_dc, 16 );
block_residual_write_cabac( h, cb, DCT_LUMA_DC, 24, h->dct.luma16x16_dc, 16 );
/* AC Luma */
if( h->mb.i_cbp_luma != 0 )
......@@ -1024,8 +1018,8 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
if( h->mb.i_cbp_chroma &0x03 ) /* Chroma DC residual present */
{
block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 0, h->dct.chroma_dc[0], 4 );
block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 1, h->dct.chroma_dc[1], 4 );
block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4 );
block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4 );
}
if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
{
......@@ -1078,15 +1072,29 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
if( h->mb.i_cbp_luma & (1 << i8) )
{
if( h->mb.b_transform_8x8 )
{
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0x0101;
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
block_residual_write_cabac( h, cb, DCT_LUMA_8x8, i8, h->dct.luma8x8[i8], 64 );
}
else
{
int i4;
for( i4 = 0; i4 < 4; i4++ )
{
h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero( h->dct.luma4x4[i4+i8*4] );
block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
}
}
}
else
{
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0;
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
}
h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero( h->dct.luma4x4[16+i8] );
h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero( h->dct.luma4x4[20+i8] );
block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 15 );
block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1, 15 );
......@@ -1097,12 +1105,14 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
static void x264_subpartition_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_pixel )
{
int b_8x4 = i_pixel == PIXEL_8x4;
h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4+2-b_8x4] );
block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
if( i_pixel == PIXEL_4x4 )
x264_cabac_mb_mvd( h, cb, 0, i4, 1, 1 );
else
{
x264_cabac_mb_mvd( h, cb, 0, i4, 1+b_8x4, 2-b_8x4 );
h->mb.cache.non_zero_count[x264_scan8[i4+2-b_8x4]] = array_non_zero( h->dct.luma4x4[i4+2-b_8x4] );
block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+2-b_8x4, h->dct.luma4x4[i4+2-b_8x4], 16 );
}
}
......@@ -1115,9 +1125,9 @@ static void x264_partition_i8x8_size_cabac( x264_t *h, x264_cabac_t *cb, int i8,
x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
if( nnz )
{
block_residual_write_cabac( h, cb, DCT_LUMA_8x8, 4*i8, h->dct.luma8x8[i8], 64 );
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4]] = 0x0101;
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
block_residual_write_cabac( h, cb, DCT_LUMA_8x8, 4*i8, h->dct.luma8x8[i8], 64 );
}
else
{
......@@ -1131,8 +1141,8 @@ static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4,
const int i_pred = x264_mb_predict_intra4x4_mode( h, i4 );
i_mode = x264_mb_pred_mode4x4_fix( i_mode );
x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
}
static void x264_i8x8_chroma_size_cabac( x264_t *h, x264_cabac_t *cb )
......@@ -1141,8 +1151,8 @@ static void x264_i8x8_chroma_size_cabac( x264_t *h, x264_cabac_t *cb )
x264_cabac_mb_cbp_chroma( h, cb );
if( h->mb.i_cbp_chroma > 0 )
{
block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 0, h->dct.chroma_dc[0], 4 );
block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 1, h->dct.chroma_dc[1], 4 );
block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4 );
block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4 );
if( h->mb.i_cbp_chroma == 2 )
{
......
......@@ -55,9 +55,6 @@ static const uint8_t sub_mb_type_b_to_golomb[13]=
10, 4, 5, 1, 11, 6, 7, 2, 12, 8, 9, 3, 0
};
#define BLOCK_INDEX_CHROMA_DC (-1)
#define BLOCK_INDEX_LUMA_DC (-2)
static inline void bs_write_vlc( bs_t *s, vlc_t v )
{
bs_write( s, v.i_size, v.i_bits );
......@@ -66,71 +63,59 @@ static inline void bs_write_vlc( bs_t *s, vlc_t v )
/****************************************************************************
* block_residual_write_cavlc:
****************************************************************************/
static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_idx, int16_t *l, int i_count )
static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
{
static const int ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
int level[16], run[16];
int i_total, i_trailing;
int i_total_zero;
int i_last;
unsigned int i_sign;
int i;
int idx = 0;
int i_suffix_length;
/* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
int nC = i_idx >= 25 ? 4 : ct_index[x264_mb_predict_non_zero_code( h, i_idx == 24 ? 0 : i_idx )];
/* first find i_last */
for( i_last = i_count-1; i_last >= 3; i_last -= 4 )
if( *(uint64_t*)(l+i_last-3) )
break;
while( i_last >= 0 && l[i_last] == 0 )
i_last--;
if( !h->mb.cache.non_zero_count[x264_scan8[i_idx]] )
{
bs_write_vlc( s, x264_coeff_token[nC][0] );
return;
}
i_last = h->quantf.coeff_last[i_ctxBlockCat](l);
i_sign = 0;
i_total = 0;
i_trailing = 0;
i_total_zero = i_last + 1;
if( i_last >= 0 )
/* level and run and total */
while( i_last >= 0 )
{
int idx = 0;
/* level and run and total */
while( i_last >= 0 )
{
int r = 0;
level[idx] = l[i_last];
while( --i_last >= 0 && l[i_last] == 0 )
r++;
run[idx++] = r;
}
int r = 0;
level[idx] = l[i_last];
while( --i_last >= 0 && l[i_last] == 0 )
r++;
run[idx++] = r;
}
i_total = idx;
i_total_zero -= idx;
i_total = idx;
i_total_zero -= idx;
i_trailing = X264_MIN(3, idx);
for( idx = 0; idx < i_trailing; idx++ )
i_trailing = X264_MIN(3, idx);
for( idx = 0; idx < i_trailing; idx++ )
{
if( (unsigned)(level[idx]+1) > 2 )
{
if( (unsigned)(level[idx]+1) > 2 )
{
i_trailing = idx;
break;
}
i_sign <<= 1;
i_sign |= level[idx] < 0;
i_trailing = idx;
break;
}
i_sign <<= 1;
i_sign |= level[idx] < 0;
}
/* total/trailing */
if( i_idx == BLOCK_INDEX_CHROMA_DC )
bs_write_vlc( s, x264_coeff_token[4][i_total*4+i_trailing] );
else
{
/* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
static const int ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3 };
int nC = x264_mb_predict_non_zero_code( h, i_idx == BLOCK_INDEX_LUMA_DC ? 0 : i_idx );
bs_write_vlc( s, x264_coeff_token[ct_index[nC]][i_total*4+i_trailing] );
}
if( i_total <= 0 )
return;
bs_write_vlc( s, x264_coeff_token[nC][i_total*4+i_trailing] );
i_suffix_length = i_total > 10 && i_trailing < 3 ? 1 : 0;
if( i_trailing > 0 )
......@@ -194,7 +179,7 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_idx, int16_t *
if( i_total < i_count )
{
if( i_idx == BLOCK_INDEX_CHROMA_DC )
if( i_idx >= 25 )
bs_write_vlc( s, x264_total_zeros_dc[i_total-1][i_total_zero] );
else
bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
......@@ -214,7 +199,7 @@ static void cavlc_qp_delta( x264_t *h, bs_t *s )
/* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
if( h->mb.i_type == I_16x16 && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma)
&& !array_non_zero(h->dct.luma16x16_dc) )
&& !h->mb.cache.non_zero_count[x264_scan8[24]] )
{
#if !RDO_SKIP_BS
h->mb.i_qp = h->mb.i_last_qp;
......@@ -291,7 +276,7 @@ static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8s
for( i4 = 0; i4 < 4; i4++ )
{
h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4] );
block_residual_write_cavlc( h, s, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
block_residual_write_cavlc( h, s, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
}
}
......@@ -604,14 +589,14 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
cavlc_qp_delta( h, s );
/* DC Luma */
block_residual_write_cavlc( h, s, BLOCK_INDEX_LUMA_DC , h->dct.luma16x16_dc, 16 );
block_residual_write_cavlc( h, s, DCT_LUMA_DC, 24 , h->dct.luma16x16_dc, 16 );
/* AC Luma */
if( h->mb.i_cbp_luma )
for( i = 0; i < 16; i++ )
{
h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
block_residual_write_cavlc( h, s, i, h->dct.luma4x4[i]+1, 15 );
block_residual_write_cavlc( h, s, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1, 15 );
}
}
else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
......@@ -622,13 +607,13 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
if( h->mb.i_cbp_chroma )
{
/* Chroma DC residual present */
block_residual_write_cavlc( h, s, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[0], 4 );
block_residual_write_cavlc( h, s, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[1], 4 );
block_residual_write_cavlc( h, s, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4 );
block_residual_write_cavlc( h, s, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4 );
if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
for( i = 16; i < 24; i++ )
{
h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
block_residual_write_cavlc( h, s, i, h->dct.luma4x4[i]+1, 15 );
block_residual_write_cavlc( h, s, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15 );
}
}
......@@ -678,9 +663,9 @@ static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
{
x264_macroblock_luma_write_cavlc( h, &s, i8, i8 );
h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero_count( h->dct.luma4x4[16+i8] );
block_residual_write_cavlc( h, &s, 16+i8, h->dct.luma4x4[16+i8]+1, 15 );
block_residual_write_cavlc( h, &s, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 15 );
h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero_count( h->dct.luma4x4[20+i8] );
block_residual_write_cavlc( h, &s, 20+i8, h->dct.luma4x4[20+i8]+1, 15 );
block_residual_write_cavlc( h, &s, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1, 15 );
i8 += x264_pixel_size[i_pixel].h >> 3;
}
......@@ -694,12 +679,12 @@ static int x264_subpartition_size_cavlc( x264_t *h, int i4, int i_pixel )
s.i_bits_encoded = 0;
cavlc_mb_mvd( h, &s, 0, i4, 1+b_8x4 );
h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero_count( h->dct.luma4x4[i4] );
block_residual_write_cavlc( h, &s, i4, h->dct.luma4x4[i4], 16 );
block_residual_write_cavlc( h, &s, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
if( i_pixel != PIXEL_4x4 )
{
i4 += 2-b_8x4;
h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero_count( h->dct.luma4x4[i4] );
block_residual_write_cavlc( h, &s, i4, h->dct.luma4x4[i4], 16 );
block_residual_write_cavlc( h, &s, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
}
return s.i_bits_encoded;
......@@ -715,14 +700,13 @@ static int cavlc_intra4x4_pred_size( x264_t *h, int i4, int i_mode )
static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
{
int i4, i;
int i4;
h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, 4*i8, i_mode );
h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8] );
for( i4 = 0; i4 < 4; i4++ )
{
for( i = 0; i < 16; i++ )
h->dct.luma4x4[i4+i8*4][i] = h->dct.luma8x8[i8][i4+i*4];
h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4] );
block_residual_write_cavlc( h, &h->out.bs, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
}
return h->out.bs.i_bits_encoded;
}
......@@ -731,7 +715,7 @@ static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
{
h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, i4, i_mode );
h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero_count( h->dct.luma4x4[i4] );
block_residual_write_cavlc( h, &h->out.bs, i4, h->dct.luma4x4[i4], 16 );
block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
return h->out.bs.i_bits_encoded;
}
......@@ -740,8 +724,8 @@ static int x264_i8x8_chroma_size_cavlc( x264_t *h )
h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
if( h->mb.i_cbp_chroma )
{
block_residual_write_cavlc( h, &h->out.bs, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[0], 4 );
block_residual_write_cavlc( h, &h->out.bs, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[1], 4 );
block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4 );
block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4 );
if( h->mb.i_cbp_chroma == 2 )
{
......@@ -749,7 +733,7 @@ static int x264_i8x8_chroma_size_cavlc( x264_t *h )
for( i = 16; i < 24; i++ )
{
h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
block_residual_write_cavlc( h, &h->out.bs, i, h->dct.luma4x4[i]+1, 15 );
block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15 );
}
}
}
......
......@@ -277,9 +277,12 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
h->mb.cache.non_zero_count[x264_scan8[16+i]] = nz;
h->mb.i_cbp_chroma |= nz;
}
h->mb.cache.non_zero_count[x264_scan8[25]] = array_non_zero( h->dct.chroma_dc[0] );
h->mb.cache.non_zero_count[x264_scan8[26]] = array_non_zero( h->dct.chroma_dc[1] );
if( h->mb.i_cbp_chroma )
h->mb.i_cbp_chroma = 2; /* dc+ac (we can't do only ac) */
else if( array_non_zero( h->dct.chroma_dc ) )
else if( h->mb.cache.non_zero_count[x264_scan8[25]] |
h->mb.cache.non_zero_count[x264_scan8[26]] )
h->mb.i_cbp_chroma = 1; /* dc only */
}
......@@ -643,6 +646,7 @@ void x264_macroblock_encode( x264_t *h )
h->mb.i_cbp_luma |= nz;
}
h->mb.i_cbp_luma *= 0xf;
h->mb.cache.non_zero_count[x264_scan8[24]] = array_non_zero( h->dct.luma16x16_dc );
}
else
{
......@@ -671,13 +675,14 @@ void x264_macroblock_encode( x264_t *h )
h->mb.i_cbp_luma |= cbp << i;
}
}
h->mb.cache.non_zero_count[x264_scan8[24]] = 0;
}
if( h->param.b_cabac )
{
i_cbp_dc = ( h->mb.i_type == I_16x16 && array_non_zero( h->dct.luma16x16_dc ) )
| array_non_zero( h->dct.chroma_dc[0] ) << 1
| array_non_zero( h->dct.chroma_dc[1] ) << 2;
i_cbp_dc = h->mb.cache.non_zero_count[x264_scan8[24]]
| h->mb.cache.non_zero_count[x264_scan8[25]] << 1
| h->mb.cache.non_zero_count[x264_scan8[26]] << 2;
}
/* store cbp */
......
......@@ -1125,7 +1125,7 @@ static int check_quant( int cpu_ref, int cpu_new )
}
report( "denoise dct :" );
#define TEST_DECIMATE( qname, decname, block, w, ac, thresh ) \
#define TEST_DECIMATE( decname, w, ac, thresh ) \
if( qf_a.decname != qf_ref.decname ) \
{ \
set_func_name( #decname ); \
......@@ -1152,11 +1152,46 @@ static int check_quant( int cpu_ref, int cpu_new )
}
ok = 1;
TEST_DECIMATE( quant_8x8, decimate_score64, CQM_8IY, 8, 0, 6 );
TEST_DECIMATE( quant_4x4, decimate_score16, CQM_4IY, 4, 0, 6 );
TEST_DECIMATE( quant_4x4, decimate_score15, CQM_4IY, 4, 1, 7 );
TEST_DECIMATE( decimate_score64, 8, 0, 6 );
TEST_DECIMATE( decimate_score16, 4, 0, 6 );
TEST_DECIMATE( decimate_score15, 4, 1, 7 );
report( "decimate_score :" );
#define TEST_LAST( last, lastname, w, ac ) \
if( qf_a.last != qf_ref.last ) \
{ \
set_func_name( #lastname ); \
used_asm = 1; \
for( i = 0; i < 100; i++ ) \
{ \
int result_c, result_a, idx, nnz=0; \
int max = rand() & (w*w-1); \
memset( dct1, 0, w*w*2 ); \
for( idx = ac; idx < max; idx++ ) \
nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
if( !nnz ) \
dct1[ac] = 1; \
memcpy( dct2, dct1, w*w*2 ); \