Commit d1fbc652 authored by Fiona Glaser's avatar Fiona Glaser

Add assembly versions of decimate_score

3-7x faster decimation, 1-3% faster overall
parent 8d6b262d
...@@ -208,6 +208,66 @@ static void x264_denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int ...@@ -208,6 +208,66 @@ static void x264_denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int
} }
} }
/* (ref: JVT-B118)
* x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
* to 0 (low score means set it to null)
* Used in inter macroblock (luma and chroma)
* luma: for a 8x8 block: if score < 4 -> null
* for the complete mb: if score < 6 -> null
* chroma: for the complete mb: if score < 7 -> null
*/
const uint8_t x264_decimate_table4[16] = {
3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0 };
const uint8_t x264_decimate_table8[64] = {
3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,
1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
static int ALWAYS_INLINE x264_decimate_score_internal( int16_t *dct, int i_max )
{
const uint8_t *ds_table = (i_max == 64) ? x264_decimate_table8 : x264_decimate_table4;
int i_score = 0;
int idx = i_max - 1;
/* Yes, dct[idx-1] is guaranteed to be 32-bit aligned. idx>=0 instead of 1 works correctly for the same reason */
while( idx >= 0 && *(uint32_t*)&dct[idx-1] == 0 )
idx -= 2;
if( idx >= 0 && dct[idx] == 0 )
idx--;
while( idx >= 0 )
{
int i_run;
if( (unsigned)(dct[idx--] + 1) > 2 )
return 9;
i_run = 0;
while( idx >= 0 && dct[idx] == 0 )
{
idx--;
i_run++;
}
i_score += ds_table[i_run];
}
return i_score;
}
static int x264_decimate_score15( int16_t *dct )
{
return x264_decimate_score_internal( dct+1, 15 );
}
static int x264_decimate_score16( int16_t *dct )
{
return x264_decimate_score_internal( dct, 16 );
}
static int x264_decimate_score64( int16_t *dct )
{
return x264_decimate_score_internal( dct, 64 );
}
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{ {
pf->quant_8x8 = quant_8x8; pf->quant_8x8 = quant_8x8;
...@@ -219,6 +279,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) ...@@ -219,6 +279,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_8x8 = dequant_8x8; pf->dequant_8x8 = dequant_8x8;
pf->denoise_dct = x264_denoise_dct; pf->denoise_dct = x264_denoise_dct;
pf->decimate_score15 = x264_decimate_score15;
pf->decimate_score16 = x264_decimate_score16;
pf->decimate_score64 = x264_decimate_score64;
#ifdef HAVE_MMX #ifdef HAVE_MMX
if( cpu&X264_CPU_MMX ) if( cpu&X264_CPU_MMX )
...@@ -242,6 +305,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) ...@@ -242,6 +305,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_2x2_dc = x264_quant_2x2_dc_mmxext; pf->quant_2x2_dc = x264_quant_2x2_dc_mmxext;
#ifdef ARCH_X86 #ifdef ARCH_X86
pf->quant_4x4_dc = x264_quant_4x4_dc_mmxext; pf->quant_4x4_dc = x264_quant_4x4_dc_mmxext;
pf->decimate_score15 = x264_decimate_score15_mmxext;
pf->decimate_score16 = x264_decimate_score16_mmxext;
pf->decimate_score64 = x264_decimate_score64_mmxext;
#endif #endif
} }
...@@ -258,6 +324,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) ...@@ -258,6 +324,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2; pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
} }
pf->denoise_dct = x264_denoise_dct_sse2; pf->denoise_dct = x264_denoise_dct_sse2;
pf->decimate_score15 = x264_decimate_score15_sse2;
pf->decimate_score16 = x264_decimate_score16_sse2;
pf->decimate_score64 = x264_decimate_score64_sse2;
} }
if( cpu&X264_CPU_SSSE3 ) if( cpu&X264_CPU_SSSE3 )
...@@ -267,6 +336,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) ...@@ -267,6 +336,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4 = x264_quant_4x4_ssse3; pf->quant_4x4 = x264_quant_4x4_ssse3;
pf->quant_8x8 = x264_quant_8x8_ssse3; pf->quant_8x8 = x264_quant_8x8_ssse3;
pf->denoise_dct = x264_denoise_dct_ssse3; pf->denoise_dct = x264_denoise_dct_ssse3;
pf->decimate_score15 = x264_decimate_score15_ssse3;
pf->decimate_score16 = x264_decimate_score16_ssse3;
pf->decimate_score64 = x264_decimate_score64_ssse3;
} }
#endif // HAVE_MMX #endif // HAVE_MMX
......
...@@ -34,6 +34,10 @@ typedef struct ...@@ -34,6 +34,10 @@ typedef struct
void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
void (*denoise_dct)( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); void (*denoise_dct)( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
int (*decimate_score15)( int16_t *dct );
int (*decimate_score16)( int16_t *dct );
int (*decimate_score64)( int16_t *dct );
} x264_quant_function_t; } x264_quant_function_t;
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ); void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
......
...@@ -22,8 +22,10 @@ ...@@ -22,8 +22,10 @@
;***************************************************************************** ;*****************************************************************************
%include "x86inc.asm" %include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA SECTION_RODATA
pb_1: times 16 db 1
pw_1: times 8 dw 1 pw_1: times 8 dw 1
pd_1: times 4 dd 1 pd_1: times 4 dd 1
...@@ -54,6 +56,17 @@ dequant8_scale: ...@@ -54,6 +56,17 @@ dequant8_scale:
DQM8 32, 28, 51, 30, 40, 38 DQM8 32, 28, 51, 30, 40, 38
DQM8 36, 32, 58, 34, 46, 43 DQM8 36, 32, 58, 34, 46, 43
decimate_mask_table4:
db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
SECTION .text SECTION .text
%macro QUANT_DC_START 0 %macro QUANT_DC_START 0
...@@ -379,3 +392,215 @@ DENOISE_DCT sse2 ...@@ -379,3 +392,215 @@ DENOISE_DCT sse2
%define PABSW PABSW_SSSE3 %define PABSW PABSW_SSSE3
%define PSIGNW PSIGNW_SSSE3 %define PSIGNW PSIGNW_SSSE3
DENOISE_DCT ssse3 DENOISE_DCT ssse3
;-----------------------------------------------------------------------------
; int x264_decimate_score( int16_t *dct )
;-----------------------------------------------------------------------------
%macro DECIMATE_MASK_SSE2 6
%ifidn %5, ssse3
pabsw xmm0, [%3+ 0]
pabsw xmm1, [%3+16]
%else
movdqa xmm0, [%3+ 0]
movdqa xmm1, [%3+16]
ABS2_MMX xmm0, xmm1, xmm3, xmm4
%endif
packsswb xmm0, xmm1
pxor xmm2, xmm2
pcmpeqb xmm2, xmm0
pcmpgtb xmm0, %4
pmovmskb %1, xmm2
pmovmskb %2, xmm0
%endmacro
%macro DECIMATE_MASK_MMX 6
movq mm0, [%3+ 0]
movq mm1, [%3+ 8]
movq mm2, [%3+16]
movq mm3, [%3+24]
ABS2_MMX mm0, mm1, mm4, mm5
ABS2_MMX mm2, mm3, mm4, mm5
packsswb mm0, mm1
packsswb mm2, mm3
pxor mm4, mm4
pxor mm5, mm5
pcmpeqb mm4, mm0
pcmpeqb mm5, mm2
pcmpgtb mm0, %4
pcmpgtb mm2, %4
pmovmskb %6, mm4
pmovmskb %1, mm5
shl %1, 8
or %1, %6
pmovmskb %6, mm0
pmovmskb %2, mm2
shl %2, 8
or %2, %6
%endmacro
cextern x264_decimate_table4
cextern x264_decimate_table8
%macro DECIMATE4x4 2
;A LUT is faster than bsf on AMD processors, and no slower on Intel
;This is not true for score64.
cglobal x264_decimate_score%1_%2, 1,3
%ifdef PIC
lea r10, [x264_decimate_table4 GLOBAL]
lea r11, [decimate_mask_table4 GLOBAL]
%define table r10
%define mask_table r11
%else
%define table x264_decimate_table4
%define mask_table decimate_mask_table4
%endif
DECIMATE_MASK edx, eax, r0, [pb_1 GLOBAL], %2, ecx
xor edx, 0xffff
je .ret
test eax, eax
jne .ret9
%if %1==15
shr edx, 1
%endif
movzx ecx, dl
movzx eax, byte [mask_table + rcx]
cmp edx, ecx
je .ret
bsr ecx, ecx
shr edx, 1
shr edx, cl
bsf ecx, edx
shr edx, 1
shr edx, cl
add al, byte [table + rcx]
add al, byte [mask_table + rdx]
.ret:
REP_RET
.ret9:
mov eax, 9
RET
%endmacro
%ifndef ARCH_X86_64
%define DECIMATE_MASK DECIMATE_MASK_MMX
DECIMATE4x4 15, mmxext
DECIMATE4x4 16, mmxext
%endif
%define DECIMATE_MASK DECIMATE_MASK_SSE2
DECIMATE4x4 15, sse2
DECIMATE4x4 15, ssse3
DECIMATE4x4 16, sse2
DECIMATE4x4 16, ssse3
%macro DECIMATE8x8 1
%ifdef ARCH_X86_64
cglobal x264_decimate_score64_%1, 1,4
%ifdef PIC
lea r10, [x264_decimate_table8 GLOBAL]
%define table r10
%else
%define table x264_decimate_table8
%endif
mova m7, [pb_1 GLOBAL]
DECIMATE_MASK r1d, eax, r0, m7, %1, null
test eax, eax
jne .ret9
DECIMATE_MASK r2d, eax, r0+32, m7, %1, null
shl r2d, 16
or r1d, r2d
DECIMATE_MASK r2d, r3d, r0+64, m7, %1, null
shl r2, 32
or eax, r3d
or r1, r2
DECIMATE_MASK r2d, r3d, r0+96, m7, %1, null
shl r2, 48
or r1, r2
not r1
test r1, r1
je .ret
or eax, r3d
jne .ret9
.loop:
bsf rcx, r1
shr r1, cl
movzx ecx, byte [table + rcx]
add eax, ecx
shr r1, 1
jne .loop
.ret:
REP_RET
.ret9:
mov eax, 9
RET
%else ; ARCH
%ifidn %1, mmxext
cglobal x264_decimate_score64_%1, 1,6
%else
cglobal x264_decimate_score64_%1, 1,5
%endif
mova m7, [pb_1 GLOBAL]
DECIMATE_MASK r3, r2, r0, m7, %1, r5
test r2, r2
jne .ret9
DECIMATE_MASK r4, r2, r0+32, m7, %1, r5
shl r4, 16
or r3, r4
DECIMATE_MASK r4, r1, r0+64, m7, %1, r5
or r2, r1
DECIMATE_MASK r1, r0, r0+96, m7, %1, r5
shl r1, 16
or r4, r1
not r3
not r4
mov r1, r3
or r1, r4
je .ret
or r0, r2
jne .ret9 ;r2 is zero at this point, so we don't need to zero it
.loop:
bsf ecx, r3
test r3, r3
je .largerun
shrd r3, r4, cl
shr r4, cl
movzx ecx, byte [x264_decimate_table8 + ecx]
add r0, ecx
shrd r3, r4, 1
shr r4, 1
mov r2, r3
or r2, r4
jne .loop
.ret:
REP_RET
.ret9:
mov eax, 9
RET
.largerun:
mov r3, r4
xor r4, r4
bsf ecx, r3
shr r3, cl
shr r3, 1
jne .loop
REP_RET
%endif ; ARCH
%endmacro
%ifndef ARCH_X86_64
INIT_MMX
%define DECIMATE_MASK DECIMATE_MASK_MMX
DECIMATE8x8 mmxext
%endif
INIT_XMM
%define DECIMATE_MASK DECIMATE_MASK_SSE2
DECIMATE8x8 sse2
DECIMATE8x8 ssse3
...@@ -46,5 +46,14 @@ void x264_dequant_8x8_flat16_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], i ...@@ -46,5 +46,14 @@ void x264_dequant_8x8_flat16_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], i
void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
void x264_denoise_dct_sse2( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); void x264_denoise_dct_sse2( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
void x264_denoise_dct_ssse3( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); void x264_denoise_dct_ssse3( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
int x264_decimate_score15_mmxext( int16_t *dct );
int x264_decimate_score15_sse2 ( int16_t *dct );
int x264_decimate_score15_ssse3 ( int16_t *dct );
int x264_decimate_score16_mmxext( int16_t *dct );
int x264_decimate_score16_sse2 ( int16_t *dct );
int x264_decimate_score16_ssse3 ( int16_t *dct );
int x264_decimate_score64_mmxext( int16_t *dct );
int x264_decimate_score64_sse2 ( int16_t *dct );
int x264_decimate_score64_ssse3 ( int16_t *dct );
#endif #endif
...@@ -35,50 +35,6 @@ static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] ) ...@@ -35,50 +35,6 @@ static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] )
} }
#undef ZIG #undef ZIG
/* (ref: JVT-B118)
* x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
* to 0 (low score means set it to null)
* Used in inter macroblock (luma and chroma)
* luma: for a 8x8 block: if score < 4 -> null
* for the complete mb: if score < 6 -> null
* chroma: for the complete mb: if score < 7 -> null
*/
static int x264_mb_decimate_score( int16_t *dct, int i_max )
{
static const int i_ds_table4[16] = {
3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0 };
static const int i_ds_table8[64] = {
3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,
1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
const int *ds_table = (i_max == 64) ? i_ds_table8 : i_ds_table4;
int i_score = 0;
int idx = i_max - 1;
while( idx >= 0 && dct[idx] == 0 )
idx--;
while( idx >= 0 )
{
int i_run;
if( (unsigned)(dct[idx--] + 1) > 2 )
return 9;
i_run = 0;
while( idx >= 0 && dct[idx] == 0 )
{
idx--;
i_run++;
}
i_score += ds_table[i_run];
}
return i_score;
}
static ALWAYS_INLINE void x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx ) static ALWAYS_INLINE void x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
{ {
int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY; int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY;
...@@ -249,7 +205,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ) ...@@ -249,7 +205,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] ); h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] );
if( b_decimate ) if( b_decimate )
i_decimate_score += x264_mb_decimate_score( h->dct.luma4x4[16+i+ch*4]+1, 15 ); i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*4] );
} }
h->dctf.dct2x2dc( dct2x2 ); h->dctf.dct2x2dc( dct2x2 );
...@@ -562,7 +518,7 @@ void x264_macroblock_encode( x264_t *h ) ...@@ -562,7 +518,7 @@ void x264_macroblock_encode( x264_t *h )
if( b_decimate ) if( b_decimate )
{ {
int i_decimate_8x8 = x264_mb_decimate_score( h->dct.luma8x8[idx], 64 ); int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[idx] );
i_decimate_mb += i_decimate_8x8; i_decimate_mb += i_decimate_8x8;
if( i_decimate_8x8 < 4 ) if( i_decimate_8x8 < 4 )
nnz8x8[idx] = 0; nnz8x8[idx] = 0;
...@@ -606,7 +562,7 @@ void x264_macroblock_encode( x264_t *h ) ...@@ -606,7 +562,7 @@ void x264_macroblock_encode( x264_t *h )
h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] ); h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
if( b_decimate && i_decimate_8x8 <= 6 ) if( b_decimate && i_decimate_8x8 <= 6 )
i_decimate_8x8 += x264_mb_decimate_score( h->dct.luma4x4[idx], 16 ); i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[idx] );
} }
/* decimate this 8x8 block */ /* decimate this 8x8 block */
...@@ -762,7 +718,7 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir ) ...@@ -762,7 +718,7 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
if( !array_non_zero(dct4x4[i4x4]) ) if( !array_non_zero(dct4x4[i4x4]) )
continue; continue;
h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] ); h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
i_decimate_mb += x264_mb_decimate_score( dctscan, 16 ); i_decimate_mb += h->quantf.decimate_score16( dctscan );
if( i_decimate_mb >= 6 ) if( i_decimate_mb >= 6 )
return 0; return 0;
} }
...@@ -804,11 +760,12 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir ) ...@@ -804,11 +760,12 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
/* calculate dct coeffs */ /* calculate dct coeffs */
for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ ) for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
{ {
dct4x4[i4x4][0][0] = 0;
h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ); h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
if( !array_non_zero(dct4x4[i4x4]) ) if( !array_non_zero(dct4x4[i4x4]) )
continue; continue;
h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] ); h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
i_decimate_mb += x264_mb_decimate_score( dctscan+1, 15 ); i_decimate_mb += h->quantf.decimate_score15( dctscan );
if( i_decimate_mb >= 7 ) if( i_decimate_mb >= 7 )
return 0; return 0;
} }
...@@ -897,7 +854,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) ...@@ -897,7 +854,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 ); h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
if( b_decimate && !h->mb.b_trellis ) if( b_decimate && !h->mb.b_trellis )
nnz8x8 = 4 <= x264_mb_decimate_score( h->dct.luma8x8[i8], 64 ); nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[i8] );
else else
nnz8x8 = array_non_zero( dct8x8 ); nnz8x8 = array_non_zero( dct8x8 );
...@@ -922,7 +879,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) ...@@ -922,7 +879,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
{ {
int i_decimate_8x8 = 0; int i_decimate_8x8 = 0;
for( i4 = 0; i4 < 4 && i_decimate_8x8 < 4; i4++ ) for( i4 = 0; i4 < 4 && i_decimate_8x8 < 4; i4++ )
i_decimate_8x8 += x264_mb_decimate_score( h->dct.luma4x4[i8*4+i4], 16 ); i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[i8*4+i4] );
nnz8x8 = 4 <= i_decimate_8x8; nnz8x8 = 4 <= i_decimate_8x8;
} }
else else
......
...@@ -1108,6 +1108,37 @@ static int check_quant( int cpu_ref, int cpu_new ) ...@@ -1108,6 +1108,37 @@ static int check_quant( int cpu_ref, int cpu_new )
} }
report( "denoise dct :" ); report( "denoise dct :" );
#define TEST_DECIMATE( qname, decname, block, w, ac ) \
if( qf_a.decname != qf_ref.decname ) \
{ \
set_func_name( #decname ); \
used_asm = 1; \
for( i = 0; i < 100; i++ ) \
{ \
int result_c, result_a, idx; \
for( idx = 0; idx < w*w; idx++ ) \
dct1[idx] = !(rand()&3) + (!(rand()&15))*(rand()&3); \
if( ac ) \
dct1[0] = 0; \
memcpy( dct2, dct1, w*w*2 ); \
result_c = call_c1( qf_c.decname, (void*)dct2 ); \
result_a = call_a1( qf_a.decname, (void*)dct2 ); \
if( result_c != result_a ) \
{ \
ok = 0; \
fprintf( stderr, #decname ": [FAILED]\n" ); \
break; \
} \
call_c2( qf_c.decname, (void*)dct2 ); \
call_a2( qf_a.decname, (void*)dct2 ); \
} \
}