diff --git a/common/quant.c b/common/quant.c index f5ec3efc98f348d5ea7f9db134d5fe335e2ff678..d7c8c30f2f03c9ae2cca1e8b5a10e05497fadfcd 100644 --- a/common/quant.c +++ b/common/quant.c @@ -208,6 +208,66 @@ static void x264_denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int } } +/* (ref: JVT-B118) + * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs + * to 0 (low score means set it to null) + * Used in inter macroblock (luma and chroma) + * luma: for a 8x8 block: if score < 4 -> null + * for the complete mb: if score < 6 -> null + * chroma: for the complete mb: if score < 7 -> null + */ + +const uint8_t x264_decimate_table4[16] = { + 3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0 }; +const uint8_t x264_decimate_table8[64] = { + 3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1, + 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; + +static int ALWAYS_INLINE x264_decimate_score_internal( int16_t *dct, int i_max ) +{ + const uint8_t *ds_table = (i_max == 64) ? x264_decimate_table8 : x264_decimate_table4; + int i_score = 0; + int idx = i_max - 1; + + /* Yes, dct[idx-1] is guaranteed to be 32-bit aligned. idx>=0 instead of 1 works correctly for the same reason */ + while( idx >= 0 && *(uint32_t*)&dct[idx-1] == 0 ) + idx -= 2; + if( idx >= 0 && dct[idx] == 0 ) + idx--; + while( idx >= 0 ) + { + int i_run; + + if( (unsigned)(dct[idx--] + 1) > 2 ) + return 9; + + i_run = 0; + while( idx >= 0 && dct[idx] == 0 ) + { + idx--; + i_run++; + } + i_score += ds_table[i_run]; + } + + return i_score; +} + +static int x264_decimate_score15( int16_t *dct ) +{ + return x264_decimate_score_internal( dct+1, 15 ); +} +static int x264_decimate_score16( int16_t *dct ) +{ + return x264_decimate_score_internal( dct, 16 ); +} +static int x264_decimate_score64( int16_t *dct ) +{ + return x264_decimate_score_internal( dct, 64 ); +} + void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) { pf->quant_8x8 = quant_8x8; @@ -219,6 +279,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->dequant_8x8 = dequant_8x8; pf->denoise_dct = x264_denoise_dct; + pf->decimate_score15 = x264_decimate_score15; + pf->decimate_score16 = x264_decimate_score16; + pf->decimate_score64 = x264_decimate_score64; #ifdef HAVE_MMX if( cpu&X264_CPU_MMX ) @@ -242,6 +305,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->quant_2x2_dc = x264_quant_2x2_dc_mmxext; #ifdef ARCH_X86 pf->quant_4x4_dc = x264_quant_4x4_dc_mmxext; + pf->decimate_score15 = x264_decimate_score15_mmxext; + pf->decimate_score16 = x264_decimate_score16_mmxext; + pf->decimate_score64 = x264_decimate_score64_mmxext; #endif } @@ -258,6 +324,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2; } pf->denoise_dct = x264_denoise_dct_sse2; + pf->decimate_score15 = x264_decimate_score15_sse2; + pf->decimate_score16 = x264_decimate_score16_sse2; + pf->decimate_score64 = x264_decimate_score64_sse2; } if( cpu&X264_CPU_SSSE3 ) @@ -267,6 +336,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->quant_4x4 = x264_quant_4x4_ssse3; pf->quant_8x8 = x264_quant_8x8_ssse3; pf->denoise_dct = x264_denoise_dct_ssse3; + pf->decimate_score15 = x264_decimate_score15_ssse3; + pf->decimate_score16 = x264_decimate_score16_ssse3; + pf->decimate_score64 = x264_decimate_score64_ssse3; } #endif // HAVE_MMX diff --git a/common/quant.h b/common/quant.h index 0a9741b04ec73ffcb828e05cc8a12c4619648085..986d9f5dcc300e9791b88810efd17941a77b138f 100644 --- a/common/quant.h +++ b/common/quant.h @@ -34,6 +34,10 @@ typedef struct void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); void (*denoise_dct)( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); + + int (*decimate_score15)( int16_t *dct ); + int (*decimate_score16)( int16_t *dct ); + int (*decimate_score64)( int16_t *dct ); } x264_quant_function_t; void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ); diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index eb692b4f71013206a7b410f4872beb8ea45225bb..f89eaf69bf7e869c4688bd13ff4d0fe799628312 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -22,8 +22,10 @@ ;***************************************************************************** %include "x86inc.asm" +%include "x86util.asm" SECTION_RODATA +pb_1: times 16 db 1 pw_1: times 8 dw 1 pd_1: times 4 dd 1 @@ -54,6 +56,17 @@ dequant8_scale: DQM8 32, 28, 51, 30, 40, 38 DQM8 36, 32, 58, 34, 46, 43 +decimate_mask_table4: + db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4 + db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14 + db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13 + db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10 + db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13 + db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12 + db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9 + db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16 + db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24 + SECTION .text %macro QUANT_DC_START 0 @@ -379,3 +392,215 @@ DENOISE_DCT sse2 %define PABSW PABSW_SSSE3 %define PSIGNW PSIGNW_SSSE3 DENOISE_DCT ssse3 + + + +;----------------------------------------------------------------------------- +; int x264_decimate_score( int16_t *dct ) +;----------------------------------------------------------------------------- + +%macro DECIMATE_MASK_SSE2 6 +%ifidn %5, ssse3 + pabsw xmm0, [%3+ 0] + pabsw xmm1, [%3+16] +%else + movdqa xmm0, [%3+ 0] + movdqa xmm1, [%3+16] + ABS2_MMX xmm0, xmm1, xmm3, xmm4 +%endif + packsswb xmm0, xmm1 + pxor xmm2, xmm2 + pcmpeqb xmm2, xmm0 + pcmpgtb xmm0, %4 + pmovmskb %1, xmm2 + pmovmskb %2, xmm0 +%endmacro + +%macro DECIMATE_MASK_MMX 6 + movq mm0, [%3+ 0] + movq mm1, [%3+ 8] + movq mm2, [%3+16] + movq mm3, [%3+24] + ABS2_MMX mm0, mm1, mm4, mm5 + ABS2_MMX mm2, mm3, mm4, mm5 + packsswb mm0, mm1 + packsswb mm2, mm3 + pxor mm4, mm4 + pxor mm5, mm5 + pcmpeqb mm4, mm0 + pcmpeqb mm5, mm2 + pcmpgtb mm0, %4 + pcmpgtb mm2, %4 + pmovmskb %6, mm4 + pmovmskb %1, mm5 + shl %1, 8 + or %1, %6 + pmovmskb %6, mm0 + pmovmskb %2, mm2 + shl %2, 8 + or %2, %6 +%endmacro + +cextern x264_decimate_table4 +cextern x264_decimate_table8 + +%macro DECIMATE4x4 2 + +;A LUT is faster than bsf on AMD processors, and no slower on Intel +;This is not true for score64. +cglobal x264_decimate_score%1_%2, 1,3 +%ifdef PIC + lea r10, [x264_decimate_table4 GLOBAL] + lea r11, [decimate_mask_table4 GLOBAL] + %define table r10 + %define mask_table r11 +%else + %define table x264_decimate_table4 + %define mask_table decimate_mask_table4 +%endif + DECIMATE_MASK edx, eax, r0, [pb_1 GLOBAL], %2, ecx + xor edx, 0xffff + je .ret + test eax, eax + jne .ret9 +%if %1==15 + shr edx, 1 +%endif + movzx ecx, dl + movzx eax, byte [mask_table + rcx] + cmp edx, ecx + je .ret + bsr ecx, ecx + shr edx, 1 + shr edx, cl + bsf ecx, edx + shr edx, 1 + shr edx, cl + add al, byte [table + rcx] + add al, byte [mask_table + rdx] +.ret: + REP_RET +.ret9: + mov eax, 9 + RET + +%endmacro + +%ifndef ARCH_X86_64 +%define DECIMATE_MASK DECIMATE_MASK_MMX +DECIMATE4x4 15, mmxext +DECIMATE4x4 16, mmxext +%endif +%define DECIMATE_MASK DECIMATE_MASK_SSE2 +DECIMATE4x4 15, sse2 +DECIMATE4x4 15, ssse3 +DECIMATE4x4 16, sse2 +DECIMATE4x4 16, ssse3 + +%macro DECIMATE8x8 1 + +%ifdef ARCH_X86_64 +cglobal x264_decimate_score64_%1, 1,4 +%ifdef PIC + lea r10, [x264_decimate_table8 GLOBAL] + %define table r10 +%else + %define table x264_decimate_table8 +%endif + mova m7, [pb_1 GLOBAL] + DECIMATE_MASK r1d, eax, r0, m7, %1, null + test eax, eax + jne .ret9 + DECIMATE_MASK r2d, eax, r0+32, m7, %1, null + shl r2d, 16 + or r1d, r2d + DECIMATE_MASK r2d, r3d, r0+64, m7, %1, null + shl r2, 32 + or eax, r3d + or r1, r2 + DECIMATE_MASK r2d, r3d, r0+96, m7, %1, null + shl r2, 48 + or r1, r2 + not r1 + test r1, r1 + je .ret + or eax, r3d + jne .ret9 +.loop: + bsf rcx, r1 + shr r1, cl + movzx ecx, byte [table + rcx] + add eax, ecx + shr r1, 1 + jne .loop +.ret: + REP_RET +.ret9: + mov eax, 9 + RET + +%else ; ARCH +%ifidn %1, mmxext +cglobal x264_decimate_score64_%1, 1,6 +%else +cglobal x264_decimate_score64_%1, 1,5 +%endif + mova m7, [pb_1 GLOBAL] + DECIMATE_MASK r3, r2, r0, m7, %1, r5 + test r2, r2 + jne .ret9 + DECIMATE_MASK r4, r2, r0+32, m7, %1, r5 + shl r4, 16 + or r3, r4 + DECIMATE_MASK r4, r1, r0+64, m7, %1, r5 + or r2, r1 + DECIMATE_MASK r1, r0, r0+96, m7, %1, r5 + shl r1, 16 + or r4, r1 + not r3 + not r4 + mov r1, r3 + or r1, r4 + je .ret + or r0, r2 + jne .ret9 ;r2 is zero at this point, so we don't need to zero it +.loop: + bsf ecx, r3 + test r3, r3 + je .largerun + shrd r3, r4, cl + shr r4, cl + movzx ecx, byte [x264_decimate_table8 + ecx] + add r0, ecx + shrd r3, r4, 1 + shr r4, 1 + mov r2, r3 + or r2, r4 + jne .loop +.ret: + REP_RET +.ret9: + mov eax, 9 + RET +.largerun: + mov r3, r4 + xor r4, r4 + bsf ecx, r3 + shr r3, cl + shr r3, 1 + jne .loop + REP_RET +%endif ; ARCH + +%endmacro + +%ifndef ARCH_X86_64 +INIT_MMX +%define DECIMATE_MASK DECIMATE_MASK_MMX +DECIMATE8x8 mmxext +%endif +INIT_XMM +%define DECIMATE_MASK DECIMATE_MASK_SSE2 +DECIMATE8x8 sse2 +DECIMATE8x8 ssse3 + diff --git a/common/x86/quant.h b/common/x86/quant.h index 51da6cdfcdc95fc58f8486fcfb62de922d9d6a81..ed64d60a7e3e4c39cc08ba8b4c090f5eaba566d6 100644 --- a/common/x86/quant.h +++ b/common/x86/quant.h @@ -46,5 +46,14 @@ void x264_dequant_8x8_flat16_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], i void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); void x264_denoise_dct_sse2( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); void x264_denoise_dct_ssse3( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); +int x264_decimate_score15_mmxext( int16_t *dct ); +int x264_decimate_score15_sse2 ( int16_t *dct ); +int x264_decimate_score15_ssse3 ( int16_t *dct ); +int x264_decimate_score16_mmxext( int16_t *dct ); +int x264_decimate_score16_sse2 ( int16_t *dct ); +int x264_decimate_score16_ssse3 ( int16_t *dct ); +int x264_decimate_score64_mmxext( int16_t *dct ); +int x264_decimate_score64_sse2 ( int16_t *dct ); +int x264_decimate_score64_ssse3 ( int16_t *dct ); #endif diff --git a/encoder/macroblock.c b/encoder/macroblock.c index c647961bcae6afa6a2f08d43ab028f3c7134f388..3bb4e0dd008c9502c8c0f44b423d00fa5c3ae604 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -35,50 +35,6 @@ static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] ) } #undef ZIG -/* (ref: JVT-B118) - * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs - * to 0 (low score means set it to null) - * Used in inter macroblock (luma and chroma) - * luma: for a 8x8 block: if score < 4 -> null - * for the complete mb: if score < 6 -> null - * chroma: for the complete mb: if score < 7 -> null - */ -static int x264_mb_decimate_score( int16_t *dct, int i_max ) -{ - static const int i_ds_table4[16] = { - 3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0 }; - static const int i_ds_table8[64] = { - 3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1, - 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; - - const int *ds_table = (i_max == 64) ? i_ds_table8 : i_ds_table4; - int i_score = 0; - int idx = i_max - 1; - - while( idx >= 0 && dct[idx] == 0 ) - idx--; - - while( idx >= 0 ) - { - int i_run; - - if( (unsigned)(dct[idx--] + 1) > 2 ) - return 9; - - i_run = 0; - while( idx >= 0 && dct[idx] == 0 ) - { - idx--; - i_run++; - } - i_score += ds_table[i_run]; - } - - return i_score; -} - static ALWAYS_INLINE void x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx ) { int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY; @@ -249,7 +205,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ) h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] ); if( b_decimate ) - i_decimate_score += x264_mb_decimate_score( h->dct.luma4x4[16+i+ch*4]+1, 15 ); + i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*4] ); } h->dctf.dct2x2dc( dct2x2 ); @@ -562,7 +518,7 @@ void x264_macroblock_encode( x264_t *h ) if( b_decimate ) { - int i_decimate_8x8 = x264_mb_decimate_score( h->dct.luma8x8[idx], 64 ); + int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[idx] ); i_decimate_mb += i_decimate_8x8; if( i_decimate_8x8 < 4 ) nnz8x8[idx] = 0; @@ -606,7 +562,7 @@ void x264_macroblock_encode( x264_t *h ) h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] ); if( b_decimate && i_decimate_8x8 <= 6 ) - i_decimate_8x8 += x264_mb_decimate_score( h->dct.luma4x4[idx], 16 ); + i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[idx] ); } /* decimate this 8x8 block */ @@ -762,7 +718,7 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir ) if( !array_non_zero(dct4x4[i4x4]) ) continue; h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] ); - i_decimate_mb += x264_mb_decimate_score( dctscan, 16 ); + i_decimate_mb += h->quantf.decimate_score16( dctscan ); if( i_decimate_mb >= 6 ) return 0; } @@ -804,11 +760,12 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir ) /* calculate dct coeffs */ for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ ) { + dct4x4[i4x4][0][0] = 0; h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ); if( !array_non_zero(dct4x4[i4x4]) ) continue; h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] ); - i_decimate_mb += x264_mb_decimate_score( dctscan+1, 15 ); + i_decimate_mb += h->quantf.decimate_score15( dctscan ); if( i_decimate_mb >= 7 ) return 0; } @@ -897,7 +854,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 ); if( b_decimate && !h->mb.b_trellis ) - nnz8x8 = 4 <= x264_mb_decimate_score( h->dct.luma8x8[i8], 64 ); + nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[i8] ); else nnz8x8 = array_non_zero( dct8x8 ); @@ -922,7 +879,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) { int i_decimate_8x8 = 0; for( i4 = 0; i4 < 4 && i_decimate_8x8 < 4; i4++ ) - i_decimate_8x8 += x264_mb_decimate_score( h->dct.luma4x4[i8*4+i4], 16 ); + i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[i8*4+i4] ); nnz8x8 = 4 <= i_decimate_8x8; } else diff --git a/tools/checkasm.c b/tools/checkasm.c index 5f5004a51643058e9eeb17182811c07debe53fa4..89489e6436b1ea3f04568643613bf02ed5153eec 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -1108,6 +1108,37 @@ static int check_quant( int cpu_ref, int cpu_new ) } report( "denoise dct :" ); +#define TEST_DECIMATE( qname, decname, block, w, ac ) \ + if( qf_a.decname != qf_ref.decname ) \ + { \ + set_func_name( #decname ); \ + used_asm = 1; \ + for( i = 0; i < 100; i++ ) \ + { \ + int result_c, result_a, idx; \ + for( idx = 0; idx < w*w; idx++ ) \ + dct1[idx] = !(rand()&3) + (!(rand()&15))*(rand()&3); \ + if( ac ) \ + dct1[0] = 0; \ + memcpy( dct2, dct1, w*w*2 ); \ + result_c = call_c1( qf_c.decname, (void*)dct2 ); \ + result_a = call_a1( qf_a.decname, (void*)dct2 ); \ + if( result_c != result_a ) \ + { \ + ok = 0; \ + fprintf( stderr, #decname ": [FAILED]\n" ); \ + break; \ + } \ + call_c2( qf_c.decname, (void*)dct2 ); \ + call_a2( qf_a.decname, (void*)dct2 ); \ + } \ + } + + TEST_DECIMATE( quant_8x8, decimate_score64, CQM_8IY, 8, 0 ); + TEST_DECIMATE( quant_4x4, decimate_score16, CQM_4IY, 4, 0 ); + TEST_DECIMATE( quant_4x4, decimate_score15, CQM_4IY, 4, 1 ); + report( "decimate_score :" ); + return ret; }