Commit 993c81e9 authored by Fiona Glaser's avatar Fiona Glaser

quant_4x4x4: quant one 8x8 block at a time

This reduces overhead and lets us use less branchy code for zigzag, dequant,
decimate, and so on.
Reorganize and optimize a lot of macroblock_encode using this new function.
~1-2% faster overall.

Includes NEON and x86 versions of the new function.
Using larger merged functions like this will also make wider SIMD, like
AVX2, more effective.
parent 5ee1d03a
......@@ -35,7 +35,7 @@ pmovmskb_byte:
.text
.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 load_mf=no
.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
vadd.u16 q8, q8, \bias0
vadd.u16 q9, q9, \bias1
.ifc \load_mf, yes
......@@ -55,7 +55,7 @@ pmovmskb_byte:
veor q9, q9, q15
vsub.s16 q8, q8, q14
vsub.s16 q9, q9, q15
vorr \bias0, q8, q9
vorr \mask, q8, q9
vst1.64 {d16-d19}, [r0,:128]!
.endm
......@@ -89,7 +89,7 @@ function x264_quant_4x4_dc_neon
vabs.s16 q9, q15
vdup.16 q0, r2
vdup.16 q2, r1
QUANT_TWO q0, q0, d4, d5, d4, d5
QUANT_TWO q0, q0, d4, d5, d4, d5, q0
vorr d0, d0, d1
QUANT_END d0
.endfunc
......@@ -101,11 +101,50 @@ function x264_quant_4x4_neon
vabs.s16 q9, q15
vld1.64 {d0-d3}, [r2,:128]
vld1.64 {d4-d7}, [r1,:128]
QUANT_TWO q0, q1, d4, d5, d6, d7
QUANT_TWO q0, q1, d4, d5, d6, d7, q0
vorr d0, d0, d1
QUANT_END d0
.endfunc
// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
function x264_quant_4x4x4_neon
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
vld1.64 {d0-d3}, [r2,:128]
vld1.64 {d4-d7}, [r1,:128]
QUANT_TWO q0, q1, d4, d5, d6, d7, q4
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
QUANT_TWO q0, q1, d4, d5, d6, d7, q5
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
QUANT_TWO q0, q1, d4, d5, d6, d7, q6
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
QUANT_TWO q0, q1, d4, d5, d6, d7, q7
vorr d8, d8, d9
vorr d10, d10, d11
vorr d12, d12, d13
vorr d14, d14, d15
vmov r0, r1, d8
vmov r2, r3, d10
orrs r0, r1
movne r0, #1
orrs r2, r3
orrne r0, #2
vmov r1, r2, d12
vmov r3, ip, d14
orrs r1, r2
orrne r0, #4
orrs r3, ip
orrne r0, #8
bx lr
.endfunc
// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
function x264_quant_8x8_neon
vld1.64 {d28-d31}, [r0,:128]
......@@ -113,13 +152,13 @@ function x264_quant_8x8_neon
vabs.s16 q9, q15
vld1.64 {d0-d3}, [r2,:128]!
vld1.64 {d4-d7}, [r1,:128]!
QUANT_TWO q0, q1, d4, d5, d6, d7
QUANT_TWO q0, q1, d4, d5, d6, d7, q0
.rept 3
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
vld1.64 {d2-d5}, [r2,:128]!
QUANT_TWO q1, q2, d4, d5, d6, d7, yes
QUANT_TWO q1, q2, d4, d5, d6, d7, q1, yes
vorr q0, q0, q1
.endr
vorr d0, d0, d1
......
......@@ -31,6 +31,7 @@ int x264_quant_2x2_dc_armv6( int16_t dct[4], int mf, int bias );
int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias );
int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias );
int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
int x264_quant_4x4x4_neon( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] );
int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
......
......@@ -254,6 +254,13 @@ static ALWAYS_INLINE uint16_t endian_fix16( uint16_t x )
}
#endif
/* For values with 4 bits or less. */
static int ALWAYS_INLINE x264_ctz_4bit( uint32_t x )
{
static uint8_t lut[16] = {4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0};
return lut[x];
}
#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 3)
#define x264_clz(x) __builtin_clz(x)
#define x264_ctz(x) __builtin_ctz(x)
......
......@@ -63,6 +63,19 @@ static int quant_4x4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] )
return !!nz;
}
static int quant_4x4x4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] )
{
int nza = 0;
for( int j = 0; j < 4; j++ )
{
int nz = 0;
for( int i = 0; i < 16; i++ )
QUANT_ONE( dct[j][i], mf[i], bias[i] );
nza |= (!!nz)<<j;
}
return nza;
}
static int quant_4x4_dc( dctcoef dct[16], int mf, int bias )
{
int nz = 0;
......@@ -405,6 +418,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
pf->quant_8x8 = quant_8x8;
pf->quant_4x4 = quant_4x4;
pf->quant_4x4x4 = quant_4x4x4;
pf->quant_4x4_dc = quant_4x4_dc;
pf->quant_2x2_dc = quant_2x2_dc;
......@@ -464,6 +478,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
if( cpu&X264_CPU_SSE2 )
{
pf->quant_4x4 = x264_quant_4x4_sse2;
pf->quant_4x4x4 = x264_quant_4x4x4_sse2;
pf->quant_8x8 = x264_quant_8x8_sse2;
pf->quant_2x2_dc = x264_quant_2x2_dc_sse2;
pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
......@@ -501,6 +516,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
if( cpu&X264_CPU_SSSE3 )
{
pf->quant_4x4 = x264_quant_4x4_ssse3;
pf->quant_4x4x4 = x264_quant_4x4x4_ssse3;
pf->quant_8x8 = x264_quant_8x8_ssse3;
pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
......@@ -520,6 +536,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_2x2_dc = x264_quant_2x2_dc_sse4;
pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
pf->quant_4x4 = x264_quant_4x4_sse4;
pf->quant_4x4x4 = x264_quant_4x4x4_sse4;
pf->quant_8x8 = x264_quant_8x8_sse4;
}
if( cpu&X264_CPU_AVX )
......@@ -543,6 +560,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
#if ARCH_X86
pf->quant_4x4 = x264_quant_4x4_mmx;
pf->quant_4x4x4 = x264_quant_4x4x4_mmx;
pf->quant_8x8 = x264_quant_8x8_mmx;
pf->dequant_4x4 = x264_dequant_4x4_mmx;
pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2;
......@@ -592,6 +610,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
pf->quant_4x4 = x264_quant_4x4_sse2;
pf->quant_4x4x4 = x264_quant_4x4x4_sse2;
pf->quant_8x8 = x264_quant_8x8_sse2;
pf->dequant_4x4 = x264_dequant_4x4_sse2;
pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
......@@ -631,6 +650,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
pf->quant_4x4 = x264_quant_4x4_ssse3;
pf->quant_4x4x4 = x264_quant_4x4x4_ssse3;
pf->quant_8x8 = x264_quant_8x8_ssse3;
pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_ssse3;
pf->denoise_dct = x264_denoise_dct_ssse3;
......@@ -696,6 +716,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_2x2_dc = x264_quant_2x2_dc_neon;
pf->quant_4x4 = x264_quant_4x4_neon;
pf->quant_4x4_dc = x264_quant_4x4_dc_neon;
pf->quant_4x4x4 = x264_quant_4x4x4_neon;
pf->quant_8x8 = x264_quant_8x8_neon;
pf->dequant_4x4 = x264_dequant_4x4_neon;
pf->dequant_4x4_dc = x264_dequant_4x4_dc_neon;
......
......@@ -29,8 +29,9 @@
typedef struct
{
int (*quant_8x8)( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
int (*quant_4x4)( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
int (*quant_8x8) ( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
int (*quant_4x4) ( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
int (*quant_4x4x4)( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
int (*quant_4x4_dc)( dctcoef dct[16], int mf, int bias );
int (*quant_2x2_dc)( dctcoef dct[4], int mf, int bias );
......
......@@ -175,7 +175,7 @@ cextern pd_1024
%endif ; cpuflag
%endmacro
%macro QUANT_ONE_AC_MMX 4
%macro QUANT_ONE_AC_MMX 5
mova m0, [%1]
mova m2, [%2]
ABSD m1, m0
......@@ -191,10 +191,10 @@ cextern pd_1024
psrad m1, 16
PSIGND m1, m0
mova [%1], m1
ACCUM por, 5, 1, %4
ACCUM por, %5, 1, %4
%endmacro
%macro QUANT_TWO_AC 4
%macro QUANT_TWO_AC 5
%if cpuflag(sse4)
mova m0, [%1 ]
mova m1, [%1+mmsize]
......@@ -210,11 +210,11 @@ cextern pd_1024
PSIGND m3, m1
mova [%1 ], m2
mova [%1+mmsize], m3
ACCUM por, 5, 2, %4
por m5, m3
ACCUM por, %5, 2, %4
ACCUM por, %5, 3, %4+mmsize
%else ; !sse4
QUANT_ONE_AC_MMX %1, %2, %3, %4
QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize
QUANT_ONE_AC_MMX %1, %2, %3, %4, %5
QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize, %5
%endif ; cpuflag
%endmacro
......@@ -244,30 +244,58 @@ cglobal quant_%1x%2_dc, 3,3,8
cglobal quant_%1x%2, 3,3,8
%assign x 0
%rep %1*%2/(mmsize/2)
QUANT_TWO_AC r0+x, r1+x, r2+x, x
QUANT_TWO_AC r0+x, r1+x, r2+x, x, 5
%assign x x+mmsize*2
%endrep
QUANT_END
RET
%endmacro
%macro QUANT_4x4 2
QUANT_TWO_AC r0+%1+mmsize*0, r1+mmsize*0, r2+mmsize*0, mmsize*0, %2
QUANT_TWO_AC r0+%1+mmsize*2, r1+mmsize*2, r2+mmsize*2, mmsize*2, %2
%endmacro
%macro QUANT_4x4x4 0
cglobal quant_4x4x4, 3,3,8
QUANT_4x4 0, 5
QUANT_4x4 64, 6
add r0, 128
packssdw m5, m6
QUANT_4x4 0, 6
QUANT_4x4 64, 7
packssdw m6, m7
packssdw m5, m6
packssdw m5, m5 ; AA BB CC DD
packsswb m5, m5 ; A B C D
pxor m4, m4
pcmpeqb m5, m4
pmovmskb eax, m5
not eax
and eax, 0xf
RET
%endmacro
INIT_XMM sse2
QUANT_DC 2, 2
QUANT_DC 4, 4
QUANT_AC 4, 4
QUANT_AC 8, 8
QUANT_4x4x4
INIT_XMM ssse3
QUANT_DC 2, 2
QUANT_DC 4, 4
QUANT_AC 4, 4
QUANT_AC 8, 8
QUANT_4x4x4
INIT_XMM sse4
QUANT_DC 2, 2
QUANT_DC 4, 4
QUANT_AC 4, 4
QUANT_AC 8, 8
QUANT_4x4x4
%endif ; HIGH_BIT_DEPTH
......@@ -285,7 +313,7 @@ QUANT_AC 8, 8
ACCUM por, 5, 0, %4
%endmacro
%macro QUANT_TWO 7
%macro QUANT_TWO 8
mova m1, %1
mova m3, %2
ABSW m0, m1, sign
......@@ -298,8 +326,8 @@ QUANT_AC 8, 8
PSIGNW m2, m3
mova %1, m0
mova %2, m2
ACCUM por, 5, 0, %7
por m5, m2
ACCUM por, %8, 0, %7
ACCUM por, %8, 2, %7+mmsize
%endmacro
;-----------------------------------------------------------------------------
......@@ -313,7 +341,7 @@ cglobal %1, 1,1,%3
%else
%assign x 0
%rep %2/2
QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x
QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x, 5
%assign x x+mmsize*2
%endrep
%endif
......@@ -328,13 +356,51 @@ cglobal %1, 1,1,%3
cglobal %1, 3,3
%assign x 0
%rep %2/2
QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x
QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x, 5
%assign x x+mmsize*2
%endrep
QUANT_END
RET
%endmacro
%macro QUANT_4x4 2
%if UNIX64
QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], m8, m9, m10, m11, mmsize*0, %2
%else
QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], [r1+mmsize*0], [r1+mmsize*1], [r2+mmsize*0], [r2+mmsize*1], mmsize*0, %2
%if mmsize==8
QUANT_TWO [r0+%1+mmsize*2], [r0+%1+mmsize*3], [r1+mmsize*2], [r1+mmsize*3], [r2+mmsize*2], [r2+mmsize*3], mmsize*2, %2
%endif
%endif
%endmacro
%macro QUANT_4x4x4 0
cglobal quant_4x4x4, 3,3,7
%if UNIX64
mova m8, [r1+mmsize*0]
mova m9, [r1+mmsize*1]
mova m10, [r2+mmsize*0]
mova m11, [r2+mmsize*1]
%endif
QUANT_4x4 0, 4
QUANT_4x4 32, 5
packssdw m4, m5
QUANT_4x4 64, 5
QUANT_4x4 96, 6
packssdw m5, m6
packssdw m4, m5
%if mmsize == 16
packssdw m4, m4 ; AA BB CC DD
%endif
packsswb m4, m4 ; A B C D
pxor m3, m3
pcmpeqb m4, m3
pmovmskb eax, m4
not eax
and eax, 0xf
RET
%endmacro
INIT_MMX mmx2
QUANT_DC quant_2x2_dc, 1
%if ARCH_X86_64 == 0 ; not needed because sse2 is faster
......@@ -342,17 +408,20 @@ QUANT_DC quant_4x4_dc, 4
INIT_MMX mmx
QUANT_AC quant_4x4, 4
QUANT_AC quant_8x8, 16
QUANT_4x4x4
%endif
INIT_XMM sse2
QUANT_DC quant_4x4_dc, 2, 8
QUANT_AC quant_4x4, 2
QUANT_AC quant_8x8, 8
QUANT_4x4x4
INIT_XMM ssse3
QUANT_DC quant_4x4_dc, 2, 8
QUANT_AC quant_4x4, 2
QUANT_AC quant_8x8, 8
QUANT_4x4x4
INIT_MMX ssse3
QUANT_DC quant_2x2_dc, 1
......
......@@ -31,18 +31,22 @@
int x264_quant_2x2_dc_mmx2( dctcoef dct[4], int mf, int bias );
int x264_quant_4x4_dc_mmx2( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_mmx( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_4x4x4_mmx( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_8x8_mmx( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
int x264_quant_2x2_dc_sse2( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_dc_sse2( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_sse2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_4x4x4_sse2( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_8x8_sse2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
int x264_quant_2x2_dc_ssse3( dctcoef dct[4], int mf, int bias );
int x264_quant_4x4_dc_ssse3( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_ssse3( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_4x4x4_ssse3( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_8x8_ssse3( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
int x264_quant_2x2_dc_sse4( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_dc_sse4( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_sse4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_4x4x4_sse4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_8x8_sse4( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_mmx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
......
......@@ -157,28 +157,51 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
return;
}
M32( &h->mb.cache.non_zero_count[x264_scan8[ 0+p*16]] ) = 0;
M32( &h->mb.cache.non_zero_count[x264_scan8[ 2+p*16]] ) = 0;
M32( &h->mb.cache.non_zero_count[x264_scan8[ 8+p*16]] ) = 0;
M32( &h->mb.cache.non_zero_count[x264_scan8[10+p*16]] ) = 0;
h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
for( int i = 0; i < 16; i++ )
if( h->mb.b_noise_reduction )
for( int idx = 0; idx < 16; idx++ )
h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
for( int idx = 0; idx < 16; idx++ )
{
/* copy dc coeff */
if( h->mb.b_noise_reduction )
h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[0], h->nr_offset[0], 16 );
dct_dc4x4[block_idx_xy_1d[i]] = dct4x4[i][0];
dct4x4[i][0] = 0;
dct_dc4x4[block_idx_xy_1d[idx]] = dct4x4[idx][0];
dct4x4[idx][0] = 0;
}
/* quant/scan/dequant */
if( h->mb.b_trellis )
nz = x264_quant_4x4_trellis( h, dct4x4[i], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, i );
else
nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz;
if( nz )
if( h->mb.b_trellis )
{
for( int idx = 0; idx < 16; idx++ )
if( x264_quant_4x4_trellis( h, dct4x4[idx], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, idx ) )
{
block_cbp = 0xf;
h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
}
}
else
{
for( int i8x8 = 0; i8x8 < 4; i8x8++ )
{
h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+i], dct4x4[i] );
h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[i_quant_cat], i_qp );
if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+i] );
block_cbp = 0xf;
nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
if( nz )
{
block_cbp = 0xf;
FOREACH_BIT( idx, i8x8*4, nz )
{
h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
}
}
}
}
......@@ -245,6 +268,18 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
h->mb.i_cbp_chroma = 0;
h->nr_count[2] += h->mb.b_noise_reduction * 4;
M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0;
M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0;
M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0;
M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0;
if( chroma422 )
{
M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0;
M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0;
M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0;
M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0;
}
/* Early termination: check variance of chroma residual before encoding.
* Don't bother trying early termination at low QPs.
* Values are experimentally derived. */
......@@ -259,17 +294,6 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
if( score < thresh*4 )
{
M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0;
M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0;
M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0;
M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0;
if( chroma422 )
{
M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0;
M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0;
M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0;
M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0;
}
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0;
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0;
......@@ -326,7 +350,7 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
{
pixel *p_src = h->mb.pic.p_fenc[1+ch];
pixel *p_dst = h->mb.pic.p_fdec[1+ch];
int i_decimate_score = 0;
int i_decimate_score = b_decimate ? 0 : 7;
int nz_ac = 0;
ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] );
......@@ -361,20 +385,40 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
dct2x2dc( dct_dc, dct4x4 );
/* calculate dct coeffs */
for( int i = 0; i < (chroma422?8:4); i++ )
for( int i8x8 = 0; i8x8 < (chroma422?2:1); i8x8++ )
{
if( h->mb.b_trellis )
nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 );
{
for( int i4x4 = 0; i4x4 < 4; i4x4++ )
{
if( x264_quant_4x4_trellis( h, dct4x4[i8x8*4+i4x4], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 ) )
{
int idx = 16+ch*16+i8x8*8+i4x4;
h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] );
h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp );
if( i_decimate_score < 7 )
i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] );
h->mb.cache.non_zero_count[x264_scan8[idx]] = 1;
nz_ac = 1;
}
}
}
else
nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
if( nz )
{
nz_ac = 1;
h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], dct4x4[i] );
h->quantf.dequant_4x4( dct4x4[i], dequant_mf, i_qp );
if( b_decimate )
i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16] );
nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4IC+b_inter][i_qp],
h->quant4_bias[CQM_4IC+b_inter][i_qp] );
nz_ac |= nz;
FOREACH_BIT( i4x4, 0, nz )
{
int idx = 16+ch*16+i8x8*8+i4x4;
h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] );
h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp );
if( i_decimate_score < 7 )
i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] );
h->mb.cache.non_zero_count[x264_scan8[idx]] = 1;
}
}
}
......@@ -390,7 +434,7 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc;
if( (b_decimate && i_decimate_score < 7) || !nz_ac )
if( i_decimate_score < 7 || !nz_ac )
{
/* Decimate the block */
M16( &h->mb.cache.non_zero_count[x264_scan8[16+16*ch]] ) = 0;
......@@ -646,11 +690,8 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
{
h->mb.b_transform_8x8 = 0;
for( int p = 0; p < plane_count; p++ )
{
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
x264_mb_encode_i16x16( h, p, i_qp );
i_qp = h->mb.i_chroma_qp;
}
}
else if( h->mb.i_type == I_8x8 )
{
......@@ -668,14 +709,13 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
if( h->mb.i_skip_intra == 2 )
h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
}
for( int p = 0; p < plane_count; p++ )
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
{
for( int i = (p == 0 && h->mb.i_skip_intra) ? 3 : 0 ; i < 4; i++ )
{
int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL, 1 );
}
i_qp = h->mb.i_chroma_qp;
}
}
else if( h->mb.i_type == I_4x4 )
......@@ -694,7 +734,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
if( h->mb.i_skip_intra == 2 )
h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
}
for( int p = 0; p < plane_count; p++ )
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
{
for( int i = (p == 0 && h->mb.i_skip_intra) ? 15 : 0 ; i < 16; i++ )
{
......@@ -707,7 +747,6 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
x264_mb_encode_i4x4( h, p, i, i_qp, i_mode, 1 );
}
i_qp = h->mb.i_chroma_qp;
}
}
else /* Inter MB */
......@@ -747,8 +786,9 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
ALIGNED_ARRAY_16( dctcoef, dct8x8,[4],[64] );
b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
for( int p = 0; p < plane_count; p++ )
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
{
CLEAR_16x16_NNZ( p );
h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
h->nr_count[1+!!p*2] += h->mb.b_noise_reduction * 4;
......@@ -772,99 +812,92 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
}
}
if( i_decimate_mb < 6 && b_decimate )
{
plane_cbp = 0;
CLEAR_16x16_NNZ( p );
}
else
if( i_decimate_mb >= 6 || !b_decimate )
{
for( int idx = 0; idx < 4; idx++ )
h->mb.i_cbp_luma |= plane_cbp;
FOREACH_BIT( idx, 0, plane_cbp )
{
int x = idx&1;
int y = idx>>1;
if( plane_cbp&(1<<idx) )
{
h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[p?CQM_8PC:CQM_8PY], i_qp );
h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] );
STORE_8x8_NNZ( p, idx, 1 );
}
else
STORE_8x8_NNZ( p, idx, 0 );
h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[p?CQM_8PC:CQM_8PY], i_qp );
h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*(idx&1) + 8*(idx>>1)*FDEC_STRIDE], dct8x8[idx] );
STORE_8x8_NNZ( p, idx, 1 );
}
}
h->mb.i_cbp_luma |= plane_cbp;
i_qp = h->mb.i_chroma_qp;
}
}
else
{
ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] );
for( int p = 0; p < plane_count; p++ )
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
{