Commit 205a032c authored by Fiona Glaser's avatar Fiona Glaser

Early termination for chroma encoding

Faster chroma encoding by terminating early if heuristics indicate that the block will be DC-only.
This works because the vast majority of inter chroma blocks have no coefficients at all, and those that do are almost always DC-only.
Add two new helper DSP functions for this: dct_dc_8x8 and var2_8x8.  mmx/sse2/ssse3 versions of each.
Early termination is disabled at very low QPs due to it not being useful there.
Performance increase is ~1-2% without trellis, up to 5-6% with trellis=2.
Increase is greater with lower bitrates.
parent 8a96d510
......@@ -170,6 +170,28 @@ static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
}
static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
{
int16_t d[4][4];
int sum = 0;
pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
sum += d[0][0] + d[0][1] + d[0][2] + d[0][3];
sum += d[1][0] + d[1][1] + d[1][2] + d[1][3];
sum += d[2][0] + d[2][1] + d[2][2] + d[2][3];
sum += d[3][0] + d[3][1] + d[3][2] + d[3][3];
return sum;
}
static void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
{
dct[0][0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
dct[0][1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
dct[1][0] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
dct[1][1] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
}
static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
{
......@@ -391,6 +413,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add4x4_idct = add4x4_idct;
dctf->sub8x8_dct = sub8x8_dct;
dctf->sub8x8_dct_dc = sub8x8_dct_dc;
dctf->add8x8_idct = add8x8_idct;
dctf->add8x8_idct_dc = add8x8_idct_dc;
......@@ -416,6 +439,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
dctf->dct4x4dc = x264_dct4x4dc_mmx;
dctf->idct4x4dc = x264_idct4x4dc_mmx;
dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
#ifndef ARCH_X86_64
dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
......@@ -434,6 +458,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
{
dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
......
......@@ -95,6 +95,7 @@ typedef struct
void (*add4x4_idct) ( uint8_t *p_dst, int16_t dct[4][4] );
void (*sub8x8_dct) ( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 );
void (*sub8x8_dct_dc)( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 );
void (*add8x8_idct) ( uint8_t *p_dst, int16_t dct[4][4][4] );
void (*add8x8_idct_dc) ( uint8_t *p_dst, int16_t dct[2][2] );
......
......@@ -160,6 +160,30 @@ static int name( uint8_t *pix, int i_stride ) \
PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 )
PIXEL_VAR_C( x264_pixel_var_8x8, 8, 6 )
/****************************************************************************
* pixel_var2_wxh
****************************************************************************/
static int pixel_var2_8x8( uint8_t *pix1, int i_stride1, uint8_t *pix2, int i_stride2, int *ssd )
{
uint32_t var = 0, sum = 0, sqr = 0;
int x, y;
for( y = 0; y < 8; y++ )
{
for( x = 0; x < 8; x++ )
{
int diff = pix1[x] - pix2[x];
sum += diff;
sqr += diff * diff;
}
pix1 += i_stride1;
pix2 += i_stride2;
}
sum = abs(sum);
var = sqr - (sum * sum >> 6);
*ssd = sqr;
return var;
}
#define HADAMARD4(d0,d1,d2,d3,s0,s1,s2,s3) {\
int t0 = s0 + s1;\
......@@ -611,6 +635,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->ssim_4x4x2_core = ssim_4x4x2_core;
pixf->ssim_end4 = ssim_end4;
pixf->var2_8x8 = pixel_var2_8x8;
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMX )
......@@ -636,6 +661,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext;
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmxext;
pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext;
if( cpu&X264_CPU_CACHELINE_32 )
{
......@@ -682,6 +708,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
#ifdef ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
pixf->var2_8x8 = x264_pixel_var2_8x8_sse2;
}
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
......@@ -761,6 +788,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
#ifdef ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
#endif
pixf->var2_8x8 = x264_pixel_var2_8x8_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_ssse3 );
......
......@@ -73,6 +73,7 @@ typedef struct
x264_pixel_cmp_x3_t fpelcmp_x3[7];
x264_pixel_cmp_x4_t fpelcmp_x4[7];
x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
int (*var2_8x8)( uint8_t *, int, uint8_t *, int, int * );
int (*var[4])( uint8_t *pix, int stride );
uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride );
......
......@@ -36,6 +36,7 @@ pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
pb_1: times 16 db 1
pw_1: times 8 dw 1
SECTION .text
......@@ -427,6 +428,79 @@ cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
IDCT_DC_STORE 0, xmm2, xmm3
ret
;-----------------------------------------------------------------------------
; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro DCTDC_2ROW_MMX 3
movq %1, [r1+FENC_STRIDE*(0+%3)]
movq m1, [r1+FENC_STRIDE*(1+%3)]
movq m2, [r2+FDEC_STRIDE*(0+%3)]
movq m3, [r2+FDEC_STRIDE*(1+%3)]
movq %2, %1
punpckldq %1, m1
punpckhdq %2, m1
movq m1, m2
punpckldq m2, m3
punpckhdq m1, m3
psadbw %1, m7
psadbw %2, m7
psadbw m2, m7
psadbw m1, m7
psubw %1, m2
psubw %2, m1
%endmacro
INIT_MMX
cglobal x264_sub8x8_dct_dc_mmxext, 3,3
pxor m7, m7
call .loop
add r1, FENC_STRIDE*4
add r2, FDEC_STRIDE*4
add r0, 4
.loop:
DCTDC_2ROW_MMX m0, m4, 0
DCTDC_2ROW_MMX m5, m6, 2
paddw m0, m5
paddw m4, m6
punpcklwd m0, m4
movd [r0], m0
ret
INIT_XMM
%macro DCTDC_2ROW_SSE2 3
movq m0, [r1+FENC_STRIDE*(0+%1)]
movq m1, [r1+FENC_STRIDE*(1+%1)]
movq m2, [r2+FDEC_STRIDE*(0+%1)]
movq m3, [r2+FDEC_STRIDE*(1+%1)]
punpckldq m0, m1
punpckldq m2, m3
psadbw m0, m7
psadbw m2, m7
%if %2
paddw %3, m0
paddw m6, m2
%else
SWAP %3, m0
SWAP m6, m2
%endif
%endmacro
cglobal x264_sub8x8_dct_dc_sse2, 3,3,8
pxor m7, m7
DCTDC_2ROW_SSE2 0, 0, m4
DCTDC_2ROW_SSE2 2, 1, m4
add r1, FENC_STRIDE*4
add r2, FDEC_STRIDE*4
psubq m4, m6
DCTDC_2ROW_SSE2 0, 0, m5
DCTDC_2ROW_SSE2 2, 1, m5
psubq m5, m6
packssdw m4, m5
packssdw m4, m4
movq [r0], m4
RET
;-----------------------------------------------------------------------------
; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
......
......@@ -32,7 +32,8 @@ void x264_sub16x16_dct_sse2 ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *p
void x264_sub4x4_dct_ssse3 ( int16_t dct[ 4][4] , uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][4][4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_ssse3 ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmxext( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_sse2 ( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 );
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4] );
void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
......
......@@ -386,6 +386,119 @@ cglobal x264_pixel_var_8x8_sse2, 2,4,8
jg .loop
VAR_END 6
%macro VAR2_END 0
HADDW m5, m7
movd r1d, m5
imul r1d, r1d
HADDD m6, m1
shr r1d, 6
movd eax, m6
mov [r4], eax
sub eax, r1d ; sqr - (sum * sum >> shift)
RET
%endmacro
;-----------------------------------------------------------------------------
; int x264_pixel_var2_8x8_mmxext( uint8_t *, int, uint8_t *, int, int * )
;-----------------------------------------------------------------------------
%ifndef ARCH_X86_64
INIT_MMX
cglobal x264_pixel_var2_8x8_mmxext, 5,6
VAR_START 0
mov r5d, 8
.loop:
movq m0, [r0]
movq m1, m0
movq m4, m0
movq m2, [r2]
movq m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
psubw m0, m2
psubw m1, m3
paddw m5, m0
paddw m5, m1
pmaddwd m0, m0
pmaddwd m1, m1
paddd m6, m0
paddd m6, m1
add r0, r1
add r2, r3
dec r5d
jg .loop
VAR2_END
RET
%endif
INIT_XMM
cglobal x264_pixel_var2_8x8_sse2, 5,6,8
VAR_START 1
mov r5d, 4
.loop:
movq m1, [r0]
movhps m1, [r0+r1]
movq m3, [r2]
movhps m3, [r2+r3]
DEINTB 0, 1, 2, 3, 7
psubw m0, m2
psubw m1, m3
paddw m5, m0
paddw m5, m1
pmaddwd m0, m0
pmaddwd m1, m1
paddd m6, m0
paddd m6, m1
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
dec r5d
jg .loop
VAR2_END
RET
cglobal x264_pixel_var2_8x8_ssse3, 5,6,8
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
mova m7, [hsub_mul GLOBAL]
mov r5d, 2
.loop:
movq m0, [r0]
movq m2, [r2]
movq m1, [r0+r1]
movq m3, [r2+r3]
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
punpcklbw m0, m2
punpcklbw m1, m3
movq m2, [r0]
movq m3, [r2]
punpcklbw m2, m3
movq m3, [r0+r1]
movq m4, [r2+r3]
punpcklbw m3, m4
pmaddubsw m0, m7
pmaddubsw m1, m7
pmaddubsw m2, m7
pmaddubsw m3, m7
paddw m5, m0
paddw m5, m1
paddw m5, m2
paddw m5, m3
pmaddwd m0, m0
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
paddd m6, m0
paddd m6, m1
paddd m6, m2
paddd m6, m3
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
dec r5d
jg .loop
VAR2_END
RET
;=============================================================================
; SATD
......
......@@ -102,6 +102,9 @@ void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
const uint8_t *pix2, int stride2, int sums[2][4] );
float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
int x264_pixel_var2_8x8_mmxext( uint8_t *, int, uint8_t *, int, int * );
int x264_pixel_var2_8x8_sse2( uint8_t *, int, uint8_t *, int, int * );
int x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * );
#define DECL_ADS( size, suffix ) \
int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
......
......@@ -84,6 +84,18 @@ static inline void dct2x2dc( int16_t d[2][2], int16_t dct4x4[4][4][4] )
dct4x4[3][0][0] = 0;
}
static inline void dct2x2dc_dconly( int16_t d[2][2] )
{
int d0 = d[0][0] + d[0][1];
int d1 = d[1][0] + d[1][1];
int d2 = d[0][0] - d[0][1];
int d3 = d[1][0] - d[1][1];
d[0][0] = d0 + d1;
d[1][0] = d2 + d3;
d[0][1] = d0 - d1;
d[1][1] = d2 - d3;
}
static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
{
int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY;
......@@ -273,8 +285,55 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
{
int i, ch, nz, nz_dc;
int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
DECLARE_ALIGNED_16( int16_t dct2x2[2][2] );
h->mb.i_cbp_chroma = 0;
/* Early termination: check variance of chroma residual before encoding.
* Don't bother trying early termination at low QPs.
* Values are experimentally derived. */
if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) )
{
int thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
int ssd[2];
int score = h->pixf.var2_8x8( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
score += h->pixf.var2_8x8( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
if( score < thresh*4 )
{
h->mb.cache.non_zero_count[x264_scan8[16]] = 0;
h->mb.cache.non_zero_count[x264_scan8[17]] = 0;
h->mb.cache.non_zero_count[x264_scan8[18]] = 0;
h->mb.cache.non_zero_count[x264_scan8[19]] = 0;
h->mb.cache.non_zero_count[x264_scan8[20]] = 0;
h->mb.cache.non_zero_count[x264_scan8[21]] = 0;
h->mb.cache.non_zero_count[x264_scan8[22]] = 0;
h->mb.cache.non_zero_count[x264_scan8[23]] = 0;
h->mb.cache.non_zero_count[x264_scan8[25]] = 0;
h->mb.cache.non_zero_count[x264_scan8[26]] = 0;
for( ch = 0; ch < 2; ch++ )
{
if( ssd[ch] > thresh )
{
h->dctf.sub8x8_dct_dc( dct2x2, h->mb.pic.p_fenc[1+ch], h->mb.pic.p_fdec[1+ch] );
dct2x2dc_dconly( dct2x2 );
if( h->mb.b_trellis )
nz_dc = x264_quant_dc_trellis( h, (int16_t*)dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter );
else
nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<
1 );
if( nz_dc )
{
h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1;
zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
idct_dequant_2x2_dconly( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
h->dctf.add8x8_idct_dc( h->mb.pic.p_fdec[1+ch], dct2x2 );
h->mb.i_cbp_chroma = 1;
}
}
}
return;
}
}
for( ch = 0; ch < 2; ch++ )
{
uint8_t *p_src = h->mb.pic.p_fenc[1+ch];
......@@ -282,7 +341,6 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
int i_decimate_score = 0;
int nz_ac = 0;
DECLARE_ALIGNED_16( int16_t dct2x2[2][2] );
DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
if( h->mb.b_lossless )
......
......@@ -354,6 +354,23 @@ static int check_pixel( int cpu_ref, int cpu_new )
TEST_PIXEL_VAR( PIXEL_8x8 );
report( "pixel var :" );
ok = 1; used_asm = 0;
if( pixel_asm.var2_8x8 != pixel_ref.var2_8x8 )
{
int res_c, res_asm, ssd_c, ssd_asm;
set_func_name( "var2_8x8" );
used_asm = 1;
res_c = call_c( pixel_c.var2_8x8, buf1, 16, buf2, 16, &ssd_c );
res_asm = call_a( pixel_asm.var2_8x8, buf1, 16, buf2, 16, &ssd_asm );
if( res_c != res_asm || ssd_c != ssd_asm )
{
ok = 0;
fprintf( stderr, "var[%d]: %d != %d or %d != %d [FAILED]\n", i, res_c, res_asm, ssd_c, ssd_asm );
}
}
report( "pixel var2 :" );
for( i=0, ok=1, used_asm=0; i<4; i++ )
if( pixel_asm.hadamard_ac[i] != pixel_ref.hadamard_ac[i] )
{
......@@ -480,6 +497,7 @@ static int check_dct( int cpu_ref, int cpu_new )
DECLARE_ALIGNED_16( int16_t dct2[16][4][4] );
DECLARE_ALIGNED_16( int16_t dct4[16][4][4] );
DECLARE_ALIGNED_16( int16_t dct8[4][8][8] );
DECLARE_ALIGNED_8( int16_t dctdc[2][2][2] );
x264_t h_buf;
x264_t *h = &h_buf;
......@@ -514,6 +532,7 @@ static int check_dct( int cpu_ref, int cpu_new )
ok = 1; used_asm = 0;
TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16*2 );
TEST_DCT( sub8x8_dct, dct1, dct2, 16*2*4 );
TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4*2 );
TEST_DCT( sub16x16_dct, dct1, dct2, 16*2*16 );
report( "sub_dct4 :" );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment