Commit 83d805fe authored by Fiona Glaser's avatar Fiona Glaser

Much faster chroma encoding and other opts

~15% faster chroma encode by reorganizing CBP calculation and adding special-case idct_dc function, since most coded chroma blocks are DC-only.
Small optimization in cache_save (skip_bp)
Fix array_non_zero to not violate strict aliasing (should eliminate miscompilation issues in the future)
Add in automatic substitutions for some asm instructions that have an equivalent smaller representation.
parent 360946d0
......@@ -348,6 +348,27 @@ static void add16x16_idct8( uint8_t *dst, int16_t dct[4][8][8] )
add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
}
static void inline add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
{
int i;
dc = (dc + 32) >> 6;
for( i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
{
p_dst[0] = x264_clip_uint8( p_dst[0] + dc );
p_dst[1] = x264_clip_uint8( p_dst[1] + dc );
p_dst[2] = x264_clip_uint8( p_dst[2] + dc );
p_dst[3] = x264_clip_uint8( p_dst[3] + dc );
}
}
static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[2][2] )
{
add4x4_idct_dc( &p_dst[0], dct[0][0] );
add4x4_idct_dc( &p_dst[4], dct[0][1] );
add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[1][0] );
add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[1][1] );
}
/****************************************************************************
* x264_dct_init:
......@@ -359,6 +380,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub8x8_dct = sub8x8_dct;
dctf->add8x8_idct = add8x8_idct;
dctf->add8x8_idct_dc = add8x8_idct_dc;
dctf->sub16x16_dct = sub16x16_dct;
dctf->add16x16_idct = add16x16_idct;
......@@ -377,6 +399,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
{
dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
dctf->add4x4_idct = x264_add4x4_idct_mmx;
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
dctf->dct4x4dc = x264_dct4x4dc_mmx;
dctf->idct4x4dc = x264_idct4x4dc_mmx;
......@@ -405,6 +428,9 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add8x8_idct = x264_add8x8_idct_sse2;
dctf->add16x16_idct = x264_add16x16_idct_sse2;
}
if( cpu&X264_CPU_SSSE3 )
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
#endif //HAVE_MMX
#ifdef ARCH_PPC
......
......@@ -96,6 +96,7 @@ typedef struct
void (*sub8x8_dct) ( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 );
void (*add8x8_idct) ( uint8_t *p_dst, int16_t dct[4][4][4] );
void (*add8x8_idct_dc) ( uint8_t *p_dst, int16_t dct[2][2] );
void (*sub16x16_dct) ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
void (*add16x16_idct)( uint8_t *p_dst, int16_t dct[16][4][4] );
......
......@@ -1354,9 +1354,10 @@ void x264_macroblock_cache_save( x264_t *h )
h->mb.skipbp[i_mb_xy] = 0xf;
else if( i_mb_type == B_8x8 )
{
int skipbp = 0;
for( i = 0; i < 4; i++ )
skipbp |= ( h->mb.i_sub_partition[i] == D_DIRECT_8x8 ) << i;
int skipbp = ( h->mb.i_sub_partition[0] == D_DIRECT_8x8 ) << 0;
skipbp |= ( h->mb.i_sub_partition[1] == D_DIRECT_8x8 ) << 1;
skipbp |= ( h->mb.i_sub_partition[2] == D_DIRECT_8x8 ) << 2;
skipbp |= ( h->mb.i_sub_partition[3] == D_DIRECT_8x8 ) << 3;
h->mb.skipbp[i_mb_xy] = skipbp;
}
else
......
......@@ -404,19 +404,19 @@ static ALWAYS_INLINE void x264_macroblock_cache_intra8x8_pred( x264_t *h, int x,
#define array_non_zero_int array_non_zero_int_c
static ALWAYS_INLINE int array_non_zero_int_c( void *v, int i_count )
{
uint64_t *x = v;
union {uint16_t s[4]; uint64_t l;} *x = v;
if(i_count == 8)
return !!x[0];
return !!x[0].l;
else if(i_count == 16)
return !!(x[0]|x[1]);
return !!(x[0].l|x[1].l);
else if(i_count == 32)
return !!(x[0]|x[1]|x[2]|x[3]);
return !!(x[0].l|x[1].l|x[2].l|x[3].l);
else
{
int i;
i_count /= sizeof(uint64_t);
for( i = 0; i < i_count; i++ )
if( x[i] ) return 1;
if( x[i].l ) return 1;
return 0;
}
}
......
......@@ -32,6 +32,7 @@ pw_8000: times 8 dw 0x8000
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
SECTION .text
......@@ -244,6 +245,85 @@ cextern x264_add8x8_idct8_sse2
SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 0
ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0
;-----------------------------------------------------------------------------
; void add8x8_idct_dc( uint8_t *p_dst, int16_t *dct2x2 )
;-----------------------------------------------------------------------------
%macro ADD_DC 3
movq mm4, [%3+FDEC_STRIDE*0]
movq mm5, [%3+FDEC_STRIDE*1]
movq mm6, [%3+FDEC_STRIDE*2]
paddusb mm4, %1
paddusb mm5, %1
paddusb mm6, %1
paddusb %1, [%3+FDEC_STRIDE*3]
psubusb mm4, %2
psubusb mm5, %2
psubusb mm6, %2
psubusb %1, %2
movq [%3+FDEC_STRIDE*0], mm4
movq [%3+FDEC_STRIDE*1], mm5
movq [%3+FDEC_STRIDE*2], mm6
movq [%3+FDEC_STRIDE*3], %1
%endmacro
cglobal x264_add8x8_idct_dc_mmx, 2,2
movq mm0, [r1]
pxor mm1, mm1
add r0, FDEC_STRIDE*4
paddw mm0, [pw_32 GLOBAL]
psraw mm0, 6
psubw mm1, mm0
packuswb mm0, mm0
packuswb mm1, mm1
punpcklbw mm0, mm0
punpcklbw mm1, mm1
pshufw mm2, mm0, 0xFA
pshufw mm3, mm1, 0xFA
punpcklbw mm0, mm0
punpcklbw mm1, mm1
ADD_DC mm0, mm1, r0-FDEC_STRIDE*4
ADD_DC mm2, mm3, r0
ret
cglobal x264_add8x8_idct_dc_ssse3, 2,2
movq xmm0, [r1]
pxor xmm1, xmm1
add r0, FDEC_STRIDE*4
paddw xmm0, [pw_32 GLOBAL]
psraw xmm0, 6
psubw xmm1, xmm0
movdqa xmm5, [pb_idctdc_unpack GLOBAL]
packuswb xmm0, xmm0
packuswb xmm1, xmm1
pshufb xmm0, xmm5
pshufb xmm1, xmm5
movq xmm2, [r0+FDEC_STRIDE*-4]
movq xmm3, [r0+FDEC_STRIDE*-3]
movq xmm4, [r0+FDEC_STRIDE*-2]
movq xmm5, [r0+FDEC_STRIDE*-1]
movhps xmm2, [r0+FDEC_STRIDE* 0]
movhps xmm3, [r0+FDEC_STRIDE* 1]
movhps xmm4, [r0+FDEC_STRIDE* 2]
movhps xmm5, [r0+FDEC_STRIDE* 3]
paddusb xmm2, xmm0
paddusb xmm3, xmm0
paddusb xmm4, xmm0
paddusb xmm5, xmm0
psubusb xmm2, xmm1
psubusb xmm3, xmm1
psubusb xmm4, xmm1
psubusb xmm5, xmm1
movq [r0+FDEC_STRIDE*-4], xmm2
movq [r0+FDEC_STRIDE*-3], xmm3
movq [r0+FDEC_STRIDE*-2], xmm4
movq [r0+FDEC_STRIDE*-1], xmm5
movhps [r0+FDEC_STRIDE* 0], xmm2
movhps [r0+FDEC_STRIDE* 1], xmm3
movhps [r0+FDEC_STRIDE* 2], xmm4
movhps [r0+FDEC_STRIDE* 3], xmm5
ret
;-----------------------------------------------------------------------------
; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
......
......@@ -32,9 +32,11 @@ void x264_sub16x16_dct_sse2 ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *p
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4] );
void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[2][2] );
void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][4][4] );
void x264_add8x8_idct_sse2 ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
void x264_add16x16_idct_sse2 ( uint8_t *p_dst, int16_t dct[16][4][4] );
void x264_add8x8_idct_dc_ssse3( uint8_t *p_dst, int16_t dct[2][2] );
void x264_dct4x4dc_mmx ( int16_t d[4][4] );
void x264_idct4x4dc_mmx ( int16_t d[4][4] );
......
......@@ -497,3 +497,27 @@ INIT_MMX
%endif
%endmacro
;Substitutions that reduce instruction size but are functionally equivalent
%macro add 2
%ifnum %2
%if %2==128
sub %1, -128
%else
add %1, %2
%endif
%else
add %1, %2
%endif
%endmacro
%macro sub 2
%ifnum %2
%if %2==128
add %1, -128
%else
sub %1, %2
%endif
%else
sub %1, %2
%endif
%endmacro
......@@ -37,25 +37,37 @@ static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] )
}
#undef ZIG
#define IDCT_DEQUANT_START \
int d0 = dct[0][0] + dct[0][1]; \
int d1 = dct[1][0] + dct[1][1]; \
int d2 = dct[0][0] - dct[0][1]; \
int d3 = dct[1][0] - dct[1][1]; \
int dmf = dequant_mf[i_qp%6][0][0]; \
int qbits = i_qp/6 - 5; \
if( qbits > 0 ) \
{ \
dmf <<= qbits; \
qbits = 0; \
}
static inline void idct_dequant_2x2_dc( int16_t dct[2][2], int16_t dct4x4[4][4][4], int dequant_mf[6][4][4], int i_qp )
{
int d0 = dct[0][0] + dct[0][1];
int d1 = dct[1][0] + dct[1][1];
int d2 = dct[0][0] - dct[0][1];
int d3 = dct[1][0] - dct[1][1];
int dmf = dequant_mf[i_qp%6][0][0];
int qbits = i_qp/6 - 5;
if( qbits > 0 )
{
dmf <<= qbits;
qbits = 0;
}
IDCT_DEQUANT_START
dct4x4[0][0][0] = (d0 + d1) * dmf >> -qbits;
dct4x4[1][0][0] = (d0 - d1) * dmf >> -qbits;
dct4x4[2][0][0] = (d2 + d3) * dmf >> -qbits;
dct4x4[3][0][0] = (d2 - d3) * dmf >> -qbits;
}
static inline void idct_dequant_2x2_dconly( int16_t dct[2][2], int dequant_mf[6][4][4], int i_qp )
{
IDCT_DEQUANT_START
dct[0][0] = (d0 + d1) * dmf >> -qbits;
dct[0][1] = (d0 - d1) * dmf >> -qbits;
dct[1][0] = (d2 + d3) * dmf >> -qbits;
dct[1][1] = (d2 - d3) * dmf >> -qbits;
}
static inline void dct2x2dc( int16_t d[2][2], int16_t dct4x4[4][4][4] )
{
int d0 = dct4x4[0][0][0] + dct4x4[1][0][0];
......@@ -202,8 +214,9 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
{
int i, ch;
int i, ch, nz;
int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
h->mb.i_cbp_chroma = 0;
for( ch = 0; ch < 2; ch++ )
{
......@@ -223,7 +236,11 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
h->zigzagf.sub_4x4( h->dct.luma4x4[16+i+ch*4], p_src+oe, p_dst+od );
h->dct.chroma_dc[ch][i] = h->dct.luma4x4[16+i+ch*4][0];
h->dct.luma4x4[16+i+ch*4][0] = 0;
nz = array_non_zero( h->dct.luma4x4[16+i+ch*4] );
h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
h->mb.i_cbp_chroma |= nz;
}
h->mb.cache.non_zero_count[x264_scan8[25]+ch] = array_non_zero( h->dct.chroma_dc[ch] );
continue;
}
......@@ -249,36 +266,40 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
if( b_decimate && i_decimate_score < 7 )
{
/* Near null chroma 8x8 block so make it null (bits saving) */
memset( &h->dct.luma4x4[16+ch*4], 0, 4 * sizeof( *h->dct.luma4x4 ) );
if( !array_non_zero( dct2x2 ) )
/* Decimate the block */
h->mb.cache.non_zero_count[x264_scan8[16+0]+24*ch] = 0;
h->mb.cache.non_zero_count[x264_scan8[16+1]+24*ch] = 0;
h->mb.cache.non_zero_count[x264_scan8[16+2]+24*ch] = 0;
h->mb.cache.non_zero_count[x264_scan8[16+3]+24*ch] = 0;
if( !array_non_zero( dct2x2 ) ) /* Whole block is empty */
{
memset( h->dct.chroma_dc[ch], 0, sizeof( h->dct.chroma_dc[ch] ) );
h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 0;
continue;
}
memset( dct4x4, 0, sizeof( dct4x4 ) );
/* DC-only */
h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1;
zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
idct_dequant_2x2_dconly( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
h->dctf.add8x8_idct_dc( p_dst, dct2x2 );
}
else
{
for( i = 0; i < 4; i++ )
h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
{
nz = array_non_zero( h->dct.luma4x4[16+ch*4+i] );
h->mb.cache.non_zero_count[x264_scan8[16+ch*4+i]] = nz;
h->mb.i_cbp_chroma |= nz;
if( nz )
h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
}
/* Don't optimize for the AC-only case--it's very rare */
h->mb.cache.non_zero_count[x264_scan8[25]+ch] = array_non_zero( dct2x2 );
zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
h->dctf.add8x8_idct( p_dst, dct4x4 );
}
zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
h->dctf.add8x8_idct( p_dst, dct4x4 );
}
/* coded block pattern */
h->mb.i_cbp_chroma = 0;
for( i = 0; i < 8; i++ )
{
int nz = array_non_zero( h->dct.luma4x4[16+i] );
h->mb.cache.non_zero_count[x264_scan8[16+i]] = nz;
h->mb.i_cbp_chroma |= nz;
}
h->mb.cache.non_zero_count[x264_scan8[25]] = array_non_zero( h->dct.chroma_dc[0] );
h->mb.cache.non_zero_count[x264_scan8[26]] = array_non_zero( h->dct.chroma_dc[1] );
if( h->mb.i_cbp_chroma )
h->mb.i_cbp_chroma = 2; /* dc+ac (we can't do only ac) */
else if( h->mb.cache.non_zero_count[x264_scan8[25]] |
......
......@@ -551,6 +551,7 @@ static int check_dct( int cpu_ref, int cpu_new )
ok = 1; used_asm = 0;
TEST_IDCT( add4x4_idct, dct4 );
TEST_IDCT( add8x8_idct, dct4 );
TEST_IDCT( add8x8_idct_dc, dct4 );
TEST_IDCT( add16x16_idct, dct4 );
report( "add_idct4 :" );
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment