Commit d43e46cf authored by Fiona Glaser's avatar Fiona Glaser

Faster probe_skip, 2x2 DC transform handling

Move the 2x2 DC DCT into the dct_dc asm function to avoid some store-to-load forwarding penalties and extra register loads.
Use dct_dc as part of the early termination in probe_skip.
x86 asm partially by Holger Lubitz.
ARM NEON asm by David Conrad.
......@@ -639,12 +639,20 @@ function x264_sub8x8_dct_dc_neon
vld1.64 {d30}, [r1,:64], r3
vadd.s16 q1, q12, q13
vld1.64 {d31}, [r2,:64], ip
vpadd.s16 d0, d0, d1
vadd.s16 q1, q1, q14
vsubl.u8 q15, d30, d31
vadd.s16 q1, q1, q14
vadd.s16 d4, d0, d1
vadd.s16 q1, q1, q15
vpadd.s16 d2, d2, d3
vsub.s16 d5, d0, d1
vadd.s16 d6, d2, d3
vsub.s16 d7, d2, d3
vadd.s16 q0, q2, q3
vsub.s16 q1, q2, q3
vpadd.s16 d0, d0, d2
vpadd.s16 d1, d1, d3
vpadd.s16 d0, d0, d1
vst1.64 {d0}, [r0,:64]
bx lr
......@@ -184,10 +184,21 @@ static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
static void sub8x8_dct_dc( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 )
int d0, d1, d2, d3;
dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
/* 2x2 DC transform */
d0 = dct[0] + dct[1];
d1 = dct[2] + dct[3];
d2 = dct[0] - dct[1];
d3 = dct[2] - dct[3];
dct[0] = d0 + d1;
dct[2] = d2 + d3;
dct[1] = d0 - d1;
dct[3] = d2 - d3;
static void add4x4_idct( uint8_t *p_dst, int16_t dct[16] )
......@@ -509,28 +509,43 @@ cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
movq m1, m2
punpckldq m2, m3
punpckhdq m1, m3
psadbw %1, m7
psadbw %2, m7
psadbw m2, m7
psadbw m1, m7
pxor m3, m3
psadbw %1, m3
psadbw %2, m3
psadbw m2, m3
psadbw m1, m3
psubw %1, m2
psubw %2, m1
%macro DCT2x2 2 ; reg s1/s0 (!=m1), reg s3/s2
pshufw mm1, %1, 10100000b ; s1 s1 s0 s0
pshufw mm0, %2, 10110001b ; s3 __ s2 __
paddw mm1, %2 ; s1 s13 s0 s02
psubw mm1, mm0 ; d13 s13 d02 s02
pshufw mm0, mm1, 01000100b ; d02 s02 d02 s02
psrlq mm1, 32 ; __ __ d13 s13
paddw mm0, mm1 ; d02 s02 d02+d13 s02+s13
psllq mm1, 32 ; d13 s13
psubw mm0, mm1 ; d02-d13 s02-s13 d02+d13 s02+s13
cglobal x264_sub8x8_dct_dc_mmxext, 3,3
pxor m7, m7
call .loop
add r1, FENC_STRIDE*4
add r2, FDEC_STRIDE*4
add r0, 4
DCTDC_2ROW_MMX m0, m4, 0
DCTDC_2ROW_MMX m5, m6, 2
paddw m0, m5
paddw m4, m6
punpcklwd m0, m4
movd [r0], m0
punpckldq m0, m4
add r1, FENC_STRIDE*4
add r2, FDEC_STRIDE*4
DCTDC_2ROW_MMX m7, m4, 0
DCTDC_2ROW_MMX m5, m6, 2
paddw m7, m5
paddw m4, m6
punpckldq m7, m4
DCT2x2 m0, m7
movq [r0], m0
......@@ -558,13 +573,16 @@ cglobal x264_sub8x8_dct_dc_sse2, 3,3,8
DCTDC_2ROW_SSE2 2, 1, m4
add r1, FENC_STRIDE*4
add r2, FDEC_STRIDE*4
psubq m4, m6
psubd m4, m6
DCTDC_2ROW_SSE2 0, 0, m5
DCTDC_2ROW_SSE2 2, 1, m5
psubq m5, m6
psubd m5, m6
packssdw m4, m5
packssdw m4, m4
movq [r0], m4
movhlps m5, m4
movdq2q mm0, m4
movdq2q mm7, m5
DCT2x2 mm0, mm7
movq [r0], mm0
......@@ -365,7 +365,6 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
if( ssd[ch] > thresh )
h->dctf.sub8x8_dct_dc( dct2x2, h->mb.pic.p_fenc[1+ch], h->mb.pic.p_fdec[1+ch] );
dct2x2dc_dconly( dct2x2 );
if( h->mb.b_trellis )
nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1 );
......@@ -980,10 +979,10 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
if( ssd < thresh )
h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
/* The vast majority of chroma checks will terminate during the DC check or the higher
* threshold check, so we can save time by doing a DC-only DCT. */
h->dctf.sub8x8_dct_dc( dct2x2, p_src, p_dst );
/* calculate dct DC */
dct2x2dc( dct2x2, dct4x4 );
if( h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ) )
return 0;
......@@ -991,9 +990,15 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
if( ssd < thresh*4 )
h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
/* calculate dct coeffs */
for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
/* We don't need to zero the DC coefficient before quantization because we already
* checked that all the DCs were zero above at twice the precision that quant4x4
* uses. This applies even though the DC here is being quantized before the 2x2
* transform. */
if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
