Commit ecb04a3b authored by Fiona Glaser's avatar Fiona Glaser

dequant_4x4_dc assembly

About 3.5x faster DC dequant on Conroe
parent 6ce71ce7
......@@ -139,7 +139,7 @@ static void dequant_8x8( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp )
}
}
void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
static void dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
{
const int i_qbits = i_qp/6 - 6;
int y;
......@@ -253,6 +253,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_2x2_dc = quant_2x2_dc;
pf->dequant_4x4 = dequant_4x4;
pf->dequant_4x4_dc = dequant_4x4_dc;
pf->dequant_8x8 = dequant_8x8;
pf->denoise_dct = x264_denoise_dct;
......@@ -267,6 +268,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4 = x264_quant_4x4_mmx;
pf->quant_8x8 = x264_quant_8x8_mmx;
pf->dequant_4x4 = x264_dequant_4x4_mmx;
pf->dequant_4x4_dc = x264_dequant_4x4dc_mmxext;
pf->dequant_8x8 = x264_dequant_8x8_mmx;
if( h->param.i_cqm_preset == X264_CQM_FLAT )
{
......@@ -294,6 +296,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4 = x264_quant_4x4_sse2;
pf->quant_8x8 = x264_quant_8x8_sse2;
pf->dequant_4x4 = x264_dequant_4x4_sse2;
pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
pf->dequant_8x8 = x264_dequant_8x8_sse2;
if( h->param.i_cqm_preset == X264_CQM_FLAT )
{
......
......@@ -30,8 +30,9 @@ typedef struct
void (*quant_4x4_dc)( int16_t dct[4][4], int mf, int bias );
void (*quant_2x2_dc)( int16_t dct[2][2], int mf, int bias );
void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void (*dequant_4x4_dc)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void (*denoise_dct)( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
......@@ -42,6 +43,4 @@ typedef struct
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qscale );
#endif
......@@ -255,26 +255,30 @@ QUANT_DC x264_quant_2x2_dc_ssse3, 1
%define t2d r1d
%endif
;-----------------------------------------------------------------------------
; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
;-----------------------------------------------------------------------------
%macro DEQUANT 4
cglobal x264_dequant_%2x%2_%1, 0,3
%macro DEQUANT_START 2
movifnidn t2d, r2m
imul t0d, t2d, 0x2b
shr t0d, 8 ; i_qbits = i_qp / 6
lea t1, [t0*3]
sub t2d, t1d
sub t2d, t1d ; i_mf = i_qp % 6
shl t2d, %3+2
shl t2d, %1
%ifdef ARCH_X86_64
add r1, t2 ; dequant_mf[i_mf]
%else
add r1, r1m ; dequant_mf[i_mf]
mov r0, r0m ; dct
%endif
sub t0d, %3
sub t0d, %2
jl .rshift32 ; negative qbits => rightshift
%endmacro
;-----------------------------------------------------------------------------
; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
;-----------------------------------------------------------------------------
%macro DEQUANT 4
cglobal x264_dequant_%2x%2_%1, 0,3
DEQUANT_START %3+2, %3
.lshift:
movd m5, t0d
......@@ -339,7 +343,67 @@ INIT_XMM
DEQUANT sse2, 4, 4, 2
DEQUANT sse2, 8, 6, 2
%macro DEQUANT_DC 1
cglobal x264_dequant_4x4dc_%1, 0,3
DEQUANT_START 6, 6
.lshift:
movd m6, [r1]
movd m5, t0d
pslld m6, m5
%if mmsize==16
pshuflw m6, m6, 0
punpcklqdq m6, m6
%else
pshufw m6, m6, 0
%endif
%assign x 0
%rep 16/mmsize
mova m0, [r0+mmsize*0+x]
mova m1, [r0+mmsize*1+x]
pmullw m0, m6
pmullw m1, m6
mova [r0+mmsize*0+x], m0
mova [r0+mmsize*1+x], m1
%assign x x+mmsize*2
%endrep
RET
.rshift32:
neg t0d
movd m5, t0d
mova m6, [pw_1 GLOBAL]
mova m7, m6
pslld m6, m5
psrld m6, 1
movd m4, [r1]
%if mmsize==8
punpcklwd m4, m4
%else
pshuflw m4, m4, 0
%endif
punpcklwd m4, m6
%assign x 0
%rep 32/mmsize
mova m0, [r0+x]
mova m1, m0
punpcklwd m0, m7
punpckhwd m1, m7
pmaddwd m0, m4
pmaddwd m1, m4
psrad m0, m5
psrad m1, m5
packssdw m0, m1
mova [r0+x], m0
%assign x x+mmsize
%endrep
RET
%endmacro
INIT_MMX
DEQUANT_DC mmxext
INIT_XMM
DEQUANT_DC sse2
;-----------------------------------------------------------------------------
; void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
......
......@@ -36,8 +36,10 @@ void x264_quant_4x4_dc_ssse3( int16_t dct[4][4], int mf, int bias );
void x264_quant_4x4_ssse3( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
void x264_quant_8x8_ssse3( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_4x4dc_mmxext( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_8x8_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
void x264_dequant_4x4_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_4x4dc_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_8x8_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
void x264_dequant_4x4_flat16_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_8x8_flat16_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
......
......@@ -188,7 +188,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
/* output samples to fdec */
h->dctf.idct4x4dc( dct_dc4x4 );
x264_mb_dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp ); /* XXX not inversed */
h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp ); /* XXX not inversed */
/* calculate dct coeffs */
for( i = 0; i < 16; i++ )
......
......@@ -1050,7 +1050,7 @@ static int check_quant( int cpu_ref, int cpu_new )
for( qp = 51; qp > 0; qp-- ) \
{ \
INIT_QUANT##w() \
call_c( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
memcpy( dct2, dct1, w*w*2 ); \
call_c1( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \
call_a1( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \
......@@ -1070,6 +1070,31 @@ static int check_quant( int cpu_ref, int cpu_new )
TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4IY, 4 );
TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4PY, 4 );
#define TEST_DEQUANT_DC( qname, dqname, block, w ) \
if( qf_a.dqname != qf_ref.dqname ) \
{ \
set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \
used_asms[1] = 1; \
for( qp = 51; qp > 0; qp-- ) \
{ \
for( i = 0; i < 16; i++ ) \
dct1[i] = rand(); \
call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
memcpy( dct2, dct1, w*w*2 ); \
call_c1( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \
call_a1( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \
if( memcmp( dct1, dct2, w*w*2 ) ) \
{ \
oks[1] = 0; \
fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
} \
call_c2( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \
call_a2( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \
} \
}
TEST_DEQUANT_DC( quant_4x4_dc, dequant_4x4_dc, CQM_4IY, 4 );
x264_cqm_delete( h );
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment