Commit fb660325 authored by Fiona Glaser's avatar Fiona Glaser Committed by Loren Merritt

denoise_dct asm

parent 223eedb0
......@@ -293,8 +293,8 @@ struct x264_t
uint16_t (*quant4_bias[4])[16]; /* [4][52][16] */
uint16_t (*quant8_bias[2])[64]; /* [2][52][64] */
uint32_t nr_residual_sum[2][64];
uint32_t nr_offset[2][64];
DECLARE_ALIGNED_16( uint32_t nr_residual_sum[2][64] );
DECLARE_ALIGNED_16( uint16_t nr_offset[2][64] );
uint32_t nr_count[2];
/* Slice header */
......
......@@ -193,6 +193,20 @@ void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_q
}
}
void x264_denoise_dct_core( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
{
int i;
for( i=1; i<size; i++ )
{
int level = dct[i];
int sign = level>>15;
level = (level+sign)^sign;
sum[i] += level;
level -= offset[i];
dct[i] = level<0 ? 0 : (level^sign)-sign;
}
}
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
pf->quant_8x8 = quant_8x8;
......@@ -203,6 +217,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_4x4 = dequant_4x4;
pf->dequant_8x8 = dequant_8x8;
pf->denoise_dct_core = x264_denoise_dct_core;
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMX )
{
......@@ -216,6 +232,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_4x4 = x264_dequant_4x4_flat16_mmx;
pf->dequant_8x8 = x264_dequant_8x8_flat16_mmx;
}
pf->denoise_dct_core = x264_denoise_dct_core_mmx;
#endif
}
......@@ -239,6 +256,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
}
pf->denoise_dct_core = x264_denoise_dct_core_sse2;
}
if( cpu&X264_CPU_SSSE3 )
......@@ -247,6 +265,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
pf->quant_4x4 = x264_quant_4x4_ssse3;
pf->quant_8x8 = x264_quant_8x8_ssse3;
pf->denoise_dct_core = x264_denoise_dct_core_ssse3;
}
#endif // HAVE_MMX
......
......@@ -32,6 +32,8 @@ typedef struct
void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
void (*denoise_dct_core)( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
} x264_quant_function_t;
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
......
......@@ -328,3 +328,56 @@ INIT_XMM
DEQUANT sse2, 4, 4, 2
DEQUANT sse2, 8, 6, 2
;-----------------------------------------------------------------------------
; void x264_denoise_dct_core_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
;-----------------------------------------------------------------------------
%macro DENOISE_DCT 1
cglobal x264_denoise_dct_core_%1, 4,5
movzx r4d, word [r0] ; backup DC coefficient
pxor m7, m7
.loop:
sub r3, regsize
mova m2, [r0+r3*2+0*regsize]
mova m3, [r0+r3*2+1*regsize]
PABSW m0, m2
PABSW m1, m3
mova m4, m0
mova m5, m1
psubusw m0, [r2+r3*2+0*regsize]
psubusw m1, [r2+r3*2+1*regsize]
PSIGNW m0, m2
PSIGNW m1, m3
mova [r0+r3*2+0*regsize], m0
mova [r0+r3*2+1*regsize], m1
mova m2, m4
mova m3, m5
punpcklwd m4, m7
punpckhwd m2, m7
punpcklwd m5, m7
punpckhwd m3, m7
paddd m4, [r1+r3*4+0*regsize]
paddd m2, [r1+r3*4+1*regsize]
paddd m5, [r1+r3*4+2*regsize]
paddd m3, [r1+r3*4+3*regsize]
mova [r1+r3*4+0*regsize], m4
mova [r1+r3*4+1*regsize], m2
mova [r1+r3*4+2*regsize], m5
mova [r1+r3*4+3*regsize], m3
jg .loop
mov [r0], r4w ; restore DC coefficient
RET
%endmacro
%define PABSW PABSW_MMX
%define PSIGNW PSIGNW_MMX
%ifndef ARCH_X86_64
INIT_MMX
DENOISE_DCT mmx
%endif
INIT_XMM
DENOISE_DCT sse2
%define PABSW PABSW_SSSE3
%define PSIGNW PSIGNW_SSSE3
DENOISE_DCT ssse3
......@@ -42,5 +42,8 @@ void x264_dequant_4x4_flat16_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], in
void x264_dequant_8x8_flat16_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
void x264_dequant_4x4_flat16_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_8x8_flat16_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
void x264_denoise_dct_core_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
void x264_denoise_dct_core_sse2( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
void x264_denoise_dct_core_ssse3( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
#endif
......@@ -2593,7 +2593,7 @@ void x264_macroblock_analyse( x264_t *h )
x264_mb_analyse_transform( h );
h->mb.b_trellis = h->param.analyse.i_trellis;
h->mb.b_noise_reduction = h->param.analyse.i_noise_reduction;
h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
h->mb.i_skip_intra = 0;
}
......
......@@ -443,11 +443,12 @@ void x264_macroblock_encode( x264_t *h )
DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] );
b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
h->nr_count[1] += h->mb.b_noise_reduction * 4;
for( idx = 0; idx < 4; idx++ )
{
if( h->mb.b_noise_reduction )
x264_denoise_dct( h, (int16_t*)dct8x8[idx] );
h->quantf.denoise_dct_core( *dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 );
if( h->mb.b_trellis )
x264_quant_8x8_trellis( h, dct8x8[idx], CQM_8PY, i_qp, 0 );
else
......@@ -482,6 +483,7 @@ void x264_macroblock_encode( x264_t *h )
{
DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
h->nr_count[0] += h->mb.b_noise_reduction * 16;
for( i8x8 = 0; i8x8 < 4; i8x8++ )
{
......@@ -494,7 +496,7 @@ void x264_macroblock_encode( x264_t *h )
idx = i8x8 * 4 + i4x4;
if( h->mb.b_noise_reduction )
x264_denoise_dct( h, (int16_t*)dct4x4[idx] );
h->quantf.denoise_dct_core( *dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
if( h->mb.b_trellis )
x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, DCT_LUMA_4x4, 0 );
else
......@@ -738,37 +740,6 @@ void x264_noise_reduction_update( x264_t *h )
}
}
void x264_denoise_dct( x264_t *h, int16_t *dct )
{
const int cat = h->mb.b_transform_8x8;
int i;
h->nr_count[cat]++;
for( i = (cat ? 63 : 15); i >= 1; i-- )
{
int level = dct[i];
if( level )
{
if( level > 0 )
{
h->nr_residual_sum[cat][i] += level;
level -= h->nr_offset[cat][i];
if( level < 0 )
level = 0;
}
else
{
h->nr_residual_sum[cat][i] -= level;
level += h->nr_offset[cat][i];
if( level > 0 )
level = 0;
}
dct[i] = level;
}
}
}
/*****************************************************************************
* RD only; 4 calls to this do not make up for one macroblock_encode.
* doesn't transform chroma dc.
......
......@@ -55,7 +55,6 @@ void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
int i_qp, int b_intra );
void x264_noise_reduction_update( x264_t *h );
void x264_denoise_dct( x264_t *h, int16_t *dct );
#endif
......@@ -1023,6 +1023,27 @@ static int check_quant( int cpu_ref, int cpu_new )
ok = oks[1]; used_asm = used_asms[1];
report( "dequant :" );
if( qf_a.denoise_dct_core != qf_ref.denoise_dct_core )
{
int size;
for( size = 16; size <= 64; size += 48 )
{
set_func_name( "denoise_dct" );
used_asm = 1;
memcpy(dct1, buf1, size*2);
memcpy(dct2, buf1, size*2);
memcpy(buf3+256, buf3, 256);
call_c1( qf_c.denoise_dct_core, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size );
call_a1( qf_a.denoise_dct_core, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size );
if( memcmp( dct1, dct2, size*2 ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) )
ok = 0;
call_c2( qf_c.denoise_dct_core, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size );
call_a2( qf_a.denoise_dct_core, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size );
}
}
report( "denoise dct :" );
return ret;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment