Commit 34e3a697 authored by Henrik Gramner's avatar Henrik Gramner Committed by Fiona Glaser

x86 SIMD versions of optimize_chroma_dc

SSE2/SSSE3/SSE4/AVX implementations.
About 3x faster.
parent 49a32b91
......@@ -141,6 +141,66 @@ static void dequant_4x4_dc( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
}
}
static ALWAYS_INLINE void idct_dequant_2x2_dconly( dctcoef out[4], dctcoef dct[4], int dequant_mf )
{
int d0 = dct[0] + dct[1];
int d1 = dct[2] + dct[3];
int d2 = dct[0] - dct[1];
int d3 = dct[2] - dct[3];
out[0] = (d0 + d1) * dequant_mf >> 5;
out[1] = (d0 - d1) * dequant_mf >> 5;
out[2] = (d2 + d3) * dequant_mf >> 5;
out[3] = (d2 - d3) * dequant_mf >> 5;
}
static ALWAYS_INLINE int idct_dequant_round_2x2_dc( dctcoef ref[4], dctcoef dct[4], int dequant_mf )
{
dctcoef out[4];
idct_dequant_2x2_dconly( out, dct, dequant_mf );
return ((ref[0] ^ (out[0]+32))
| (ref[1] ^ (out[1]+32))
| (ref[2] ^ (out[2]+32))
| (ref[3] ^ (out[3]+32))) >> 6;
}
static int optimize_chroma_dc( dctcoef dct[4], int dequant_mf )
{
/* dequant_mf = h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << i_qp/6, max 32*64 */
dctcoef dct_orig[4];
int coeff, nz;
idct_dequant_2x2_dconly( dct_orig, dct, dequant_mf );
dct_orig[0] += 32;
dct_orig[1] += 32;
dct_orig[2] += 32;
dct_orig[3] += 32;
/* If the DC coefficients already round to zero, terminate early. */
if( !((dct_orig[0]|dct_orig[1]|dct_orig[2]|dct_orig[3])>>6) )
return 0;
/* Start with the highest frequency coefficient... is this the best option? */
for( nz = 0, coeff = 3; coeff >= 0; coeff-- )
{
int level = dct[coeff];
int sign = level>>31 | 1; /* dct2x2[coeff] < 0 ? -1 : 1 */
while( level )
{
dct[coeff] = level - sign;
if( idct_dequant_round_2x2_dc( dct_orig, dct, dequant_mf ) )
{
nz = 1;
dct[coeff] = level;
break;
}
level -= sign;
}
}
return nz;
}
static void x264_denoise_dct( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size )
{
for( int i = 0; i < size; i++ )
......@@ -272,6 +332,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_4x4_dc = dequant_4x4_dc;
pf->dequant_8x8 = dequant_8x8;
pf->optimize_chroma_dc = optimize_chroma_dc;
pf->denoise_dct = x264_denoise_dct;
pf->decimate_score15 = x264_decimate_score15;
pf->decimate_score16 = x264_decimate_score16;
......@@ -427,6 +489,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
}
pf->optimize_chroma_dc = x264_optimize_chroma_dc_sse2;
pf->denoise_dct = x264_denoise_dct_sse2;
pf->decimate_score15 = x264_decimate_score15_sse2;
pf->decimate_score16 = x264_decimate_score16_sse2;
......@@ -457,6 +520,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
pf->quant_4x4 = x264_quant_4x4_ssse3;
pf->quant_8x8 = x264_quant_8x8_ssse3;
pf->optimize_chroma_dc = x264_optimize_chroma_dc_ssse3;
pf->denoise_dct = x264_denoise_dct_ssse3;
pf->decimate_score15 = x264_decimate_score15_ssse3;
pf->decimate_score16 = x264_decimate_score16_ssse3;
......@@ -473,6 +537,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
pf->quant_4x4 = x264_quant_4x4_sse4;
pf->quant_8x8 = x264_quant_8x8_sse4;
pf->optimize_chroma_dc = x264_optimize_chroma_dc_sse4;
}
if( cpu&X264_CPU_AVX )
......@@ -480,6 +545,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_4x4 = x264_dequant_4x4_avx;
pf->dequant_8x8 = x264_dequant_8x8_avx;
pf->dequant_4x4_dc = x264_dequant_4x4dc_avx;
pf->optimize_chroma_dc = x264_optimize_chroma_dc_avx;
pf->denoise_dct = x264_denoise_dct_avx;
}
#endif // HAVE_MMX
......
......@@ -38,6 +38,8 @@ typedef struct
void (*dequant_4x4)( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void (*dequant_4x4_dc)( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
int (*optimize_chroma_dc)( dctcoef dct[4], int dequant_mf );
void (*denoise_dct)( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
int (*decimate_score15)( dctcoef *dct );
......
......@@ -51,6 +51,7 @@ const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1)
const pd_1, times 4 dd 1
const pd_32, times 4 dd 32
const pd_1024, times 4 dd 1024
const pd_ffff, times 4 dd 0xffff
const pw_00ff, times 8 dw 0x00ff
const pw_ff00, times 8 dw 0xff00
......
......@@ -7,6 +7,7 @@
;* Fiona Glaser <fiona@x264.com>
;* Christian Heine <sennindemokrit@gmx.net>
;* Oskar Arvidsson <oskar@irock.se>
;* Henrik Gramner <hengar-6@student.ltu.se>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
......@@ -69,12 +70,18 @@ decimate_mask_table4:
db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
chroma_dc_dct_mask_mmx: dw 0, 0,-1,-1, 0, 0,-1,-1
chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0
chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1
chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1
SECTION .text
cextern pb_1
cextern pw_1
cextern pd_1
cextern pb_01
cextern pd_1024
%macro QUANT_DC_START_MMX 0
movd m6, r1m ; mf
......@@ -117,12 +124,18 @@ cextern pb_01
psignw %1, %2
%endmacro
%macro PSIGND_MMX 2
%macro PSIGND_MMX 2-3
%if %0==3
mova %1, %2
pxor %1, %3
psubd %1, %3
%else
pxor %1, %2
psubd %1, %2
%endif
%endmacro
%macro PSIGND_SSSE3 2
%macro PSIGND_SSSE3 2+
psignd %1, %2
%endmacro
......@@ -747,6 +760,126 @@ INIT_AVX
DEQUANT_DC avx , w
%endif
; t4 is eax for return value.
%ifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX
%else
DECLARE_REG_TMP 4,1,2,3,0,5
%endif
;-----------------------------------------------------------------------------
; x264_optimize_chroma_dc( dctcoef dct[4], int dequant_mf )
;-----------------------------------------------------------------------------
; %2 == 1 for sse2 or ssse3, 0 for sse4/avx
%macro OPTIMIZE_CHROMA_DC 2
%assign %%regs 4+%2
%ifndef ARCH_X86_64
%assign %%regs %%regs+1 ; t0-t4 are volatile on x86-64
%endif
cglobal optimize_chroma_dc_%1, 0,%%regs,7
movifnidn t0, r0mp
movd m2, r1m
movq m1, [t0]
%if %2
pxor m4, m4
%else ; sse4, avx
pcmpeqb m4, m4
pslld m4, 11
%endif
%ifidn %1, sse2
mova m3, [chroma_dc_dct_mask_mmx]
mova m5, [chroma_dc_dmf_mask_mmx]
%else
mova m3, [chroma_dc_dct_mask]
mova m5, [chroma_dc_dmf_mask]
%endif
pshuflw m2, m2, 0
pshufd m0, m1, 00010001b ; 1 0 3 2 1 0 3 2
punpcklqdq m2, m2
punpcklqdq m1, m1 ; 3 2 1 0 3 2 1 0
mova m6, [pd_1024] ; 32<<5, elements are shifted 5 bits to the left
PSIGNW m0, m3 ; -1 -0 3 2 -1 -0 3 2
PSIGNW m2, m5 ; + - - + - - + +
paddw m0, m1 ; -1+3 -0+2 1+3 0+2 -1+3 -0+2 1+3 0+2
pmaddwd m0, m2 ; 0-1-2+3 0-1+2-3 0+1-2-3 0+1+2+3 * dmf
punpcklwd m1, m1
psrad m2, 16 ; + - - +
mov t1d, 3
paddd m0, m6
xor t4d, t4d
%ifidn %1, sse2
psrad m1, 31 ; has to be 0 or -1 in order for PSIGND_MMX to work correctly
%endif
%if %2
mova m6, m0
SWAP 0, 6
psrad m6, 11
pcmpeqd m6, m4
pmovmskb t5d, m6
cmp t5d, 0xffff
%else ; sse4, avx
ptest m0, m4
%endif
jz .ret ; if the DC coefficients already round to zero, terminate early
mova m3, m0
.outer_loop:
movsx t3d, word [t0+2*t1] ; dct[coeff]
pshufd m6, m1, 11111111b
pshufd m1, m1, 10010000b ; move the next element to high dword
PSIGND m5, m2, m6
test t3d, t3d
jz .loop_end
.outer_loop_0:
mov t2d, t3d
sar t3d, 31
or t3d, 1
.inner_loop:
psubd m3, m5 ; coeff -= sign
pxor m6, m0, m3
%if %2
psrad m6, 11
pcmpeqd m6, m4
pmovmskb t5d, m6
cmp t5d, 0xffff
%else ; sse4, avx
ptest m6, m4
%endif
jz .round_coeff
paddd m3, m5 ; coeff += sign
mov t4d, 1
.loop_end:
dec t1d
jz .last_coeff
pshufd m2, m2, 01111000b ; - + - + / - - + +
jg .outer_loop
.ret:
REP_RET
.round_coeff:
sub t2d, t3d
mov [t0+2*t1], t2w
jnz .inner_loop
jmp .loop_end
.last_coeff:
movsx t3d, word [t0]
punpcklqdq m2, m2 ; + + + +
PSIGND m5, m2, m1
test t3d, t3d
jnz .outer_loop_0
REP_RET
%endmacro
INIT_XMM
%define PSIGNW PSIGNW_MMX
%define PSIGND PSIGND_MMX
OPTIMIZE_CHROMA_DC sse2, 1
%define PSIGNW PSIGNW_SSSE3
%define PSIGND PSIGND_SSSE3
OPTIMIZE_CHROMA_DC ssse3, 1
OPTIMIZE_CHROMA_DC sse4, 0
INIT_AVX
OPTIMIZE_CHROMA_DC avx, 0
%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
......
......@@ -57,6 +57,10 @@ void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_
void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
int x264_optimize_chroma_dc_sse2( dctcoef dct[4], int dequant_mf );
int x264_optimize_chroma_dc_ssse3( dctcoef dct[4], int dequant_mf );
int x264_optimize_chroma_dc_sse4( dctcoef dct[4], int dequant_mf );
int x264_optimize_chroma_dc_avx( dctcoef dct[4], int dequant_mf );
void x264_denoise_dct_mmx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
......
......@@ -273,59 +273,19 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
}
static inline int idct_dequant_round_2x2_dc( dctcoef ref[4], dctcoef dct[4], int dequant_mf[6][16], int i_qp )
{
dctcoef out[4];
idct_dequant_2x2_dconly( out, dct, dequant_mf, i_qp );
return ((ref[0] ^ (out[0]+32))
| (ref[1] ^ (out[1]+32))
| (ref[2] ^ (out[2]+32))
| (ref[3] ^ (out[3]+32))) >> 6;
}
/* Round down coefficients losslessly in DC-only chroma blocks.
* Unlike luma blocks, this can't be done with a lookup table or
* other shortcut technique because of the interdependencies
* between the coefficients due to the chroma DC transform. */
static inline int x264_mb_optimize_chroma_dc( x264_t *h, int b_inter, int i_qp, dctcoef dct2x2[4] )
static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t *h, dctcoef dct2x2[4], int dequant_mf[6][16], int i_qp )
{
dctcoef dct2x2_orig[4];
int coeff, nz;
int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
/* If the QP is too high, there's no benefit to rounding optimization. */
if( h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << (i_qp/6) > 32*64 )
if( dmf > 32*64 )
return 1;
idct_dequant_2x2_dconly( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
dct2x2_orig[0] += 32;
dct2x2_orig[1] += 32;
dct2x2_orig[2] += 32;
dct2x2_orig[3] += 32;
/* If the DC coefficients already round to zero, terminate early. */
if( !((dct2x2_orig[0]|dct2x2_orig[1]|dct2x2_orig[2]|dct2x2_orig[3])>>6) )
return 0;
/* Start with the highest frequency coefficient... is this the best option? */
for( nz = 0, coeff = h->quantf.coeff_last[DCT_CHROMA_DC]( dct2x2 ); coeff >= 0; coeff-- )
{
int level = dct2x2[coeff];
int sign = level>>31 | 1; /* dct2x2[coeff] < 0 ? -1 : 1 */
while( level )
{
dct2x2[coeff] = level - sign;
if( idct_dequant_round_2x2_dc( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) )
{
nz = 1;
dct2x2[coeff] = level;
break;
}
level -= sign;
}
}
return nz;
return h->quantf.optimize_chroma_dc( dct2x2, dmf );
}
void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
......@@ -370,7 +330,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
if( nz_dc )
{
if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp, dct2x2 ) )
if( !x264_mb_optimize_chroma_dc( h, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) )
continue;
h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1;
zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
......@@ -446,7 +406,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
h->mb.cache.non_zero_count[x264_scan8[16+3]+24*ch] = 0;
if( !nz_dc ) /* Whole block is empty */
continue;
if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp, dct2x2 ) )
if( !x264_mb_optimize_chroma_dc( h, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) )
{
h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 0;
continue;
......
......@@ -1382,7 +1382,7 @@ static int check_quant( int cpu_ref, int cpu_new )
ALIGNED_16( dctcoef dct2[64] );
ALIGNED_16( uint8_t cqm_buf[64] );
int ret = 0, ok, used_asm;
int oks[2] = {1,1}, used_asms[2] = {0,0};
int oks[3] = {1,1,1}, used_asms[3] = {0,0,0};
x264_t h_buf;
x264_t *h = &h_buf;
memset( h, 0, sizeof(*h) );
......@@ -1556,6 +1556,41 @@ static int check_quant( int cpu_ref, int cpu_new )
TEST_DEQUANT_DC( quant_4x4_dc, dequant_4x4_dc, CQM_4IY, 4 );
#define TEST_OPTIMIZE_CHROMA_DC( qname, optname, w ) \
if( qf_a.optname != qf_ref.optname ) \
{ \
set_func_name( #optname ); \
used_asms[2] = 1; \
for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
{ \
int dmf = h->dequant4_mf[CQM_4IC][qp%6][0] << qp/6; \
if( dmf > 32*64 ) \
continue; \
for( int i = 16; ; i <<= 1 )\
{ \
int res_c, res_asm; \
int max = X264_MIN( i, PIXEL_MAX*16 ); \
for( int j = 0; j < w*w; j++ ) \
dct1[j] = rand()%(max*2+1) - max; \
call_c1( qf_c.qname, dct1, h->quant4_mf[CQM_4IC][qp][0]>>1, h->quant4_bias[CQM_4IC][qp][0]>>1 ); \
memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
res_c = call_c1( qf_c.optname, dct1, dmf ); \
res_asm = call_a1( qf_a.optname, dct2, dmf ); \
if( res_c != res_asm || memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \
{ \
oks[2] = 0; \
fprintf( stderr, #optname "(qp=%d, res_c=%d, res_asm=%d): [FAILED]\n", qp, res_c, res_asm ); \
} \
call_c2( qf_c.optname, dct1, dmf ); \
call_a2( qf_a.optname, dct2, dmf ); \
if( i >= PIXEL_MAX*16 ) \
break; \
} \
} \
}
TEST_OPTIMIZE_CHROMA_DC( quant_2x2_dc, optimize_chroma_dc, 2 );
x264_cqm_delete( h );
}
......@@ -1565,6 +1600,9 @@ static int check_quant( int cpu_ref, int cpu_new )
ok = oks[1]; used_asm = used_asms[1];
report( "dequant :" );
ok = oks[2]; used_asm = used_asms[2];
report( "optimize chroma dc :" );
ok = 1; used_asm = 0;
if( qf_a.denoise_dct != qf_ref.denoise_dct )
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment