Commit c82c7374 authored by Henrik Gramner's avatar Henrik Gramner
Browse files

x86: Add asm for mbtree fixed point conversion

The QP offsets of each macroblock are stored as floats internally and
converted to big-endian Q8.8 fixed point numbers when written to the 2-pass
stats file, and converted back to floats when read from the stats file.

Add SSSE3 and AVX2 implementations for conversions in both directions.

About 8x faster than C on Haswell.
parent be677efc
......@@ -589,6 +589,19 @@ static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs
}
}
/* Conversion between float and Q8.8 fixed point (big-endian) for storage */
static void mbtree_fix8_pack( uint16_t *dst, float *src, int count )
{
for( int i = 0; i < count; i++ )
dst[i] = endian_fix16( (int16_t)(src[i] * 256.0f) );
}
static void mbtree_fix8_unpack( float *dst, uint16_t *src, int count )
{
for( int i = 0; i < count; i++ )
dst[i] = (int16_t)endian_fix16( src[i] ) * (1.0f/256.0f);
}
void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
{
pf->mc_luma = mc_luma;
......@@ -646,6 +659,8 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
pf->mbtree_propagate_cost = mbtree_propagate_cost;
pf->mbtree_propagate_list = mbtree_propagate_list;
pf->mbtree_fix8_pack = mbtree_fix8_pack;
pf->mbtree_fix8_unpack = mbtree_fix8_unpack;
#if HAVE_MMX
x264_mc_init_mmx( cpu, pf );
......
......@@ -201,10 +201,11 @@ typedef struct
void (*mbtree_propagate_cost)( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void (*mbtree_propagate_list)( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
int16_t *propagate_amount, uint16_t *lowres_costs,
int bipred_weight, int mb_y, int len, int list );
void (*mbtree_fix8_pack)( uint16_t *dst, float *src, int count );
void (*mbtree_fix8_unpack)( float *dst, uint16_t *src, int count );
} x264_mc_functions_t;
void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent );
......
......@@ -59,6 +59,13 @@ deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
%endif ; !HIGH_BIT_DEPTH
mbtree_fix8_unpack_shuf: db -1,-1, 1, 0,-1,-1, 3, 2,-1,-1, 5, 4,-1,-1, 7, 6
db -1,-1, 9, 8,-1,-1,11,10,-1,-1,13,12,-1,-1,15,14
mbtree_fix8_pack_shuf: db 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14
pf_256: times 4 dd 256.0
pf_inv256: times 4 dd 0.00390625
pd_16: times 4 dd 16
pd_0f: times 4 dd 0xffff
......@@ -2260,3 +2267,109 @@ INIT_XMM ssse3
MBTREE_PROPAGATE_LIST
INIT_XMM avx
MBTREE_PROPAGATE_LIST
%macro MBTREE_FIX8 0
;-----------------------------------------------------------------------------
; void mbtree_fix8_pack( uint16_t *dst, float *src, int count )
;-----------------------------------------------------------------------------
cglobal mbtree_fix8_pack, 3,4
%if mmsize == 32
vbroadcastf128 m2, [pf_256]
vbroadcasti128 m3, [mbtree_fix8_pack_shuf]
%else
movaps m2, [pf_256]
mova m3, [mbtree_fix8_pack_shuf]
%endif
sub r2d, mmsize/2
movsxdifnidn r2, r2d
lea r1, [r1+4*r2]
lea r0, [r0+2*r2]
neg r2
jg .skip_loop
.loop:
mulps m0, m2, [r1+4*r2]
mulps m1, m2, [r1+4*r2+mmsize]
cvttps2dq m0, m0
cvttps2dq m1, m1
packssdw m0, m1
pshufb m0, m3
%if mmsize == 32
vpermq m0, m0, q3120
%endif
mova [r0+2*r2], m0
add r2, mmsize/2
jle .loop
.skip_loop:
sub r2, mmsize/2
jz .end
; Do the remaining values in scalar in order to avoid overreading src.
.scalar:
mulss xm0, xm2, [r1+4*r2+2*mmsize]
cvttss2si r3d, xm0
rol r3w, 8
mov [r0+2*r2+mmsize], r3w
inc r2
jl .scalar
.end:
RET
;-----------------------------------------------------------------------------
; void mbtree_fix8_unpack( float *dst, uint16_t *src, int count )
;-----------------------------------------------------------------------------
cglobal mbtree_fix8_unpack, 3,4
%if mmsize == 32
vbroadcastf128 m2, [pf_inv256]
%else
movaps m2, [pf_inv256]
mova m4, [mbtree_fix8_unpack_shuf+16]
%endif
mova m3, [mbtree_fix8_unpack_shuf]
sub r2d, mmsize/2
movsxdifnidn r2, r2d
lea r1, [r1+2*r2]
lea r0, [r0+4*r2]
neg r2
jg .skip_loop
.loop:
%if mmsize == 32
vbroadcasti128 m0, [r1+2*r2]
vbroadcasti128 m1, [r1+2*r2+16]
pshufb m0, m3
pshufb m1, m3
%else
mova m1, [r1+2*r2]
pshufb m0, m1, m3
pshufb m1, m4
%endif
psrad m0, 16 ; sign-extend
psrad m1, 16
cvtdq2ps m0, m0
cvtdq2ps m1, m1
mulps m0, m2
mulps m1, m2
movaps [r0+4*r2], m0
movaps [r0+4*r2+mmsize], m1
add r2, mmsize/2
jle .loop
.skip_loop:
sub r2, mmsize/2
jz .end
.scalar:
movzx r3d, word [r1+2*r2+mmsize]
rol r3w, 8
movsx r3d, r3w
; Use 3-arg cvtsi2ss as a workaround for the fact that the instruction has a stupid dependency on
; dst which causes terrible performance when used in a loop otherwise. Blame Intel for poor design.
cvtsi2ss xm0, xm2, r3d
mulss xm0, xm2
movss [r0+4*r2+2*mmsize], xm0
inc r2
jl .scalar
.end:
RET
%endmacro
INIT_XMM ssse3
MBTREE_FIX8
INIT_YMM avx2
MBTREE_FIX8
......@@ -173,6 +173,10 @@ void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_fix8_pack_ssse3( uint16_t *dst, float *src, int count );
void x264_mbtree_fix8_pack_avx2 ( uint16_t *dst, float *src, int count );
void x264_mbtree_fix8_unpack_ssse3( float *dst, uint16_t *src, int count );
void x264_mbtree_fix8_unpack_avx2 ( float *dst, uint16_t *src, int count );
#define MC_CHROMA(cpu)\
void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\
......@@ -736,6 +740,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->plane_copy_swap = x264_plane_copy_swap_ssse3;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_ssse3;
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_ssse3;
if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
pf->integral_init4v = x264_integral_init4v_ssse3;
......@@ -841,6 +847,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->plane_copy_swap = x264_plane_copy_swap_ssse3;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_ssse3;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_ssse3;
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_ssse3;
if( !(cpu&X264_CPU_SLOW_PSHUFB) )
{
......@@ -928,4 +936,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->plane_copy_swap = x264_plane_copy_swap_avx2;
pf->get_ref = get_ref_avx2;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx2;
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx2;
}
......@@ -565,11 +565,7 @@ int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offs
}
float *dst = rc->mbtree.rescale_enabled ? rc->mbtree.scale_buffer[0] : frame->f_qp_offset;
for( int i = 0; i < rc->mbtree.src_mb_count; i++ )
{
int16_t qp_fix8 = endian_fix16( rc->mbtree.qp_buffer[rc->mbtree.qpbuf_pos][i] );
dst[i] = qp_fix8 * (1.f/256.f);
}
h->mc.mbtree_fix8_unpack( dst, rc->mbtree.qp_buffer[rc->mbtree.qpbuf_pos], rc->mbtree.src_mb_count );
if( rc->mbtree.rescale_enabled )
x264_macroblock_tree_rescale( h, rc, frame->f_qp_offset );
if( h->frames.b_have_lowres )
......@@ -1889,9 +1885,7 @@ int x264_ratecontrol_end( x264_t *h, int bits, int *filler )
if( h->param.rc.b_mb_tree && h->fenc->b_kept_as_ref && !h->param.rc.b_stat_read )
{
uint8_t i_type = h->sh.i_type;
/* Values are stored as big-endian FIX8.8 */
for( int i = 0; i < h->mb.i_mb_count; i++ )
rc->mbtree.qp_buffer[0][i] = endian_fix16( (int16_t)(h->fenc->f_qp_offset[i]*256.0) );
h->mc.mbtree_fix8_pack( rc->mbtree.qp_buffer[0], h->fenc->f_qp_offset, h->mb.i_mb_count );
if( fwrite( &i_type, 1, 1, rc->p_mbtree_stat_file_out ) < 1 )
goto fail;
if( fwrite( rc->mbtree.qp_buffer[0], sizeof(uint16_t), h->mb.i_mb_count, rc->p_mbtree_stat_file_out ) < h->mb.i_mb_count )
......
......@@ -1745,6 +1745,60 @@ static int check_mc( int cpu_ref, int cpu_new )
call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
}
}
if( mc_a.mbtree_fix8_pack != mc_ref.mbtree_fix8_pack )
{
set_func_name( "mbtree_fix8_pack" );
used_asm = 1;
float *fix8_src = (float*)(buf3 + 0x800);
uint16_t *dstc = (uint16_t*)buf3;
uint16_t *dsta = (uint16_t*)buf4;
for( int i = 0; i < 5; i++ )
{
int count = 256 + i;
for( int j = 0; j < count; j++ )
fix8_src[j] = (int16_t)(rand()) / 256.0f;
dsta[count] = 0xAAAA;
call_c( mc_c.mbtree_fix8_pack, dstc, fix8_src, count );
call_a( mc_a.mbtree_fix8_pack, dsta, fix8_src, count );
if( memcmp( dsta, dstc, count * sizeof(uint16_t) ) || dsta[count] != 0xAAAA )
{
ok = 0;
fprintf( stderr, "mbtree_fix8_pack FAILED\n" );
break;
}
}
}
if( mc_a.mbtree_fix8_unpack != mc_ref.mbtree_fix8_unpack )
{
set_func_name( "mbtree_fix8_unpack" );
used_asm = 1;
uint16_t *fix8_src = (uint16_t*)(buf3 + 0x800);
float *dstc = (float*)buf3;
float *dsta = (float*)buf4;
for( int i = 0; i < 5; i++ )
{
int count = 256 + i;
for( int j = 0; j < count; j++ )
fix8_src[j] = rand();
M32( &dsta[count] ) = 0xAAAAAAAA;
call_c( mc_c.mbtree_fix8_unpack, dstc, fix8_src, count );
call_a( mc_a.mbtree_fix8_unpack, dsta, fix8_src, count );
if( memcmp( dsta, dstc, count * sizeof(float) ) || M32( &dsta[count] ) != 0xAAAAAAAA )
{
ok = 0;
fprintf( stderr, "mbtree_fix8_unpack FAILED\n" );
break;
}
}
}
report( "mbtree :" );
if( mc_a.memcpy_aligned != mc_ref.memcpy_aligned )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment