Commit 3451ba3a authored by Henrik Gramner's avatar Henrik Gramner
Browse files

x86: AVX-512 mbtree_propagate_cost

Also make the AVX and AVX2 implementations slightly faster.
parent 75f6f9b2
......@@ -223,11 +223,13 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
PREALLOC( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
PREALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
}
PREALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) );
PREALLOC( frame->i_propagate_cost, i_mb_count * sizeof(uint16_t) );
for( int j = 0; j <= h->param.i_bframe+1; j++ )
for( int i = 0; i <= h->param.i_bframe+1; i++ )
PREALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
PREALLOC( frame->lowres_costs[j][i], i_mb_count * sizeof(uint16_t) );
/* mbtree asm can overread the input buffers, make sure we don't read outside of allocated memory. */
prealloc_size += NATIVE_ALIGN;
}
if( h->param.rc.i_aq_mode )
{
......
......@@ -388,7 +388,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
}
int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int16_t);
int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+15)&~15) * sizeof(int16_t);
scratch_size = X264_MAX( scratch_size, buf_mbtree );
if( scratch_size )
CHECKED_MALLOC( h->scratch_buffer, scratch_size );
......
......@@ -2147,13 +2147,13 @@ MBTREE
cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
vbroadcastss m5, [r5]
mov r5d, r6m
lea r0, [r0+r5*2]
lea r2, [r2+r5*2]
add r5d, r5d
add r1, r5
add r2, r5
add r3, r5
add r4, r5
neg r5
sub r1, r5
sub r3, r5
sub r0, r5
mova xm4, [pw_3fff]
%if notcpuflag(avx2)
pxor xm7, xm7
......@@ -2165,9 +2165,8 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
pmovzxwd m2, [r1+r5] ; prop
pand xm3, xm4, [r3+r5] ; inter
pmovzxwd m3, xm3
pminsd m3, m0
pmaddwd m1, m0
psubd m3, m0, m3
psubusw m3, m0, m3
cvtdq2ps m0, m0
cvtdq2ps m1, m1
cvtdq2ps m2, m2
......@@ -2184,7 +2183,7 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
movu xm1, [r4+r5]
movu xm2, [r1+r5]
pand xm3, xm4, [r3+r5]
pminsw xm3, xm0
psubusw xm3, xm0, xm3
INT16_UNPACK 0
INT16_UNPACK 1
INT16_UNPACK 2
......@@ -2194,7 +2193,6 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
cvtdq2ps m2, m2
cvtdq2ps m3, m3
mulps m1, m0
subps m3, m0, m3
mulps m1, m5 ; intra*invq*fps_factor>>8
addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
rcpps m2, m0 ; 1 / intra 1st approximation
......@@ -2205,7 +2203,7 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
subps m2, m0 ; 2nd approximation for 1/intra
mulps m1, m2 ; / intra
%endif
vcvtps2dq m1, m1
cvtps2dq m1, m1
vextractf128 xm2, m1, 1
packssdw xm1, xm2
mova [r0+r5], xm1
......@@ -2219,6 +2217,39 @@ MBTREE_AVX
INIT_YMM avx2
MBTREE_AVX
INIT_ZMM avx512
cglobal mbtree_propagate_cost, 6,6
vbroadcastss m5, [r5]
mov r5d, 0x3fff3fff
vpbroadcastd ym4, r5d
mov r5d, r6m
lea r2, [r2+r5*2]
add r5d, r5d
add r1, r5
neg r5
sub r4, r5
sub r3, r5
sub r0, r5
.loop:
pmovzxwd m0, [r2+r5] ; intra
pmovzxwd m1, [r1+r5] ; prop
pmovzxwd m2, [r4+r5] ; invq
pand ym3, ym4, [r3+r5] ; inter
pmovzxwd m3, ym3
psubusw m3, m0, m3
cvtdq2ps m0, m0
cvtdq2ps m1, m1
cvtdq2ps m2, m2
cvtdq2ps m3, m3
vdivps m1, m0, {rn-sae}
fmaddps m1, m2, m5, m1
mulps m1, m3
cvtps2dq m1, m1
vpmovsdw [r0+r5], m1
add r5, 32
jl .loop
RET
%macro MBTREE_PROPAGATE_LIST 0
;-----------------------------------------------------------------------------
; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs,
......
......@@ -160,14 +160,16 @@ void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride
void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );
void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );
void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride );
void x264_mbtree_propagate_cost_sse2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_sse2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_fma4 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx512( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_fix8_pack_ssse3( uint16_t *dst, float *src, int count );
void x264_mbtree_fix8_pack_avx2 ( uint16_t *dst, float *src, int count );
void x264_mbtree_fix8_unpack_ssse3( float *dst, uint16_t *src, int count );
......@@ -864,4 +866,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx2;
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx2;
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx2;
if( !(cpu&X264_CPU_AVX512) )
return;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx512;
}
......@@ -1743,7 +1743,7 @@ static int check_mc( int cpu_ref, int cpu_new )
{
ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4;
if( !ok )
fprintf( stderr, "mbtree_propagate_cost FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] );
fprintf( stderr, "mbtree_propagate_cost FAILED: %d !~= %d\n", dstc[j], dsta[j] );
}
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment