Commit d2e86861 authored by Fiona Glaser's avatar Fiona Glaser
Browse files

AVX mbtree_propagate

Up to ~20-30% faster than SSE2 on Sandy Bridge.
parent 6d2b51a3
......@@ -179,7 +179,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
}
CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) );
for( int j = 0; j <= h->param.i_bframe+1; j++ )
for( int i = 0; i <= h->param.i_bframe+1; i++ )
CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
......
......@@ -342,7 +342,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
}
int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+3)&~3) * sizeof(int);
int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int);
scratch_size = X264_MAX( scratch_size, buf_mbtree );
if( scratch_size )
CHECKED_MALLOC( h->scratch_buffer, scratch_size );
......
......@@ -40,7 +40,7 @@ deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
pd_16: times 4 dd 16
pd_0f: times 4 dd 0xffff
pf_inv256: times 4 dd 0.00390625
pf_inv256: times 8 dd 0.00390625
pad10: times 8 dw 10*PIXEL_MAX
pad20: times 8 dw 20*PIXEL_MAX
......@@ -1630,7 +1630,7 @@ FRAME_INIT_LOWRES ssse3
; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
;-----------------------------------------------------------------------------
cglobal mbtree_propagate_cost_sse2, 7,7,7
shl r6d, 1
add r6d, r6d
lea r0, [r0+r6*2]
add r1, r6
add r2, r6
......@@ -1673,3 +1673,49 @@ cglobal mbtree_propagate_cost_sse2, 7,7,7
jl .loop
REP_RET
%macro INT16_TO_FLOAT 1
vpunpckhwd xmm4, xmm%1, xmm7
vpunpcklwd xmm%1, xmm7
vinsertf128 ymm%1, ymm%1, xmm4, 1
vcvtdq2ps ymm%1, ymm%1
%endmacro
; FIXME: align loads/stores to 16 bytes
cglobal mbtree_propagate_cost_avx, 7,7,8
add r6d, r6d
lea r0, [r0+r6*2]
add r1, r6
add r2, r6
add r3, r6
add r4, r6
neg r6
vmovdqa xmm5, [pw_3fff]
vbroadcastss ymm6, [r5]
vmulps ymm6, ymm6, [pf_inv256]
vpxor xmm7, xmm7
.loop:
vmovdqu xmm0, [r2+r6] ; intra
vmovdqu xmm1, [r4+r6] ; invq
vmovdqu xmm2, [r1+r6] ; prop
vpand xmm3, xmm5, [r3+r6] ; inter
INT16_TO_FLOAT 0
INT16_TO_FLOAT 1
INT16_TO_FLOAT 2
INT16_TO_FLOAT 3
vmulps ymm1, ymm1, ymm0
vsubps ymm4, ymm0, ymm3
vmulps ymm1, ymm1, ymm6 ; intra*invq*fps_factor>>8
vaddps ymm1, ymm1, ymm2 ; prop + (intra*invq*fps_factor>>8)
vrcpps ymm3, ymm0 ; 1 / intra 1st approximation
vmulps ymm2, ymm0, ymm3 ; intra * (1/intra 1st approx)
vmulps ymm2, ymm2, ymm3 ; intra * (1/intra 1st approx)^2
vmulps ymm1, ymm1, ymm4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
vaddps ymm3, ymm3, ymm3 ; 2 * (1/intra 1st approx)
vsubps ymm3, ymm3, ymm2 ; 2nd approximation for 1/intra
vmulps ymm1, ymm1, ymm3 ; / intra
vcvtps2dq ymm1, ymm1
vmovdqu [r0+r6*2], ymm1
add r6, 16
jl .loop
vzeroupper
RET
......@@ -140,6 +140,8 @@ void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
#define MC_CHROMA(cpu)\
void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\
......@@ -728,4 +730,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_avx;
#endif // HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_AVX) )
return;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
}
......@@ -1255,8 +1255,8 @@ static int check_mc( int cpu_ref, int cpu_new )
int *dstc = dsta+400;
uint16_t *prop = (uint16_t*)buf1;
uint16_t *intra = (uint16_t*)buf4;
uint16_t *inter = intra+100;
uint16_t *qscale = inter+100;
uint16_t *inter = intra+128;
uint16_t *qscale = inter+128;
uint16_t *rnd = (uint16_t*)buf2;
x264_emms();
for( int j = 0; j < 100; j++ )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment