Commit e9ff8c4b authored by Loren Merritt's avatar Loren Merritt

simd part of x264_macroblock_tree_propagate.

1.6x faster on conroe.
parent 5599c478
......@@ -743,7 +743,8 @@ int x264_macroblock_cache_init( x264_t *h )
int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
CHECKED_MALLOC( h->scratch_buffer, X264_MAX3( buf_hpel, buf_ssim, buf_tesa ) );
int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
CHECKED_MALLOC( h->scratch_buffer, X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_mbtree ) );
return 0;
fail: return -1;
......
......@@ -356,6 +356,33 @@ static void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth,
}
}
#if defined(__GNUC__) && (defined(ARCH_X86) || defined(ARCH_X86_64))
// gcc isn't smart enough to use the "idiv" instruction
static ALWAYS_INLINE int32_t div_64_32(int64_t x, int32_t y) {
int32_t quotient, remainder;
asm("idiv %4"
:"=a"(quotient), "=d"(remainder)
:"a"((uint32_t)x), "d"((int32_t)(x>>32)), "r"(y)
);
return quotient;
}
#else
#define div_64_32(x,y) ((x)/(y))
#endif
/* Estimate the total amount of influence on future quality that could be had if we
* were to improve the reference samples used to inter predict any given macroblock. */
static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, int len )
{
int i;
for( i=0; i<len; i++ )
{
int propagate_amount = propagate_in[i] + ((intra_costs[i] * inv_qscales[i] + 128)>>8);
dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - inter_costs[i]), intra_costs[i]);
}
}
void x264_mc_init( int cpu, x264_mc_functions_t *pf )
{
pf->mc_luma = mc_luma;
......@@ -392,6 +419,8 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
pf->integral_init4v = integral_init4v;
pf->integral_init8v = integral_init8v;
pf->mbtree_propagate_cost = mbtree_propagate_cost;
#ifdef HAVE_MMX
x264_mc_init_mmx( cpu, pf );
#endif
......
......@@ -74,6 +74,9 @@ typedef struct
void (*frame_init_lowres_core)( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
int src_stride, int dst_stride, int width, int height );
void (*mbtree_propagate_cost)( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, int len );
} x264_mc_functions_t;
void x264_mc_init( int cpu, x264_mc_functions_t *pf );
......
......@@ -34,6 +34,7 @@ filt_mul51: times 8 db 1, -5
pw_1: times 8 dw 1
pw_16: times 8 dw 16
pw_32: times 8 dw 32
pd_128: times 4 dd 128
SECTION .text
......@@ -1081,3 +1082,43 @@ INIT_XMM
FRAME_INIT_LOWRES sse2, 12
%define PALIGNR PALIGNR_SSSE3
FRAME_INIT_LOWRES ssse3, 12
;-----------------------------------------------------------------------------
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
; uint16_t *inter_costs, uint16_t *inv_qscales, int len )
;-----------------------------------------------------------------------------
cglobal x264_mbtree_propagate_cost_sse2, 6,6
shl r5d, 1
lea r0, [r0+r5*2]
lea r1, [r1+r5]
lea r2, [r2+r5]
lea r3, [r3+r5]
lea r4, [r4+r5]
neg r5
pxor xmm5, xmm5
movdqa xmm4, [pd_128 GLOBAL]
.loop:
movq xmm2, [r2+r5] ; intra
movq xmm0, [r4+r5] ; invq
punpcklwd xmm2, xmm5
punpcklwd xmm0, xmm5
pmaddwd xmm0, xmm2
paddd xmm0, xmm4
psrld xmm0, 8 ; intra*invq>>8
movq xmm1, [r1+r5] ; prop
movq xmm3, [r3+r5] ; inter
punpcklwd xmm1, xmm5
punpcklwd xmm3, xmm5
paddd xmm0, xmm1 ; prop + (intra*invq>>8)
cvtdq2ps xmm1, xmm2 ; intra
psubd xmm2, xmm3 ; intra - inter
cvtdq2ps xmm0, xmm0
cvtdq2ps xmm2, xmm2
mulps xmm0, xmm2 ; (prop + (intra*invq>>8)) * (intra - inter)
divps xmm0, xmm1 ; / intra
cvttps2dq xmm0, xmm0 ; truncation isn't really desired, but matches the integer implementation
movdqa [r0+r5*2], xmm0
add r5, 8
jl .loop
REP_RET
......@@ -74,6 +74,8 @@ extern void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int strid
extern void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
extern void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
extern void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
extern void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, int len );
#define LOWRES(cpu) \
extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
int src_stride, int dst_stride, int width, int height );
......@@ -303,6 +305,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->integral_init4v = x264_integral_init4v_sse2;
pf->integral_init8v = x264_integral_init8v_sse2;
pf->hpel_filter = x264_hpel_filter_sse2_amd;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
if( cpu&X264_CPU_SSE2_IS_SLOW )
return;
......
......@@ -648,6 +648,7 @@ static int x264_validate_parameters( x264_t *h )
BOOLIFY( analyse.b_fast_pskip );
BOOLIFY( rc.b_stat_write );
BOOLIFY( rc.b_stat_read );
BOOLIFY( rc.b_mb_tree );
#undef BOOLIFY
return 0;
......
......@@ -406,22 +406,21 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
int dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32;
int16_t (*mvs[2])[2] = { frames[b]->lowres_mvs[0][b-p0-1], frames[b]->lowres_mvs[1][p1-b-1] };
int *buf = h->scratch_buffer;
for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->sps->i_mb_height; h->mb.i_mb_y++ )
{
int mb_index = h->mb.i_mb_y*h->mb.i_mb_stride;
h->mc.mbtree_propagate_cost( buf, frames[b]->i_propagate_cost+mb_index,
frames[b]->i_intra_cost+mb_index, frames[b]->lowres_costs[b-p0][p1-b]+mb_index,
frames[b]->i_inv_qscale_factor+mb_index, h->sps->i_mb_width );
for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->sps->i_mb_width; h->mb.i_mb_x++, mb_index++ )
{
int inter_cost = frames[b]->lowres_costs[b-p0][p1-b][mb_index];
int intra_cost = frames[b]->i_intra_cost[mb_index];
int propagate_amount = buf[h->mb.i_mb_x];
/* Don't propagate for an intra block. */
if( inter_cost < intra_cost )
if( propagate_amount > 0 )
{
int lists_used = frames[b]->lowres_inter_types[b-p0][p1-b][mb_index];
/* The approximate amount of data that this block contains. */
int propagate_amount = frames[b]->i_propagate_cost[mb_index] + ((intra_cost * frames[b]->i_inv_qscale_factor[mb_index] + 128)>>8);
propagate_amount = ((uint64_t)propagate_amount*(intra_cost-inter_cost)) / intra_cost;
int list;
/* Follow the MVs to the previous frame(s). */
for( list = 0; list < 2; list++ )
......
......@@ -960,6 +960,32 @@ static int check_mc( int cpu_ref, int cpu_new )
INTEGRAL_INIT( integral_init8v, 9, sum, stride );
report( "integral init :" );
if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost )
{
ok = 1; used_asm = 1;
set_func_name( "mbtree_propagate" );
int *dsta = (int*)buf3;
int *dstc = dsta+400;
uint16_t *prop = (uint16_t*)buf1;
uint16_t *intra = (uint16_t*)buf4;
uint16_t *inter = intra+400;
uint16_t *qscale = inter+400;
uint16_t *rand = (uint16_t*)buf2;
for( i=0; i<400; i++ )
{
intra[i] = *rand++ & 0x7fff;
intra[i] += !intra[i];
inter[i] = *rand++ & 0x7fff;
qscale[i] = *rand++ & 0x7fff;
}
call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, 400 );
call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, 400 );
// I don't care about exact rounding, this is just how close the floating-point implementation happens to be
for( i=0; i<400; i++ )
ok &= abs(dstc[i]-dsta[i]) <= (abs(dstc[i])>512) || fabs((double)dstc[i]/dsta[i]-1) < 1e-6;
report( "mbtree propagate :" );
}
return ret;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment