Commit b3fb7184 authored by Fiona Glaser's avatar Fiona Glaser

Macroblock tree overhaul/optimization

Move the second core part of macroblock tree into an assembly function;
SIMD-optimize roughly half of it (for x86). Roughly ~25-65% faster mbtree,
depending on content.

Slightly change how mbtree handles the tradeoff between range and precision
for propagation.

Overall a slight (but mostly negligible) effect on SSIM and ~2% faster.
parent 00a00cca
......@@ -389,7 +389,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
}
int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int);
int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int16_t);
scratch_size = X264_MAX( scratch_size, buf_mbtree );
if( scratch_size )
CHECKED_MALLOC( h->scratch_buffer, scratch_size );
......@@ -397,7 +397,9 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
h->scratch_buffer = NULL;
int buf_lookahead_threads = (h->mb.i_mb_height + (4 + 32) * h->param.i_lookahead_threads) * sizeof(int) * 2;
CHECKED_MALLOC( h->scratch_buffer2, buf_lookahead_threads );
int buf_mbtree2 = buf_mbtree * 12; /* size of the internal propagate_list asm buffer */
scratch_size = X264_MAX( buf_lookahead_threads, buf_mbtree2 );
CHECKED_MALLOC( h->scratch_buffer2, scratch_size );
return 0;
fail:
......
......@@ -483,20 +483,97 @@ static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel
/* Estimate the total amount of influence on future quality that could be had if we
* were to improve the reference samples used to inter predict any given macroblock. */
static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
static void mbtree_propagate_cost( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
{
float fps = *fps_factor;
for( int i = 0; i < len; i++ )
{
float intra_cost = intra_costs[i] * inv_qscales[i];
float propagate_amount = propagate_in[i] + intra_cost*fps;
float propagate_num = intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK);
float propagate_denom = intra_costs[i];
dst[i] = (int)(propagate_amount * propagate_num / propagate_denom + 0.5f);
int intra_cost = intra_costs[i];
int inter_cost = X264_MIN(intra_costs[i], inter_costs[i] & LOWRES_COST_MASK);
float propagate_intra = intra_cost * inv_qscales[i];
float propagate_amount = propagate_in[i] + propagate_intra*fps;
float propagate_num = intra_cost - inter_cost;
float propagate_denom = intra_cost;
dst[i] = X264_MIN((int)(propagate_amount * propagate_num / propagate_denom + 0.5f), 32767);
}
}
static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
int16_t *propagate_amount, uint16_t *lowres_costs,
int bipred_weight, int mb_y, int len, int list )
{
unsigned stride = h->mb.i_mb_stride;
unsigned width = h->mb.i_mb_width;
unsigned height = h->mb.i_mb_height;
for( unsigned i = 0; i < len; i++ )
{
#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
int lists_used = lowres_costs[i]>>LOWRES_COST_SHIFT;
if( !(lists_used & (1 << list)) )
continue;
int listamount = propagate_amount[i];
/* Apply bipred weighting. */
if( lists_used == 3 )
listamount = (listamount * bipred_weight + 32) >> 6;
/* Early termination for simple case of mv0. */
if( !M32( mvs[i] ) )
{
CLIP_ADD( ref_costs[mb_y*stride + i], listamount );
continue;
}
int x = mvs[i][0];
int y = mvs[i][1];
unsigned mbx = (x>>5)+i;
unsigned mby = (y>>5)+mb_y;
unsigned idx0 = mbx + mby * stride;
unsigned idx2 = idx0 + stride;
x &= 31;
y &= 31;
int idx0weight = (32-y)*(32-x);
int idx1weight = (32-y)*x;
int idx2weight = y*(32-x);
int idx3weight = y*x;
idx0weight = (idx0weight * listamount + 512) >> 10;
idx1weight = (idx1weight * listamount + 512) >> 10;
idx2weight = (idx2weight * listamount + 512) >> 10;
idx3weight = (idx3weight * listamount + 512) >> 10;
if( mbx < width-1 && mby < height-1 )
{
CLIP_ADD( ref_costs[idx0+0], idx0weight );
CLIP_ADD( ref_costs[idx0+1], idx1weight );
CLIP_ADD( ref_costs[idx2+0], idx2weight );
CLIP_ADD( ref_costs[idx2+1], idx3weight );
}
else
{
/* Note: this takes advantage of unsigned representation to
* catch negative mbx/mby. */
if( mby < height )
{
if( mbx < width )
CLIP_ADD( ref_costs[idx0+0], idx0weight );
if( mbx+1 < width )
CLIP_ADD( ref_costs[idx0+1], idx1weight );
}
if( mby+1 < height )
{
if( mbx < width )
CLIP_ADD( ref_costs[idx2+0], idx2weight );
if( mbx+1 < width )
CLIP_ADD( ref_costs[idx2+1], idx3weight );
}
}
}
#undef CLIP_ADD
}
void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
{
pf->mc_luma = mc_luma;
......@@ -552,6 +629,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
pf->integral_init8v = integral_init8v;
pf->mbtree_propagate_cost = mbtree_propagate_cost;
pf->mbtree_propagate_list = mbtree_propagate_list;
#if HAVE_MMX
x264_mc_init_mmx( cpu, pf );
......@@ -565,7 +643,10 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
#endif
if( cpu_independent )
{
pf->mbtree_propagate_cost = mbtree_propagate_cost;
pf->mbtree_propagate_list = mbtree_propagate_list;
}
}
void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
......
......@@ -122,8 +122,12 @@ typedef struct
weight_fn_t *offsetsub;
void (*weight_cache)( x264_t *, x264_weight_t * );
void (*mbtree_propagate_cost)( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
void (*mbtree_propagate_cost)( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void (*mbtree_propagate_list)( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
int16_t *propagate_amount, uint16_t *lowres_costs,
int bipred_weight, int mb_y, int len, int list );
} x264_mc_functions_t;
void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent );
......
......@@ -36,6 +36,7 @@ const pw_32, times 16 dw 32
const pw_512, times 16 dw 512
const pw_00ff, times 16 dw 0x00ff
const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
const pw_0to15, dw 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
const pd_1, times 8 dd 1
const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
......
......@@ -32,6 +32,7 @@
SECTION_RODATA 32
pw_1024: times 16 dw 1024
filt_mul20: times 32 db 20
filt_mul15: times 16 db 1, -5
filt_mul51: times 16 db -5, 1
......@@ -56,8 +57,6 @@ deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
%endif ; !HIGH_BIT_DEPTH
pw_1024: times 16 dw 1024
pd_16: times 4 dd 16
pd_0f: times 4 dd 0xffff
......@@ -70,16 +69,22 @@ tap1: times 4 dw 1, -5
tap2: times 4 dw 20, 20
tap3: times 4 dw -5, 1
pw_0xc000: times 8 dw 0xc000
pw_31: times 8 dw 31
pd_4: times 4 dd 4
SECTION .text
cextern pb_0
cextern pw_1
cextern pw_8
cextern pw_16
cextern pw_32
cextern pw_512
cextern pw_00ff
cextern pw_3fff
cextern pw_pixel_max
cextern pw_0to15
cextern pd_ffff
%macro LOAD_ADD 4
......@@ -1986,7 +1991,7 @@ FRAME_INIT_LOWRES
cglobal mbtree_propagate_cost, 6,6,7
movss m6, [r5]
mov r5d, r6m
lea r0, [r0+r5*4]
lea r0, [r0+r5*2]
add r5d, r5d
add r1, r5
add r2, r5
......@@ -2001,10 +2006,11 @@ cglobal mbtree_propagate_cost, 6,6,7
movq m0, [r4+r5] ; invq
movq m3, [r3+r5] ; inter
movq m1, [r1+r5] ; prop
pand m3, m5
pminsw m3, m2
punpcklwd m2, m4
punpcklwd m0, m4
pmaddwd m0, m2
pand m3, m5
punpcklwd m1, m4
punpcklwd m3, m4
%if cpuflag(fma4)
......@@ -2037,7 +2043,8 @@ cglobal mbtree_propagate_cost, 6,6,7
mulps m0, m3 ; / intra
%endif
cvtps2dq m0, m0
mova [r0+r5*2], m0
packssdw m0, m0
movh [r0+r5], m0
add r5, 8
jl .loop
RET
......@@ -2060,7 +2067,7 @@ MBTREE
cglobal mbtree_propagate_cost, 6,6,%1
vbroadcastss m6, [r5]
mov r5d, r6m
lea r0, [r0+r5*4]
lea r0, [r0+r5*2]
add r5d, r5d
add r1, r5
add r2, r5
......@@ -2078,6 +2085,7 @@ cglobal mbtree_propagate_cost, 6,6,%1
pmovzxwd m2, [r1+r5] ; prop
pand xm3, xm5, [r3+r5] ; inter
pmovzxwd m3, xm3
pminsd m3, m0
pmaddwd m1, m0
psubd m4, m0, m3
cvtdq2ps m0, m0
......@@ -2096,6 +2104,7 @@ cglobal mbtree_propagate_cost, 6,6,%1
movu xm1, [r4+r5]
movu xm2, [r1+r5]
pand xm3, xm5, [r3+r5]
pminsw xm3, xm0
INT16_UNPACK 0
INT16_UNPACK 1
INT16_UNPACK 2
......@@ -2117,7 +2126,9 @@ cglobal mbtree_propagate_cost, 6,6,%1
mulps m1, m3 ; / intra
%endif
vcvtps2dq m1, m1
mova [r0+r5*2], m1
vextractf128 xm2, m1, 1
packssdw xm1, xm2
mova [r0+r5], xm1
add r5, 16
jl .loop
RET
......@@ -2127,3 +2138,95 @@ INIT_YMM avx
MBTREE_AVX 8
INIT_YMM avx2,fma3
MBTREE_AVX 7
%macro MBTREE_PROPAGATE_LIST 0
;-----------------------------------------------------------------------------
; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int *propagate_amount, uint16_t *lowres_costs,
; int16_t *output, int bipred_weight, int mb_y, int len )
;-----------------------------------------------------------------------------
cglobal mbtree_propagate_list_internal, 4,6,8
movh m6, [pw_0to15] ; mb_x
movd m7, r5m
pshuflw m7, m7, 0
punpcklwd m6, m7 ; 0 y 1 y 2 y 3 y
movd m7, r4m
SPLATW m7, m7 ; bipred_weight
psllw m7, 9 ; bipred_weight << 9
mov r5d, r6m
xor r4d, r4d
.loop:
mova m3, [r1+r4*2]
movu m4, [r2+r4*2]
mova m5, [pw_0xc000]
pand m4, m5
pcmpeqw m4, m5
pmulhrsw m5, m3, m7 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
%if cpuflag(avx)
pblendvb m5, m3, m5, m4
%else
pand m5, m4
pandn m4, m3
por m5, m4 ; if( lists_used == 3 )
; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
%endif
movu m0, [r0+r4*4] ; x,y
movu m1, [r0+r4*4+mmsize]
psraw m2, m0, 5
psraw m3, m1, 5
mova m4, [pd_4]
paddw m2, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
paddw m6, m4 ; {mbx, mby} += {4, 0}
paddw m3, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
paddw m6, m4 ; {mbx, mby} += {4, 0}
mova [r3+mmsize*0], m2
mova [r3+mmsize*1], m3
mova m3, [pw_31]
pand m0, m3 ; x &= 31
pand m1, m3 ; y &= 31
packuswb m0, m1
psrlw m1, m0, 3
pand m0, m3 ; x
SWAP 1, 3
pandn m1, m3 ; y premultiplied by (1<<5) for later use of pmulhrsw
mova m3, [pw_32]
psubw m3, m0 ; 32 - x
mova m4, [pw_1024]
psubw m4, m1 ; (32 - y) << 5
pmullw m2, m3, m4 ; idx0weight = (32-y)*(32-x) << 5
pmullw m4, m0 ; idx1weight = (32-y)*x << 5
pmullw m0, m1 ; idx3weight = y*x << 5
pmullw m1, m3 ; idx2weight = y*(32-x) << 5
; avoid overflow in the input to pmulhrsw
psrlw m3, m2, 15
psubw m2, m3 ; idx0weight -= (idx0weight == 32768)
pmulhrsw m2, m5 ; idx0weight * propagate_amount + 512 >> 10
pmulhrsw m4, m5 ; idx1weight * propagate_amount + 512 >> 10
pmulhrsw m1, m5 ; idx2weight * propagate_amount + 512 >> 10
pmulhrsw m0, m5 ; idx3weight * propagate_amount + 512 >> 10
SBUTTERFLY wd, 2, 4, 3
SBUTTERFLY wd, 1, 0, 3
mova [r3+mmsize*2], m2
mova [r3+mmsize*3], m4
mova [r3+mmsize*4], m1
mova [r3+mmsize*5], m0
add r4d, mmsize/2
add r3, mmsize*6
cmp r4d, r5d
jl .loop
REP_RET
%endmacro
INIT_XMM ssse3
MBTREE_PROPAGATE_LIST
INIT_XMM avx
MBTREE_PROPAGATE_LIST
......@@ -161,13 +161,13 @@ void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride
void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );
void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );
void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride );
void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
void x264_mbtree_propagate_cost_sse2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx ( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_fma4( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx2_fma3( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
void x264_mbtree_propagate_cost_avx2_fma3( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
#define MC_CHROMA(cpu)\
......@@ -533,6 +533,113 @@ PLANE_INTERLEAVE(sse2)
PLANE_INTERLEAVE(avx)
#endif
#if HAVE_X86_INLINE_ASM
#define CLIP_ADD(s,x)\
do\
{\
int temp;\
asm("movd %0, %%xmm0 \n"\
"movd %2, %%xmm1 \n"\
"paddsw %%xmm1, %%xmm0 \n"\
"movd %%xmm0, %1 \n"\
:"+m"(s), "=&r"(temp)\
:"m"(x)\
);\
s = temp;\
} while(0)
#define CLIP_ADD2(s,x)\
do\
{\
asm("movd %0, %%xmm0 \n"\
"movd %1, %%xmm1 \n"\
"paddsw %%xmm1, %%xmm0 \n"\
"movd %%xmm0, %0 \n"\
:"+m"(M32(s))\
:"m"(M32(x))\
);\
} while(0)
#else
#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
#define CLIP_ADD2(s,x)\
do\
{\
CLIP_ADD((s)[0], (x)[0]);\
CLIP_ADD((s)[1], (x)[1]);\
} while(0)
#endif
#define PROPAGATE_LIST(cpu)\
void x264_mbtree_propagate_list_internal_##cpu( int16_t (*mvs)[2], int16_t *propagate_amount,\
uint16_t *lowres_costs, int16_t *output,\
int bipred_weight, int mb_y, int len );\
\
static void x264_mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],\
int16_t *propagate_amount, uint16_t *lowres_costs,\
int bipred_weight, int mb_y, int len, int list )\
{\
int16_t *current = h->scratch_buffer2;\
\
x264_mbtree_propagate_list_internal_##cpu( mvs, propagate_amount, lowres_costs,\
current, bipred_weight, mb_y, len );\
\
unsigned stride = h->mb.i_mb_stride;\
unsigned width = h->mb.i_mb_width;\
unsigned height = h->mb.i_mb_height;\
\
for( unsigned i = 0; i < len; current += 32 )\
{\
int end = X264_MIN( i+8, len );\
for( ; i < end; i++, current += 2 )\
{\
if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\
continue;\
\
unsigned mbx = current[0];\
unsigned mby = current[1];\
unsigned idx0 = mbx + mby * stride;\
unsigned idx2 = idx0 + stride;\
\
/* Shortcut for the simple/common case of zero MV */\
if( !M32( mvs[i] ) )\
{\
CLIP_ADD( ref_costs[idx0], current[16] );\
continue;\
}\
\
if( mbx < width-1 && mby < height-1 )\
{\
CLIP_ADD2( ref_costs+idx0, current+16 );\
CLIP_ADD2( ref_costs+idx2, current+32 );\
}\
else\
{\
/* Note: this takes advantage of unsigned representation to\
* catch negative mbx/mby. */\
if( mby < height )\
{\
if( mbx < width )\
CLIP_ADD( ref_costs[idx0+0], current[16] );\
if( mbx+1 < width )\
CLIP_ADD( ref_costs[idx0+1], current[17] );\
}\
if( mby+1 < height )\
{\
if( mbx < width )\
CLIP_ADD( ref_costs[idx2+0], current[32] );\
if( mbx+1 < width )\
CLIP_ADD( ref_costs[idx2+1], current[33] );\
}\
}\
}\
}\
}
PROPAGATE_LIST(ssse3)
PROPAGATE_LIST(avx)
#undef CLIP_ADD
#undef CLIP_ADD2
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
if( !(cpu&X264_CPU_MMX) )
......@@ -645,6 +752,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
pf->integral_init4v = x264_integral_init4v_ssse3;
......@@ -748,6 +856,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_ssse3;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
if( !(cpu&X264_CPU_SLOW_PSHUFB) )
{
......@@ -824,6 +933,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
return;
pf->memzero_aligned = x264_memzero_aligned_avx;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx;
if( cpu&X264_CPU_FMA4 )
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
......
......@@ -31,7 +31,6 @@
SECTION_RODATA 32
pw_0to15: dw 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4
pw_m3: times 16 dw -3
pw_m7: times 16 dw -7
......@@ -56,6 +55,7 @@ cextern pw_8
cextern pw_16
cextern pw_00ff
cextern pw_pixel_max
cextern pw_0to15
%macro STORE8 1
mova [r0+0*FDEC_STRIDEB], %1
......
......@@ -1022,9 +1022,12 @@ static int x264_slicetype_frame_cost_recalculate( x264_t *h, x264_frame_t **fram
return i_score;
}
/* Trade off precision in mbtree for increased range */
#define MBTREE_PRECISION 0.5f
static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, float average_duration, int ref0_distance )
{
int fps_factor = round( CLIP_DURATION(average_duration) / CLIP_DURATION(frame->f_duration) * 256 );
int fps_factor = round( CLIP_DURATION(average_duration) / CLIP_DURATION(frame->f_duration) * 256 / MBTREE_PRECISION );
float weightdelta = 0.0;
if( ref0_distance && frame->f_weighted_cost_delta[ref0_distance-1] > 0 )
weightdelta = (1.0 - frame->f_weighted_cost_delta[ref0_distance-1]);
......@@ -1051,11 +1054,12 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, fl
int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32;
int16_t (*mvs[2])[2] = { frames[b]->lowres_mvs[0][b-p0-1], frames[b]->lowres_mvs[1][p1-b-1] };
int bipred_weights[2] = {i_bipred_weight, 64 - i_bipred_weight};
int *buf = h->scratch_buffer;
int16_t *buf = h->scratch_buffer;
uint16_t *propagate_cost = frames[b]->i_propagate_cost;
uint16_t *lowres_costs = frames[b]->lowres_costs[b-p0][p1-b];
x264_emms();
float fps_factor = CLIP_DURATION(frames[b]->f_duration) / (CLIP_DURATION(average_duration) * 256.0f);
float fps_factor = CLIP_DURATION(frames[b]->f_duration) / (CLIP_DURATION(average_duration) * 256.0f) * MBTREE_PRECISION;
/* For non-reffed frames the source costs are always zero, so just memset one row and re-use it. */
if( !referenced )
......@@ -1065,72 +1069,17 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, fl
{
int mb_index = h->mb.i_mb_y*h->mb.i_mb_stride;
h->mc.mbtree_propagate_cost( buf, propagate_cost,
frames[b]->i_intra_cost+mb_index, frames[b]->lowres_costs[b-p0][p1-b]+mb_index,
frames[b]->i_intra_cost+mb_index, lowres_costs+mb_index,
frames[b]->i_inv_qscale_factor+mb_index, &fps_factor, h->mb.i_mb_width );
if( referenced )
propagate_cost += h->mb.i_mb_width;
for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->mb.i_mb_width; h->mb.i_mb_x++, mb_index++ )
h->mc.mbtree_propagate_list( h, ref_costs[0], &mvs[0][mb_index], buf, &lowres_costs[mb_index],
bipred_weights[0], h->mb.i_mb_y, h->mb.i_mb_width, 0 );
if( b != p1 )
{
int propagate_amount = buf[h->mb.i_mb_x];
/* Don't propagate for an intra block. */
if( propagate_amount > 0 )
{
/* Access width-2 bitfield. */
int lists_used = frames[b]->lowres_costs[b-p0][p1-b][mb_index] >> LOWRES_COST_SHIFT;
/* Follow the MVs to the previous frame(s). */
for( int list = 0; list < 2; list++ )
if( (lists_used >> list)&1 )
{
#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<16)-1)
int listamount = propagate_amount;
/* Apply bipred weighting. */
if( lists_used == 3 )
listamount = (listamount * bipred_weights[list] + 32) >> 6;
/* Early termination for simple case of mv0. */
if( !M32( mvs[list][mb_index] ) )
{
CLIP_ADD( ref_costs[list][mb_index], listamount );
continue;
}
int x = mvs[list][mb_index][0];
int y = mvs[list][mb_index][1];
int mbx = (x>>5)+h->mb.i_mb_x;
int mby = (y>>5)+h->mb.i_mb_y;
int idx0 = mbx + mby * h->mb.i_mb_stride;
int idx1 = idx0 + 1;
int idx2 = idx0 + h->mb.i_mb_stride;
int idx3 = idx0 + h->mb.i_mb_stride + 1;
x &= 31;
y &= 31;