Commit c583687f authored by Fiona Glaser's avatar Fiona Glaser

VFR/framerate-aware ratecontrol, part 2

MB-tree and qcomp complexity estimation now consider the duration of a frame in their calculations.
This is very important for visual optimizations, as frames that last longer are inherently more important quality-wise.
Improves VFR-aware PSNR as much as 1-2db on extreme test cases, ~0.5db on more ordinary VFR clips (e.g. deduped anime episodes).

WARNING: This change redefines x264's internal quality measurement.
x264 will now scale its quality based on the framerate of the video due to the aforementioned frame duration logic.
That is, --crf X will give lower quality per frame for a 60fps video than for a 30fps one.
This will make --crf closer to constant perceptual quality than previously.
The "center" for this change is 25fps: that is, videos lower than 25fps will go up in quality at the same CRF and videos above will go down.
This choice is completely arbitrary.

Note that to take full advantage of this, x264 must encode your video at the correct framerate, with the correct timestamps.
parent 247f504d
......@@ -431,30 +431,19 @@ static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel
}
}
#if defined(__GNUC__) && (ARCH_X86 || ARCH_X86_64)
// gcc isn't smart enough to use the "idiv" instruction
static ALWAYS_INLINE int32_t div_64_32(int64_t x, int32_t y)
{
int32_t quotient, remainder;
asm("idiv %4"
:"=a"(quotient), "=d"(remainder)
:"a"((uint32_t)x), "d"((int32_t)(x>>32)), "r"(y)
);
return quotient;
}
#else
#define div_64_32(x,y) ((x)/(y))
#endif
/* Estimate the total amount of influence on future quality that could be had if we
* were to improve the reference samples used to inter predict any given macroblock. */
static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, int len )
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
{
float fps = *fps_factor / 256.f;
for( int i = 0; i < len; i++ )
{
int propagate_amount = propagate_in[i] + ((intra_costs[i] * inv_qscales[i] + 128)>>8);
dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK)), intra_costs[i]);
float intra_cost = intra_costs[i] * inv_qscales[i];
float propagate_amount = propagate_in[i] + intra_cost*fps;
float propagate_num = intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK);
float propagate_denom = intra_costs[i];
dst[i] = (int)(propagate_amount * propagate_num / propagate_denom + 0.5f);
}
}
......
......@@ -123,7 +123,7 @@ typedef struct
void (*weight_cache)( x264_t *, x264_weight_t * );
void (*mbtree_propagate_cost)( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, int len );
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
} x264_mc_functions_t;
void x264_mc_init( int cpu, x264_mc_functions_t *pf );
......
......@@ -51,7 +51,6 @@ const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1)
const pd_1, times 4 dd 1
const pd_32, times 4 dd 32
const pd_128, times 4 dd 128
const pd_ffff, times 4 dd 0xffff
const pw_00ff, times 8 dw 0x00ff
const pw_ff00, times 8 dw 0xff00
......
......@@ -40,6 +40,7 @@ deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
pd_16: times 4 dd 16
pd_0f: times 4 dd 0xffff
pf_inv256: times 4 dd 0.00390625
pad10: times 8 dw 10*PIXEL_MAX
pad20: times 8 dw 20*PIXEL_MAX
......@@ -59,7 +60,6 @@ cextern pw_32
cextern pw_00ff
cextern pw_3fff
cextern pw_pixel_max
cextern pd_128
cextern pd_ffff
%macro LOAD_ADD 4
......@@ -1649,47 +1649,49 @@ FRAME_INIT_LOWRES ssse3
;-----------------------------------------------------------------------------
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
; uint16_t *inter_costs, uint16_t *inv_qscales, int len )
; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
;-----------------------------------------------------------------------------
cglobal mbtree_propagate_cost_sse2, 6,6,7
shl r5d, 1
lea r0, [r0+r5*2]
add r1, r5
add r2, r5
add r3, r5
add r4, r5
neg r5
pxor xmm5, xmm5
movdqa xmm6, [pw_3fff]
movdqa xmm4, [pd_128]
cglobal mbtree_propagate_cost_sse2, 7,7,7
shl r6d, 1
lea r0, [r0+r6*2]
add r1, r6
add r2, r6
add r3, r6
add r4, r6
neg r6
pxor xmm4, xmm4
movss xmm6, [r5]
shufps xmm6, xmm6, 0
mulps xmm6, [pf_inv256]
movdqa xmm5, [pw_3fff]
.loop:
movq xmm2, [r2+r5] ; intra
movq xmm0, [r4+r5] ; invq
movq xmm3, [r3+r5] ; inter
movq xmm1, [r1+r5] ; prop
punpcklwd xmm2, xmm5
punpcklwd xmm0, xmm5
movq xmm2, [r2+r6] ; intra
movq xmm0, [r4+r6] ; invq
movq xmm3, [r3+r6] ; inter
movq xmm1, [r1+r6] ; prop
punpcklwd xmm2, xmm4
punpcklwd xmm0, xmm4
pmaddwd xmm0, xmm2
pand xmm3, xmm6
punpcklwd xmm1, xmm5
punpcklwd xmm3, xmm5
paddd xmm0, xmm4
psrld xmm0, 8 ; intra*invq>>8
paddd xmm0, xmm1 ; prop + (intra*invq>>8)
pand xmm3, xmm5
punpcklwd xmm1, xmm4
punpcklwd xmm3, xmm4
cvtdq2ps xmm0, xmm0
mulps xmm0, xmm6 ; intra*invq*fps_factor>>8
cvtdq2ps xmm1, xmm1 ; prop
addps xmm0, xmm1 ; prop + (intra*invq*fps_factor>>8)
cvtdq2ps xmm1, xmm2 ; intra
psubd xmm2, xmm3 ; intra - inter
cvtdq2ps xmm2, xmm2 ; intra - inter
rcpps xmm3, xmm1 ; 1 / intra 1st approximation
cvtdq2ps xmm0, xmm0
mulps xmm1, xmm3 ; intra * (1/intra 1st approx)
cvtdq2ps xmm2, xmm2
mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2
mulps xmm0, xmm2 ; (prop + (intra*invq>>8)) * (intra - inter)
mulps xmm0, xmm2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
subps xmm3, xmm1 ; 2nd approximation for 1/intra
mulps xmm0, xmm3 ; / intra
cvttps2dq xmm0, xmm0 ; truncation isn't really desired, but matches the integer implementation
movdqa [r0+r5*2], xmm0
add r5, 8
cvtps2dq xmm0, xmm0
movdqa [r0+r6*2], xmm0
add r6, 8
jl .loop
REP_RET
......@@ -124,7 +124,7 @@ void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, int len );
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
#define MC_CHROMA(cpu)\
void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\
......
......@@ -1603,9 +1603,7 @@ int x264_ratecontrol_end( x264_t *h, int bits, int *filler )
rc->cplxr_sum += bits * qp2qscale( rc->qpa_rc ) / (rc->last_rceq * fabs( h->param.rc.f_pb_factor ));
}
rc->cplxr_sum *= rc->cbr_decay;
double frame_duration = (double)h->fenc->i_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
rc->wanted_bits_window += frame_duration * rc->bitrate;
rc->wanted_bits_window += h->fenc->f_duration * rc->bitrate;
rc->wanted_bits_window *= rc->cbr_decay;
}
......@@ -2184,7 +2182,7 @@ static float rate_estimate_qscale( x264_t *h )
rcc->last_satd = x264_rc_analyse_slice( h );
rcc->short_term_cplxsum *= 0.5;
rcc->short_term_cplxcount *= 0.5;
rcc->short_term_cplxsum += rcc->last_satd;
rcc->short_term_cplxsum += rcc->last_satd / (CLIP_DURATION(h->fenc->f_duration) / BASE_FRAME_DURATION);
rcc->short_term_cplxcount ++;
rce.tex_bits = rcc->last_satd;
......@@ -2541,10 +2539,11 @@ static int init_pass2( x264_t *h )
{
x264_ratecontrol_t *rcc = h->rc;
uint64_t all_const_bits = 0;
double timescale = (double)h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
double duration = 0;
for( int i = 0; i < rcc->num_entries; i++ )
duration += rcc->entry[i].i_duration;
duration *= (double)h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
duration *= timescale;
uint64_t all_available_bits = h->param.rc.i_bitrate * 1000. * duration;
double rate_factor, step_mult;
double qblur = h->param.rc.f_qblur;
......@@ -2583,21 +2582,23 @@ static int init_pass2( x264_t *h )
for( int j = 1; j < cplxblur*2 && j < rcc->num_entries-i; j++ )
{
ratecontrol_entry_t *rcj = &rcc->entry[i+j];
double frame_duration = CLIP_DURATION(rcj->i_duration * timescale) / BASE_FRAME_DURATION;
weight *= 1 - pow( (float)rcj->i_count / rcc->nmb, 2 );
if( weight < .0001 )
break;
gaussian_weight = weight * exp( -j*j/200.0 );
weight_sum += gaussian_weight;
cplx_sum += gaussian_weight * (qscale2bits(rcj, 1) - rcj->misc_bits);
cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits) / frame_duration;
}
/* weighted average of cplx of past frames */
weight = 1.0;
for( int j = 0; j <= cplxblur*2 && j <= i; j++ )
{
ratecontrol_entry_t *rcj = &rcc->entry[i-j];
double frame_duration = CLIP_DURATION(rcj->i_duration * timescale) / BASE_FRAME_DURATION;
gaussian_weight = weight * exp( -j*j/200.0 );
weight_sum += gaussian_weight;
cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits);
cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits) / frame_duration;
weight *= 1 - pow( (float)rcj->i_count / rcc->nmb, 2 );
if( weight < .0001 )
break;
......
......@@ -27,6 +27,16 @@
#ifndef X264_RATECONTROL_H
#define X264_RATECONTROL_H
/* Completely arbitrary. Ratecontrol lowers relative quality at higher framerates
* and the reverse at lower framerates; this serves as the center of the curve. */
#define BASE_FRAME_DURATION (0.04f)
/* Arbitrary limitations as a sanity check. */
#define MAX_FRAME_DURATION 1.00f
#define MIN_FRAME_DURATION 0.01f
#define CLIP_DURATION(f) x264_clip3f(f,MIN_FRAME_DURATION,MAX_FRAME_DURATION)
int x264_ratecontrol_new ( x264_t * );
void x264_ratecontrol_delete( x264_t * );
......
......@@ -748,9 +748,10 @@ static int x264_slicetype_frame_cost_recalculate( x264_t *h, x264_frame_t **fram
return i_score;
}
static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, int ref0_distance )
static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, float average_duration, int ref0_distance )
{
x264_emms();
int fps_factor_intra = round( CLIP_DURATION(frame->f_duration) / BASE_FRAME_DURATION * 256 );
int fps_factor_propagate = round( CLIP_DURATION( average_duration) / BASE_FRAME_DURATION * 256 );
float weightdelta = 0.0;
if( ref0_distance && frame->f_weighted_cost_delta[ref0_distance-1] > 0 )
weightdelta = (1.0 - frame->f_weighted_cost_delta[ref0_distance-1]);
......@@ -760,17 +761,18 @@ static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, int ref
float strength = 5.0f * (1.0f - h->param.rc.f_qcompress);
for( int mb_index = 0; mb_index < h->mb.i_mb_count; mb_index++ )
{
int intra_cost = (frame->i_intra_cost[mb_index] * frame->i_inv_qscale_factor[mb_index]+128)>>8;
int intra_cost = (frame->i_intra_cost[mb_index] * frame->i_inv_qscale_factor[mb_index] + 128) >> 8;
int intra_cost_scaled = (intra_cost * fps_factor_intra + 128) >> 8;
if( intra_cost )
{
int propagate_cost = frame->i_propagate_cost[mb_index];
float log2_ratio = x264_log2(intra_cost + propagate_cost) - x264_log2(intra_cost) + weightdelta;
int propagate_cost = (frame->i_propagate_cost[mb_index] * fps_factor_propagate + 128) >> 8;
float log2_ratio = x264_log2(intra_cost_scaled + propagate_cost) - x264_log2(intra_cost) + weightdelta;
frame->f_qp_offset[mb_index] = frame->f_qp_offset_aq[mb_index] - strength * log2_ratio;
}
}
}
static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, int p0, int p1, int b, int referenced )
static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, float average_duration, int p0, int p1, int b, int referenced )
{
uint16_t *ref_costs[2] = {frames[p0]->i_propagate_cost,frames[p1]->i_propagate_cost};
int dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
......@@ -780,6 +782,9 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
int *buf = h->scratch_buffer;
uint16_t *propagate_cost = frames[b]->i_propagate_cost;
x264_emms();
float fps_factor = CLIP_DURATION(frames[b]->f_duration) / CLIP_DURATION(average_duration);
/* For non-reffed frames the source costs are always zero, so just memset one row and re-use it. */
if( !referenced )
memset( frames[b]->i_propagate_cost, 0, h->mb.i_mb_width * sizeof(uint16_t) );
......@@ -789,7 +794,7 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
int mb_index = h->mb.i_mb_y*h->mb.i_mb_stride;
h->mc.mbtree_propagate_cost( buf, propagate_cost,
frames[b]->i_intra_cost+mb_index, frames[b]->lowres_costs[b-p0][p1-b]+mb_index,
frames[b]->i_inv_qscale_factor+mb_index, h->mb.i_mb_width );
frames[b]->i_inv_qscale_factor+mb_index, &fps_factor, h->mb.i_mb_width );
if( referenced )
propagate_cost += h->mb.i_mb_width;
for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->mb.i_mb_width; h->mb.i_mb_x++, mb_index++ )
......@@ -858,7 +863,7 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
}
if( h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead && referenced )
x264_macroblock_tree_finish( h, frames[b], b == p1 ? b - p0 : 0 );
x264_macroblock_tree_finish( h, frames[b], average_duration, b == p1 ? b - p0 : 0 );
}
static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int num_frames, int b_intra )
......@@ -866,6 +871,13 @@ static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t
int idx = !b_intra;
int last_nonb, cur_nonb = 1;
int bframes = 0;
x264_emms();
float total_duration = 0.0;
for( int j = 0; j <= num_frames; j++ )
total_duration += frames[j]->f_duration;
float average_duration = total_duration / (num_frames + 1);
int i = num_frames;
if( b_intra )
......@@ -918,34 +930,34 @@ static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t
if( i != middle )
{
x264_slicetype_frame_cost( h, a, frames, p0, p1, i, 0 );
x264_macroblock_tree_propagate( h, frames, p0, p1, i, 0 );
x264_macroblock_tree_propagate( h, frames, average_duration, p0, p1, i, 0 );
}
i--;
}
x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, middle, 1 );
x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, middle, 1 );
}
else
{
while( i > cur_nonb )
{
x264_slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, i, 0 );
x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, i, 0 );
x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, i, 0 );
i--;
}
}
x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, last_nonb, 1 );
x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, last_nonb, 1 );
last_nonb = cur_nonb;
}
if( !h->param.rc.i_lookahead )
{
x264_macroblock_tree_propagate( h, frames, 0, last_nonb, last_nonb, 1 );
x264_macroblock_tree_propagate( h, frames, average_duration, 0, last_nonb, last_nonb, 1 );
XCHG( uint16_t*, frames[last_nonb]->i_propagate_cost, frames[0]->i_propagate_cost );
}
x264_macroblock_tree_finish( h, frames[last_nonb], last_nonb );
x264_macroblock_tree_finish( h, frames[last_nonb], average_duration, last_nonb );
if( h->param.i_bframe_pyramid && bframes > 1 && !h->param.rc.i_vbv_buffer_size )
x264_macroblock_tree_finish( h, frames[last_nonb+(bframes+1)/2], 0 );
x264_macroblock_tree_finish( h, frames[last_nonb+(bframes+1)/2], average_duration, 0 );
}
static int x264_vbv_frame_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b )
......
......@@ -1236,29 +1236,34 @@ static int check_mc( int cpu_ref, int cpu_new )
if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost )
{
ok = 1; used_asm = 1;
set_func_name( "mbtree_propagate" );
int *dsta = (int*)buf3;
int *dstc = dsta+400;
uint16_t *prop = (uint16_t*)buf1;
uint16_t *intra = (uint16_t*)buf4;
uint16_t *inter = intra+400;
uint16_t *qscale = inter+400;
uint16_t *rnd = (uint16_t*)buf2;
x264_emms();
for( int i = 0; i < 400; i++ )
for( int i = 0; i < 10; i++ )
{
intra[i] = *rnd++ & 0x7fff;
intra[i] += !intra[i];
inter[i] = *rnd++ & 0x7fff;
qscale[i] = *rnd++ & 0x7fff;
float fps_factor = (rand()&65535) / 256.;
ok = 1; used_asm = 1;
set_func_name( "mbtree_propagate" );
int *dsta = (int*)buf3;
int *dstc = dsta+400;
uint16_t *prop = (uint16_t*)buf1;
uint16_t *intra = (uint16_t*)buf4;
uint16_t *inter = intra+100;
uint16_t *qscale = inter+100;
uint16_t *rnd = (uint16_t*)buf2;
x264_emms();
for( int j = 0; j < 100; j++ )
{
intra[j] = *rnd++ & 0x7fff;
intra[j] += !intra[j];
inter[j] = *rnd++ & 0x7fff;
qscale[j] = *rnd++ & 0x7fff;
}
call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, &fps_factor, 100 );
call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, &fps_factor, 100 );
// I don't care about exact rounding, this is just how close the floating-point implementation happens to be
x264_emms();
for( int j = 0; j < 100; j++ )
ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4;
}
call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, 400 );
call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, 400 );
// I don't care about exact rounding, this is just how close the floating-point implementation happens to be
x264_emms();
for( int i = 0; i < 400; i++ )
ok &= abs( dstc[i]-dsta[i] ) <= 1 || fabs( (double)dstc[i]/dsta[i]-1 ) < 1e-6;
report( "mbtree propagate :" );
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment