Commit a40aa64d authored by Fiona Glaser's avatar Fiona Glaser

Reduce lookahead memory usage, cache misses

Merge lowres_types with lowres_costs.
parent a6410b8c
......@@ -148,10 +148,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
for( int j = 0; j <= h->param.i_bframe+1; j++ )
for( int i = 0; i <= h->param.i_bframe+1; i++ )
{
CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
}
frame->i_intra_cost = frame->lowres_costs[0][0];
memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
}
......@@ -199,10 +196,7 @@ void x264_frame_delete( x264_frame_t *frame )
x264_free( frame->i_propagate_cost );
for( int j = 0; j <= X264_BFRAME_MAX+1; j++ )
for( int i = 0; i <= X264_BFRAME_MAX+1; i++ )
{
x264_free( frame->lowres_costs[j][i] );
x264_free( frame->lowres_inter_types[j][i] );
}
x264_free( frame->f_qp_offset );
x264_free( frame->f_qp_offset_aq );
x264_free( frame->i_inv_qscale_factor );
......
......@@ -84,9 +84,14 @@ typedef struct x264_frame
uint8_t *mb_partition;
int16_t (*mv[2])[2];
int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2];
/* Stored as (lists_used << LOWRES_COST_SHIFT) + (cost).
* Doesn't need special addressing for intra cost because
* lists_used is guaranteed to be zero in that cast. */
uint16_t (*lowres_costs[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
/* Actually a width-2 bitfield with 4 values per uint8_t. */
uint8_t (*lowres_inter_types[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
#define LOWRES_COST_MASK ((1<<14)-1)
#define LOWRES_COST_SHIFT 14
int *lowres_mv_costs[2][X264_BFRAME_MAX+1];
int8_t *ref[2];
int i_ref[2];
......
......@@ -427,7 +427,7 @@ static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *i
for( int i = 0; i < len; i++ )
{
int propagate_amount = propagate_in[i] + ((intra_costs[i] * inv_qscales[i] + 128)>>8);
dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - inter_costs[i]), intra_costs[i]);
dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK)), intra_costs[i]);
}
}
......
......@@ -37,6 +37,7 @@ pw_1: times 8 dw 1
pw_16: times 8 dw 16
pw_32: times 8 dw 32
pd_128: times 4 dd 128
pw_0x3fff: times 4 dw 0x3fff
SECTION .text
......@@ -1132,8 +1133,9 @@ cglobal x264_mbtree_propagate_cost_sse2, 6,6
pmaddwd xmm0, xmm2
paddd xmm0, xmm4
psrld xmm0, 8 ; intra*invq>>8
movq xmm1, [r1+r5] ; prop
movq xmm3, [r3+r5] ; inter
movq xmm1, [r1+r5] ; prop
pand xmm3, [pw_0x3fff]
punpcklwd xmm1, xmm5
punpcklwd xmm3, xmm5
paddd xmm0, xmm1 ; prop + (intra*invq>>8)
......
......@@ -416,10 +416,6 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
if( b_bidir && ( M32( m[0].mv ) || M32( m[1].mv ) ) )
TRY_BIDIR( m[0].mv, m[1].mv, 5 );
/* Store to width-2 bitfield. */
frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy>>2] &= ~(3<<((i_mb_xy&3)*2));
frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy>>2] |= list_used<<((i_mb_xy&3)*2);
lowres_intra_mb:
if( !fenc->b_intra_calculated )
{
......@@ -481,7 +477,10 @@ lowres_intra_mb:
int i_icost = fenc->i_intra_cost[i_mb_xy];
int b_intra = i_icost < i_bcost;
if( b_intra )
{
i_bcost = i_icost;
list_used = 0;
}
if( b_frame_score_mb )
fenc->i_intra_mbs[b-p0] += b_intra;
}
......@@ -501,7 +500,8 @@ lowres_intra_mb:
}
}
fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = i_bcost;
assert(i_bcost < (1<<14));
fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = i_bcost + (list_used << LOWRES_COST_SHIFT);
}
#undef TRY_BIDIR
......@@ -615,7 +615,7 @@ static int x264_slicetype_frame_cost_recalculate( x264_t *h, x264_frame_t **fram
for( h->mb.i_mb_x = h->sps->i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
{
int i_mb_xy = h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride;
int i_mb_cost = frames[b]->lowres_costs[b-p0][p1-b][i_mb_xy];
int i_mb_cost = frames[b]->lowres_costs[b-p0][p1-b][i_mb_xy] & LOWRES_COST_MASK;
float qp_adj = qp_offset[i_mb_xy];
i_mb_cost = (i_mb_cost * x264_exp2fix8(qp_adj) + 128) >> 8;
row_satd[ h->mb.i_mb_y ] += i_mb_cost;
......@@ -681,7 +681,7 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
if( propagate_amount > 0 )
{
/* Access width-2 bitfield. */
int lists_used = (frames[b]->lowres_inter_types[b-p0][p1-b][mb_index>>2] >> ((mb_index&3)*2))&3;
int lists_used = frames[b]->lowres_costs[b-p0][p1-b][mb_index] >> LOWRES_COST_SHIFT;
/* Follow the MVs to the previous frame(s). */
for( int list = 0; list < 2; list++ )
if( (lists_used >> list)&1 )
......@@ -1490,7 +1490,7 @@ int x264_rc_analyse_slice( x264_t *h )
for( int x = h->fdec->i_pir_start_col; x <= h->fdec->i_pir_end_col; x++, mb_xy++ )
{
int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor + 128) >> 8;
int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy];
int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy] & LOWRES_COST_MASK;
int diff = intra_cost - inter_cost;
if( h->param.rc.i_aq_mode )
h->fdec->i_row_satd[y] += (diff * frames[b]->i_inv_qscale_factor[mb_xy] + 128) >> 8;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment