Commit 65068aab authored by Fiona Glaser's avatar Fiona Glaser

Faster mbtree propagate and x264_log2, less memory usage

Avoid an int->float conversion with a small table.
Change lowres_inter_types to a bitfield; cut its size by 75%.
Somewhat lower memory usage with lots of bframes.
Make log2/exp2 tables global to avoid duplication.
parent adc25db9
......@@ -154,12 +154,9 @@ static inline uint32_t x264_cabac_amvd_sum( int16_t *mvdleft, int16_t *mvdtop )
return amvd0 + (amvd1<<16);
}
static const uint8_t exp2_lut[64] = {
1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 44, 47,
50, 53, 57, 60, 64, 67, 71, 74, 78, 81, 85, 89, 93, 96, 100, 104,
108, 112, 116, 120, 124, 128, 132, 137, 141, 145, 150, 154, 159, 163, 168, 172,
177, 182, 186, 191, 196, 201, 206, 211, 216, 221, 226, 232, 237, 242, 248, 253,
};
extern const uint8_t x264_exp2_lut[64];
extern const float x264_log2_lut[128];
extern const float x264_log2_lz_lut[32];
/* Not a general-purpose function; multiplies input by -1/6 to convert
* qp to qscale. */
......@@ -168,32 +165,13 @@ static ALWAYS_INLINE int x264_exp2fix8( float x )
if( x >= 512.f/6.f ) return 0;
if( x <= -512.f/6.f ) return 0xffff;
int i = x*(-64.f/6.f) + 512;
return (exp2_lut[i&63]+256) << (i>>6) >> 8;
return (x264_exp2_lut[i&63]+256) << (i>>6) >> 8;
}
static const float log2_lut[128] = {
0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
};
static ALWAYS_INLINE float x264_log2( uint32_t x )
{
int lz = x264_clz( x );
return log2_lut[(x<<lz>>24)&0x7f] + (31 - lz);
return x264_log2_lut[(x<<lz>>24)&0x7f] + x264_log2_lz_lut[lz];
}
/****************************************************************************
......
......@@ -98,7 +98,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
for( i = 0; i <= h->param.i_bframe+1; i++ )
{
CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3) * sizeof(uint8_t) );
CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
}
frame->i_intra_cost = frame->lowres_costs[0][0];
memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
......
......@@ -66,6 +66,7 @@ typedef struct
int16_t (*mv[2])[2];
int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2];
uint16_t (*lowres_costs[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
/* Actually a width-2 bitfield with 4 values per uint8_t. */
uint8_t (*lowres_inter_types[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
int *lowres_mv_costs[2][X264_BFRAME_MAX+1];
int8_t *ref[2];
......
......@@ -156,6 +156,37 @@ const int x264_lambda2_tab[52] = {
943718, 1189010, 1498059, 1887436 /* 48 - 51 */
};
const uint8_t x264_exp2_lut[64] = {
1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 44, 47,
50, 53, 57, 60, 64, 67, 71, 74, 78, 81, 85, 89, 93, 96, 100, 104,
108, 112, 116, 120, 124, 128, 132, 137, 141, 145, 150, 154, 159, 163, 168, 172,
177, 182, 186, 191, 196, 201, 206, 211, 216, 221, 226, 232, 237, 242, 248, 253,
};
const float x264_log2_lut[128] = {
0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
};
/* Avoid an int/float conversion. */
const float x264_log2_lz_lut[32] = {
31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
};
// should the intra and inter lambdas be different?
// I'm just matching the behaviour of deadzone quant.
static const int x264_trellis_lambda2_tab[2][52] = {
......
......@@ -194,7 +194,9 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
if( b_bidir && ( *(uint32_t*)m[0].mv || *(uint32_t*)m[1].mv ) )
TRY_BIDIR( m[0].mv, m[1].mv, 5 );
frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy] = list_used;
/* Store to width-2 bitfield. */
frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy>>2] &= ~(3<<((i_mb_xy&3)*2));
frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy>>2] |= list_used<<((i_mb_xy&3)*2);
lowres_intra_mb:
/* forbid intra-mbs in B-frames, because it's rare and not worth checking */
......@@ -428,10 +430,11 @@ static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, int b_b
static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, int p0, int p1, int b )
{
x264_frame_t *refs[2] = {frames[p0],frames[p1]};
uint16_t *ref_costs[2] = {frames[p0]->i_propagate_cost,frames[p1]->i_propagate_cost};
int dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32;
int16_t (*mvs[2])[2] = { frames[b]->lowres_mvs[0][b-p0-1], frames[b]->lowres_mvs[1][p1-b-1] };
int bipred_weights[2] = {i_bipred_weight, 64 - i_bipred_weight};
int *buf = h->scratch_buffer;
for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->sps->i_mb_height; h->mb.i_mb_y++ )
......@@ -446,7 +449,8 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
/* Don't propagate for an intra block. */
if( propagate_amount > 0 )
{
int lists_used = frames[b]->lowres_inter_types[b-p0][p1-b][mb_index];
/* Access width-2 bitfield. */
int lists_used = (frames[b]->lowres_inter_types[b-p0][p1-b][mb_index>>2] >> ((mb_index&3)*2))&3;
int list;
/* Follow the MVs to the previous frame(s). */
for( list = 0; list < 2; list++ )
......@@ -456,7 +460,7 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
int y = mvs[list][mb_index][1];
int listamount = propagate_amount;
int mbx = (x>>5)+h->mb.i_mb_x;
int mby = ((y>>5)+h->mb.i_mb_y);
int mby = (y>>5)+h->mb.i_mb_y;
int idx0 = mbx + mby*h->mb.i_mb_stride;
int idx1 = idx0 + 1;
int idx2 = idx0 + h->mb.i_mb_stride;
......@@ -470,7 +474,7 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
/* Apply bipred weighting. */
if( lists_used == 3 )
listamount = (listamount * (list?(64-i_bipred_weight):i_bipred_weight) + 32) >> 6;
listamount = (listamount * bipred_weights[list] + 32) >> 6;
#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<16)-1)
......@@ -478,21 +482,21 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
* be counted. */
if( mbx < h->sps->i_mb_width-1 && mby < h->sps->i_mb_height-1 && mbx >= 0 && mby >= 0 )
{
CLIP_ADD( refs[list]->i_propagate_cost[idx0], (listamount*idx0weight+512)>>10 );
CLIP_ADD( refs[list]->i_propagate_cost[idx1], (listamount*idx1weight+512)>>10 );
CLIP_ADD( refs[list]->i_propagate_cost[idx2], (listamount*idx2weight+512)>>10 );
CLIP_ADD( refs[list]->i_propagate_cost[idx3], (listamount*idx3weight+512)>>10 );
CLIP_ADD( ref_costs[list][idx0], (listamount*idx0weight+512)>>10 );
CLIP_ADD( ref_costs[list][idx1], (listamount*idx1weight+512)>>10 );
CLIP_ADD( ref_costs[list][idx2], (listamount*idx2weight+512)>>10 );
CLIP_ADD( ref_costs[list][idx3], (listamount*idx3weight+512)>>10 );
}
else /* Check offsets individually */
{
if( mbx < h->sps->i_mb_width && mby < h->sps->i_mb_height && mbx >= 0 && mby >= 0 )
CLIP_ADD( refs[list]->i_propagate_cost[idx0], (listamount*idx0weight+512)>>10 );
CLIP_ADD( ref_costs[list][idx0], (listamount*idx0weight+512)>>10 );
if( mbx+1 < h->sps->i_mb_width && mby < h->sps->i_mb_height && mbx+1 >= 0 && mby >= 0 )
CLIP_ADD( refs[list]->i_propagate_cost[idx1], (listamount*idx1weight+512)>>10 );
CLIP_ADD( ref_costs[list][idx1], (listamount*idx1weight+512)>>10 );
if( mbx < h->sps->i_mb_width && mby+1 < h->sps->i_mb_height && mbx >= 0 && mby+1 >= 0 )
CLIP_ADD( refs[list]->i_propagate_cost[idx2], (listamount*idx2weight+512)>>10 );
CLIP_ADD( ref_costs[list][idx2], (listamount*idx2weight+512)>>10 );
if( mbx+1 < h->sps->i_mb_width && mby+1 < h->sps->i_mb_height && mbx+1 >= 0 && mby+1 >= 0 )
CLIP_ADD( refs[list]->i_propagate_cost[idx3], (listamount*idx3weight+512)>>10 );
CLIP_ADD( ref_costs[list][idx3], (listamount*idx3weight+512)>>10 );
}
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment