Commit f5af5f14 authored by Fiona Glaser's avatar Fiona Glaser

Various performance optimizations

Simplify and compact storage of direct motion vectors, faster --direct auto.
Shrink various arrays to save a bit of cache.
Simplify and reorganize B macroblock type writing in CABAC.
Add some missing ALIGNED macros.
parent c0474786
......@@ -611,10 +611,10 @@ struct x264_t
struct
{
/* real intra4x4_pred_mode if I_4X4 or I_8X8, I_PRED_4x4_DC if mb available, -1 if not */
int8_t intra4x4_pred_mode[X264_SCAN8_SIZE];
ALIGNED_8( int8_t intra4x4_pred_mode[X264_SCAN8_SIZE] );
/* i_non_zero_count if available else 0x80 */
uint8_t non_zero_count[X264_SCAN8_SIZE];
ALIGNED_4( uint8_t non_zero_count[X264_SCAN8_SIZE] );
/* -1 if unused, -2 if unavailable */
ALIGNED_4( int8_t ref[2][X264_SCAN8_SIZE] );
......@@ -626,8 +626,8 @@ struct x264_t
/* 1 if SKIP or DIRECT. set only for B-frames + CABAC */
ALIGNED_4( int8_t skip[X264_SCAN8_SIZE] );
ALIGNED_16( int16_t direct_mv[2][X264_SCAN8_SIZE][2] );
ALIGNED_4( int8_t direct_ref[2][X264_SCAN8_SIZE] );
ALIGNED_4( int16_t direct_mv[2][4][2] );
ALIGNED_4( int8_t direct_ref[2][4] );
ALIGNED_4( int16_t pskip_mv[2] );
/* number of neighbors (top and left) that used 8x8 dct */
......
......@@ -326,52 +326,58 @@ int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed )
if( b_changed != NULL && b_available )
{
int type_col = h->fref1[0]->mb_type[ h->mb.i_mb_xy ];
if( IS_INTRA(type_col) || type_col == P_SKIP )
int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy];
int changed = 0;
if( IS_INTRA( type_col ) || type_col == P_SKIP )
{
*b_changed = h->mb.cache.direct_ref[0][0] != h->mb.cache.ref[0][X264_SCAN8_0]
|| h->mb.cache.direct_ref[1][0] != h->mb.cache.ref[1][X264_SCAN8_0]
|| M32( h->mb.cache.direct_mv[0][X264_SCAN8_0] ) != M32( h->mb.cache.mv[0][X264_SCAN8_0] )
|| M32( h->mb.cache.direct_mv[1][X264_SCAN8_0] ) != M32( h->mb.cache.mv[1][X264_SCAN8_0] );
changed |= M32( h->mb.cache.direct_mv[0][0] ) ^ M32( h->mb.cache.mv[0][X264_SCAN8_0] );
changed |= M32( h->mb.cache.direct_mv[1][0] ) ^ M32( h->mb.cache.mv[1][X264_SCAN8_0] );
changed |= h->mb.cache.direct_ref[0][0] ^ h->mb.cache.ref[0][X264_SCAN8_0];
changed |= h->mb.cache.direct_ref[1][0] ^ h->mb.cache.ref[1][X264_SCAN8_0];
}
else
{
int i, l;
*b_changed = 0;
int l;
for( l = 0; l < 2; l++ )
for( i = 0; i < 4; i++ )
*b_changed |= h->mb.cache.direct_ref[l][i] != h->mb.cache.ref[l][x264_scan8[i*4]];
*b_changed = *b_changed || memcmp(h->mb.cache.direct_mv, h->mb.cache.mv, sizeof(h->mb.cache.mv));
{
changed |= M32( h->mb.cache.direct_mv[l][0] ) ^ M32( h->mb.cache.mv[l][x264_scan8[ 0]] );
if( changed ) break;
changed |= M32( h->mb.cache.direct_mv[l][1] ) ^ M32( h->mb.cache.mv[l][x264_scan8[ 4]] );
changed |= M32( h->mb.cache.direct_mv[l][2] ) ^ M32( h->mb.cache.mv[l][x264_scan8[ 8]] );
changed |= M32( h->mb.cache.direct_mv[l][3] ) ^ M32( h->mb.cache.mv[l][x264_scan8[12]] );
if( changed ) break;
changed |= h->mb.cache.direct_ref[l][0] ^ h->mb.cache.ref[l][x264_scan8[ 0]];
changed |= h->mb.cache.direct_ref[l][1] ^ h->mb.cache.ref[l][x264_scan8[ 4]];
changed |= h->mb.cache.direct_ref[l][2] ^ h->mb.cache.ref[l][x264_scan8[ 8]];
changed |= h->mb.cache.direct_ref[l][3] ^ h->mb.cache.ref[l][x264_scan8[12]];
}
}
if( !*b_changed )
*b_changed = changed;
if( !changed )
return b_available;
}
/* cache ref & mv */
if( b_available )
{
int i, l;
int l;
for( l = 0; l < 2; l++ )
for( i = 0; i < 4; i++ )
h->mb.cache.direct_ref[l][i] = h->mb.cache.ref[l][x264_scan8[i*4]];
h->mc.memcpy_aligned(h->mb.cache.direct_mv, h->mb.cache.mv, sizeof(h->mb.cache.mv));
{
CP32( h->mb.cache.direct_mv[l][0], h->mb.cache.mv[l][x264_scan8[ 0]] );
CP32( h->mb.cache.direct_mv[l][1], h->mb.cache.mv[l][x264_scan8[ 4]] );
CP32( h->mb.cache.direct_mv[l][2], h->mb.cache.mv[l][x264_scan8[ 8]] );
CP32( h->mb.cache.direct_mv[l][3], h->mb.cache.mv[l][x264_scan8[12]] );
h->mb.cache.direct_ref[l][0] = h->mb.cache.ref[l][x264_scan8[ 0]];
h->mb.cache.direct_ref[l][1] = h->mb.cache.ref[l][x264_scan8[ 4]];
h->mb.cache.direct_ref[l][2] = h->mb.cache.ref[l][x264_scan8[ 8]];
h->mb.cache.direct_ref[l][3] = h->mb.cache.ref[l][x264_scan8[12]];
}
}
return b_available;
}
void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
{
const int x = 2*(idx%2);
const int y = 2*(idx/2);
x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
CP64( h->mb.cache.mv[0][x264_scan8[idx*4]+0], h->mb.cache.direct_mv[0][x264_scan8[idx*4]+0] );
CP64( h->mb.cache.mv[0][x264_scan8[idx*4]+8], h->mb.cache.direct_mv[0][x264_scan8[idx*4]+8] );
CP64( h->mb.cache.mv[1][x264_scan8[idx*4]+0], h->mb.cache.direct_mv[1][x264_scan8[idx*4]+0] );
CP64( h->mb.cache.mv[1][x264_scan8[idx*4]+8], h->mb.cache.direct_mv[1][x264_scan8[idx*4]+8] );
}
/* This just improves encoder performance, it's not part of the spec */
void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[9][2], int *i_mvc )
{
......
......@@ -292,10 +292,6 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
* if b_changed != NULL, set it to whether refs or mvs differ from
* before this functioncall. */
int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed );
/* x264_mb_load_mv_direct8x8:
* set h->mb.cache.mv and h->mb.cache.ref for B_DIRECT
* must be called only after x264_mb_predict_mv_direct16x16 */
void x264_mb_load_mv_direct8x8( x264_t *h, int idx );
/* x264_mb_predict_mv_ref16x16:
* set mvc with D_16x16 prediction.
* uses all neighbors, even those that didn't end up using this ref.
......
......@@ -132,7 +132,7 @@ typedef struct
} x264_mb_analysis_t;
/* lambda = pow(2,qp/6-2) */
const int x264_lambda_tab[52] = {
const uint8_t x264_lambda_tab[52] = {
1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
1, 1, 1, 1, /* 8-11 */
1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
......@@ -220,16 +220,16 @@ static const uint16_t x264_chroma_lambda2_offset_tab[] = {
};
/* TODO: calculate CABAC costs */
static const int i_mb_b_cost_table[X264_MBTYPE_MAX] = {
static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = {
9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
};
static const int i_mb_b16x8_cost_table[17] = {
static const uint8_t i_mb_b16x8_cost_table[17] = {
0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
};
static const int i_sub_mb_b_cost_table[13] = {
static const uint8_t i_sub_mb_b_cost_table[13] = {
7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
};
static const int i_sub_mb_p_cost_table[4] = {
static const uint8_t i_sub_mb_p_cost_table[4] = {
5, 3, 3, 1
};
......@@ -1773,6 +1773,16 @@ static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int
}
}
static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
{
const int x = 2*(idx&1);
const int y = 2*(idx>>1);
x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
}
#define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
if( x264_mb_partition_listX_table[0][part] ) \
{ \
......
......@@ -144,30 +144,31 @@ static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb )
}
else
{
static const uint8_t i_mb_bits[9*3][6] =
static const uint8_t i_mb_bits[9*3] =
{
{ 1,0,0,0,1,2 }, { 1,0,0,1,0,2 }, { 0,0,2,2,2,2 }, /* L0 L0 */
{ 1,0,1,0,1,2 }, { 1,0,1,1,0,2 }, {0}, /* L0 L1 */
{ 1,1,0,0,0,0 }, { 1,1,0,0,0,1 }, {0}, /* L0 BI */
{ 1,0,1,1,1,2 }, { 1,1,1,1,0,2 }, {0}, /* L1 L0 */
{ 1,0,0,1,1,2 }, { 1,0,1,0,0,2 }, { 0,1,2,2,2,2 }, /* L1 L1 */
{ 1,1,0,0,1,0 }, { 1,1,0,0,1,1 }, {0}, /* L1 BI */
{ 1,1,0,1,0,0 }, { 1,1,0,1,0,1 }, {0}, /* BI L0 */
{ 1,1,0,1,1,0 }, { 1,1,0,1,1,1 }, {0}, /* BI L1 */
{ 1,1,1,0,0,0 }, { 1,1,1,0,0,1 }, { 1,0,0,0,0,2 }, /* BI BI */
0x31, 0x29, 0x4, /* L0 L0 */
0x35, 0x2d, 0, /* L0 L1 */
0x43, 0x63, 0, /* L0 BI */
0x3d, 0x2f, 0, /* L1 L0 */
0x39, 0x25, 0x6, /* L1 L1 */
0x53, 0x73, 0, /* L1 BI */
0x4b, 0x6b, 0, /* BI L0 */
0x5b, 0x7b, 0, /* BI L1 */
0x47, 0x67, 0x21 /* BI BI */
};
const int idx = (i_mb_type - B_L0_L0) * 3 + (h->mb.i_partition - D_16x8);
int bits = i_mb_bits[idx];
x264_cabac_encode_decision_noup( cb, 27+3, i_mb_bits[idx][0] );
x264_cabac_encode_decision( cb, 27+5-i_mb_bits[idx][0], i_mb_bits[idx][1] );
if( i_mb_bits[idx][2] != 2 )
x264_cabac_encode_decision_noup( cb, 27+3, bits&1 );
x264_cabac_encode_decision( cb, 27+5-(bits&1), (bits>>1)&1 ); bits >>= 2;
if( bits != 1 )
{
x264_cabac_encode_decision( cb, 27+5, i_mb_bits[idx][2] );
x264_cabac_encode_decision( cb, 27+5, i_mb_bits[idx][3] );
x264_cabac_encode_decision( cb, 27+5, i_mb_bits[idx][4] );
if( i_mb_bits[idx][5] != 2 )
x264_cabac_encode_decision_noup( cb, 27+5, i_mb_bits[idx][5] );
x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
if( bits != 1 )
x264_cabac_encode_decision_noup( cb, 27+5, bits&1 );
}
}
}
......
......@@ -280,7 +280,7 @@ static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8s
void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
{
const int i_mb_type = h->mb.i_type;
static const int i_offsets[3] = {5,23,0};
static const uint8_t i_offsets[3] = {5,23,0};
int i_mb_i_offset = i_offsets[h->sh.i_type];
int i;
......
......@@ -27,7 +27,7 @@
#include "common/macroblock.h"
extern const int x264_lambda2_tab[52];
extern const int x264_lambda_tab[52];
extern const uint8_t x264_lambda_tab[52];
void x264_rdo_init( void );
......
......@@ -32,7 +32,7 @@
* and refine_* are run only on the winner.
* the subme=8,9 values are much higher because any amount of satd search makes
* up its time by reducing the number of qpel-rd iterations. */
static const int subpel_iterations[][4] =
static const uint8_t subpel_iterations[][4] =
{{0,0,0,0},
{1,1,0,0},
{0,1,1,0},
......@@ -46,7 +46,7 @@ static const int subpel_iterations[][4] =
{0,0,4,10}};
/* (x-1)%6 */
static const int mod6m1[8] = {5,0,1,2,3,4,5,0};
static const uint8_t mod6m1[8] = {5,0,1,2,3,4,5,0};
/* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
static const int hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}};
static const int square1[9][2] = {{0,0}, {0,-1}, {0,1}, {-1,0}, {1,0}, {-1,-1}, {-1,1}, {1,-1}, {1,1}};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment