Commit c9c7edf3 authored by Fiona Glaser's avatar Fiona Glaser

Various optimizations and cosmetics

Update AUTHORS file with Gabriel and me
update XCHG macro to work correctly in if statements
Add new lookup tables for block_idx and fdec/fenc addresses
Slightly faster array_non_zero_count_mmx (patch by holger)
Eliminate branch in analyse_intra
Unroll loops in and clean up chroma encode
Convert some for loops to do/while loops for speed improvement
Do explicit write-combining on --me tesa mvsad_t struct
Shrink --me esa zero[] array
Speed up bime by reducing size of visited[][][] array
parent 65324952
......@@ -39,11 +39,21 @@ S: France
N: Francesco Corriga
D: VfW
N: Gabriel Bouvigne
E: gabriel.bouvigne AT joost DOT com
D: 2pass VBV
N: Guillaume Poirier
E: gpoirier CHEZ mplayerhq POINT hu
D: Altivec optimizations
S: Brittany, France
N: Fiona Glaser
E: fiona AT x264 DOT com
D: x86 asm, 1pass VBV, adaptive quantization, inline asm
D: various speed optimizations, bugfixes
S: USA
N: Justin Clay
E: justin_clay AT hotmail DOT com
C: wheatgerm
......
......@@ -33,7 +33,7 @@
#define X264_MAX3(a,b,c) X264_MAX((a),X264_MAX((b),(c)))
#define X264_MIN4(a,b,c,d) X264_MIN((a),X264_MIN3((b),(c),(d)))
#define X264_MAX4(a,b,c,d) X264_MAX((a),X264_MAX3((b),(c),(d)))
#define XCHG(type,a,b) { type t = a; a = b; b = t; }
#define XCHG(type,a,b) do{ type t = a; a = b; b = t; } while(0)
#define FIX8(f) ((int)(f*(1<<8)+.5))
#define CHECKED_MALLOC( var, size )\
......
......@@ -219,6 +219,32 @@ static const uint8_t block_idx_xy[4][4] =
{ 4, 6, 12, 14 },
{ 5, 7, 13, 15 }
};
static const uint8_t block_idx_xy_1d[16] =
{
0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
};
static const uint8_t block_idx_xy_fenc[16] =
{
0*4 + 0*4*FENC_STRIDE, 1*4 + 0*4*FENC_STRIDE,
0*4 + 1*4*FENC_STRIDE, 1*4 + 1*4*FENC_STRIDE,
2*4 + 0*4*FENC_STRIDE, 3*4 + 0*4*FENC_STRIDE,
2*4 + 1*4*FENC_STRIDE, 3*4 + 1*4*FENC_STRIDE,
0*4 + 2*4*FENC_STRIDE, 1*4 + 2*4*FENC_STRIDE,
0*4 + 3*4*FENC_STRIDE, 1*4 + 3*4*FENC_STRIDE,
2*4 + 2*4*FENC_STRIDE, 3*4 + 2*4*FENC_STRIDE,
2*4 + 3*4*FENC_STRIDE, 3*4 + 3*4*FENC_STRIDE
};
static const uint16_t block_idx_xy_fdec[16] =
{
0*4 + 0*4*FDEC_STRIDE, 1*4 + 0*4*FDEC_STRIDE,
0*4 + 1*4*FDEC_STRIDE, 1*4 + 1*4*FDEC_STRIDE,
2*4 + 0*4*FDEC_STRIDE, 3*4 + 0*4*FDEC_STRIDE,
2*4 + 1*4*FDEC_STRIDE, 3*4 + 1*4*FDEC_STRIDE,
0*4 + 2*4*FDEC_STRIDE, 1*4 + 2*4*FDEC_STRIDE,
0*4 + 3*4*FDEC_STRIDE, 1*4 + 3*4*FDEC_STRIDE,
2*4 + 2*4*FDEC_STRIDE, 3*4 + 2*4*FDEC_STRIDE,
2*4 + 3*4*FDEC_STRIDE, 3*4 + 3*4*FDEC_STRIDE
};
static const uint8_t i_chroma_qp_table[52] =
{
......
......@@ -77,24 +77,22 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
#define array_non_zero_count array_non_zero_count_mmx
static inline int array_non_zero_count_mmx( int16_t *v )
{
static const uint64_t pw_2 = 0x0202020202020202ULL;
int count;
asm(
"pxor %%mm7, %%mm7 \n"
"movq (%1), %%mm0 \n"
"movq 16(%1), %%mm1 \n"
"packsswb 8(%1), %%mm0 \n"
"movq 8(%1), %%mm1 \n"
"packsswb 16(%1), %%mm0 \n"
"packsswb 24(%1), %%mm1 \n"
"pcmpeqb %%mm7, %%mm0 \n"
"pcmpeqb %%mm7, %%mm1 \n"
"paddb %%mm0, %%mm1 \n"
"paddb %2, %%mm1 \n"
"psadbw %%mm7, %%mm1 \n"
"movd %%mm1, %0 \n"
:"=r"(count)
:"r"(v), "m"(pw_2)
:"r"(v)
);
return count;
return (count+0x10)&0xff;
}
#undef array_non_zero_int
#define array_non_zero_int array_non_zero_int_mmx
......
......@@ -606,10 +606,9 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
if( b_merged_satd && i_max == 9 )
{
int satd[3];
int satd[9];
h->pixf.intra_sa8d_x3_8x8( p_src_by, edge, satd );
if( i_pred_mode < 3 )
satd[i_pred_mode] -= 3 * a->i_lambda;
satd[i_pred_mode] -= 3 * a->i_lambda;
for( i=2; i>=0; i-- )
{
int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
......@@ -679,10 +678,8 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
for( idx = 0;; idx++ )
{
int x = block_idx_x[idx];
int y = block_idx_y[idx];
uint8_t *p_src_by = p_src + 4*x + 4*y*FENC_STRIDE;
uint8_t *p_dst_by = p_dst + 4*x + 4*y*FDEC_STRIDE;
uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
int i_best = COST_MAX;
int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
......@@ -694,10 +691,9 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
if( b_merged_satd && i_max >= 6 )
{
int satd[3];
int satd[9];
h->pixf.intra_satd_x3_4x4( p_src_by, p_dst_by, satd );
if( i_pred_mode < 3 )
satd[i_pred_mode] -= 3 * a->i_lambda;
satd[i_pred_mode] -= 3 * a->i_lambda;
for( i=2; i>=0; i-- )
COPY2_IF_LT( i_best, satd[i] + 4 * a->i_lambda,
a->i_predict4x4[idx], i );
......@@ -808,16 +804,11 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
int i_nnz = 0;
for( idx = 0; idx < 16; idx++ )
{
uint8_t *p_src_by;
uint8_t *p_dst_by;
uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
i_best = COST_MAX;
i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
x = block_idx_x[idx];
y = block_idx_y[idx];
p_src_by = p_src + 4*x + 4*y*FENC_STRIDE;
p_dst_by = p_dst + 4*x + 4*y*FDEC_STRIDE;
predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
......
......@@ -19,13 +19,12 @@
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
#include "common/common.h"
#include "macroblock.h"
#define ZIG(i,y,x) level[i] = dct[x][y];
static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] )
{
......@@ -82,10 +81,8 @@ static int x264_mb_decimate_score( int16_t *dct, int i_max )
void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
{
int x = 4 * block_idx_x[idx];
int y = 4 * block_idx_y[idx];
uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
uint8_t *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]];
uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]];
DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
if( h->mb.b_lossless )
......@@ -147,10 +144,10 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
{
for( i = 0; i < 16; i++ )
{
int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
int oe = block_idx_xy_fenc[i];
int od = block_idx_xy_fdec[i];
h->zigzagf.sub_4x4( h->dct.luma4x4[i], p_src+oe, p_dst+od );
dct_dc4x4[block_idx_x[i]][block_idx_y[i]] = h->dct.luma4x4[i][0];
dct_dc4x4[0][block_idx_xy_1d[i]] = h->dct.luma4x4[i][0];
h->dct.luma4x4[i][0] = 0;
}
h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
......@@ -161,7 +158,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
for( i = 0; i < 16; i++ )
{
/* copy dc coeff */
dct_dc4x4[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
dct_dc4x4[0][block_idx_xy_1d[i]] = dct4x4[i][0][0];
dct4x4[i][0][0] = 0;
/* quant/scan/dequant */
......@@ -186,7 +183,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
for( i = 0; i < 16; i++ )
{
/* copy dc coeff */
dct4x4[i][0][0] = dct_dc4x4[block_idx_y[i]][block_idx_x[i]];
dct4x4[i][0][0] = dct_dc4x4[0][block_idx_xy_1d[i]];
}
/* put pixels to fdec */
h->dctf.add16x16_idct( p_dst, dct4x4 );
......@@ -224,7 +221,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
for( i = 0; i < 4; i++ )
{
/* copy dc coeff */
dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
dct2x2[i>>1][i&1] = dct4x4[i][0][0];
dct4x4[i][0][0] = 0;
/* no trellis; it doesn't seem to help chroma noticeably */
......@@ -258,9 +255,10 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
for( i = 0; i < 4; i++ )
h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qscale );
}
for( i = 0; i < 4; i++ )
dct4x4[i][0][0] = dct2x2[0][i];
dct4x4[0][0][0] = dct2x2[0][0];
dct4x4[1][0][0] = dct2x2[0][1];
dct4x4[2][0][0] = dct2x2[1][0];
dct4x4[3][0][0] = dct2x2[1][1];
h->dctf.add8x8_idct( p_dst, dct4x4 );
}
......@@ -408,7 +406,7 @@ void x264_macroblock_encode( x264_t *h )
}
for( i = h->mb.i_skip_intra ? 15 : 0 ; i < 16; i++ )
{
uint8_t *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[i] + 4 * block_idx_y[i] * FDEC_STRIDE];
uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i]];
int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
......@@ -432,11 +430,9 @@ void x264_macroblock_encode( x264_t *h )
{
for( i4x4 = 0; i4x4 < 16; i4x4++ )
{
int x = 4*block_idx_x[i4x4];
int y = 4*block_idx_y[i4x4];
h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4],
h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4],
h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] );
}
}
else if( h->mb.b_transform_8x8 )
......
......@@ -196,8 +196,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 );
COST_MV_HPEL( mx, my );
}
i++;
} while( i < i_mvc );
} while( ++i < i_mvc );
bmx = ( bpred_mx + 2 ) >> 2;
bmy = ( bpred_my + 2 ) >> 2;
COST_MV( bmx, bmy );
......@@ -223,8 +222,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
my = x264_clip3( my, mv_y_min, mv_y_max );
COST_MV( mx, my );
}
i++;
} while( i < i_mvc );
} while( ++i < i_mvc );
}
COST_MV( 0, 0 );
......@@ -232,14 +230,15 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
{
case X264_ME_DIA:
/* diamond search, radius 1 */
for( i = 0; i < i_me_range; i++ )
i = 0;
do
{
DIA1_ITER( bmx, bmy );
if( (bmx == omx) & (bmy == omy) )
break;
if( !CHECK_MVRANGE(bmx, bmy) )
break;
}
} while( ++i < i_me_range );
break;
case X264_ME_HEX:
......@@ -410,7 +409,9 @@ me_hex2:
/* hexagon grid */
omx = bmx; omy = bmy;
for( i = 1; i <= i_me_range/4; i++ )
i = 1;
do
{
static const int hex4[16][2] = {
{-4, 2}, {-4, 1}, {-4, 0}, {-4,-1}, {-4,-2},
......@@ -437,7 +438,7 @@ me_hex2:
COST_MV_X4( 4*i, 1*i, 4*i, 2*i, 2*i, 3*i, 0*i, 4*i );
COST_MV_X4( -2*i, 3*i, -2*i,-3*i, 0*i,-4*i, 2*i,-3*i );
}
}
} while( ++i <= i_me_range/4 );
if( bmy <= mv_y_max )
goto me_hex2;
break;
......@@ -464,7 +465,10 @@ me_hex2:
* because sum(abs(diff)) >= abs(diff(sum)). */
const int stride = m->i_stride[0];
uint16_t *sums_base = m->integral;
DECLARE_ALIGNED_16( static uint8_t zero[16*16] );
/* due to a GCC bug on some platforms (win32?), zero[] may not actually be aligned.
* unlike the similar case in ratecontrol.c, this is not a problem because it is not used for any
* SSE instructions and the only loss is a tiny bit of performance. */
DECLARE_ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] );
DECLARE_ALIGNED_16( int enc_dc[4] );
int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
int delta = x264_pixel_size[sad_size].w;
......@@ -546,7 +550,13 @@ me_hex2:
for( i=0; i<nmvsad && mvsads[i].sad <= bsad; i++ );
for( j=i; j<nmvsad; j++ )
if( mvsads[j].sad <= bsad )
mvsads[i++] = mvsads[j];
{
/* mvsad_t is not guaranteed to be 8 bytes on all archs, so check before using explicit write-combining */
if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
*(uint64_t*)&mvsads[i++] = *(uint64_t*)&mvsads[j];
else
mvsads[i++] = mvsads[j];
}
nmvsad = i;
}
if( nmvsad > limit )
......@@ -558,7 +568,12 @@ me_hex2:
for( j=i+1; j<nmvsad; j++ )
COPY2_IF_LT( bsad, mvsads[j].sad, bj, j );
if( bj > i )
XCHG( mvsad_t, mvsads[i], mvsads[bj] );
{
if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
XCHG( uint64_t, *(uint64_t*)&mvsads[i], *(uint64_t*)&mvsads[bj] );
else
XCHG( mvsad_t, mvsads[i], mvsads[bj] );
}
}
nmvsad = limit;
}
......@@ -781,12 +796,12 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
BIME_CACHE(-(a),-(b))
#define COST_BIMV_SATD( m0x, m0y, m1x, m1y ) \
if( pass == 0 || !visited[(m0x)&7][(m0y)&7][(m1x)&7][(m1y)&7] ) \
if( pass == 0 || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) ) \
{ \
int cost; \
int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y); \
int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y); \
visited[(m0x)&7][(m0y)&7][(m1x)&7][(m1y)&7] = 1; \
visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));\
h->mc.memcpy_aligned( pix, pix0[i0], bs ); \
if( i_weight == 32 ) \
h->mc.avg[i_pixel]( pix, bw, pix1[i1], bw ); \
......@@ -837,7 +852,8 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight
int bm1y = m1->mv[1], om1y = bm1y;
int bcost = COST_MAX;
int pass = 0;
uint8_t visited[8][8][8][8];
/* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
uint8_t visited[8][8][8];
h->mc.memzero_aligned( visited, sizeof(visited) );
BIME_CACHE( 0, 0 );
......@@ -898,8 +914,7 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight
if( satd <= bsatd * SATD_THRESH )\
{ \
int cost; \
cache_mv[0] = cache_mv2[0] = mx; \
cache_mv[1] = cache_mv2[1] = my; \
*(uint32_t*)cache_mv = *(uint32_t*)cache_mv2 = pack16to32_mask(mx,my); \
cost = x264_rd_cost_part( h, i_lambda2, i8, m->i_pixel ); \
COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
} \
......@@ -937,7 +952,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8 )
p_cost_mvx = m->p_cost_mv - pmx;
p_cost_mvy = m->p_cost_mv - pmy;
COST_MV_SATD( bmx, bmy, bsatd );
COST_MV_RD( bmx, bmy, 0, 0, 0);
COST_MV_RD( bmx, bmy, 0, 0, 0 );
/* check the predicted mv */
if( (bmx != pmx || bmy != pmy)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment