Commit ec3d0955 authored by Fiona Glaser's avatar Fiona Glaser

Convert NNZ to raster order and other optimizations

Converting NNZ to raster order simplifies a lot of the load/store code and allows more use of write-combining.
More use of write-combining throughout load/save code in common/macroblock.c
GCC has aliasing issues in the case of stores to 8-bit heap-allocated arrays; dereferencing the pointer once avoids this problem and significantly increases performance.
More manual loop unrolling and such.
Move all packXtoY functions to macroblock.h so any function can use them.
Add pack8to32.
Minor optimizations to encoder/macroblock.c
parent d97bcbcb
......@@ -405,7 +405,8 @@ struct x264_t
int8_t *type; /* mb type */
int8_t *qp; /* mb qp */
int16_t *cbp; /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x0200 and 0x0400: chroma dc (all set for PCM)*/
int8_t (*intra4x4_pred_mode)[7]; /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
int8_t (*intra4x4_pred_mode)[8]; /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
/* actually has only 7 entries; set to 8 for write-combining optimizations */
uint8_t (*non_zero_count)[16+4+4]; /* nzc. for I_PCM set to 16 */
int8_t *chroma_pred_mode; /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */
int16_t (*mv[2])[2]; /* mb mv. set to 0 for intra mb */
......
......@@ -306,16 +306,16 @@ void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
{
uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
int x;
int x, nnz;
for( x=0; x<h->sps->i_mb_width; x++ )
{
memcpy( buf+x, src+x, 16 );
if( transform[x] )
{
if( src[x][0] ) src[x][0] = 0x01010101;
if( src[x][1] ) src[x][1] = 0x01010101;
if( src[x][2] ) src[x][2] = 0x01010101;
if( src[x][3] ) src[x][3] = 0x01010101;
nnz = src[x][0] | src[x][1];
src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
nnz = src[x][2] | src[x][3];
src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
}
}
}
......@@ -642,8 +642,8 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
int y = i_dir == 0 ? i : i_edge;\
int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03;\
int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03;\
if( h->mb.non_zero_count[mb_xy][block_idx_xy[x][y]] != 0 ||\
h->mb.non_zero_count[mbn_xy][block_idx_xy[xn][yn]] != 0 )\
if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
{\
bS[i] = 2;\
}\
......
......@@ -855,7 +855,7 @@ int x264_macroblock_cache_init( x264_t *h )
CHECKED_MALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
/* 0 -> 3 top(4), 4 -> 6 : left(3) */
CHECKED_MALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 7 * sizeof(int8_t) );
CHECKED_MALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
/* all coeffs */
CHECKED_MALLOC( h->mb.non_zero_count, i_mb_count * 24 * sizeof(uint8_t) );
......@@ -1045,27 +1045,18 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
h->mb.i_neighbour |= MB_TOP;
/* load intra4x4 */
h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][0];
h->mb.cache.intra4x4_pred_mode[x264_scan8[1] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][1];
h->mb.cache.intra4x4_pred_mode[x264_scan8[4] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][2];
h->mb.cache.intra4x4_pred_mode[x264_scan8[5] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][3];
*(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] = *(uint32_t*)&h->mb.intra4x4_pred_mode[i_top_xy][0];
/* load non_zero_count */
h->mb.cache.non_zero_count[x264_scan8[0] - 8] = h->mb.non_zero_count[i_top_xy][10];
h->mb.cache.non_zero_count[x264_scan8[1] - 8] = h->mb.non_zero_count[i_top_xy][11];
h->mb.cache.non_zero_count[x264_scan8[4] - 8] = h->mb.non_zero_count[i_top_xy][14];
h->mb.cache.non_zero_count[x264_scan8[5] - 8] = h->mb.non_zero_count[i_top_xy][15];
h->mb.cache.non_zero_count[x264_scan8[16+0] - 8] = h->mb.non_zero_count[i_top_xy][16+2];
h->mb.cache.non_zero_count[x264_scan8[16+1] - 8] = h->mb.non_zero_count[i_top_xy][16+3];
h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 8] = h->mb.non_zero_count[i_top_xy][16+4+2];
h->mb.cache.non_zero_count[x264_scan8[16+4+1] - 8] = h->mb.non_zero_count[i_top_xy][16+4+3];
*(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0] - 8] = *(uint32_t*)&h->mb.non_zero_count[i_top_xy][12];
/* shift because x264_scan8[16] is misaligned */
*(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16] - 9] = *(uint16_t*)&h->mb.non_zero_count[i_top_xy][18] << 8;
*(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] = *(uint16_t*)&h->mb.non_zero_count[i_top_xy][22] << 8;
}
else
{
h->mb.i_mb_type_top = -1;
/* load intra4x4 */
h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] =
h->mb.cache.intra4x4_pred_mode[x264_scan8[1] - 8] =
......@@ -1081,7 +1072,6 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
h->mb.cache.non_zero_count[x264_scan8[16+1] - 8] =
h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 8] =
h->mb.cache.non_zero_count[x264_scan8[16+4+1] - 8] = 0x80;
}
if( i_mb_x > 0 && i_mb_xy > h->sh.i_first_mb )
......@@ -1099,9 +1089,9 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = h->mb.intra4x4_pred_mode[i_left_xy][3];
/* load non_zero_count */
h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = h->mb.non_zero_count[i_left_xy][5];
h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = h->mb.non_zero_count[i_left_xy][3];
h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = h->mb.non_zero_count[i_left_xy][7];
h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = h->mb.non_zero_count[i_left_xy][13];
h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = h->mb.non_zero_count[i_left_xy][11];
h->mb.cache.non_zero_count[x264_scan8[10] - 1] = h->mb.non_zero_count[i_left_xy][15];
h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = h->mb.non_zero_count[i_left_xy][16+1];
......@@ -1329,13 +1319,15 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );
if( i_left_type >= 0 )
{
h->mb.cache.skip[x264_scan8[0] - 1] = h->mb.skipbp[i_left_xy] & 0x2;
h->mb.cache.skip[x264_scan8[8] - 1] = h->mb.skipbp[i_left_xy] & 0x8;
uint8_t skipbp = h->mb.skipbp[i_left_xy];
h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2;
h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8;
}
if( i_top_type >= 0 )
{
h->mb.cache.skip[x264_scan8[0] - 8] = h->mb.skipbp[i_top_xy] & 0x4;
h->mb.cache.skip[x264_scan8[4] - 8] = h->mb.skipbp[i_top_xy] & 0x8;
uint8_t skipbp = h->mb.skipbp[i_top_xy];
h->mb.cache.skip[x264_scan8[0] - 8] = skipbp & 0x4;
h->mb.cache.skip[x264_scan8[4] - 8] = skipbp & 0x8;
}
}
......@@ -1367,6 +1359,19 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
}
static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i)
{
int w = i ? 8 : 16;
int i_stride = h->fdec->i_stride[i];
int i_stride2 = i_stride << h->mb.b_interlaced;
int i_pix_offset = h->mb.b_interlaced
? w * (h->mb.i_mb_x + (h->mb.i_mb_y&~1) * i_stride) + (h->mb.i_mb_y&1) * i_stride
: w * (h->mb.i_mb_x + h->mb.i_mb_y * i_stride);
h->mc.copy[i?PIXEL_8x8:PIXEL_16x16](
&h->fdec->plane[i][i_pix_offset], i_stride2,
h->mb.pic.p_fdec[i], FDEC_STRIDE, w );
}
void x264_macroblock_cache_save( x264_t *h )
{
const int i_mb_xy = h->mb.i_mb_xy;
......@@ -1376,20 +1381,16 @@ void x264_macroblock_cache_save( x264_t *h )
const int i_mb_4x4 = h->mb.i_b4_xy;
const int i_mb_8x8 = h->mb.i_b8_xy;
int i;
/* GCC pessimizes direct stores to heap-allocated 8-bit arrays due to aliasing.*/
/* By only dereferencing them once, we avoid this issue. */
int8_t *intra4x4_pred_mode = h->mb.intra4x4_pred_mode[i_mb_xy];
uint8_t *non_zero_count = h->mb.non_zero_count[i_mb_xy];
for( i = 0; i < 3; i++ )
{
int w = i ? 8 : 16;
int i_stride = h->fdec->i_stride[i];
int i_stride2 = i_stride << h->mb.b_interlaced;
int i_pix_offset = h->mb.b_interlaced
? w * (h->mb.i_mb_x + (h->mb.i_mb_y&~1) * i_stride) + (h->mb.i_mb_y&1) * i_stride
: w * (h->mb.i_mb_x + h->mb.i_mb_y * i_stride);
h->mc.copy[i?PIXEL_8x8:PIXEL_16x16](
&h->fdec->plane[i][i_pix_offset], i_stride2,
h->mb.pic.p_fdec[i], FDEC_STRIDE, w );
}
int i, y;
x264_macroblock_store_pic( h, 0 );
x264_macroblock_store_pic( h, 1 );
x264_macroblock_store_pic( h, 2 );
x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );
......@@ -1406,40 +1407,28 @@ void x264_macroblock_cache_save( x264_t *h )
/* save intra4x4 */
if( i_mb_type == I_4x4 )
{
h->mb.intra4x4_pred_mode[i_mb_xy][0] = h->mb.cache.intra4x4_pred_mode[x264_scan8[10] ];
h->mb.intra4x4_pred_mode[i_mb_xy][1] = h->mb.cache.intra4x4_pred_mode[x264_scan8[11] ];
h->mb.intra4x4_pred_mode[i_mb_xy][2] = h->mb.cache.intra4x4_pred_mode[x264_scan8[14] ];
h->mb.intra4x4_pred_mode[i_mb_xy][3] = h->mb.cache.intra4x4_pred_mode[x264_scan8[15] ];
h->mb.intra4x4_pred_mode[i_mb_xy][4] = h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ];
h->mb.intra4x4_pred_mode[i_mb_xy][5] = h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ];
h->mb.intra4x4_pred_mode[i_mb_xy][6] = h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ];
*(uint32_t*)&intra4x4_pred_mode[0] = *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[10] ];
*(uint32_t*)&intra4x4_pred_mode[4] = pack8to32(h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
}
else
{
h->mb.intra4x4_pred_mode[i_mb_xy][0] =
h->mb.intra4x4_pred_mode[i_mb_xy][1] =
h->mb.intra4x4_pred_mode[i_mb_xy][2] =
h->mb.intra4x4_pred_mode[i_mb_xy][3] =
h->mb.intra4x4_pred_mode[i_mb_xy][4] =
h->mb.intra4x4_pred_mode[i_mb_xy][5] =
h->mb.intra4x4_pred_mode[i_mb_xy][6] = I_PRED_4x4_DC;
}
*(uint64_t*)intra4x4_pred_mode = I_PRED_4x4_DC * 0x0101010101010101ULL;
if( i_mb_type == I_PCM )
{
h->mb.cbp[i_mb_xy] = 0x72f; /* all set */
for( i = 0; i < 16 + 2*4; i++ )
{
h->mb.non_zero_count[i_mb_xy][i] = 16;
}
non_zero_count[i] = 16;
}
else
{
/* save non zero count */
for( i = 0; i < 16 + 2*4; i++ )
{
h->mb.non_zero_count[i_mb_xy][i] = h->mb.cache.non_zero_count[x264_scan8[i]];
}
for( y = 0; y < 4; y++ )
*(uint32_t*)&non_zero_count[y*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+y*8];
for( y = 0; y < 4; y++ )
*(uint16_t*)&non_zero_count[16+y*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+y*2]-1] >> 8;
}
if( h->mb.i_cbp_luma == 0 && h->mb.i_type != I_8x8 )
......@@ -1448,20 +1437,25 @@ void x264_macroblock_cache_save( x264_t *h )
if( !IS_INTRA( i_mb_type ) )
{
int i_list;
for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2 : 1 ); i_list++ )
h->mb.ref[0][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]];
h->mb.ref[0][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]];
h->mb.ref[0][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]];
h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
for( y = 0; y < 4; y++ )
{
int y;
h->mb.ref[i_list][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[i_list][x264_scan8[0]];
h->mb.ref[i_list][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[i_list][x264_scan8[4]];
h->mb.ref[i_list][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[i_list][x264_scan8[8]];
h->mb.ref[i_list][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[i_list][x264_scan8[12]];
*(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+0];
*(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+2];
}
if(h->sh.i_type == SLICE_TYPE_B)
{
h->mb.ref[1][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]];
h->mb.ref[1][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]];
h->mb.ref[1][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]];
h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
for( y = 0; y < 4; y++ )
{
*(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[i_list][x264_scan8[0]+8*y+0];
*(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[i_list][x264_scan8[0]+8*y+2];
*(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+0];
*(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+2];
}
}
}
......@@ -1470,11 +1464,8 @@ void x264_macroblock_cache_save( x264_t *h )
int i_list;
for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2 : 1 ); i_list++ )
{
int y;
*(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+0*s8x8] = (uint8_t)(-1) * 0x0101;
*(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+1*s8x8] = (uint8_t)(-1) * 0x0101;
for( y = 0; y < 4; y++ )
{
*(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = 0;
......@@ -1492,32 +1483,33 @@ void x264_macroblock_cache_save( x264_t *h )
if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) && !IS_DIRECT( i_mb_type ) )
{
int i_list;
for( i_list = 0; i_list < 2; i_list++ )
for( y = 0; y < 4; y++ )
{
const int s4x4 = 4 * h->mb.i_mb_stride;
int y;
*(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[0][x264_scan8[0]+8*y+0];
*(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[0][x264_scan8[0]+8*y+2];
}
if( h->sh.i_type == SLICE_TYPE_B )
for( y = 0; y < 4; y++ )
{
*(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[i_list][x264_scan8[0]+8*y+0];
*(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[i_list][x264_scan8[0]+8*y+2];
*(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[1][x264_scan8[0]+8*y+0];
*(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[1][x264_scan8[0]+8*y+2];
}
}
}
else
{
int i_list;
for( i_list = 0; i_list < 2; i_list++ )
for( y = 0; y < 4; y++ )
{
const int s4x4 = 4 * h->mb.i_mb_stride;
int y;
*(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+0] = 0;
*(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+2] = 0;
}
if( h->sh.i_type == SLICE_TYPE_B )
for( y = 0; y < 4; y++ )
{
*(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+0] = 0;
*(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+2] = 0;
*(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+0] = 0;
*(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+2] = 0;
}
}
}
if( h->sh.i_type == SLICE_TYPE_B )
{
if( i_mb_type == B_SKIP || i_mb_type == B_DIRECT )
......
......@@ -293,6 +293,30 @@ int x264_mb_transform_8x8_allowed( x264_t *h );
void x264_mb_mc( x264_t *h );
void x264_mb_mc_8x8( x264_t *h, int i8 );
static ALWAYS_INLINE uint32_t pack16to32( int a, int b )
{
#ifdef WORDS_BIGENDIAN
return b + (a<<16);
#else
return a + (b<<16);
#endif
}
static ALWAYS_INLINE uint32_t pack8to16( int a, int b )
{
#ifdef WORDS_BIGENDIAN
return b + (a<<8);
#else
return a + (b<<8);
#endif
}
static ALWAYS_INLINE uint32_t pack8to32( int a, int b, int c, int d )
{
#ifdef WORDS_BIGENDIAN
return d + (c<<8) + (b<<16) + (a<<24);
#else
return a + (b<<8) + (c<<16) + (d<<24);
#endif
}
static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
{
#ifdef WORDS_BIGENDIAN
......
......@@ -37,24 +37,6 @@
# include "ppc/predict.h"
#endif
static ALWAYS_INLINE uint32_t pack16to32( int a, int b )
{
#ifdef WORDS_BIGENDIAN
return b + (a<<16);
#else
return a + (b<<16);
#endif
}
static ALWAYS_INLINE uint32_t pack8to16( int a, int b )
{
#ifdef WORDS_BIGENDIAN
return b + (a<<8);
#else
return a + (b<<8);
#endif
}
/****************************************************************************
* 16x16 prediction for intra luma block
****************************************************************************/
......
......@@ -549,23 +549,26 @@ void x264_macroblock_encode( x264_t *h )
for( i = 0; i < 4; i++)
{
if(!nnz8x8[i])
for( j = 0; j < 4; j++ )
h->mb.cache.non_zero_count[x264_scan8[j+i*4]] = 0;
{
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[0+i*4]] = 0;
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[2+i*4]] = 0;
}
else if( h->mb.b_transform_8x8 )
{
int nz = nnz8x8[i];
for( j = 0; j < 4; j++ )
h->mb.cache.non_zero_count[x264_scan8[j+4*i]] = nz;
h->mb.i_cbp_luma |= nz << i;
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[0+4*i]] = nnz8x8[i] * 0x0101;
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[2+4*i]] = nnz8x8[i] * 0x0101;
h->mb.i_cbp_luma |= nnz8x8[i] << i;
}
else
{
int nz, cbp = 0;
for( j = 0; j < 4; j++ )
{
int nz = array_non_zero( h->dct.luma4x4[j+i*4] );
h->mb.cache.non_zero_count[x264_scan8[j+i*4]] = nz;
h->mb.i_cbp_luma |= nz << i;
nz = array_non_zero( h->dct.luma4x4[j+4*i] );
h->mb.cache.non_zero_count[x264_scan8[j+4*i]] = nz;
cbp |= nz;
}
h->mb.i_cbp_luma |= cbp << i;
}
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment