Commit f9307df8 authored by Fiona Glaser's avatar Fiona Glaser

Faster macroblock_cache_rect

Explicit loop unrolling
parent 69dc9f4d
......@@ -351,35 +351,44 @@ static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
}
static ALWAYS_INLINE void x264_macroblock_cache_rect1( void *dst, int width, int height, uint8_t val )
{
int dy;
if( width == 4 )
{
uint32_t val2 = val * 0x01010101;
for( dy = 0; dy < height; dy++ )
((uint32_t*)dst)[2*dy] = val2;
((uint32_t*)dst)[0] = val2;
if( height >= 2 ) ((uint32_t*)dst)[2] = val2;
if( height == 4 ) ((uint32_t*)dst)[4] = val2;
if( height == 4 ) ((uint32_t*)dst)[6] = val2;
}
else // 2
{
uint32_t val2 = val * 0x0101;
for( dy = 0; dy < height; dy++ )
((uint16_t*)dst)[4*dy] = val2;
((uint16_t*)dst)[ 0] = val2;
if( height >= 2 ) ((uint16_t*)dst)[ 4] = val2;
if( height == 4 ) ((uint16_t*)dst)[ 8] = val2;
if( height == 4 ) ((uint16_t*)dst)[12] = val2;
}
}
static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int height, uint32_t val )
{
int dy, dx;
int dy;
if( width == 1 || WORD_SIZE < 8 )
{
for( dy = 0; dy < height; dy++ )
for( dx = 0; dx < width; dx++ )
((uint32_t*)dst)[dx+8*dy] = val;
{
((uint32_t*)dst)[8*dy+0] = val;
if( width >= 2 ) ((uint32_t*)dst)[8*dy+1] = val;
if( width == 4 ) ((uint32_t*)dst)[8*dy+2] = val;
if( width == 4 ) ((uint32_t*)dst)[8*dy+3] = val;
}
}
else
{
uint64_t val64 = val + ((uint64_t)val<<32);
for( dy = 0; dy < height; dy++ )
for( dx = 0; dx < width/2; dx++ )
((uint64_t*)dst)[dx+4*dy] = val64;
{
((uint64_t*)dst)[4*dy+0] = val64;
if( width == 4 ) ((uint64_t*)dst)[4*dy+1] = val64;
}
}
}
#define x264_macroblock_cache_mv_ptr(a,x,y,w,h,l,mv) x264_macroblock_cache_mv(a,x,y,w,h,l,*(uint32_t*)mv)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment