Commit c6213d2f authored by Loren Merritt's avatar Loren Merritt

mc_chroma width2 mmx



git-svn-id: svn://svn.videolan.org/x264/trunk@534 df754926-b1dd-0310-bc7b-ec298dee348c
parent eff6a5e2
......@@ -410,19 +410,31 @@ ALIGN 16
;-----------------------------------------------------------------------------
x264_mc_chroma_mmxext:
mov r10d, parm6d
mov r11d, parm5d
sar r10d, 3
sar r11d, 3
imul r10d, parm2d
pxor mm3, mm3
add r10d, r11d
movsxd r10, r10d
add parm1q, r10 ; src += (dx>>3) + (dy>>3) * src_stride
and parm5d, 7 ; dx &= 7
je .mc1d
and parm6d, 7 ; dy &= 7
je .mc1d
movd mm0, parm5d
movd mm1, parm6d
pxor mm3, mm3
pshufw mm5, mm0, 0 ; mm5 - dx
pshufw mm6, mm1, 0 ; mm6 - dy
pshufw mm5, mm0, 0 ; mm5 = dx
pshufw mm6, mm1, 0 ; mm6 = dy
movq mm4, [pw_8 GLOBAL]
movq mm0, mm4
psubw mm4, mm5 ; mm4 - 8-dx
psubw mm0, mm6 ; mm0 - 8-dy
psubw mm4, mm5 ; mm4 = 8-dx
psubw mm0, mm6 ; mm0 = 8-dy
movq mm7, mm5
pmullw mm5, mm0 ; mm5 = dx*(8-dy) = cB
......@@ -457,8 +469,9 @@ ALIGN 4
pmullw mm1, mm7 ; line * cD
paddw mm0, mm2
paddw mm0, mm1
psrlw mm0, 6
%macro HEIGHT_LOOP_END 1
packuswb mm0, mm3 ; 00 00 00 00 px1 px2 px3 px4
movd [r10], mm0
......@@ -466,7 +479,7 @@ ALIGN 4
add r10, parm4q ; i_dst_stride
dec r11d
jnz .height_loop
jnz %1
sub parm7d, 8
jnz .finish ; width != 8 so assume 4
......@@ -476,7 +489,45 @@ ALIGN 4
mov r11d, parm8d ; i_height
add r10, 4
add rax, 4
jmp .height_loop
jmp %1
%endmacro
HEIGHT_LOOP_END .height_loop
.finish
ret
ALIGN 4
.mc1d
%ifdef WIN64
%define pel_offset rsi
%else
%define pel_offset r9
%endif
mov eax, parm5d
or eax, parm6d
and eax, 7
cmp parm5d, 0
mov pel_offset, 1
cmove pel_offset, parm2q ; pel_offset = dx ? 1 : src_stride
movd mm6, eax
movq mm5, [pw_8 GLOBAL]
pshufw mm6, mm6, 0
movq mm7, [pw_4 GLOBAL]
psubw mm5, mm6
mov rax, parm1q
mov r10, parm3q
mov r11d, parm8d
ALIGN 4
.height_loop1
movd mm0, [rax+pel_offset]
movd mm1, [rax]
punpcklbw mm0, mm3
punpcklbw mm1, mm3
pmullw mm0, mm6
pmullw mm1, mm5
paddw mm0, mm7
paddw mm0, mm1
psrlw mm0, 3
HEIGHT_LOOP_END .height_loop1
nop
......@@ -507,14 +507,28 @@ ALIGN 16
;-----------------------------------------------------------------------------
x264_mc_chroma_mmxext:
picpush ebx
picgetgot ebx
push edi
mov ecx, [picesp+4+24]
mov edx, [picesp+4+20]
mov eax, ecx
mov edi, edx
sar ecx, 3
sar edx, 3
imul ecx, [picesp+4+8]
add ecx, edx
add [picesp+4+4], ecx ; src += (dx>>3) + (dy>>3) * src_stride
pxor mm3, mm3
pshufw mm5, [picesp+20], 0 ; mm5 = dx
pshufw mm6, [picesp+24], 0 ; mm6 = dy
and edi, 7
and eax, 7
movd mm5, edi
movd mm6, eax
pshufw mm5, mm5, 0 ; mm5 = dx&7
pshufw mm6, mm6, 0 ; mm6 = dy&7
movq mm4, [pw_8 GOT_ebx]
movq mm0, mm4
......@@ -528,8 +542,6 @@ x264_mc_chroma_mmxext:
pmullw mm6, mm4 ; mm6 = (8-dx)*dy = cC
pmullw mm4, mm0 ; mm4 = (8-dx)*(8-dy) = cA
push edi
mov eax, [picesp+4+4] ; src
mov edi, [picesp+4+12] ; dst
mov ecx, [picesp+4+8] ; i_src_stride
......
......@@ -307,27 +307,6 @@ static void motion_compensation_chroma( uint8_t *src, int i_src_stride,
}
}
#ifdef HAVE_MMXEXT
static void motion_compensation_chroma_mmxext( uint8_t *src, int i_src_stride,
uint8_t *dst, int i_dst_stride,
int mvx, int mvy,
int i_width, int i_height )
{
if (i_width == 2) {
motion_compensation_chroma(src, i_src_stride, dst, i_dst_stride,
mvx, mvy, i_width, i_height);
} else {
const int d8x = mvx&0x07;
const int d8y = mvy&0x07;
src += (mvy >> 3) * i_src_stride + (mvx >> 3);
x264_mc_chroma_mmxext( src, i_src_stride, dst, i_dst_stride,
d8x, d8y, i_width, i_height );
}
}
#endif
#define MC_COPY(W) \
static void mc_copy_w##W( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_height ) \
{ \
......@@ -372,7 +351,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
#ifdef HAVE_MMXEXT
if( cpu&X264_CPU_MMXEXT ) {
x264_mc_mmxext_init( pf );
pf->mc_chroma = motion_compensation_chroma_mmxext;
pf->mc_chroma = x264_mc_chroma_mmxext;
}
#endif
#ifdef HAVE_SSE2
......
......@@ -41,6 +41,8 @@ typedef struct
int mvx, int mvy,
int i_width, int i_height );
/* mc_chroma may write up to 2 bytes of garbage to the right of dst,
* so it must be run from left to right. */
void (*mc_chroma)(uint8_t *, int, uint8_t *, int,
int mvx, int mvy,
int i_width, int i_height );
......
......@@ -238,40 +238,6 @@ uint8_t *get_ref_altivec( uint8_t *src[4], int i_src_stride,
}
}
static void mc_chroma_c( uint8_t *src, int i_src_stride,
uint8_t *dst, int i_dst_stride,
int mvx, int mvy,
int i_width, int i_height )
{
uint8_t *srcp;
int x, y;
int d8x = mvx & 0x07;
int d8y = mvy & 0x07;
DECLARE_ALIGNED( uint16_t, coeff[4], 16 );
coeff[0] = (8-d8x)*(8-d8y);
coeff[1] = d8x *(8-d8y);
coeff[2] = (8-d8x)*d8y;
coeff[3] = d8x *d8y;
src += (mvy >> 3) * i_src_stride + (mvx >> 3);
srcp = &src[i_src_stride];
/* TODO: optimize */
for( y = 0; y < i_height; y++ )
{
for( x = 0; x < i_width; x++ )
{
dst[x] = ( coeff[0]*src[x] + coeff[1]*src[x+1] +
coeff[2]*srcp[x] + coeff[3]*srcp[x+1] + 32 ) >> 6;
}
dst += i_dst_stride;
src = srcp;
srcp += i_src_stride;
}
}
#define DO_PROCESS(a) \
src##a##v_16 = vec_u8_to_u16( src##a##v_8 ); \
src##a##v_16 = vec_mladd( coeff##a##v, src##a##v_16, zero_u16v ); \
......@@ -418,17 +384,12 @@ static void mc_chroma_altivec( uint8_t *src, int i_src_stride,
{
mc_chroma_altivec_8xh( src, i_src_stride, dst, i_dst_stride,
mvx, mvy, i_height );
return;
}
if( i_width == 4 )
else
{
mc_chroma_altivec_4xh( src, i_src_stride, dst, i_dst_stride,
mvx, mvy, i_height );
return;
}
mc_chroma_c( src, i_src_stride, dst, i_dst_stride,
mvx, mvy, i_width, i_height );
}
void x264_mc_altivec_init( x264_mc_functions_t *pf )
......
......@@ -274,7 +274,7 @@ static int check_mc( int cpu_ref, int cpu_new )
uint8_t *dst1 = &buf3[2*32+2];
uint8_t *dst2 = &buf4[2*32+2];
int dx, dy, i, w;
int dx, dy, i, j, w;
int ret = 0, ok, used_asm;
x264_mc_init( 0, &mc_c );
......@@ -304,6 +304,10 @@ static int check_mc( int cpu_ref, int cpu_new )
memset(buf4, 0xCD, 1024); \
mc_c.mc_chroma( src, 32, dst1, 16, dx, dy, w, h ); \
mc_a.mc_chroma( src, 32, dst2, 16, dx, dy, w, h ); \
/* mc_chroma width=2 may write garbage to the right of dst. ignore that. */\
for( j=0; j<h; j++ ) \
for( i=w; i<4; i++ ) \
dst2[i+j*16] = dst1[i+j*16]; \
if( memcmp( buf3, buf4, 1024 ) ) \
{ \
fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
......@@ -325,8 +329,8 @@ static int check_mc( int cpu_ref, int cpu_new )
report( "mc luma :" );
ok = 1; used_asm = 0;
for( dy = 0; dy < 9; dy++ )
for( dx = 0; dx < 9; dx++ )
for( dy = -1; dy < 9; dy++ )
for( dx = -1; dx < 9; dx++ )
{
MC_TEST_CHROMA( 8, 8 );
MC_TEST_CHROMA( 8, 4 );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment