Commit 8850b6fa authored by Loren Merritt's avatar Loren Merritt

faster ESA


git-svn-id: svn://svn.videolan.org/x264/trunk@562 df754926-b1dd-0310-bc7b-ec298dee348c
parent f8652aab
......@@ -456,6 +456,10 @@ cglobal x264_intra_satd_x3_4x4_mmxext
cglobal x264_intra_satd_x3_8x8c_mmxext
cglobal x264_intra_satd_x3_16x16_mmxext
cglobal x264_pixel_ads4_mmxext
cglobal x264_pixel_ads2_mmxext
cglobal x264_pixel_ads1_mmxext
%macro SAD_START 0
pxor mm0, mm0
......@@ -1110,3 +1114,83 @@ x264_intra_satd_x3_8x8c_mmxext:
movd [parm3q+4], mm1 ; i8x8c_h satd
movd [parm3q+8], mm2 ; i8x8c_v satd
ret
;-----------------------------------------------------------------------------
; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
; uint16_t *res, int width )
;-----------------------------------------------------------------------------
ALIGN 16
x264_pixel_ads4_mmxext:
movq mm6, [parm1q]
movq mm4, [parm1q+8]
pshufw mm7, mm6, 0
pshufw mm6, mm6, 0xAA
pshufw mm5, mm4, 0
pshufw mm4, mm4, 0xAA
shl parm3q, 1
.loop:
movq mm0, [parm2q]
movq mm1, [parm2q+16]
psubw mm0, mm7
psubw mm1, mm6
MMX_ABS mm0, mm2
MMX_ABS mm1, mm3
movq mm2, [parm2q+parm3q]
movq mm3, [parm2q+parm3q+16]
psubw mm2, mm5
psubw mm3, mm4
paddw mm0, mm1
MMX_ABS mm2, mm1
MMX_ABS mm3, mm1
paddw mm0, mm2
paddw mm0, mm3
movq [parm4q], mm0
add parm2q, 8
add parm4q, 8
sub parm5d, 4
jg .loop
nop
ret
ALIGN 16
x264_pixel_ads2_mmxext:
movq mm6, [parm1q]
pshufw mm7, mm6, 0
pshufw mm6, mm6, 0xAA
shl parm3q, 1
.loop:
movq mm0, [parm2q]
movq mm1, [parm2q+parm3q]
psubw mm0, mm7
psubw mm1, mm6
MMX_ABS mm0, mm2
MMX_ABS mm1, mm3
paddw mm0, mm1
movq [parm4q], mm0
add parm2q, 8
add parm4q, 8
sub parm5d, 4
jg .loop
nop
ret
ALIGN 16
x264_pixel_ads1_mmxext:
pshufw mm7, [parm1q], 0
.loop:
movq mm0, [parm2q]
movq mm1, [parm2q+8]
psubw mm0, mm7
psubw mm1, mm7
MMX_ABS mm0, mm2
MMX_ABS mm1, mm3
movq [parm4q], mm0
movq [parm4q+8], mm1
add parm2q, 16
add parm4q, 16
sub parm5d, 8
jg .loop
nop
ret
......@@ -492,6 +492,10 @@ cglobal x264_intra_sa8d_x3_8x8_core_mmxext
cglobal x264_pixel_ssim_4x4x2_core_mmxext
cglobal x264_pixel_ads4_mmxext
cglobal x264_pixel_ads2_mmxext
cglobal x264_pixel_ads1_mmxext
%macro SAD_START 0
push ebx
......@@ -1635,3 +1639,99 @@ x264_pixel_ssim_4x4x2_core_mmxext:
pop ebx
emms
ret
;-----------------------------------------------------------------------------
; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
; uint16_t *res, int width )
;-----------------------------------------------------------------------------
ALIGN 16
x264_pixel_ads4_mmxext:
push ebx
mov eax, [esp+8]
movq mm6, [eax]
movq mm4, [eax+8]
pshufw mm7, mm6, 0
pshufw mm6, mm6, 0xAA
pshufw mm5, mm4, 0
pshufw mm4, mm4, 0xAA
mov eax, [esp+12]
mov ebx, [esp+16]
mov ecx, [esp+20]
mov edx, [esp+24]
shl ebx, 1
.loop:
movq mm0, [eax]
movq mm1, [eax+16]
psubw mm0, mm7
psubw mm1, mm6
MMX_ABS mm0, mm2
MMX_ABS mm1, mm3
movq mm2, [eax+ebx]
movq mm3, [eax+ebx+16]
psubw mm2, mm5
psubw mm3, mm4
paddw mm0, mm1
MMX_ABS mm2, mm1
MMX_ABS mm3, mm1
paddw mm0, mm2
paddw mm0, mm3
movq [ecx], mm0
add eax, 8
add ecx, 8
sub edx, 4
jg .loop
pop ebx
ret
ALIGN 16
x264_pixel_ads2_mmxext:
push ebx
mov eax, [esp+8]
movq mm6, [eax]
pshufw mm7, mm6, 0
pshufw mm6, mm6, 0xAA
mov eax, [esp+12]
mov ebx, [esp+16]
mov ecx, [esp+20]
mov edx, [esp+24]
shl ebx, 1
.loop:
movq mm0, [eax]
movq mm1, [eax+ebx]
psubw mm0, mm7
psubw mm1, mm6
MMX_ABS mm0, mm2
MMX_ABS mm1, mm3
paddw mm0, mm1
movq [ecx], mm0
add eax, 8
add ecx, 8
sub edx, 4
jg .loop
pop ebx
ret
ALIGN 16
x264_pixel_ads1_mmxext:
mov eax, [esp+4]
pshufw mm7, [eax], 0
mov eax, [esp+8]
mov ecx, [esp+16]
mov edx, [esp+20]
.loop:
movq mm0, [eax]
movq mm1, [eax+8]
psubw mm0, mm7
psubw mm1, mm7
MMX_ABS mm0, mm2
MMX_ABS mm1, mm3
movq [ecx], mm0
movq [ecx+8], mm1
add eax, 16
add ecx, 16
sub edx, 8
jg .loop
nop
ret
......@@ -104,4 +104,11 @@ void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
const uint8_t *pix2, int stride2, int sums[2][4] );
float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
uint16_t *res, int width );
void x264_pixel_ads2_mmxext( int enc_dc[2], uint16_t *sums, int delta,
uint16_t *res, int width );
void x264_pixel_ads1_mmxext( int enc_dc[1], uint16_t *sums, int delta,
uint16_t *res, int width );
#endif
......@@ -399,6 +399,38 @@ float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
}
/****************************************************************************
* successive elimination
****************************************************************************/
static void pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
uint16_t *res, int width )
{
int i;
for( i=0; i<width; i++, sums++ )
res[i] = abs( enc_dc[0] - sums[0] )
+ abs( enc_dc[1] - sums[8] )
+ abs( enc_dc[2] - sums[delta] )
+ abs( enc_dc[3] - sums[delta+8] );
}
static void pixel_ads2( int enc_dc[2], uint16_t *sums, int delta,
uint16_t *res, int width )
{
int i;
for( i=0; i<width; i++, sums++ )
res[i] = abs( enc_dc[0] - sums[0] )
+ abs( enc_dc[1] - sums[delta] );
}
static void pixel_ads1( int enc_dc[1], uint16_t *sums, int delta,
uint16_t *res, int width )
{
int i;
for( i=0; i<width; i++, sums++ )
res[i] = abs( enc_dc[0] - sums[0] );
}
/****************************************************************************
* x264_pixel_init:
****************************************************************************/
......@@ -428,6 +460,10 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->ssim_4x4x2_core = ssim_4x4x2_core;
pixf->ssim_end4 = ssim_end4;
pixf->ads[PIXEL_16x16] = pixel_ads4;
pixf->ads[PIXEL_16x8] = pixel_ads2;
pixf->ads[PIXEL_8x8] = pixel_ads1;
#ifdef HAVE_MMXEXT
if( cpu&X264_CPU_MMX )
{
......@@ -445,6 +481,10 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sad_pde[PIXEL_16x8 ] = x264_pixel_sad_pde_16x8_mmxext;
pixf->sad_pde[PIXEL_8x16 ] = x264_pixel_sad_pde_8x16_mmxext;
pixf->ads[PIXEL_16x16] = x264_pixel_ads4_mmxext;
pixf->ads[PIXEL_16x8 ] = x264_pixel_ads2_mmxext;
pixf->ads[PIXEL_8x8 ] = x264_pixel_ads1_mmxext;
#ifdef ARCH_X86
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext;
......@@ -516,5 +556,10 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sad_x4[PIXEL_16x8] = x264_pixel_sad_x4_16x8_vis;
pixf->sad_x4[PIXEL_16x16] = x264_pixel_sad_x4_16x16_vis;
#endif
pixf->ads[PIXEL_8x16] =
pixf->ads[PIXEL_8x4] =
pixf->ads[PIXEL_4x8] = pixf->ads[PIXEL_16x8];
pixf->ads[PIXEL_4x4] = pixf->ads[PIXEL_8x8];
}
......@@ -85,6 +85,11 @@ typedef struct
x264_pixel_cmp_x3_t sad_x3[7];
x264_pixel_cmp_x4_t sad_x4[7];
/* abs-diff-sum for successive elimination.
* may round width up to a multiple of 8. */
void (*ads[7])( int enc_dc[4], uint16_t *sums, int delta,
uint16_t *res, int width );
/* calculate satd of V, H, and DC modes.
* may be NULL, in which case just use pred+satd instead. */
void (*intra_satd_x3_16x16)( uint8_t *fenc, uint8_t *fdec, int res[3] );
......
......@@ -462,64 +462,46 @@ me_hex2:
/* successive elimination by comparing DC before a full SAD,
* because sum(abs(diff)) >= abs(diff(sum)). */
const int stride = m->i_stride[0];
const uint16_t *integral_base = m->integral;
static uint8_t zero[16*16] = {0,};
uint16_t *sums_base = m->integral;
int enc_dc[4];
int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
int sad_w = x264_pixel_size[sad_size].w;
h->pixf.sad_x4[sad_size]( zero, m->p_fenc[0], m->p_fenc[0]+sad_w,
m->p_fenc[0]+sad_w*FENC_STRIDE, m->p_fenc[0]+sad_w+sad_w*FENC_STRIDE,
int delta = x264_pixel_size[sad_size].w;
uint16_t *ads = alloca((max_x-min_x+8) * sizeof(uint16_t));
h->pixf.sad_x4[sad_size]( zero, m->p_fenc[0], m->p_fenc[0]+delta,
m->p_fenc[0]+delta*FENC_STRIDE, m->p_fenc[0]+delta+delta*FENC_STRIDE,
FENC_STRIDE, enc_dc );
if( sad_w == 4 )
integral_base += stride * (h->fenc->i_lines[0] + 64);
#define ESA(ADS) \
for( my = min_y; my <= max_y; my++ )\
{\
int mvs[3], i_mvs=0;\
bcost -= p_cost_mvy[my<<2];\
for( mx = min_x; mx <= max_x; mx++ )\
{\
const uint16_t *integral = &integral_base[ mx + my * stride ];\
if( ADS < bcost - p_cost_mvx[mx<<2] )\
{\
if( i_mvs == 3 )\
{\
COST_MV_X4_ABS( mvs[0],my, mvs[1],my, mvs[2],my, mx,my );\
i_mvs = 0;\
}\
else\
mvs[i_mvs++] = mx;\
}\
}\
bcost += p_cost_mvy[my<<2];\
for( i=0; i<i_mvs; i++ )\
COST_MV( mvs[i], my );\
}
if( delta == 4 )
sums_base += stride * (h->fenc->i_lines[0] + 64);
if( i_pixel == PIXEL_16x16 || i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
delta *= stride;
if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
enc_dc[1] = enc_dc[2];
if( i_pixel == PIXEL_16x16 )
{
ESA( abs( enc_dc[0] - integral[0] )
+ abs( enc_dc[1] - integral[8] )
+ abs( enc_dc[2] - integral[8*stride] )
+ abs( enc_dc[3] - integral[8*stride+8] ) );
}
else if( i_pixel == PIXEL_8x8 || i_pixel == PIXEL_4x4 )
{
ESA( abs( enc_dc[0] - integral[0] ) );
}
else
for( my = min_y; my <= max_y; my++ )
{
int dw = i_pixel < PIXEL_8x8 ? 8 : 4;
if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
int mvs[3], i_mvs=0;
bcost -= p_cost_mvy[my<<2];
h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
ads, max_x-min_x+1 );
for( mx = min_x; mx <= max_x; mx++ )
{
dw *= stride;
enc_dc[1] = enc_dc[2];
if( ads[mx-min_x] < bcost - p_cost_mvx[mx<<2] )
{
if( i_mvs == 3 )
{
COST_MV_X4_ABS( mvs[0],my, mvs[1],my, mvs[2],my, mx,my );
i_mvs = 0;
}
else
mvs[i_mvs++] = mx;
}
}
ESA( abs( enc_dc[0] - integral[0] )
+ abs( enc_dc[1] - integral[dw] ) );
bcost += p_cost_mvy[my<<2];
for( i=0; i<i_mvs; i++ )
COST_MV( mvs[i], my );
}
#undef ESA
#endif
}
break;
......
......@@ -38,7 +38,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
x264_predict8x8_t predict_8x8[9+3];
DECLARE_ALIGNED( uint8_t, edge[33], 8 );
int ret = 0, ok, used_asm;
int i;
int i, j;
x264_pixel_init( 0, &pixel_c );
x264_pixel_init( cpu_ref, &pixel_ref );
......@@ -147,6 +147,25 @@ static int check_pixel( int cpu_ref, int cpu_new )
report( "ssim :" );
}
ok = 1; used_asm = 0;
for( i=0; i<4; i++ )
if( pixel_asm.ads[i] != pixel_ref.ads[i] )
{
uint16_t res_a[32], res_c[32];
uint16_t sums[72];
int dc[4];
for( j=0; j<72; j++ )
sums[j] = rand() & 0x3fff;
for( j=0; j<4; j++ )
dc[j] = rand() & 0x3fff;
used_asm = 1;
pixel_c.ads[i]( dc, sums, 32, res_c, 32 );
pixel_asm.ads[i]( dc, sums, 32, res_a, 32 );
if( memcmp(res_a, res_c, sizeof(res_c)) )
ok = 0;
}
report( "esa ads:" );
return ret;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment