Commit 6bf39eaa authored by Loren Merritt's avatar Loren Merritt

early termination within large SADs. ~1% faster UMH, ~4% faster ESA.



git-svn-id: svn://svn.videolan.org/x264/trunk@397 df754926-b1dd-0310-bc7b-ec298dee348c
parent 73a45ef2
......@@ -262,6 +262,10 @@ cglobal x264_pixel_sad_8x4_mmxext
cglobal x264_pixel_sad_4x8_mmxext
cglobal x264_pixel_sad_4x4_mmxext
cglobal x264_pixel_sad_pde_16x16_mmxext
cglobal x264_pixel_sad_pde_16x8_mmxext
cglobal x264_pixel_sad_pde_8x16_mmxext
cglobal x264_pixel_ssd_16x16_mmxext
cglobal x264_pixel_ssd_16x8_mmxext
cglobal x264_pixel_ssd_8x16_mmxext
......@@ -377,6 +381,64 @@ x264_pixel_sad_4x4_mmxext:
%macro PDE_CHECK 0
movd eax, mm0
cmp eax, parm5d ; prev_score
jl .continue
ret
ALIGN 4
.continue:
%endmacro
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sad_pde_16x16_mmxext (uint8_t *, int, uint8_t *, int, int )
;-----------------------------------------------------------------------------
x264_pixel_sad_pde_16x16_mmxext:
SAD_START
SAD_INC_2x16P
SAD_INC_2x16P
SAD_INC_2x16P
SAD_INC_2x16P
PDE_CHECK
SAD_INC_2x16P
SAD_INC_2x16P
SAD_INC_2x16P
SAD_INC_2x16P
SAD_END
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sad_pde_16x8_mmxext (uint8_t *, int, uint8_t *, int, int )
;-----------------------------------------------------------------------------
x264_pixel_sad_pde_16x8_mmxext:
SAD_START
SAD_INC_2x16P
SAD_INC_2x16P
PDE_CHECK
SAD_INC_2x16P
SAD_INC_2x16P
SAD_END
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sad_pde_8x16_mmxext (uint8_t *, int, uint8_t *, int, int )
;-----------------------------------------------------------------------------
x264_pixel_sad_pde_8x16_mmxext:
SAD_START
SAD_INC_2x8P
SAD_INC_2x8P
SAD_INC_2x8P
SAD_INC_2x8P
PDE_CHECK
SAD_INC_2x8P
SAD_INC_2x8P
SAD_INC_2x8P
SAD_INC_2x8P
SAD_END
%macro SSD_START 0
firstpush rbx
pushreg rbx
......
......@@ -269,6 +269,10 @@ cglobal x264_pixel_sad_8x4_mmxext
cglobal x264_pixel_sad_4x8_mmxext
cglobal x264_pixel_sad_4x4_mmxext
cglobal x264_pixel_sad_pde_16x16_mmxext
cglobal x264_pixel_sad_pde_16x8_mmxext
cglobal x264_pixel_sad_pde_8x16_mmxext
cglobal x264_pixel_ssd_16x16_mmxext
cglobal x264_pixel_ssd_16x8_mmxext
cglobal x264_pixel_ssd_8x16_mmxext
......@@ -391,6 +395,66 @@ x264_pixel_sad_4x4_mmxext:
SAD_END
%macro PDE_CHECK 0
movd ebx, mm0
cmp ebx, [esp+24] ; prev_score
jl .continue
pop ebx
mov eax, 0xffff
ret
ALIGN 4
.continue:
mov ebx, [esp+12]
%endmacro
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sad_pde_16x16_mmxext (uint8_t *, int, uint8_t *, int, int )
;-----------------------------------------------------------------------------
x264_pixel_sad_pde_16x16_mmxext:
SAD_START
SAD_INC_2x16P
SAD_INC_2x16P
SAD_INC_2x16P
SAD_INC_2x16P
PDE_CHECK
SAD_INC_2x16P
SAD_INC_2x16P
SAD_INC_2x16P
SAD_INC_2x16P
SAD_END
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sad_pde_16x8_mmxext (uint8_t *, int, uint8_t *, int, int )
;-----------------------------------------------------------------------------
x264_pixel_sad_pde_16x8_mmxext:
SAD_START
SAD_INC_2x16P
SAD_INC_2x16P
PDE_CHECK
SAD_INC_2x16P
SAD_INC_2x16P
SAD_END
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sad_pde_8x16_mmxext (uint8_t *, int, uint8_t *, int, int )
;-----------------------------------------------------------------------------
x264_pixel_sad_pde_8x16_mmxext:
SAD_START
SAD_INC_2x8P
SAD_INC_2x8P
SAD_INC_2x8P
SAD_INC_2x8P
PDE_CHECK
SAD_INC_2x8P
SAD_INC_2x8P
SAD_INC_2x8P
SAD_INC_2x8P
SAD_END
%macro SSD_START 0
push ebx
......
......@@ -32,6 +32,10 @@ int x264_pixel_sad_8x4_mmxext( uint8_t *, int, uint8_t *, int );
int x264_pixel_sad_4x8_mmxext( uint8_t *, int, uint8_t *, int );
int x264_pixel_sad_4x4_mmxext( uint8_t *, int, uint8_t *, int );
int x264_pixel_sad_pde_16x16_mmxext( uint8_t *, int, uint8_t *, int, int );
int x264_pixel_sad_pde_16x8_mmxext( uint8_t *, int, uint8_t *, int, int );
int x264_pixel_sad_pde_8x16_mmxext( uint8_t *, int, uint8_t *, int, int );
int x264_pixel_ssd_16x16_mmxext( uint8_t *, int, uint8_t *, int );
int x264_pixel_ssd_16x8_mmxext( uint8_t *, int, uint8_t *, int );
int x264_pixel_ssd_8x16_mmxext( uint8_t *, int, uint8_t *, int );
......
......@@ -346,6 +346,10 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sad[PIXEL_4x8 ] = x264_pixel_sad_4x8_mmxext;
pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_mmxext;
pixf->sad_pde[PIXEL_16x16] = x264_pixel_sad_pde_16x16_mmxext;
pixf->sad_pde[PIXEL_16x8 ] = x264_pixel_sad_pde_16x8_mmxext;
pixf->sad_pde[PIXEL_8x16 ] = x264_pixel_sad_pde_8x16_mmxext;
pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_mmxext;
pixf->ssd[PIXEL_16x8] = x264_pixel_ssd_16x8_mmxext;
pixf->ssd[PIXEL_8x16] = x264_pixel_ssd_8x16_mmxext;
......
......@@ -25,6 +25,7 @@
#define _PIXEL_H 1
typedef int (*x264_pixel_cmp_t) ( uint8_t *, int, uint8_t *, int );
typedef int (*x264_pixel_cmp_pde_t) ( uint8_t *, int, uint8_t *, int, int );
enum
{
......@@ -66,6 +67,11 @@ typedef struct
x264_pixel_cmp_t satd[7];
x264_pixel_cmp_t sa8d[4];
x264_pixel_cmp_t mbcmp[7]; /* either satd or sad for subpel refine and mode decision */
/* partial distortion elimination:
* terminate early if partial score is worse than a threshold.
* may be NULL, in which case just use sad instead. */
x264_pixel_cmp_pde_t sad_pde[7];
} x264_pixel_function_t;
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
......
......@@ -62,6 +62,19 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
#define COST_MV( mx, my ) COST_MV_INT( mx, my, 0, 0 )
#define COST_MV_DIR( mx, my, d ) COST_MV_INT( mx, my, 1, d )
#define COST_MV_PDE( mx, my ) \
{ \
int cost = h->pixf.sad_pde[i_pixel]( m->p_fenc[0], m->i_stride[0], \
&p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0], \
bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ] ); \
if( cost < bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ] ) \
{ \
bcost = cost + p_cost_mvx[ (mx)<<2 ] + p_cost_mvy[ (my)<<2 ]; \
bmx = mx; \
bmy = my; \
} \
}
#define DIA1_ITER( mx, my )\
{\
omx = mx; omy = my;\
......@@ -234,24 +247,40 @@ me_hex2:
};
COST_MV( omx + square2[i][0], omy + square2[i][1] );
}
/* hexagon grid */
omx = bmx; omy = bmy;
for( i = 1; i <= i_me_range/4; i++ )
{
int bounds_check = 4*i > X264_MIN4( mv_x_max-omx, mv_y_max-omy, omx-mv_x_min, omy-mv_y_min );
for( j = 0; j < 16; j++ )
static const int hex4[16][2] = {
{-4, 2}, {-4, 1}, {-4, 0}, {-4,-1}, {-4,-2},
{ 4,-2}, { 4,-1}, { 4, 0}, { 4, 1}, { 4, 2},
{ 2, 3}, { 0, 4}, {-2, 3},
{-2,-3}, { 0,-4}, { 2,-3},
};
const int bounds_check = 4*i > X264_MIN4( mv_x_max-omx, mv_y_max-omy, omx-mv_x_min, omy-mv_y_min );
if( h->pixf.sad_pde[i_pixel] )
{
static const int hex4[16][2] = {
{-4, 2}, {-4, 1}, {-4, 0}, {-4,-1}, {-4,-2},
{ 4,-2}, { 4,-1}, { 4, 0}, { 4, 1}, { 4, 2},
{ 2, 3}, { 0, 4}, {-2, 3},
{-2,-3}, { 0,-4}, { 2,-3},
};
int mx = omx + hex4[j][0]*i;
int my = omy + hex4[j][1]*i;
if( !bounds_check || ( mx >= mv_x_min && mx <= mv_x_max
&& my >= mv_y_min && my <= mv_y_max ) )
COST_MV( mx, my );
for( j = 0; j < 16; j++ )
{
int mx = omx + hex4[j][0]*i;
int my = omy + hex4[j][1]*i;
if( !bounds_check || ( mx >= mv_x_min && mx <= mv_x_max
&& my >= mv_y_min && my <= mv_y_max ) )
COST_MV_PDE( mx, my );
}
}
else
{
for( j = 0; j < 16; j++ )
{
int mx = omx + hex4[j][0]*i;
int my = omy + hex4[j][1]*i;
if( !bounds_check || ( mx >= mv_x_min && mx <= mv_x_max
&& my >= mv_y_min && my <= mv_y_max ) )
COST_MV( mx, my );
}
}
}
goto me_hex2;
......@@ -279,15 +308,32 @@ me_hex2:
const int enc_dc = h->pixf.sad[i_pixel]( m->p_fenc[0], stride, zero, 16 );
const uint16_t *integral_base = &m->integral[ -1 - 1*stride ];
for( my = min_y; my <= max_y; my++ )
for( mx = min_x; mx <= max_x; mx++ )
{
const uint16_t *integral = &integral_base[ mx + my * stride ];
const uint16_t ref_dc = integral[ 0 ] + integral[ dh + dw ]
- integral[ dw ] - integral[ dh ];
if( abs( ref_dc - enc_dc ) < bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ] )
COST_MV( mx, my );
}
if( h->pixf.sad_pde[i_pixel] )
{
for( my = min_y; my <= max_y; my++ )
for( mx = min_x; mx <= max_x; mx++ )
{
const uint16_t *integral = &integral_base[ mx + my * stride ];
const uint16_t ref_dc = integral[ 0 ] + integral[ dh + dw ]
- integral[ dw ] - integral[ dh ];
const int bsad = bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ];
if( abs( ref_dc - enc_dc ) < bsad )
COST_MV_PDE( mx, my );
}
}
else
{
for( my = min_y; my <= max_y; my++ )
for( mx = min_x; mx <= max_x; mx++ )
{
const uint16_t *integral = &integral_base[ mx + my * stride ];
const uint16_t ref_dc = integral[ 0 ] + integral[ dh + dw ]
- integral[ dw ] - integral[ dh ];
const int bsad = bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ];
if( abs( ref_dc - enc_dc ) < bsad )
COST_MV( mx, my );
}
}
#endif
}
break;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment