Commit f8652aab authored by Loren Merritt's avatar Loren Merritt

faster ESA


git-svn-id: svn://svn.videolan.org/x264/trunk@561 df754926-b1dd-0310-bc7b-ec298dee348c
parent a020d6ec
......@@ -92,7 +92,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
if( h->param.analyse.i_me_method == X264_ME_ESA )
{
CHECKED_MALLOC( frame->buffer[11],
frame->i_stride[0] * (frame->i_lines[0] + 64) * sizeof(uint16_t) );
2 * frame->i_stride[0] * (frame->i_lines[0] + 64) * sizeof(uint16_t) );
frame->integral = (uint16_t*)frame->buffer[11] + frame->i_stride[0] * 32 + 32;
}
......
......@@ -417,27 +417,31 @@ void x264_frame_filter( int cpu, x264_frame_t *frame )
}
/* generate integral image:
* each entry in frame->integral is the sum of all luma samples above and
* to the left of its location (inclusive).
* this allows us to calculate the DC of any rectangle by looking only
* at the corner entries.
* individual entries will overflow 16 bits, but that's ok:
* we only need the differences between entries, and those will be correct
* as long as we don't try to evaluate a rectangle bigger than 16x16.
* likewise, we don't really have to init the edges to 0, leaving garbage
* there wouldn't affect the results.*/
* frame->integral contains 2 planes. in the upper plane, each element is
* the sum of an 8x8 pixel region with top-left corner on that point.
* in the lower plane, 4x4 sums (needed only with --analyse p4x4). */
if( frame->integral )
{
memset( frame->integral - 32 * stride - 32, 0, stride * sizeof(uint16_t) );
for( y = -31; y < frame->i_lines[0] + 32; y++ )
for( y = -32; y < frame->i_lines[0] + 31; y++ )
{
uint8_t *ref = frame->plane[0] + y * stride - 32;
uint16_t *line = frame->integral + y * stride - 32;
uint16_t *line = frame->integral + (y+1) * stride - 31;
uint16_t v = line[0] = 0;
for( x = 1; x < stride; x++ )
for( x = 0; x < stride-1; x++ )
line[x] = v += ref[x] + line[x-stride] - line[x-stride-1];
}
for( y = -31; y < frame->i_lines[0] + 24; y++ )
{
uint16_t *line = frame->integral + y * stride - 31;
uint16_t *sum4 = line + frame->i_stride[0] * (frame->i_lines[0] + 64);
for( x = -31; x < stride - 40; x++, line++, sum4++ )
{
sum4[0] = line[4+4*stride] - line[4] - line[4*stride] + line[0];
line[0] += line[8+8*stride] - line[8] - line[8*stride];
}
}
}
}
......
......@@ -104,10 +104,10 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
p_fref + (m2x) + (m2y)*m->i_stride[0],\
p_fref + (m3x) + (m3y)*m->i_stride[0],\
m->i_stride[0], costs );\
costs[0] += BITS_MVD( m0x, m0y );\
costs[1] += BITS_MVD( m1x, m1y );\
costs[2] += BITS_MVD( m2x, m2y );\
costs[3] += BITS_MVD( m3x, m3y );\
costs[0] += p_cost_mvx[m0x<<2]; /* no cost_mvy */\
costs[1] += p_cost_mvx[m1x<<2];\
costs[2] += p_cost_mvx[m2x<<2];\
costs[3] += p_cost_mvx[m3x<<2];\
COPY3_IF_LT( bcost, costs[0], bmx, m0x, bmy, m0y );\
COPY3_IF_LT( bcost, costs[1], bmx, m1x, bmy, m1y );\
COPY3_IF_LT( bcost, costs[2], bmx, m2x, bmy, m2y );\
......@@ -462,35 +462,64 @@ me_hex2:
/* successive elimination by comparing DC before a full SAD,
* because sum(abs(diff)) >= abs(diff(sum)). */
const int stride = m->i_stride[0];
const int dw = x264_pixel_size[i_pixel].w;
const int dh = x264_pixel_size[i_pixel].h * stride;
const uint16_t *integral_base = m->integral;
static uint8_t zero[16*16] = {0,};
const int enc_dc = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, zero, 16 );
const uint16_t *integral_base = &m->integral[ -1 - 1*stride ];
int enc_dc[4];
int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
int sad_w = x264_pixel_size[sad_size].w;
h->pixf.sad_x4[sad_size]( zero, m->p_fenc[0], m->p_fenc[0]+sad_w,
m->p_fenc[0]+sad_w*FENC_STRIDE, m->p_fenc[0]+sad_w+sad_w*FENC_STRIDE,
FENC_STRIDE, enc_dc );
if( sad_w == 4 )
integral_base += stride * (h->fenc->i_lines[0] + 64);
#define ESA(ADS) \
for( my = min_y; my <= max_y; my++ )\
{\
int mvs[3], i_mvs=0;\
bcost -= p_cost_mvy[my<<2];\
for( mx = min_x; mx <= max_x; mx++ )\
{\
const uint16_t *integral = &integral_base[ mx + my * stride ];\
if( ADS < bcost - p_cost_mvx[mx<<2] )\
{\
if( i_mvs == 3 )\
{\
COST_MV_X4_ABS( mvs[0],my, mvs[1],my, mvs[2],my, mx,my );\
i_mvs = 0;\
}\
else\
mvs[i_mvs++] = mx;\
}\
}\
bcost += p_cost_mvy[my<<2];\
for( i=0; i<i_mvs; i++ )\
COST_MV( mvs[i], my );\
}
for( my = min_y; my <= max_y; my++ )
if( i_pixel == PIXEL_16x16 )
{
int mvs[3], i_mvs=0;
for( mx = min_x; mx <= max_x; mx++ )
ESA( abs( enc_dc[0] - integral[0] )
+ abs( enc_dc[1] - integral[8] )
+ abs( enc_dc[2] - integral[8*stride] )
+ abs( enc_dc[3] - integral[8*stride+8] ) );
}
else if( i_pixel == PIXEL_8x8 || i_pixel == PIXEL_4x4 )
{
ESA( abs( enc_dc[0] - integral[0] ) );
}
else
{
int dw = i_pixel < PIXEL_8x8 ? 8 : 4;
if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
{
const uint16_t *integral = &integral_base[ mx + my * stride ];
const uint16_t ref_dc = integral[ 0 ] + integral[ dh + dw ]
- integral[ dw ] - integral[ dh ];
const int bsad = bcost - BITS_MVD(mx,my);
if( abs( ref_dc - enc_dc ) < bsad )
{
if( i_mvs == 3 )
{
COST_MV_X4_ABS( mvs[0],my, mvs[1],my, mvs[2],my, mx,my );
i_mvs = 0;
}
else
mvs[i_mvs++] = mx;
}
dw *= stride;
enc_dc[1] = enc_dc[2];
}
for( i=0; i<i_mvs; i++ )
COST_MV( mvs[i], my );
ESA( abs( enc_dc[0] - integral[0] )
+ abs( enc_dc[1] - integral[dw] ) );
}
#undef ESA
#endif
}
break;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment