Commit 9e1f3000 authored by Steven Walters's avatar Steven Walters Committed by Loren Merritt

factor mallocs out of hpel, ssim, and esa.

there should now be no memory allocation outside of init-time.
parent ffd73767
......@@ -605,6 +605,8 @@ struct x264_t
} stat;
void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
/* CPU functions dependents */
x264_predict_t predict_16x16[4+3];
x264_predict_t predict_8x8c[4+3];
......
......@@ -23,6 +23,7 @@
*****************************************************************************/
#include "common.h"
#include "encoder/me.h"
void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] )
{
......@@ -838,6 +839,13 @@ int x264_macroblock_cache_init( x264_t *h )
h->mb.i_neighbour4[15] =
h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
int buf_hpel = (h->param.i_width+40) * sizeof(int16_t);
int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
((me_range*2+18) * sizeof(int16_t) + (me_range+1) * (me_range+1) * 4 * sizeof(mvsad_t));
CHECKED_MALLOC( h->scratch_buffer, X264_MAX3( buf_hpel, buf_ssim, buf_tesa ) );
return 0;
fail: return -1;
}
......@@ -863,6 +871,7 @@ void x264_macroblock_cache_end( x264_t *h )
x264_free( h->mb.skipbp );
x264_free( h->mb.cbp );
x264_free( h->mb.qp );
x264_free( h->scratch_buffer );
}
void x264_macroblock_slice_init( x264_t *h )
{
......
......@@ -132,9 +132,8 @@ static void mc_copy( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_str
#define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d]))
static void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
int stride, int width, int height )
int stride, int width, int height, int16_t *buf )
{
int16_t *buf = x264_malloc((width+5)*sizeof(int16_t));
int x, y;
for( y=0; y<height; y++ )
{
......@@ -153,7 +152,6 @@ static void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *s
dstc += stride;
src += stride;
}
x264_free(buf);
}
static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
......@@ -423,7 +421,8 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
frame->filtered[2] + offs,
frame->filtered[3] + offs,
frame->plane[0] + offs,
stride, width + 16, height - start );
stride, width + 16, height - start,
h->scratch_buffer );
}
/* generate integral image:
......
......@@ -55,7 +55,7 @@ typedef struct
uint8_t *src, int i_src, int w, int h);
void (*hpel_filter)( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
int i_stride, int i_width, int i_height );
int i_stride, int i_width, int i_height, int16_t *buf );
/* prefetch the next few macroblocks of fenc or fdec */
void (*prefetch_fenc)( uint8_t *pix_y, int stride_y,
......
......@@ -488,12 +488,12 @@ static float ssim_end4( int sum0[5][4], int sum1[5][4], int width )
float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
uint8_t *pix1, int stride1,
uint8_t *pix2, int stride2,
int width, int height )
int width, int height, void *buf )
{
int x, y, z;
float ssim = 0.0;
int (*sum0)[4] = x264_malloc(4 * (width/4+3) * sizeof(int));
int (*sum1)[4] = x264_malloc(4 * (width/4+3) * sizeof(int));
int (*sum0)[4] = buf;
int (*sum1)[4] = sum0 + width/4+3;
width >>= 2;
height >>= 2;
z = 0;
......@@ -508,8 +508,6 @@ float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
for( x = 0; x < width-1; x += 4 )
ssim += pf->ssim_end4( sum0+x, sum1+x, X264_MIN(4,width-x-1) );
}
x264_free(sum0);
x264_free(sum1);
return ssim;
}
......
......@@ -104,6 +104,6 @@ typedef struct
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height );
float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height );
float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height, void *buf );
#endif
......@@ -202,16 +202,14 @@ void x264_hpel_filter_c_##cpuc( uint8_t *dst, int16_t *buf, int width );\
void x264_hpel_filter_h_##cpuh( uint8_t *dst, uint8_t *src, int width );\
void x264_sfence( void );\
static void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,\
int stride, int width, int height )\
int stride, int width, int height, int16_t *buf )\
{\
int16_t *buf;\
int realign = (long)src & (align-1);\
src -= realign;\
dstv -= realign;\
dstc -= realign;\
dsth -= realign;\
width += realign;\
buf = x264_malloc((width+16)*sizeof(int16_t));\
while( height-- )\
{\
x264_hpel_filter_v_##cpuv( dstv, src, buf+8, stride, width );\
......@@ -223,14 +221,13 @@ static void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
src += stride;\
}\
x264_sfence();\
x264_free(buf);\
}
HPEL(8, mmxext, mmxext, mmxext, mmxext)
HPEL(16, sse2_amd, mmxext, mmxext, sse2)
#ifdef ARCH_X86_64
void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height );
void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height );
void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
#else
HPEL(16, sse2, sse2, sse2, sse2)
HPEL(16, ssse3, sse2, ssse3, ssse3)
......
......@@ -1049,7 +1049,7 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
x264_pixel_ssim_wxh( &h->pixf,
h->fdec->plane[0] + 2+min_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
h->fenc->plane[0] + 2+min_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
h->param.i_width-2, max_y-min_y );
h->param.i_width-2, max_y-min_y, h->scratch_buffer );
}
}
......
......@@ -474,8 +474,7 @@ me_hex2:
DECLARE_ALIGNED_16( int enc_dc[4] );
int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
int delta = x264_pixel_size[sad_size].w;
int16_t xs_buf[64];
int16_t *xs = width<=64 ? xs_buf : x264_malloc( (width+15)*sizeof(int16_t) );
int16_t *xs = h->scratch_buffer;
int xn;
uint16_t *cost_fpel_mvx = x264_cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2);
......@@ -492,11 +491,7 @@ me_hex2:
if( h->mb.i_me_method == X264_ME_TESA )
{
// ADS threshold, then SAD threshold, then keep the best few SADs, then SATD
typedef struct {
int sad;
int16_t mx, my;
} mvsad_t;
mvsad_t *mvsads = x264_malloc( width*(max_y-min_y+1)*sizeof(mvsad_t) );
mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15));
int nmvsad = 0, limit;
int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
int bsad = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, p_fref+bmy*stride+bmx, stride )
......@@ -581,7 +576,6 @@ me_hex2:
}
for( i=0; i<nmvsad; i++ )
COST_MV( mvsads[i].mx, mvsads[i].my );
x264_free( mvsads );
}
else
{
......@@ -601,9 +595,6 @@ me_hex2:
COST_MV( min_x+xs[i], my );
}
}
if( xs != xs_buf )
x264_free( xs );
#endif
}
break;
......
......@@ -48,6 +48,11 @@ typedef struct
DECLARE_ALIGNED_4( int16_t mv[2] );
} DECLARE_ALIGNED_16( x264_me_t );
typedef struct {
int sad;
int16_t mx, my;
} mvsad_t;
void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
static inline void x264_me_search( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc )
{ x264_me_search_ref( h, m, mvc, i_mvc, NULL ); }
......
......@@ -407,8 +407,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
int sums[5][4] = {{0}};
used_asm = ok = 1;
x264_emms();
res_c = x264_pixel_ssim_wxh( &pixel_c, buf1+2, 32, buf2+2, 32, 32, 28 );
res_a = x264_pixel_ssim_wxh( &pixel_asm, buf1+2, 32, buf2+2, 32, 32, 28 );
res_c = x264_pixel_ssim_wxh( &pixel_c, buf1+2, 32, buf2+2, 32, 32, 28, buf3 );
res_a = x264_pixel_ssim_wxh( &pixel_asm, buf1+2, 32, buf2+2, 32, 32, 28, buf3 );
if( fabs(res_c - res_a) > 1e-6 )
{
ok = 0;
......@@ -792,12 +792,13 @@ static int check_mc( int cpu_ref, int cpu_new )
uint8_t *src = buf1+8+2*64;
uint8_t *dstc[3] = { buf3+8, buf3+8+16*64, buf3+8+32*64 };
uint8_t *dsta[3] = { buf4+8, buf4+8+16*64, buf4+8+32*64 };
void *tmp = buf3+49*64;
set_func_name( "hpel_filter" );
ok = 1; used_asm = 1;
memset( buf3, 0, 4096 );
memset( buf4, 0, 4096 );
call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], src, 64, 48, 10 );
call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], src, 64, 48, 10 );
call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], src, 64, 48, 10, tmp );
call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], src, 64, 48, 10, tmp );
for( i=0; i<3; i++ )
for( j=0; j<10; j++ )
//FIXME ideally the first pixels would match too, but they aren't actually used
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment