Commit bc29c635 authored by Loren Merritt's avatar Loren Merritt

faster ESA init

reduce memory if using ESA and not p4x4
parent 8e5d63a5
......@@ -338,6 +338,7 @@ struct x264_t
int i_max_ref1;
int i_delay; /* Number of frames buffered for B reordering */
int b_have_lowres; /* Whether 1/2 resolution luma planes are being used */
int b_have_sub8x8_esa;
} frames;
/* current frame being encoded */
......
......@@ -99,7 +99,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
if( h->param.analyse.i_me_method >= X264_ME_ESA )
{
CHECKED_MALLOC( frame->buffer[3],
2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) );
frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
}
......
......@@ -269,6 +269,42 @@ static void memzero_aligned( void * dst, int n )
memset( dst, 0, n );
}
static void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
{
int x, v = pix[0]+pix[1]+pix[2]+pix[3];
for( x=0; x<stride-4; x++ )
{
sum[x] = v + sum[x-stride];
v += pix[x+4] - pix[x];
}
}
static void integral_init8h( uint16_t *sum, uint8_t *pix, int stride )
{
int x, v = pix[0]+pix[1]+pix[2]+pix[3]+pix[4]+pix[5]+pix[6]+pix[7];
for( x=0; x<stride-8; x++ )
{
sum[x] = v + sum[x-stride];
v += pix[x+8] - pix[x];
}
}
static void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
{
int x;
for( x=0; x<stride-8; x++ )
sum4[x] = sum8[x+4*stride] - sum8[x];
for( x=0; x<stride-8; x++ )
sum8[x] = sum8[x+8*stride] + sum8[x+8*stride+4] - sum8[x] - sum8[x+4];
}
static void integral_init8v( uint16_t *sum8, int stride )
{
int x;
for( x=0; x<stride-8; x++ )
sum8[x] = sum8[x+8*stride] - sum8[x];
}
void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
{
uint8_t *src = frame->plane[0];
......@@ -353,6 +389,11 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
pf->memzero_aligned = memzero_aligned;
pf->frame_init_lowres_core = frame_init_lowres_core;
pf->integral_init4h = integral_init4h;
pf->integral_init8h = integral_init8h;
pf->integral_init4v = integral_init4v;
pf->integral_init8v = integral_init8v;
#ifdef HAVE_MMX
x264_mc_init_mmx( cpu, pf );
#endif
......@@ -370,7 +411,7 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
int start = (mb_y*16 >> b_interlaced) - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8
int height = ((b_end ? frame->i_lines[0] : mb_y*16) >> b_interlaced) + 8;
int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd
int x, y;
int y;
if( mb_y & b_interlaced )
return;
......@@ -401,20 +442,22 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
height += PADV-8;
for( y = start; y < height; y++ )
{
uint8_t *ref = frame->plane[0] + y * stride - PADH;
uint16_t *line = frame->integral + (y+1) * stride - PADH + 1;
uint16_t v = line[0] = 0;
for( x = 1; x < stride-1; x++ )
line[x] = v += ref[x] + line[x-stride] - line[x-stride-1];
line -= 8*stride;
if( y >= 9-PADV )
uint8_t *pix = frame->plane[0] + y * stride - PADH;
uint16_t *sum8 = frame->integral + (y+1) * stride - PADH;
uint16_t *sum4;
if( h->frames.b_have_sub8x8_esa )
{
h->mc.integral_init4h( sum8, pix, stride );
sum8 -= 8*stride;
sum4 = sum8 + stride * (frame->i_lines[0] + PADV*2);
if( y >= 8-PADV )
h->mc.integral_init4v( sum8, sum4, stride );
}
else
{
uint16_t *sum4 = line + stride * (frame->i_lines[0] + PADV*2);
for( x = 1; x < stride-8; x++, line++, sum4++ )
{
sum4[0] = line[4+4*stride] - line[4] - line[4*stride] + line[0];
line[0] += line[8+8*stride] - line[8] - line[8*stride];
}
h->mc.integral_init8h( sum8, pix, stride );
if( y >= 8-PADV )
h->mc.integral_init8v( sum8-8*stride, stride );
}
}
}
......
......@@ -66,6 +66,12 @@ typedef struct
void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
void (*memzero_aligned)( void *dst, int n );
/* successive elimination prefilter */
void (*integral_init4h)( uint16_t *sum, uint8_t *pix, int stride );
void (*integral_init8h)( uint16_t *sum, uint8_t *pix, int stride );
void (*integral_init4v)( uint16_t *sum8, uint16_t *sum4, int stride );
void (*integral_init8v)( uint16_t *sum8, int stride );
void (*frame_init_lowres_core)( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
int src_stride, int dst_stride, int width, int height );
} x264_mc_functions_t;
......
......@@ -694,6 +694,104 @@ MEMZERO sse2
;-----------------------------------------------------------------------------
; void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride )
;-----------------------------------------------------------------------------
cglobal x264_integral_init4h_sse4, 3,4
lea r3, [r0+r2*2]
add r1, r2
neg r2
pxor m4, m4
.loop:
movdqa m0, [r1+r2]
movdqu m1, [r1+r2+8]
mpsadbw m0, m4, 0
mpsadbw m1, m4, 0
paddw m0, [r0+r2*2]
paddw m1, [r0+r2*2+16]
movdqa [r3+r2*2 ], m0
movdqa [r3+r2*2+16], m1
add r2, 16
jl .loop
REP_RET
cglobal x264_integral_init8h_sse4, 3,4
lea r3, [r0+r2*2]
add r1, r2
neg r2
pxor m4, m4
.loop:
movdqa m0, [r1+r2]
movdqu m1, [r1+r2+8]
movdqa m2, m0
movdqa m3, m1
mpsadbw m0, m4, 0
mpsadbw m1, m4, 0
mpsadbw m2, m4, 4
mpsadbw m3, m4, 4
paddw m0, [r0+r2*2]
paddw m1, [r0+r2*2+16]
paddw m0, m2
paddw m1, m3
movdqa [r3+r2*2 ], m0
movdqa [r3+r2*2+16], m1
add r2, 16
jl .loop
REP_RET
%macro INTEGRAL_INIT 1
;-----------------------------------------------------------------------------
; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride )
;-----------------------------------------------------------------------------
cglobal x264_integral_init4v_%1, 3,5
shl r2, 1
add r0, r2
add r1, r2
lea r3, [r0+r2*4]
lea r4, [r0+r2*8]
neg r2
.loop:
movu m0, [r0+r2+8]
mova m2, [r0+r2]
movu m1, [r4+r2+8]
paddw m0, m2
paddw m1, [r4+r2]
mova m3, [r3+r2]
psubw m1, m0
psubw m3, m2
mova [r0+r2], m1
mova [r1+r2], m3
add r2, mmsize
jl .loop
REP_RET
;-----------------------------------------------------------------------------
; void x264_integral_init8v_mmx( uint16_t *sum8, int stride )
;-----------------------------------------------------------------------------
cglobal x264_integral_init8v_%1, 3,3
shl r1, 1
add r0, r1
lea r2, [r0+r1*8]
neg r1
.loop:
mova m0, [r2+r1]
mova m1, [r2+r1+mmsize]
psubw m0, [r0+r1]
psubw m1, [r0+r1+mmsize]
mova [r0+r1], m0
mova [r0+r1+mmsize], m1
add r1, 2*mmsize
jl .loop
REP_RET
%endmacro
INIT_MMX
INTEGRAL_INIT mmx
INIT_XMM
INTEGRAL_INIT sse2
%macro FILT8x4 7
mova %3, [r0+%7]
mova %4, [r0+r5+%7]
......
......@@ -64,6 +64,12 @@ extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
extern void x264_memzero_aligned_mmx( void * dst, int n );
extern void x264_memzero_aligned_sse2( void * dst, int n );
extern void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride );
extern void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride );
extern void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride );
extern void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride );
extern void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
extern void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
#define LOWRES(cpu) \
extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
int src_stride, int dst_stride, int width, int height );
......@@ -242,6 +248,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx;
pf->memcpy_aligned = x264_memcpy_aligned_mmx;
pf->memzero_aligned = x264_memzero_aligned_mmx;
pf->integral_init4v = x264_integral_init4v_mmx;
pf->integral_init8v = x264_integral_init8v_mmx;
if( !(cpu&X264_CPU_MMXEXT) )
return;
......@@ -286,6 +294,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->memcpy_aligned = x264_memcpy_aligned_sse2;
pf->memzero_aligned = x264_memzero_aligned_sse2;
pf->integral_init4v = x264_integral_init4v_sse2;
pf->integral_init8v = x264_integral_init8v_sse2;
pf->hpel_filter = x264_hpel_filter_sse2_amd;
if( cpu&X264_CPU_SSE2_IS_SLOW )
......@@ -331,4 +341,10 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->hpel_filter = x264_hpel_filter_ssse3;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
pf->mc_chroma = x264_mc_chroma_ssse3;
if( !(cpu&X264_CPU_SSE4) )
return;
pf->integral_init4h = x264_integral_init4h_sse4;
pf->integral_init8h = x264_integral_init8h_sse4;
}
......@@ -713,6 +713,7 @@ x264_t *x264_encoder_open ( x264_param_t *param )
|| h->param.i_bframe_adaptive
|| h->param.b_pre_scenecut );
h->frames.b_have_lowres |= (h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0);
h->frames.b_have_sub8x8_esa = !!(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
h->frames.i_last_idr = - h->param.i_keyint_max;
h->frames.i_input = 0;
......@@ -839,6 +840,8 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
// can only twiddle these if they were enabled to begin with:
if( h->param.analyse.i_me_method >= X264_ME_ESA || param->analyse.i_me_method < X264_ME_ESA )
COPY( analyse.i_me_method );
if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->frames.b_have_sub8x8_esa )
h->param.analyse.inter &= ~X264_ANALYSE_PSUB8x8;
if( h->pps->b_transform_8x8_mode )
COPY( analyse.b_transform_8x8 );
if( h->frames.i_max_ref1 > 1 )
......
......@@ -822,33 +822,57 @@ static int check_mc( int cpu_ref, int cpu_new )
uint8_t *dstc[4] = { buf3, buf3+1024, buf3+2048, buf3+3072 };
uint8_t *dsta[4] = { buf4, buf4+1024, buf4+2048, buf3+3072 };
set_func_name( "lowres_init" );
ok = 1; used_asm = 1;
for( w=40; w<=48; w+=8 )
if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core )
{
int stride = (w+8)&~15;
call_c( mc_c.frame_init_lowres_core, buf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 );
call_a( mc_a.frame_init_lowres_core, buf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 );
for( i=0; i<16; i++)
{
int stride = (w+8)&~15;
used_asm = 1;
call_c( mc_c.frame_init_lowres_core, buf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 );
call_a( mc_a.frame_init_lowres_core, buf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 );
for( i=0; i<16; i++)
{
for( j=0; j<4; j++)
if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w ) )
{
ok = 0;
fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i );
for( k=0; k<w; k++ )
printf( "%d ", dstc[j][k+i*stride] );
printf("\n");
for( k=0; k<w; k++ )
printf( "%d ", dsta[j][k+i*stride] );
printf("\n");
break;
}
}
for( j=0; j<4; j++)
if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w ) )
{
ok = 0;
fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i );
for( k=0; k<w; k++ )
printf( "%d ", dstc[j][k+i*stride] );
printf("\n");
for( k=0; k<w; k++ )
printf( "%d ", dsta[j][k+i*stride] );
printf("\n");
break;
}
}
}
report( "lowres init :" );
}
#define INTEGRAL_INIT( name, size, ... )\
if( mc_a.name != mc_ref.name )\
{\
int stride = 80;\
set_func_name( #name );\
used_asm = 1;\
memcpy( buf3, buf1, size*2*stride );\
memcpy( buf4, buf1, size*2*stride );\
uint16_t *sum = (uint16_t*)buf3;\
call_c1( mc_c.name, __VA_ARGS__ );\
sum = (uint16_t*)buf4;\
call_a1( mc_a.name, __VA_ARGS__ );\
if( memcmp( buf3, buf4, (stride-8)*2 )\
|| (size>9 && memcmp( buf3+18*stride, buf4+18*stride, (stride-8)*2 )))\
ok = 0;\
call_c2( mc_c.name, __VA_ARGS__ );\
call_a2( mc_a.name, __VA_ARGS__ );\
}
ok = 1; used_asm = 0;
INTEGRAL_INIT( integral_init4h, 2, sum+stride, buf2, stride );
INTEGRAL_INIT( integral_init8h, 2, sum+stride, buf2, stride );
INTEGRAL_INIT( integral_init4v, 14, sum, sum+9*stride, stride );
INTEGRAL_INIT( integral_init8v, 9, sum, stride );
report( "integral init :" );
return ret;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment