Commit 9bdf19c2 authored by Fiona Glaser's avatar Fiona Glaser Committed by Loren Merritt

memzero_aligned_mmx

parent 57985796
......@@ -271,13 +271,18 @@ static void plane_copy( uint8_t *dst, int i_dst,
}
}
void prefetch_fenc_null( uint8_t *pix_y, int stride_y,
uint8_t *pix_uv, int stride_uv, int mb_x )
static void prefetch_fenc_null( uint8_t *pix_y, int stride_y,
uint8_t *pix_uv, int stride_uv, int mb_x )
{}
void prefetch_ref_null( uint8_t *pix, int stride, int parity )
static void prefetch_ref_null( uint8_t *pix, int stride, int parity )
{}
static void memzero_aligned( void * dst, int n )
{
memset( dst, 0, n );
}
void x264_mc_init( int cpu, x264_mc_functions_t *pf )
{
pf->mc_luma = mc_luma;
......@@ -316,6 +321,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
pf->prefetch_fenc = prefetch_fenc_null;
pf->prefetch_ref = prefetch_ref_null;
pf->memcpy_aligned = memcpy;
pf->memzero_aligned = memzero_aligned;
#ifdef HAVE_MMX
x264_mc_init_mmx( cpu, pf );
......
......@@ -67,6 +67,7 @@ typedef struct
void (*prefetch_ref)( uint8_t *pix, int stride, int parity );
void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
void (*memzero_aligned)( void *dst, int n );
} x264_mc_functions_t;
......
......@@ -387,6 +387,12 @@ cglobal x264_plane_copy_mmxext, 6,7
emms
RET
; These functions are not general-use; not only do the SSE ones require aligned input,
; but they also will fail if given a non-mod16 size or a size less than 64.
; memzero SSE will fail for non-mod128.
;-----------------------------------------------------------------------------
; void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
;-----------------------------------------------------------------------------
......@@ -440,3 +446,25 @@ cglobal x264_memcpy_aligned_sse2, 3,3
movdqa [r0 + r2 + 48], xmm3
jg .copy64
REP_RET
;-----------------------------------------------------------------------------
; void *x264_memzero_aligned( void *dst, size_t n );
;-----------------------------------------------------------------------------
%macro MEMZERO 1
cglobal x264_memzero_aligned_%1, 2,2
pxor m0, m0
.loop:
sub r1d, regsize*8
%assign i 0
%rep 8
mova [r0 + r1 + i], m0
%assign i i+regsize
%endrep
jg .loop
REP_RET
%endmacro
INIT_MMX
MEMZERO mmx
INIT_XMM
MEMZERO sse2
......@@ -55,6 +55,8 @@ extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
extern void x264_memzero_aligned_mmx( void * dst, int n );
extern void x264_memzero_aligned_sse2( void * dst, int n );
#define PIXEL_AVG_W(width,cpu)\
extern void x264_pixel_avg2_w##width##_##cpu( uint8_t *, int, uint8_t *, int, uint8_t *, int );
......@@ -230,6 +232,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx;
pf->memcpy_aligned = x264_memcpy_aligned_mmx;
pf->memzero_aligned = x264_memzero_aligned_mmx;
if( !(cpu&X264_CPU_MMXEXT) )
return;
......@@ -278,6 +281,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
return;
pf->memcpy_aligned = x264_memcpy_aligned_sse2;
pf->memzero_aligned = x264_memzero_aligned_sse2;
pf->hpel_filter = x264_hpel_filter_sse2_amd;
// disable on AMD processors since it is slower
......
......@@ -837,7 +837,7 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight
int bcost = COST_MAX;
int pass = 0;
uint8_t visited[8][8][8][8];
memset( visited, 0, sizeof(visited) );
h->mc.memzero_aligned( visited, sizeof(visited) );
BIME_CACHE( 0, 0 );
CHECK_BIDIR( 0, 0, 0, 0 );
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment