Commit 5469a4ba authored by Fiona Glaser's avatar Fiona Glaser Committed by Loren Merritt

memcpy_aligned_sse2

parent 9d0c0a90
......@@ -27,7 +27,9 @@
typedef struct
{
/* context */
uint8_t state[460];
DECLARE_ALIGNED( uint8_t, state[460], 16 );
int f8_bits_encoded; // only if using x264_cabac_size_decision()
/* state */
int i_low;
......@@ -36,7 +38,6 @@ typedef struct
/* bit stream */
int i_queue;
int i_bytes_outstanding;
int f8_bits_encoded; // only if using x264_cabac_size_decision()
uint8_t *p_start;
uint8_t *p;
......
......@@ -502,7 +502,7 @@ int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed )
for( l = 0; l < 2; l++ )
for( i = 0; i < 4; i++ )
h->mb.cache.direct_ref[l][i] = h->mb.cache.ref[l][x264_scan8[i*4]];
memcpy(h->mb.cache.direct_mv, h->mb.cache.mv, sizeof(h->mb.cache.mv));
h->mc.memcpy_aligned(h->mb.cache.direct_mv, h->mb.cache.mv, sizeof(h->mb.cache.mv));
}
return b_available;
......
......@@ -372,6 +372,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
pf->prefetch_fenc = prefetch_fenc_null;
pf->prefetch_ref = prefetch_ref_null;
pf->memcpy_aligned = memcpy;
#ifdef HAVE_MMX
x264_mc_init_mmx( cpu, pf );
......
......@@ -65,6 +65,8 @@ typedef struct
uint8_t *pix_uv, int stride_uv, int mb_x );
/* prefetch the next few macroblocks of a hpel reference frame */
void (*prefetch_ref)( uint8_t *pix, int stride, int parity );
void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
} x264_mc_functions_t;
......
......@@ -336,3 +336,56 @@ cglobal x264_plane_copy_mmxext, 6,7
emms
RET
;-----------------------------------------------------------------------------
; void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
;-----------------------------------------------------------------------------
cglobal x264_memcpy_aligned_mmx, 3,3
test r2d, 16
jz .copy32
sub r2d, 16
movq mm0, [r1 + r2 + 0]
movq mm1, [r1 + r2 + 8]
movq [r0 + r2 + 0], mm0
movq [r0 + r2 + 8], mm1
.copy32:
sub r2d, 32
movq mm0, [r1 + r2 + 0]
movq mm1, [r1 + r2 + 8]
movq mm2, [r1 + r2 + 16]
movq mm3, [r1 + r2 + 24]
movq [r0 + r2 + 0], mm0
movq [r0 + r2 + 8], mm1
movq [r0 + r2 + 16], mm2
movq [r0 + r2 + 24], mm3
jg .copy32
REP_RET
;-----------------------------------------------------------------------------
; void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
;-----------------------------------------------------------------------------
cglobal x264_memcpy_aligned_sse2, 3,3
test r2d, 16
jz .copy32
sub r2d, 16
movdqa xmm0, [r1 + r2]
movdqa [r0 + r2], xmm0
.copy32:
test r2d, 32
jz .copy64
sub r2d, 32
movdqa xmm0, [r1 + r2 + 0]
movdqa xmm1, [r1 + r2 + 16]
movdqa [r0 + r2 + 0], xmm0
movdqa [r0 + r2 + 16], xmm1
.copy64:
sub r2d, 64
movdqa xmm0, [r1 + r2 + 0]
movdqa xmm1, [r1 + r2 + 16]
movdqa xmm2, [r1 + r2 + 32]
movdqa xmm3, [r1 + r2 + 48]
movdqa [r0 + r2 + 0], xmm0
movdqa [r0 + r2 + 16], xmm1
movdqa [r0 + r2 + 32], xmm2
movdqa [r0 + r2 + 48], xmm3
jg .copy64
REP_RET
......@@ -56,6 +56,8 @@ extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
extern void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
int i_stride, int i_width, int i_height );
extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
#define AVG_WEIGHT(W,H) \
void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
......@@ -144,6 +146,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx;
pf->memcpy_aligned = x264_memcpy_aligned_mmx;
if( !(cpu&X264_CPU_MMXEXT) )
return;
......@@ -175,5 +178,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
pf->prefetch_ref = x264_prefetch_ref_mmxext;
/* todo: use sse2 */
if( !(cpu&X264_CPU_SSE2) )
return;
pf->memcpy_aligned = x264_memcpy_aligned_sse2;
}
......@@ -82,8 +82,8 @@ static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
}
else if( h->param.b_cabac )
{
x264_cabac_t cabac_tmp = h->cabac;
cabac_tmp.f8_bits_encoded = 0;
x264_cabac_t cabac_tmp;
h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
x264_macroblock_size_cabac( h, &cabac_tmp );
i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
}
......@@ -124,8 +124,8 @@ int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel )
if( h->param.b_cabac )
{
x264_cabac_t cabac_tmp = h->cabac;
cabac_tmp.f8_bits_encoded = 0;
x264_cabac_t cabac_tmp;
h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel );
i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
}
......@@ -146,8 +146,8 @@ int x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
if( h->param.b_cabac )
{
x264_cabac_t cabac_tmp = h->cabac;
cabac_tmp.f8_bits_encoded = 0;
x264_cabac_t cabac_tmp;
h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
}
......@@ -168,8 +168,9 @@ int x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
if( h->param.b_cabac )
{
x264_cabac_t cabac_tmp = h->cabac;
cabac_tmp.f8_bits_encoded = 0;
x264_cabac_t cabac_tmp;
h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
}
......@@ -194,8 +195,8 @@ int x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
if( h->param.b_cabac )
{
x264_cabac_t cabac_tmp = h->cabac;
cabac_tmp.f8_bits_encoded = 0;
x264_cabac_t cabac_tmp;
h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
x264_i8x8_chroma_size_cabac( h, &cabac_tmp );
i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment