Commit 8d1ebe2e authored by Loren Merritt's avatar Loren Merritt

prefetch pixels for motion compensation and deblocking.


git-svn-id: svn://svn.videolan.org/x264/trunk@590 df754926-b1dd-0310-bc7b-ec298dee348c
parent 9fadbd7b
......@@ -72,6 +72,9 @@ cglobal x264_mc_copy_w16_sse2
cglobal x264_mc_chroma_mmxext
cglobal x264_prefetch_fenc_mmxext
cglobal x264_prefetch_ref_mmxext
;=============================================================================
; pixel avg
;=============================================================================
......@@ -549,3 +552,49 @@ ALIGN 4
dec r11d
jnz .height_loop1_w8
rep ret
;-----------------------------------------------------------------------------
; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y,
; uint8_t *pix_uv, int stride_uv, int mb_x )
;-----------------------------------------------------------------------------
ALIGN 16
x264_prefetch_fenc_mmxext:
mov eax, parm5d
and eax, 3
imul eax, parm2d
lea parm1q, [parm1q+rax*4+64]
prefetcht0 [parm1q]
prefetcht0 [parm1q+parm2q]
lea parm1q, [parm1q+parm2q*2]
prefetcht0 [parm1q]
prefetcht0 [parm1q+parm2q]
mov eax, parm5d
and eax, 6
imul eax, parm4d
lea parm3q, [parm3q+rax+64]
prefetcht0 [parm3q]
prefetcht0 [parm3q+parm4q]
ret
;-----------------------------------------------------------------------------
; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
;-----------------------------------------------------------------------------
ALIGN 16
x264_prefetch_ref_mmxext:
dec parm3d
and parm3d, parm2d
lea parm1q, [parm1q+parm3q*8+64]
lea rax, [parm2q*3]
prefetcht0 [parm1q]
prefetcht0 [parm1q+parm2q]
prefetcht0 [parm1q+parm2q*2]
prefetcht0 [parm1q+rax]
lea parm1q, [parm1q+parm2q*4]
prefetcht0 [parm1q]
prefetcht0 [parm1q+parm2q]
prefetcht0 [parm1q+parm2q*2]
prefetcht0 [parm1q+rax]
ret
......@@ -536,6 +536,8 @@ void x264_frame_deblocking_filter( x264_t *h, int i_slice_type )
i_pix_y[2] -= 7*h->fdec->i_stride[2];
}
x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
/* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
* entropy coding, but per 64 coeffs for the purpose of deblocking */
if( !h->param.b_cabac && b_8x8_transform )
......
......@@ -72,6 +72,9 @@ cglobal x264_mc_copy_w16_sse2
cglobal x264_mc_chroma_mmxext
cglobal x264_prefetch_fenc_mmxext
cglobal x264_prefetch_ref_mmxext
;=============================================================================
; pixel avg
;=============================================================================
......@@ -595,3 +598,59 @@ ALIGN 4
pop edi
picpop ebx
ret
; prefetches tuned for 64 byte cachelines (K7/K8/Core2)
; TODO add 32 and 128 byte versions for P3/P4
;-----------------------------------------------------------------------------
; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y,
; uint8_t *pix_uv, int stride_uv, int mb_x )
;-----------------------------------------------------------------------------
ALIGN 16
x264_prefetch_fenc_mmxext:
mov eax, [esp+20]
mov ecx, [esp+8]
mov edx, [esp+4]
and eax, 3
imul eax, ecx
lea edx, [edx+eax*4+64]
prefetcht0 [edx]
prefetcht0 [edx+ecx]
lea edx, [edx+ecx*2]
prefetcht0 [edx]
prefetcht0 [edx+ecx]
mov eax, [esp+20]
mov ecx, [esp+16]
mov edx, [esp+12]
and eax, 6
imul eax, ecx
lea edx, [edx+eax+64]
prefetcht0 [edx]
prefetcht0 [edx+ecx]
ret
;-----------------------------------------------------------------------------
; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
;-----------------------------------------------------------------------------
ALIGN 16
x264_prefetch_ref_mmxext:
mov eax, [esp+12]
mov ecx, [esp+8]
mov edx, [esp+4]
sub eax, 1
and eax, ecx
lea edx, [edx+eax*8+64]
lea eax, [ecx*3]
prefetcht0 [edx]
prefetcht0 [edx+ecx]
prefetcht0 [edx+ecx*2]
prefetcht0 [edx+eax]
lea edx, [edx+ecx*4]
prefetcht0 [edx]
prefetcht0 [edx+ecx]
prefetcht0 [edx+ecx*2]
prefetcht0 [edx+eax]
ret
......@@ -40,6 +40,8 @@ extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_prefetch_ref_mmxext( uint8_t *, int, int );
#define AVG(W,H) \
static void x264_pixel_avg_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src ) \
......@@ -161,6 +163,9 @@ void x264_mc_mmxext_init( x264_mc_functions_t *pf )
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx;
pf->plane_copy = x264_plane_copy_mmxext;
pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
pf->prefetch_ref = x264_prefetch_ref_mmxext;
}
void x264_mc_sse2_init( x264_mc_functions_t *pf )
{
......
......@@ -927,6 +927,15 @@ void x264_macroblock_slice_init( x264_t *h )
memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );
}
void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
{
int stride_y = fenc->i_stride[0];
int stride_uv = fenc->i_stride[1];
int off_y = 16 * (i_mb_x + i_mb_y * stride_y);
int off_uv = 8 * (i_mb_x + i_mb_y * stride_uv);
h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y,
fenc->plane[1+(i_mb_x&1)]+off_uv, stride_uv, i_mb_x );
}
void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
{
......@@ -1143,6 +1152,8 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
h->mb.pic.p_integral[1][i] = &h->fref1[i]->integral[ 16 * ( i_mb_x + i_mb_y * h->fdec->i_stride[0] )];
}
x264_prefetch_fenc( h, h->fenc, i_mb_x, i_mb_y );
/* load ref/mv/mvd */
if( h->sh.i_type != SLICE_TYPE_I )
{
......@@ -1359,6 +1370,8 @@ void x264_macroblock_cache_save( x264_t *h )
h->mb.pic.p_fdec[i], FDEC_STRIDE, w );
}
x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );
h->mb.type[i_mb_xy] = i_mb_type;
if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
......
......@@ -248,6 +248,8 @@ void x264_macroblock_cache_end( x264_t *h );
void x264_macroblock_bipred_init( x264_t *h );
void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y );
/* x264_mb_predict_mv_16x16:
* set mvp with predicted mv for D_16x16 block
* h->mb. need only valid values from other blocks */
......
......@@ -327,6 +327,13 @@ static void plane_copy( uint8_t *dst, int i_dst,
}
}
void prefetch_fenc_null( uint8_t *pix_y, int stride_y,
uint8_t *pix_uv, int stride_uv, int mb_x )
{}
void prefetch_ref_null( uint8_t *pix, int stride, int parity )
{}
void x264_mc_init( int cpu, x264_mc_functions_t *pf )
{
pf->mc_luma = mc_luma;
......@@ -361,6 +368,9 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
pf->plane_copy = plane_copy;
pf->prefetch_fenc = prefetch_fenc_null;
pf->prefetch_ref = prefetch_ref_null;
#ifdef HAVE_MMXEXT
if( cpu&X264_CPU_MMXEXT ) {
x264_mc_mmxext_init( pf );
......
......@@ -55,6 +55,13 @@ typedef struct
void (*plane_copy)( uint8_t *dst, int i_dst,
uint8_t *src, int i_src, int w, int h);
/* prefetch the next few macroblocks of fenc or fdec */
void (*prefetch_fenc)( uint8_t *pix_y, int stride_y,
uint8_t *pix_uv, int stride_uv, int mb_x );
/* prefetch the next few macroblocks of a hpel reference frame */
void (*prefetch_ref)( uint8_t *pix, int stride, int parity );
} x264_mc_functions_t;
void x264_mc_init( int cpu, x264_mc_functions_t *pf );
......
......@@ -1996,6 +1996,8 @@ void x264_macroblock_analyse( x264_t *h )
int b_skip = 0;
int i_intra_cost, i_intra_type;
h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
/* Fast P_SKIP detection */
analysis.b_try_pskip = 0;
if( h->param.analyse.b_fast_pskip )
......@@ -2009,6 +2011,8 @@ void x264_macroblock_analyse( x264_t *h )
b_skip = x264_macroblock_probe_pskip( h );
}
h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
if( b_skip )
{
h->mb.i_type = P_SKIP;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment