Commit 46141bf2 authored by Loren Merritt's avatar Loren Merritt

Cache half-pixel interpolated reference frames, to avoid duplicate motion compensation.

30-50% speedup at subq=5.
Patch by Radek Czyz.


git-svn-id: svn://svn.videolan.org/x264/trunk@123 df754926-b1dd-0310-bc7b-ec298dee348c
parent d81fa19a
......@@ -305,7 +305,7 @@ struct x264_t
uint8_t *p_fdec[3];
/* pointer over mb of the references */
uint8_t *p_fref[2][16][3];
uint8_t *p_fref[2][16][4+2]; /* last: lN, lH, lV, lHV, cU, cV */
/* common stride */
int i_stride[3];
......@@ -393,7 +393,7 @@ struct x264_t
x264_predict_t predict_4x4[9+3];
x264_pixel_function_t pixf;
x264_mc_function_t mc[2];
x264_mc_functions_t mc;
x264_dct_function_t dctf;
x264_csp_function_t csp;
......
......@@ -66,6 +66,17 @@ x264_frame_t *x264_frame_new( x264_t *h )
frame->buffer[3] = NULL;
frame->plane[3] = NULL;
frame->filtered[0] = frame->plane[0];
for( i = 0; i < 3; i++ )
{
frame->buffer[4+i] = x264_malloc( frame->i_stride[0] *
( frame->i_lines[0] + 64 ) );
frame->filtered[i+1] = ((uint8_t*)frame->buffer[4+i]) +
frame->i_stride[0] * 32 + 32;
}
frame->i_poc = -1;
frame->i_type = X264_TYPE_AUTO;
frame->i_qpplus1 = 0;
......@@ -95,6 +106,10 @@ void x264_frame_delete( x264_frame_t *frame )
{
x264_free( frame->buffer[i] );
}
for( i = 4; i < 7; i++ ) /* filtered planes */
{
x264_free( frame->buffer[i] );
}
x264_free( frame->mv[0] );
x264_free( frame->mv[1] );
x264_free( frame->ref[0] );
......@@ -180,6 +195,47 @@ void x264_frame_expand_border( x264_frame_t *frame )
}
}
void x264_frame_expand_border_filtered( x264_frame_t *frame )
{
/* during filtering, 8 extra pixels were filtered on each edge.
we want to expand border from the last filtered pixel */
int w;
int i, y;
for( i = 1; i < 4; i++ )
{
#define PPIXEL(x, y) ( frame->filtered[i] + (x) +(y)*frame->i_stride[0] )
w = 32;
for( y = 8; y < w; y++ )
{
/* upper band */
memcpy( PPIXEL(-8,-y-1), PPIXEL(-8,-8), frame->i_stride[0] - 2 * w + 16 );
/* up left corner */
memset( PPIXEL(-w,-y-1), PPIXEL(-8,-8)[0], w - 8 );
/* up right corner */
memset( PPIXEL(frame->i_stride[0] - 2*w + 8,-y-1), PPIXEL( frame->i_stride[0]-1-2*w+8,-8)[0], w - 8 );
/* lower band */
memcpy( PPIXEL(-8, frame->i_lines[0]+y), PPIXEL(-8,frame->i_lines[0]+7), frame->i_stride[0] - 2 * w + 16 );
/* low left corner */
memset( PPIXEL(-w, frame->i_lines[0]+y), PPIXEL(-8,frame->i_lines[0]+7)[0], w - 8);
/* low right corner */
memset( PPIXEL(frame->i_stride[0]-2*w+8, frame->i_lines[0]+y), PPIXEL(frame->i_stride[0]+7-2*w,frame->i_lines[0]+7)[0], w-8);
}
for( y = -8; y < frame->i_lines[0]+8; y++ )
{
/* left band */
memset( PPIXEL( -w, y ), PPIXEL( -8, y )[0], w - 8 );
/* right band */
memset( PPIXEL( frame->i_stride[0]-2*w + 8, y ), PPIXEL( frame->i_stride[0] + 7 - 2*w, y )[0], w - 8 );
}
#undef PPIXEL
}
}
/* FIXME theses tables are duplicated with the ones in macroblock.c */
static const uint8_t block_idx_xy[4][4] =
{
......
......@@ -38,10 +38,11 @@ typedef struct
int i_stride[4];
int i_lines[4];
uint8_t *plane[4];
uint8_t *filtered[4]; /* plane[0], H, V, HV */
/* for unrestricted mv we allocate more data than needed
* allocated data are stored in buffer */
void *buffer[4];
void *buffer[7];
/* motion data */
int16_t (*mv[2])[2];
......@@ -58,6 +59,10 @@ void x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_pictur
void x264_frame_expand_border( x264_frame_t *frame );
void x264_frame_expand_border_filtered( x264_frame_t *frame );
void x264_frame_deblocking_filter( x264_t *h, int i_slice_type );
void x264_frame_filter( int cpu, x264_frame_t *frame );
#endif
......@@ -1021,12 +1021,136 @@ static void motion_compensation_luma_sse2( uint8_t *src, int i_src_stride,
MOTION_COMPENSATION_LUMA
}
void x264_mc_mmxext_init( x264_mc_function_t pf[2] )
void mc_luma_mmx( uint8_t *src[4], int i_src_stride,
uint8_t *dst, int i_dst_stride,
int mvx,int mvy,
int i_width, int i_height )
{
uint8_t *src1, *src2;
/* todo : fixme... */
int correction = ((mvx&3) == 3 && (mvy&3) == 1 || (mvx&3) == 1 && (mvy&3) == 3) ? 1:0;
int hpel1x = mvx>>1;
int hpel1y = (mvy+1-correction)>>1;
int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
{
int hpel2x = (mvx+1)>>1;
int hpel2y = (mvy+correction)>>1;
int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
switch(i_width) {
case 4:
x264_pixel_avg_w4_mmxext( dst, i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_height );
break;
case 8:
x264_pixel_avg_w8_mmxext( dst, i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_height );
break;
case 16:
default:
x264_pixel_avg_w16_mmxext(dst, i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_height );
}
}
else
{
switch(i_width) {
case 4:
x264_mc_copy_w4_mmxext( src1, i_src_stride, dst, i_dst_stride, i_height );
break;
case 8:
x264_mc_copy_w8_mmxext( src1, i_src_stride, dst, i_dst_stride, i_height );
break;
case 16:
x264_mc_copy_w16_mmxext( src1, i_src_stride, dst, i_dst_stride, i_height );
break;
}
}
}
uint8_t *get_ref_mmx( uint8_t *src[4], int i_src_stride,
uint8_t *dst, int *i_dst_stride,
int mvx,int mvy,
int i_width, int i_height )
{
uint8_t *src1, *src2;
/* todo : fixme... */
int correction = ((mvx&3) == 3 && (mvy&3) == 1 || (mvx&3) == 1 && (mvy&3) == 3) ? 1:0;
int hpel1x = mvx>>1;
int hpel1y = (mvy+1-correction)>>1;
int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
{
int hpel2x = (mvx+1)>>1;
int hpel2y = (mvy+correction)>>1;
int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
switch(i_width) {
case 4:
x264_pixel_avg_w4_mmxext( dst, *i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_height );
break;
case 8:
x264_pixel_avg_w8_mmxext( dst, *i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_height );
break;
case 16:
default:
x264_pixel_avg_w16_mmxext(dst, *i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_height );
}
return dst;
}
else
{
*i_dst_stride = i_src_stride;
return src1;
}
}
void x264_mc_mmxext_init( x264_mc_functions_t *pf )
{
pf[MC_LUMA] = motion_compensation_luma_mmxext;
pf->mc_luma = mc_luma_mmx;
pf->get_ref = get_ref_mmx;
}
void x264_mc_sse2_init( x264_mc_function_t pf[2] )
void x264_mc_sse2_init( x264_mc_functions_t *pf )
{
pf[MC_LUMA] = motion_compensation_luma_sse2;
/* todo: use sse2 */
pf->mc_luma = mc_luma_mmx;
pf->get_ref = get_ref_mmx;
}
void get_funcs_mmx(pf_mc_t *int_h, pf_mc_t *int_v, pf_mc_t *int_hv)
{
*int_h = mc_hh_w16;
*int_v = mc_hv_w16;
*int_hv = mc_hc_w16;
}
void get_funcs_sse2(pf_mc_t *int_h, pf_mc_t *int_v, pf_mc_t *int_hv)
{
*int_h = mc_hh_w16;
*int_v = mc_hv_w16;
*int_hv = mc_hc_w16;
}
......@@ -24,7 +24,7 @@
#ifndef _I386_MC_H
#define _I386_MC_H 1
void x264_mc_mmxext_init( x264_mc_function_t pf[2] );
void x264_mc_sse2_init( x264_mc_function_t pf[2] );
void x264_mc_mmxext_init( x264_mc_functions_t *pf );
void x264_mc_sse2_init( x264_mc_functions_t *pf );
#endif
......@@ -577,15 +577,15 @@ static inline void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int hei
const int mvx = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
const int mvy = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
h->mc[MC_LUMA]( &h->mb.pic.p_fref[0][i_ref][0][4*y * h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0],
h->mc.mc_luma( h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0],
&h->mb.pic.p_fdec[0][4*y * h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0],
mvx, mvy, 4*width, 4*height );
mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height );
h->mc[MC_CHROMA]( &h->mb.pic.p_fref[0][i_ref][1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
h->mc.mc_chroma( &h->mb.pic.p_fref[0][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
&h->mb.pic.p_fdec[1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
mvx, mvy, 2*width, 2*height );
h->mc[MC_CHROMA]( &h->mb.pic.p_fref[0][i_ref][2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
h->mc.mc_chroma( &h->mb.pic.p_fref[0][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
&h->mb.pic.p_fdec[2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
mvx, mvy, 2*width, 2*height );
}
......@@ -596,15 +596,15 @@ static inline void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int hei
const int mvx = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
const int mvy = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
h->mc[MC_LUMA]( &h->mb.pic.p_fref[1][i_ref][0][4*y * h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0],
h->mc.mc_luma( h->mb.pic.p_fref[1][i_ref], h->mb.pic.i_stride[0],
&h->mb.pic.p_fdec[0][4*y *h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0],
mvx, mvy, 4*width, 4*height );
mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height );
h->mc[MC_CHROMA]( &h->mb.pic.p_fref[1][i_ref][1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
h->mc.mc_chroma( &h->mb.pic.p_fref[1][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
&h->mb.pic.p_fdec[1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
mvx, mvy, 2*width, 2*height );
h->mc[MC_CHROMA]( &h->mb.pic.p_fref[1][i_ref][2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
h->mc.mc_chroma( &h->mb.pic.p_fref[1][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
&h->mb.pic.p_fdec[2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
mvx, mvy, 2*width, 2*height );
}
......@@ -631,26 +631,26 @@ static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int he
else if( width == 1 && height == 2 ) i_mode = PIXEL_4x8;
else if( width == 1 && height == 1 ) i_mode = PIXEL_4x4;
h->mc[MC_LUMA]( &h->mb.pic.p_fref[0][i_ref0][0][4*y * h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0],
h->mc.mc_luma( h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0],
&h->mb.pic.p_fdec[0][4*y *h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0],
mvx0, mvy0, 4*width, 4*height );
h->mc[MC_CHROMA]( &h->mb.pic.p_fref[0][i_ref0][1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
mvx0 + 4*4*x, mvy0 + 4*4*y, 4*width, 4*height );
h->mc.mc_chroma( &h->mb.pic.p_fref[0][i_ref0][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
&h->mb.pic.p_fdec[1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
mvx0, mvy0, 2*width, 2*height );
h->mc[MC_CHROMA]( &h->mb.pic.p_fref[0][i_ref0][2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
h->mc.mc_chroma( &h->mb.pic.p_fref[0][i_ref0][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
&h->mb.pic.p_fdec[2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
mvx0, mvy0, 2*width, 2*height );
h->mc[MC_LUMA]( &h->mb.pic.p_fref[1][i_ref1][0][4*y * h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0],
tmp, 16, mvx1, mvy1, 4*width, 4*height );
h->mc.mc_luma( h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
tmp, 16, mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height );
h->pixf.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y *h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0], tmp, 16 );
h->mc[MC_CHROMA]( &h->mb.pic.p_fref[1][i_ref1][1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
h->mc.mc_chroma( &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
tmp, 16, mvx1, mvy1, 2*width, 2*height );
h->pixf.avg[i_mode]( &h->mb.pic.p_fdec[1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], tmp, 16 );
h->mc[MC_CHROMA]( &h->mb.pic.p_fref[1][i_ref1][2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
h->mc.mc_chroma( &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
tmp, 16, mvx1, mvy1, 2*width, 2*height );
h->pixf.avg[i_mode]( &h->mb.pic.p_fdec[2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], tmp, 16 );
}
......@@ -946,11 +946,13 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
for( j = 0; j < h->i_ref0; j++ )
{
h->mb.pic.p_fref[0][j][i] = &h->fref0[j]->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )];
h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &h->fref0[j]->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )];
h->mb.pic.p_fref[0][j][i+1] = &h->fref0[j]->filtered[i+1][ 16 * ( i_mb_x + i_mb_y * h->fdec->i_stride[0] )];
}
for( j = 0; j < h->i_ref1; j++ )
{
h->mb.pic.p_fref[1][j][i] = &h->fref1[j]->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )];
h->mb.pic.p_fref[1][j][i==0 ? 0:i+3] = &h->fref1[j]->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )];
h->mb.pic.p_fref[1][j][i+1] = &h->fref1[j]->filtered[i+1][ 16 * ( i_mb_x + i_mb_y * h->fdec->i_stride[0] )];
}
}
......
......@@ -35,6 +35,7 @@
#include "mc.h"
#include "clip1.h"
#include "frame.h"
#ifdef _MSC_VER
#undef HAVE_MMXEXT /* not finished now */
......@@ -270,6 +271,80 @@ static void motion_compensation_luma( uint8_t *src, int i_src_stride,
pf_mc[mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_width, i_height );
}
void mc_luma( uint8_t *src[4], int i_src_stride,
uint8_t *dst, int i_dst_stride,
int mvx,int mvy,
int i_width, int i_height )
{
uint8_t *src1, *src2;
/* todo : fixme... */
int correction = ((mvx&3) == 3 && (mvy&3) == 1 || (mvx&3) == 1 && (mvy&3) == 3) ? 1:0;
int hpel1x = mvx>>1;
int hpel1y = (mvy+1-correction)>>1;
int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
{
int hpel2x = (mvx+1)>>1;
int hpel2y = (mvy+correction)>>1;
int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
pixel_avg( dst, i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_width, i_height );
}
else
{
mc_copy( src1, i_src_stride, dst, i_dst_stride, i_width, i_height );
}
}
uint8_t *get_ref( uint8_t *src[4], int i_src_stride,
uint8_t *dst, int * i_dst_stride,
int mvx,int mvy,
int i_width, int i_height )
{
uint8_t *src1, *src2;
/* todo : fixme... */
int correction = ((mvx&3) == 3 && (mvy&3) == 1 || (mvx&3) == 1 && (mvy&3) == 3) ? 1:0;
int hpel1x = mvx>>1;
int hpel1y = (mvy+1-correction)>>1;
int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
{
int hpel2x = (mvx+1)>>1;
int hpel2y = (mvy+correction)>>1;
int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
pixel_avg( dst, *i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_width, i_height );
return dst;
}
else
{
*i_dst_stride = i_src_stride;
return src1;
}
}
/* full chroma mc (ie until 1/8 pixel)*/
static void motion_compensation_chroma( uint8_t *src, int i_src_stride,
uint8_t *dst, int i_dst_stride,
......@@ -304,10 +379,11 @@ static void motion_compensation_chroma( uint8_t *src, int i_src_stride,
}
}
void x264_mc_init( int cpu, x264_mc_function_t pf[2] )
void x264_mc_init( int cpu, x264_mc_functions_t *pf )
{
pf[MC_LUMA] = motion_compensation_luma;
pf[MC_CHROMA] = motion_compensation_chroma;
pf->mc_luma = mc_luma;
pf->get_ref = get_ref;
pf->mc_chroma = motion_compensation_chroma;
#ifdef HAVE_MMXEXT
if( cpu&X264_CPU_MMXEXT )
......@@ -317,10 +393,54 @@ void x264_mc_init( int cpu, x264_mc_function_t pf[2] )
if( cpu&X264_CPU_SSE2 )
x264_mc_sse2_init( pf );
#endif
/*
#ifdef ARCH_PPC
if( cpu&X264_CPU_ALTIVEC )
x264_mc_altivec_init( pf );
#endif
*/
}
void get_funcs_mmx(pf_mc_t*, pf_mc_t*, pf_mc_t*);
void get_funcs_sse2(pf_mc_t*, pf_mc_t*, pf_mc_t*);
void x264_frame_filter( int cpu, x264_frame_t *frame )
{
const int x_inc = 16, y_inc = 16;
const int stride = frame->i_stride[0];
int x, y;
pf_mc_t int_h = mc_hh;
pf_mc_t int_v = mc_hv;
pf_mc_t int_hv = mc_hc;
#ifdef HAVE_MMXEXT
if( cpu&X264_CPU_MMXEXT )
get_funcs_mmx(&int_h, &int_v, &int_hv);
#endif
#ifdef HAVE_SSE2
if( cpu&X264_CPU_SSE2 )
get_funcs_sse2(&int_h, &int_v, &int_hv);
#endif
for( y = -8; y < frame->i_lines[0]+8; y += y_inc ) {
uint8_t *p_in = frame->plane[0] + y * stride - 8;
uint8_t *p_h = frame->filtered[1] + y * stride - 8;
uint8_t *p_v = frame->filtered[2] + y * stride - 8;
uint8_t *p_hv = frame->filtered[3] + y * stride - 8;
for( x = -8; x < stride - 64 + 8; x += x_inc )
{
int_h( p_in, stride, p_h, stride, x_inc, y_inc );
int_v( p_in, stride, p_v, stride, x_inc, y_inc );
int_hv( p_in, stride, p_hv, stride, x_inc, y_inc );
p_h += x_inc;
p_v += x_inc;
p_hv += x_inc;
p_in += x_inc;
}
}
}
......@@ -31,15 +31,21 @@
* width == 16-> height == 8 or 16
* */
typedef void (*x264_mc_function_t)(uint8_t *, int, uint8_t *, int,
int mvx, int mvy,
int i_width, int i_height );
enum
typedef struct
{
MC_LUMA = 0,
MC_CHROMA = 1,
};
void (*mc_luma)(uint8_t **, int, uint8_t *, int,
int mvx, int mvy,
int i_width, int i_height );
void x264_mc_init( int cpu, x264_mc_function_t pf[2] );
uint8_t* (*get_ref)(uint8_t **, int, uint8_t *, int *,
int mvx, int mvy,
int i_width, int i_height );
void (*mc_chroma)(uint8_t *, int, uint8_t *, int,
int mvx, int mvy,
int i_width, int i_height );
} x264_mc_functions_t;
void x264_mc_init( int cpu, x264_mc_functions_t *pf );
#endif
......@@ -486,6 +486,12 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *res )
}
}
#define LOAD_HPELS(dst, src, offset) \
dst[0] = &src[0][offset]; \
dst[1] = &src[1][offset]; \
dst[2] = &src[2][offset]; \
dst[3] = &src[3][offset]; \
static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
{
x264_me_t m;
......@@ -507,7 +513,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
i_fullpel_thresh -= i_ref_cost;
/* search with ref */
m.p_fref = h->mb.pic.p_fref[0][i_ref][0];
LOAD_HPELS( m.p_fref, h->mb.pic.p_fref[0][i_ref], 0 );
x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
......@@ -535,7 +541,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
{
uint8_t *p_fref = h->mb.pic.p_fref[0][a->l0.i_ref][0];
uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
uint8_t *p_fenc = h->mb.pic.p_fenc[0];
int mvc[5][2], i_mvc;
int i;
......@@ -556,9 +562,9 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
m->i_pixel = PIXEL_8x8;
m->lm = a->i_lambda;
m->p_fenc = &p_fenc[8*(y8*h->mb.pic.i_stride[0]+x8)];
m->p_fref = &p_fref[8*(y8*h->mb.pic.i_stride[0]+x8)];
m->p_fenc = &p_fenc[8*(y8*h->mb.pic.i_stride[0]+x8)];
m->i_stride= h->mb.pic.i_stride[0];
LOAD_HPELS( m->p_fref, p_fref, 8*(y8*m->i_stride + x8) );
x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
x264_me_search( h, m, mvc, i_mvc );
......@@ -579,7 +585,7 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
{
uint8_t *p_fref = h->mb.pic.p_fref[0][a->l0.i_ref][0];
uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
uint8_t *p_fenc = h->mb.pic.p_fenc[0];
int mvc[2][2];
int i;
......@@ -594,9 +600,9 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
m->i_pixel = PIXEL_16x8;
m->lm = a->i_lambda;
m->p_fenc = &p_fenc[8*i*h->mb.pic.i_stride[0]];
m->p_fref = &p_fref[8*i*h->mb.pic.i_stride[0]];
m->p_fenc = &p_fenc[8*i*h->mb.pic.i_stride[0]];
m->i_stride= h->mb.pic.i_stride[0];
LOAD_HPELS( m->p_fref, p_fref, 8*i*m->i_stride );
mvc[0][0] = a->l0.me8x8[2*i].mv[0];
mvc[0][1] = a->l0.me8x8[2*i].mv[1];
......@@ -614,7 +620,7 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
{
uint8_t *p_fref = h->mb.pic.p_fref[0][a->l0.i_ref][0];
uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
uint8_t *p_fenc = h->mb.pic.p_fenc[0];
int mvc[2][2];
int i;
......@@ -630,8 +636,8 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
m->lm = a->i_lambda;
m->p_fenc = &p_fenc[8*i];
m->p_fref = &p_fref[8*i];
m->i_stride= h->mb.pic.i_stride[0];
LOAD_HPELS( m->p_fref, p_fref, 8*i );
mvc[0][0] = a->l0.me8x8[i].mv[0];
mvc[0][1] = a->l0.me8x8[i].mv[1];
......@@ -649,7 +655,7 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
{
uint8_t *p_fref = h->mb.pic.p_fref[0][a->l0.i_ref][0];
uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
uint8_t *p_fenc = h->mb.pic.p_fenc[0];
int i4x4;
......@@ -670,8 +676,8 @@ static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8
m->lm = a->i_lambda;
m->p_fenc = &p_fenc[4*(y4*h->mb.pic.i_stride[0]+x4)];
m->p_fref = &p_fref[4*(y4*h->mb.pic.i_stride[0]+x4)];
m->i_stride= h->mb.pic.i_stride[0];
LOAD_HPELS( m->p_fref, p_fref, 4*(y4*m->i_stride + x4) );
x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
......@@ -688,7 +694,7 @@ static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8
static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
{
uint8_t *p_fref = h->mb.pic.p_fref[0][a->l0.i_ref][0];
uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
uint8_t *p_fenc = h->mb.pic.p_fenc[0];