Commit 8f05dffc authored by Loren Merritt's avatar Loren Merritt

copy current macroblock to a smaller buffer, to improve cache coherency and...

copy current macroblock to a smaller buffer, to improve cache coherency and reduce stride computations.
part 1: memory arrangement.



git-svn-id: svn://svn.videolan.org/x264/trunk@443 df754926-b1dd-0310-bc7b-ec298dee348c
parent 38865823
......@@ -404,6 +404,12 @@ struct x264_t
struct
{
/* space for p_fenc and p_fdec */
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
DECLARE_ALIGNED( uint8_t, fenc_buf[24*FENC_STRIDE], 16 );
DECLARE_ALIGNED( uint8_t, fdec_buf[27*FDEC_STRIDE], 16 );
/* pointer over mb of the frame to be compressed */
uint8_t *p_fenc[3];
......@@ -414,7 +420,7 @@ struct x264_t
uint8_t *p_fref[2][16][4+2]; /* last: lN, lH, lV, lHV, cU, cV */
uint16_t *p_integral[2][16];
/* common stride */
/* fref stride */
int i_stride[3];
} pic;
......
......@@ -545,15 +545,15 @@ static inline void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int hei
const int mvy = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
h->mc.mc_luma( h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0],
&h->mb.pic.p_fdec[0][4*y * h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0],
&h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height );
h->mc.mc_chroma( &h->mb.pic.p_fref[0][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
&h->mb.pic.p_fdec[1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
&h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
mvx, mvy, 2*width, 2*height );
h->mc.mc_chroma( &h->mb.pic.p_fref[0][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
&h->mb.pic.p_fdec[2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
&h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
mvx, mvy, 2*width, 2*height );
}
static inline void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
......@@ -564,15 +564,15 @@ static inline void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int hei
const int mvy = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
h->mc.mc_luma( h->mb.pic.p_fref[1][i_ref], h->mb.pic.i_stride[0],
&h->mb.pic.p_fdec[0][4*y *h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0],
&h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height );
h->mc.mc_chroma( &h->mb.pic.p_fref[1][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
&h->mb.pic.p_fdec[1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
&h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
mvx, mvy, 2*width, 2*height );
h->mc.mc_chroma( &h->mb.pic.p_fref[1][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
&h->mb.pic.p_fdec[2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
&h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
mvx, mvy, 2*width, 2*height );
}
......@@ -596,27 +596,27 @@ static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int he
const int i_ref0 = h->mb.cache.ref[0][i8];
const int weight = h->mb.bipred_weight[i_ref0][i_ref1];
h->mc.avg_weight[i_mode]( &h->mb.pic.p_fdec[0][4*y *h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0], tmp, 16, weight );
h->mc.avg_weight[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, tmp, 16, weight );
h->mc.mc_chroma( &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
tmp, 16, mvx1, mvy1, 2*width, 2*height );
h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], tmp, 16, weight );
h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16, weight );
h->mc.mc_chroma( &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
tmp, 16, mvx1, mvy1, 2*width, 2*height );
h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], tmp, 16, weight );
h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16, weight );
}
else
{
h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y *h->mb.pic.i_stride[0]+4*x], h->mb.pic.i_stride[0], tmp, 16 );
h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, tmp, 16 );
h->mc.mc_chroma( &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
tmp, 16, mvx1, mvy1, 2*width, 2*height );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], tmp, 16 );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16 );
h->mc.mc_chroma( &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
tmp, 16, mvx1, mvy1, 2*width, 2*height );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], tmp, 16 );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16 );
}
}
......@@ -917,17 +917,38 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
h->mb.i_b4_xy = i_mb_4x4;
h->mb.i_neighbour = 0;
/* fdec: fenc:
* yyyyyyy
* yYYYY YYYY
* yYYYY YYYY
* yYYYY YYYY
* yYYYY YYYY
* uuu vvv UUVV
* uUU vVV UUVV
* uUU vVV
*/
h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
/* load picture pointers */
for( i = 0; i < 3; i++ )
{
const int w = (i == 0 ? 16 : 8);
const int i_stride = h->fdec->i_stride[i];
const uint8_t *plane_fdec = &h->fdec->plane[i][ w * (i_mb_x + i_mb_y * i_stride) ];
int j;
h->mb.pic.i_stride[i] = i_stride;
h->mb.pic.p_fenc[i] = &h->fenc->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )];
h->mb.pic.p_fdec[i] = &h->fdec->plane[i][ w * ( i_mb_x + i_mb_y * i_stride )];
h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
&h->fenc->plane[i][ w * (i_mb_x + i_mb_y * i_stride) ], i_stride, w );
memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], &plane_fdec[-1-i_stride], w*3/2+1 );
for( j = 0; j < w; j++ )
h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride];
for( j = 0; j < h->i_ref0; j++ )
{
......@@ -1268,6 +1289,15 @@ void x264_macroblock_cache_save( x264_t *h )
int i;
for( i = 0; i < 3; i++ )
{
int w = i ? 8 : 16;
h->mc.copy[i?PIXEL_8x8:PIXEL_16x16](
&h->fdec->plane[i][ w * (h->mb.i_mb_x + h->mb.i_mb_y * h->fdec->i_stride[i]) ],
h->fdec->i_stride[i],
h->mb.pic.p_fdec[i], FDEC_STRIDE, w );
}
h->mb.type[i_mb_xy] = i_mb_type;
if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
......
......@@ -410,7 +410,6 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
int predict_mode[9];
uint8_t *p_dstc[2], *p_srcc[2];
int i_stride[2];
if( a->i_sad_i8x8chroma < COST_MAX )
return;
......@@ -421,9 +420,6 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
p_srcc[0] = h->mb.pic.p_fenc[1];
p_srcc[1] = h->mb.pic.p_fenc[2];
i_stride[0] = h->mb.pic.i_stride[1];
i_stride[1] = h->mb.pic.i_stride[2];
predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
a->i_sad_i8x8chroma = COST_MAX;
for( i = 0; i < i_max; i++ )
......@@ -434,14 +430,14 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
i_mode = predict_mode[i];
/* we do the prediction */
h->predict_8x8c[i_mode]( p_dstc[0], i_stride[0] );
h->predict_8x8c[i_mode]( p_dstc[1], i_stride[1] );
h->predict_8x8c[i_mode]( p_dstc[0], FDEC_STRIDE );
h->predict_8x8c[i_mode]( p_dstc[1], FDEC_STRIDE );
/* we calculate the cost */
i_sad = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], i_stride[0],
p_srcc[0], i_stride[0] ) +
h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], i_stride[1],
p_srcc[1], i_stride[1] ) +
i_sad = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE,
p_srcc[0], FENC_STRIDE ) +
h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE,
p_srcc[1], FENC_STRIDE ) +
a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
/* if i_score is lower it is better */
......@@ -458,7 +454,6 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_inter )
{
const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
const int i_stride = h->mb.pic.i_stride[0];
uint8_t *p_src = h->mb.pic.p_fenc[0];
uint8_t *p_dst = h->mb.pic.p_fdec[0];
int f8_satd_rd_ratio = 0;
......@@ -483,9 +478,9 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_
int i_mode;
i_mode = predict_mode[i];
h->predict_16x16[i_mode]( p_dst, i_stride );
h->predict_16x16[i_mode]( p_dst, FDEC_STRIDE );
i_sad = h->pixf.mbcmp[PIXEL_16x16]( p_dst, i_stride, p_src, i_stride ) +
i_sad = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
if( a->i_sad_i16x16 > i_sad )
{
......@@ -534,15 +529,15 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_
x = block_idx_x[idx];
y = block_idx_y[idx];
p_src_by = p_src + 4 * x + 4 * y * i_stride;
p_dst_by = p_dst + 4 * x + 4 * y * i_stride;
p_src_by = p_src + 4 * x + 4 * y * FENC_STRIDE;
p_dst_by = p_dst + 4 * x + 4 * y * FDEC_STRIDE;
i_best = COST_MAX;
predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
/* emulate missing topright samples */
*(uint32_t*) &p_dst_by[4 - i_stride] = p_dst_by[3 - i_stride] * 0x01010101U;
*(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
for( i = 0; i < i_max; i++ )
{
......@@ -550,10 +545,10 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_
int i_mode;
i_mode = predict_mode[i];
h->predict_4x4[i_mode]( p_dst_by, i_stride );
h->predict_4x4[i_mode]( p_dst_by, FDEC_STRIDE );
i_sad = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, i_stride,
p_src_by, i_stride )
i_sad = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE,
p_src_by, FENC_STRIDE )
+ a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
if( i_best > i_sad )
......@@ -565,7 +560,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_
a->i_sad_i4x4 += i_best;
/* we need to encode this block now (for next ones) */
h->predict_4x4[a->i_predict4x4[x][y]]( p_dst_by, i_stride );
h->predict_4x4[a->i_predict4x4[x][y]]( p_dst_by, FDEC_STRIDE );
x264_mb_encode_i4x4( h, idx, a->i_qp );
h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[x][y];
......@@ -607,8 +602,8 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_
x = idx&1;
y = idx>>1;
p_src_by = p_src + 8 * x + 8 * y * i_stride;
p_dst_by = p_dst + 8 * x + 8 * y * i_stride;
p_src_by = p_src + 8 * x + 8 * y * FENC_STRIDE;
p_dst_by = p_dst + 8 * x + 8 * y * FDEC_STRIDE;
i_best = COST_MAX;
predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
......@@ -618,11 +613,11 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_
int i_mode;
i_mode = predict_mode[i];
h->predict_8x8[i_mode]( p_dst_by, i_stride, h->mb.i_neighbour8[idx] );
h->predict_8x8[i_mode]( p_dst_by, FDEC_STRIDE, h->mb.i_neighbour8[idx] );
/* could use sa8d, but it doesn't seem worth the speed cost (without mmx at least) */
i_sad = h->pixf.mbcmp[PIXEL_8x8]( p_dst_by, i_stride,
p_src_by, i_stride )
i_sad = h->pixf.mbcmp[PIXEL_8x8]( p_dst_by, FDEC_STRIDE,
p_src_by, FENC_STRIDE )
+ a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
if( i_best > i_sad )
......@@ -634,7 +629,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_
a->i_sad_i8x8 += i_best;
/* we need to encode this block now (for next ones) */
h->predict_8x8[a->i_predict8x8[x][y]]( p_dst_by, i_stride, h->mb.i_neighbour );
h->predict_8x8[a->i_predict8x8[x][y]]( p_dst_by, FDEC_STRIDE, h->mb.i_neighbour );
x264_mb_encode_i8x8( h, idx, a->i_qp );
x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[x][y] );
......@@ -664,9 +659,9 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_
#define LOAD_FENC( m, src, xoff, yoff) \
(m)->i_stride[0] = h->mb.pic.i_stride[0]; \
(m)->i_stride[1] = h->mb.pic.i_stride[1]; \
(m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
(m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
(m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]];
(m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
(m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
(m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
#define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
(m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
......@@ -1002,7 +997,8 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
{
uint8_t pix1[8*8], pix2[8*8];
DECLARE_ALIGNED( uint8_t, pix1[8*8], 8 );
DECLARE_ALIGNED( uint8_t, pix2[8*8], 8 );
const int i_stride = h->mb.pic.i_stride[1];
const int off = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
......@@ -1028,8 +1024,8 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 );
}
return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][off], i_stride, pix1, 8 )
+ h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][off], i_stride, pix2, 8 );
return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][off], FENC_STRIDE, pix1, 8 )
+ h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][off], FENC_STRIDE, pix2, 8 );
}
static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
......@@ -1153,18 +1149,16 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
uint8_t **p_fenc = h->mb.pic.p_fenc;
uint8_t **p_fdec = h->mb.pic.p_fdec;
int i_stride= h->mb.pic.i_stride[0];
int i;
a->i_cost16x16direct = 0;
for( i = 0; i < 4; i++ )
{
const int x8 = i%2;
const int y8 = i/2;
const int off = 8 * x8 + 8 * i_stride * y8;
const int x = (i&1)*8;
const int y = (i>>1)*8;
a->i_cost16x16direct +=
a->i_cost8x8direct[i] =
h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][off], i_stride, &p_fdec[0][off], i_stride );
h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
/* mb type cost */
a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
......@@ -1302,7 +1296,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
else
h->mc.avg[PIXEL_16x16]( pix1, 16, src2, stride2 );
a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0], pix1, 16 )
a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix1, 16 )
+ REF_COST( 0, a->l0.i_ref )
+ REF_COST( 1, a->l1.i_ref )
+ a->l0.me16x16.cost_mv
......@@ -1487,7 +1481,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
}
WEIGHTED_AVG( PIXEL_8x8, pix[0], 8, pix[1], 8 );
i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 8 )
i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
+ a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
......@@ -1539,7 +1533,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
uint8_t **p_fref[2] =
{ h->mb.pic.p_fref[0][a->l0.i_ref],
h->mb.pic.p_fref[1][a->l1.i_ref] };
uint8_t pix[2][16*8];
DECLARE_ALIGNED( uint8_t, pix[2][16*8], 16 );
int mvc[2][2];
int i, l;
......@@ -1579,7 +1573,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
}
WEIGHTED_AVG( PIXEL_16x8, pix[0], 16, pix[1], 16 );
i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 16 );
i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
i_part_cost = a->l0.me16x8[i].cost;
a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
......@@ -1663,7 +1657,7 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
}
WEIGHTED_AVG( PIXEL_8x16, pix[0], 8, pix[1], 8 );
i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 8 );
i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
i_part_cost = a->l0.me8x16[i].cost;
a->i_mb_partition8x16[i] = D_L0_8x8;
......@@ -1745,10 +1739,10 @@ static inline void x264_mb_analyse_transform( x264_t *h )
/* FIXME only luma mc is needed */
x264_mb_mc( h );
i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
h->mb.pic.p_fdec[0], FDEC_STRIDE );
i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
h->mb.pic.p_fdec[0], FDEC_STRIDE );
h->mb.b_transform_8x8 = i_cost8 < i_cost4;
}
......
......@@ -97,18 +97,19 @@ static inline void scan_zigzag_2x2_dc( int level[4], int16_t dct[2][2] )
#undef ZIG
#define ZIG(i,y,x) {\
int o = x+y*i_stride;\
level[i] = p_src[o] - p_dst[o];\
p_dst[o] = p_src[o];\
int oe = x+y*FENC_STRIDE;\
int od = x+y*FDEC_STRIDE;\
level[i] = p_src[oe] - p_dst[od];\
p_dst[od] = p_src[oe];\
}
static inline void sub_zigzag_4x4full( int level[16], const uint8_t *p_src, uint8_t *p_dst, int i_stride )
static inline void sub_zigzag_4x4full( int level[16], const uint8_t *p_src, uint8_t *p_dst )
{
ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)
ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
}
static inline void sub_zigzag_4x4( int level[15], const uint8_t *p_src, uint8_t *p_dst, int i_stride )
static inline void sub_zigzag_4x4( int level[15], const uint8_t *p_src, uint8_t *p_dst )
{
ZIG( 0,0,1) ZIG( 1,1,0) ZIG( 2,2,0)
ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2)
......@@ -192,19 +193,19 @@ static int x264_mb_decimate_score( int *dct, int i_max )
void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
{
const int i_stride = h->mb.pic.i_stride[0];
const int i_offset = 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * i_stride;
uint8_t *p_src = &h->mb.pic.p_fenc[0][i_offset];
uint8_t *p_dst = &h->mb.pic.p_fdec[0][i_offset];
int x = 4 * block_idx_x[idx];
int y = 4 * block_idx_y[idx];
uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
int16_t dct4x4[4][4];
if( h->mb.b_lossless )
{
sub_zigzag_4x4full( h->dct.block[idx].luma4x4, p_src, p_dst, i_stride );
sub_zigzag_4x4full( h->dct.block[idx].luma4x4, p_src, p_dst );
return;
}
h->dctf.sub4x4_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
h->dctf.sub4x4_dct( dct4x4, p_src, FENC_STRIDE, p_dst, FDEC_STRIDE );
if( h->mb.b_noise_reduction )
x264_denoise_dct( h, (int16_t*)dct4x4 );
......@@ -217,18 +218,18 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
/* output samples to fdec */
h->dctf.add4x4_idct( p_dst, i_stride, dct4x4 );
h->dctf.add4x4_idct( p_dst, FDEC_STRIDE, dct4x4 );
}
void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale )
{
const int i_stride = h->mb.pic.i_stride[0];
const int i_offset = 8 * (idx&1) + 8 * (idx>>1) * i_stride;
uint8_t *p_src = &h->mb.pic.p_fenc[0][i_offset];
uint8_t *p_dst = &h->mb.pic.p_fdec[0][i_offset];
int x = 8 * (idx&1);
int y = 8 * (idx>>1);
uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
int16_t dct8x8[8][8];
h->dctf.sub8x8_dct8( dct8x8, p_src, i_stride, p_dst, i_stride );
h->dctf.sub8x8_dct8( dct8x8, p_src, FENC_STRIDE, p_dst, FDEC_STRIDE );
if( h->mb.b_noise_reduction )
x264_denoise_dct( h, (int16_t*)dct8x8 );
......@@ -239,12 +240,11 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale )
scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8 );
h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qscale );
h->dctf.add8x8_idct8( p_dst, i_stride, dct8x8 );
h->dctf.add8x8_idct8( p_dst, FDEC_STRIDE, dct8x8 );
}
static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
{
const int i_stride = h->mb.pic.i_stride[0];
uint8_t *p_src = h->mb.pic.p_fenc[0];
uint8_t *p_dst = h->mb.pic.p_fdec[0];
......@@ -256,16 +256,17 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
{
for( i = 0; i < 16; i++ )
{
int o = block_idx_x[i]*4 + block_idx_y[i]*4*i_stride;
sub_zigzag_4x4( h->dct.block[i].residual_ac, p_src+o, p_dst+o, i_stride );
dct4x4[0][block_idx_y[i]][block_idx_x[i]] = p_src[o] - p_dst[o];
p_dst[o] = p_src[o];
int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
sub_zigzag_4x4( h->dct.block[i].residual_ac, p_src+oe, p_dst+od );
dct4x4[0][block_idx_y[i]][block_idx_x[i]] = p_src[oe] - p_dst[od];
p_dst[od] = p_src[oe];
}
scan_zigzag_4x4full( h->dct.luma16x16_dc, dct4x4[0] );
return;
}
h->dctf.sub16x16_dct( &dct4x4[1], p_src, i_stride, p_dst, i_stride );
h->dctf.sub16x16_dct( &dct4x4[1], p_src, FENC_STRIDE, p_dst, FDEC_STRIDE );
for( i = 0; i < 16; i++ )
{
/* copy dc coeff */
......@@ -298,7 +299,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
}
/* put pixels to fdec */
h->dctf.add16x16_idct( p_dst, i_stride, &dct4x4[1] );
h->dctf.add16x16_idct( p_dst, FDEC_STRIDE, &dct4x4[1] );
}
static void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
......@@ -307,7 +308,6 @@ static void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
for( ch = 0; ch < 2; ch++ )
{
const int i_stride = h->mb.pic.i_stride[1+ch];
uint8_t *p_src = h->mb.pic.p_fenc[1+ch];
uint8_t *p_dst = h->mb.pic.p_fdec[1+ch];
int i_decimate_score = 0;
......@@ -319,15 +319,16 @@ static void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
{
for( i = 0; i < 4; i++ )
{
int o = block_idx_x[i]*4 + block_idx_y[i]*4*i_stride;
sub_zigzag_4x4( h->dct.block[16+i+ch*4].residual_ac, p_src+o, p_dst+o, i_stride );
h->dct.chroma_dc[ch][i] = p_src[o] - p_dst[o];
p_dst[o] = p_src[o];
int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
sub_zigzag_4x4( h->dct.block[16+i+ch*4].residual_ac, p_src+oe, p_dst+od );
h->dct.chroma_dc[ch][i] = p_src[oe] - p_dst[od];
p_dst[od] = p_src[oe];
}
continue;
}
h->dctf.sub8x8_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
h->dctf.sub8x8_dct( dct4x4, p_src, FENC_STRIDE, p_dst, FDEC_STRIDE );
/* calculate dct coeffs */
for( i = 0; i < 4; i++ )
{
......@@ -370,7 +371,7 @@ static void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
/* copy dc coeff */
dct4x4[i][0][0] = dct2x2[0][i];
}
h->dctf.add8x8_idct( p_dst, i_stride, dct4x4 );
h->dctf.add8x8_idct( p_dst, FDEC_STRIDE, dct4x4 );
}
}
......@@ -402,17 +403,17 @@ void x264_macroblock_encode_pskip( x264_t *h )
/* Motion compensation XXX probably unneeded */
h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0],
mvx, mvy, 16, 16 );
h->mb.pic.p_fdec[0],