mc.c 25.7 KB
Newer Older
Laurent Aimar's avatar
Laurent Aimar committed
1
/*****************************************************************************
Fiona Glaser's avatar
Fiona Glaser committed
2
 * mc.c: motion compensation
Laurent Aimar's avatar
Laurent Aimar committed
3
 *****************************************************************************
Anton Mitrofanov's avatar
Anton Mitrofanov committed
4
 * Copyright (C) 2003-2015 x264 project
Laurent Aimar's avatar
Laurent Aimar committed
5 6
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7
 *          Loren Merritt <lorenm@u.washington.edu>
Laurent Aimar's avatar
Laurent Aimar committed
8 9 10 11 12 13 14 15 16 17 18 19 20
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
Fiona Glaser's avatar
Fiona Glaser committed
22 23 24
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
Laurent Aimar's avatar
Laurent Aimar committed
25 26
 *****************************************************************************/

Loren Merritt's avatar
Loren Merritt committed
27
#include "common.h"
Laurent Aimar's avatar
Laurent Aimar committed
28

Steven Walters's avatar
Steven Walters committed
29
#if HAVE_MMX
30
#include "x86/mc.h"
Laurent Aimar's avatar
Laurent Aimar committed
31
#endif
Steven Walters's avatar
Steven Walters committed
32
#if ARCH_PPC
33
#include "ppc/mc.h"
Laurent Aimar's avatar
Laurent Aimar committed
34
#endif
Steven Walters's avatar
Steven Walters committed
35
#if ARCH_ARM
36 37
#include "arm/mc.h"
#endif
38 39 40
#if ARCH_AARCH64
#include "aarch64/mc.h"
#endif
Laurent Aimar's avatar
Laurent Aimar committed
41 42


43 44 45
static inline void pixel_avg( pixel *dst,  intptr_t i_dst_stride,
                              pixel *src1, intptr_t i_src1_stride,
                              pixel *src2, intptr_t i_src2_stride, int i_width, int i_height )
Laurent Aimar's avatar
Laurent Aimar committed
46
{
47
    for( int y = 0; y < i_height; y++ )
Laurent Aimar's avatar
Laurent Aimar committed
48
    {
49
        for( int x = 0; x < i_width; x++ )
Laurent Aimar's avatar
Laurent Aimar committed
50 51 52 53 54 55 56
            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
        dst  += i_dst_stride;
        src1 += i_src1_stride;
        src2 += i_src2_stride;
    }
}

57 58 59
static inline void pixel_avg_wxh( pixel *dst,  intptr_t i_dst,
                                  pixel *src1, intptr_t i_src1,
                                  pixel *src2, intptr_t i_src2, int width, int height )
60
{
61
    for( int y = 0; y < height; y++ )
62
    {
63
        for( int x = 0; x < width; x++ )
64 65 66
            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
        src1 += i_src1;
        src2 += i_src2;
67 68 69 70 71 72
        dst += i_dst;
    }
}

/* Implicit weighted bipred only:
 * assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */
73 74 75
static inline void pixel_avg_weight_wxh( pixel *dst,  intptr_t i_dst,
                                         pixel *src1, intptr_t i_src1,
                                         pixel *src2, intptr_t i_src2, int width, int height, int i_weight1 )
76
{
77
    int i_weight2 = 64 - i_weight1;
78
    for( int y = 0; y<height; y++, dst += i_dst, src1 += i_src1, src2 += i_src2 )
79 80
        for( int x = 0; x<width; x++ )
            dst[x] = x264_clip_pixel( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 );
81
}
Fiona Glaser's avatar
Fiona Glaser committed
82
#undef op_scale2
83

Fiona Glaser's avatar
Fiona Glaser committed
84
#define PIXEL_AVG_C( name, width, height ) \
85 86 87
static void name( pixel *pix1, intptr_t i_stride_pix1, \
                  pixel *pix2, intptr_t i_stride_pix2, \
                  pixel *pix3, intptr_t i_stride_pix3, int weight ) \
88
{ \
Anton Mitrofanov's avatar
Anton Mitrofanov committed
89
    if( weight == 32 ) \
Fiona Glaser's avatar
Fiona Glaser committed
90
        pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
Anton Mitrofanov's avatar
Anton Mitrofanov committed
91
    else \
Fiona Glaser's avatar
Fiona Glaser committed
92
        pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \
93
}
Fiona Glaser's avatar
Fiona Glaser committed
94 95 96 97 98
PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
PIXEL_AVG_C( pixel_avg_16x8,  16, 8 )
PIXEL_AVG_C( pixel_avg_8x16,  8, 16 )
PIXEL_AVG_C( pixel_avg_8x8,   8, 8 )
PIXEL_AVG_C( pixel_avg_8x4,   8, 4 )
Henrik Gramner's avatar
Henrik Gramner committed
99
PIXEL_AVG_C( pixel_avg_4x16,  4, 16 )
Fiona Glaser's avatar
Fiona Glaser committed
100 101 102
PIXEL_AVG_C( pixel_avg_4x8,   4, 8 )
PIXEL_AVG_C( pixel_avg_4x4,   4, 4 )
PIXEL_AVG_C( pixel_avg_4x2,   4, 2 )
Henrik Gramner's avatar
Henrik Gramner committed
103
PIXEL_AVG_C( pixel_avg_2x8,   2, 8 )
Fiona Glaser's avatar
Fiona Glaser committed
104 105
PIXEL_AVG_C( pixel_avg_2x4,   2, 4 )
PIXEL_AVG_C( pixel_avg_2x2,   2, 2 )
106

Dylan Yudaken's avatar
Dylan Yudaken committed
107 108 109 110
static void x264_weight_cache( x264_t *h, x264_weight_t *w )
{
    w->weightfn = h->mc.weight;
}
111 112
#define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * scale + (1<<(denom - 1))) >> denom) + offset )
#define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * scale + offset )
113 114
static void mc_weight( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
                       const x264_weight_t *weight, int i_width, int i_height )
Dylan Yudaken's avatar
Dylan Yudaken committed
115
{
116 117 118 119
    int offset = weight->i_offset << (BIT_DEPTH-8);
    int scale = weight->i_scale;
    int denom = weight->i_denom;
    if( denom >= 1 )
Dylan Yudaken's avatar
Dylan Yudaken committed
120
    {
121 122
        for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
            for( int x = 0; x < i_width; x++ )
Dylan Yudaken's avatar
Dylan Yudaken committed
123 124 125 126
                opscale( x );
    }
    else
    {
127 128
        for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
            for( int x = 0; x < i_width; x++ )
Dylan Yudaken's avatar
Dylan Yudaken committed
129 130 131 132
                opscale_noden( x );
    }
}

133
#define MC_WEIGHT_C( name, width ) \
134
    static void name( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, const x264_weight_t *weight, int height ) \
Dylan Yudaken's avatar
Dylan Yudaken committed
135
{ \
136
    mc_weight( dst, i_dst_stride, src, i_src_stride, weight, width, height );\
Dylan Yudaken's avatar
Dylan Yudaken committed
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
}

MC_WEIGHT_C( mc_weight_w20, 20 )
MC_WEIGHT_C( mc_weight_w16, 16 )
MC_WEIGHT_C( mc_weight_w12, 12 )
MC_WEIGHT_C( mc_weight_w8,   8 )
MC_WEIGHT_C( mc_weight_w4,   4 )
MC_WEIGHT_C( mc_weight_w2,   2 )

static weight_fn_t x264_mc_weight_wtab[6] =
{
    mc_weight_w2,
    mc_weight_w4,
    mc_weight_w8,
    mc_weight_w12,
    mc_weight_w16,
    mc_weight_w20,
};
Anton Mitrofanov's avatar
Anton Mitrofanov committed
155
const x264_weight_t x264_weight_none[3] = { {{0}} };
156
static void mc_copy( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, int i_width, int i_height )
Laurent Aimar's avatar
Laurent Aimar committed
157
{
158
    for( int y = 0; y < i_height; y++ )
Laurent Aimar's avatar
Laurent Aimar committed
159
    {
160
        memcpy( dst, src, i_width * sizeof(pixel) );
Laurent Aimar's avatar
Laurent Aimar committed
161 162 163 164 165 166

        src += i_src_stride;
        dst += i_dst_stride;
    }
}

Loren Merritt's avatar
Loren Merritt committed
167
#define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d]))
168
static void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
169
                         intptr_t stride, int width, int height, int16_t *buf )
Laurent Aimar's avatar
Laurent Aimar committed
170
{
171
    const int pad = (BIT_DEPTH > 9) ? (-10 * PIXEL_MAX) : 0;
172
    for( int y = 0; y < height; y++ )
Laurent Aimar's avatar
Laurent Aimar committed
173
    {
174
        for( int x = -2; x < width+3; x++ )
Laurent Aimar's avatar
Laurent Aimar committed
175
        {
Loren Merritt's avatar
Loren Merritt committed
176
            int v = TAPFILTER(src,stride);
177
            dstv[x] = x264_clip_pixel( (v + 16) >> 5 );
178 179
            /* transform v for storage in a 16-bit integer */
            buf[x+2] = v + pad;
Laurent Aimar's avatar
Laurent Aimar committed
180
        }
181
        for( int x = 0; x < width; x++ )
182
            dstc[x] = x264_clip_pixel( (TAPFILTER(buf+2,1) - 32*pad + 512) >> 10 );
183
        for( int x = 0; x < width; x++ )
184
            dsth[x] = x264_clip_pixel( (TAPFILTER(src,1) + 16) >> 5 );
Loren Merritt's avatar
Loren Merritt committed
185 186 187 188
        dsth += stride;
        dstv += stride;
        dstc += stride;
        src += stride;
Laurent Aimar's avatar
Laurent Aimar committed
189
    }
190 191
}

192 193
const uint8_t x264_hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
const uint8_t x264_hpel_ref1[16] = {0,0,1,0,2,2,3,2,2,2,3,2,2,2,3,2};
Loren Merritt's avatar
Loren Merritt committed
194

195 196
static void mc_luma( pixel *dst,    intptr_t i_dst_stride,
                     pixel *src[4], intptr_t i_src_stride,
Loren Merritt's avatar
Loren Merritt committed
197
                     int mvx, int mvy,
Dylan Yudaken's avatar
Dylan Yudaken committed
198
                     int i_width, int i_height, const x264_weight_t *weight )
199
{
Loren Merritt's avatar
Loren Merritt committed
200 201
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
202
    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
203

Loren Merritt's avatar
Loren Merritt committed
204
    if( qpel_idx & 5 ) /* qpel interpolation needed */
205
    {
206
        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
207 208
        pixel_avg( dst, i_dst_stride, src1, i_src_stride,
                   src2, i_src_stride, i_width, i_height );
Dylan Yudaken's avatar
Dylan Yudaken committed
209 210
        if( weight->weightfn )
            mc_weight( dst, i_dst_stride, dst, i_dst_stride, weight, i_width, i_height );
211
    }
Dylan Yudaken's avatar
Dylan Yudaken committed
212 213
    else if( weight->weightfn )
        mc_weight( dst, i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
214 215 216 217
    else
        mc_copy( src1, i_src_stride, dst, i_dst_stride, i_width, i_height );
}

218 219
static pixel *get_ref( pixel *dst,   intptr_t *i_dst_stride,
                       pixel *src[4], intptr_t i_src_stride,
220 221
                       int mvx, int mvy,
                       int i_width, int i_height, const x264_weight_t *weight )
222
{
Loren Merritt's avatar
Loren Merritt committed
223 224
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
225
    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
226

Loren Merritt's avatar
Loren Merritt committed
227
    if( qpel_idx & 5 ) /* qpel interpolation needed */
228
    {
229
        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
230 231
        pixel_avg( dst, *i_dst_stride, src1, i_src_stride,
                   src2, i_src_stride, i_width, i_height );
Dylan Yudaken's avatar
Dylan Yudaken committed
232 233 234 235 236 237 238
        if( weight->weightfn )
            mc_weight( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_width, i_height );
        return dst;
    }
    else if( weight->weightfn )
    {
        mc_weight( dst, *i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
239 240 241 242 243 244 245 246 247
        return dst;
    }
    else
    {
        *i_dst_stride = i_src_stride;
        return src1;
    }
}

Laurent Aimar's avatar
Laurent Aimar committed
248
/* full chroma mc (ie until 1/8 pixel)*/
249 250
static void mc_chroma( pixel *dstu, pixel *dstv, intptr_t i_dst_stride,
                       pixel *src, intptr_t i_src_stride,
251 252
                       int mvx, int mvy,
                       int i_width, int i_height )
Laurent Aimar's avatar
Laurent Aimar committed
253
{
254
    pixel *srcp;
Laurent Aimar's avatar
Laurent Aimar committed
255

256 257 258 259 260 261
    int d8x = mvx&0x07;
    int d8y = mvy&0x07;
    int cA = (8-d8x)*(8-d8y);
    int cB = d8x    *(8-d8y);
    int cC = (8-d8x)*d8y;
    int cD = d8x    *d8y;
Laurent Aimar's avatar
Laurent Aimar committed
262

263
    src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
Laurent Aimar's avatar
Laurent Aimar committed
264 265
    srcp = &src[i_src_stride];

266
    for( int y = 0; y < i_height; y++ )
Laurent Aimar's avatar
Laurent Aimar committed
267
    {
268
        for( int x = 0; x < i_width; x++ )
269 270 271 272 273 274 275 276
        {
            dstu[x] = ( cA*src[2*x]  + cB*src[2*x+2] +
                        cC*srcp[2*x] + cD*srcp[2*x+2] + 32 ) >> 6;
            dstv[x] = ( cA*src[2*x+1]  + cB*src[2*x+3] +
                        cC*srcp[2*x+1] + cD*srcp[2*x+3] + 32 ) >> 6;
        }
        dstu += i_dst_stride;
        dstv += i_dst_stride;
Laurent Aimar's avatar
Laurent Aimar committed
277 278 279 280 281
        src   = srcp;
        srcp += i_src_stride;
    }
}

Loren Merritt's avatar
Loren Merritt committed
282
#define MC_COPY(W) \
283
static void mc_copy_w##W( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int i_height ) \
Loren Merritt's avatar
Loren Merritt committed
284 285 286 287 288 289 290
{ \
    mc_copy( src, i_src, dst, i_dst, W, i_height ); \
}
MC_COPY( 16 )
MC_COPY( 8 )
MC_COPY( 4 )

291 292
void x264_plane_copy_c( pixel *dst, intptr_t i_dst,
                        pixel *src, intptr_t i_src, int w, int h )
293 294 295
{
    while( h-- )
    {
296
        memcpy( dst, src, w * sizeof(pixel) );
297 298 299 300 301
        dst += i_dst;
        src += i_src;
    }
}

Xiaolei Yu's avatar
Xiaolei Yu committed
302 303 304 305 306 307 308 309 310 311 312
void x264_plane_copy_swap_c( pixel *dst, intptr_t i_dst,
                             pixel *src, intptr_t i_src, int w, int h )
{
    for( int y=0; y<h; y++, dst+=i_dst, src+=i_src )
        for( int x=0; x<2*w; x+=2 )
        {
            dst[x]   = src[x+1];
            dst[x+1] = src[x];
        }
}

313 314 315
void x264_plane_copy_interleave_c( pixel *dst,  intptr_t i_dst,
                                   pixel *srcu, intptr_t i_srcu,
                                   pixel *srcv, intptr_t i_srcv, int w, int h )
316 317 318 319
{
    for( int y=0; y<h; y++, dst+=i_dst, srcu+=i_srcu, srcv+=i_srcv )
        for( int x=0; x<w; x++ )
        {
320 321
            dst[2*x]   = srcu[x];
            dst[2*x+1] = srcv[x];
322 323 324
        }
}

325 326 327
static void x264_plane_copy_deinterleave_c( pixel *dstu, intptr_t i_dstu,
                                            pixel *dstv, intptr_t i_dstv,
                                            pixel *src,  intptr_t i_src, int w, int h )
328 329 330 331 332 333 334 335 336
{
    for( int y=0; y<h; y++, dstu+=i_dstu, dstv+=i_dstv, src+=i_src )
        for( int x=0; x<w; x++ )
        {
            dstu[x] = src[2*x];
            dstv[x] = src[2*x+1];
        }
}

337 338 339 340
static void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, intptr_t i_dsta,
                                                pixel *dstb, intptr_t i_dstb,
                                                pixel *dstc, intptr_t i_dstc,
                                                pixel *src,  intptr_t i_src, int pw, int w, int h )
xvidfan's avatar
xvidfan committed
341 342 343 344 345 346 347 348 349 350 351 352
{
    for( int y=0; y<h; y++, dsta+=i_dsta, dstb+=i_dstb, dstc+=i_dstc, src+=i_src )
    {
        for( int x=0; x<w; x++ )
        {
            dsta[x] = src[x*pw];
            dstb[x] = src[x*pw+1];
            dstc[x] = src[x*pw+2];
        }
    }
}

James Weaver's avatar
James Weaver committed
353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380
void x264_plane_copy_deinterleave_v210_c( pixel *dsty, intptr_t i_dsty,
                                          pixel *dstc, intptr_t i_dstc,
                                          uint32_t *src, intptr_t i_src, int w, int h )
{
    for( int l = 0; l < h; l++ )
    {
        pixel *dsty0 = dsty;
        pixel *dstc0 = dstc;
        uint32_t *src0 = src;

        for( int n = 0; n < w; n += 3 )
        {
            *(dstc0++) = *src0 & 0x03FF;
            *(dsty0++) = ( *src0 >> 10 ) & 0x03FF;
            *(dstc0++) = ( *src0 >> 20 ) & 0x03FF;
            src0++;
            *(dsty0++) = *src0 & 0x03FF;
            *(dstc0++) = ( *src0 >> 10 ) & 0x03FF;
            *(dsty0++) = ( *src0 >> 20 ) & 0x03FF;
            src0++;
        }

        dsty += i_dsty;
        dstc += i_dstc;
        src  += i_src;
    }
}

381
static void store_interleave_chroma( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height )
382
{
Henrik Gramner's avatar
Henrik Gramner committed
383
    for( int y=0; y<height; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
384 385 386 387 388 389 390
        for( int x=0; x<8; x++ )
        {
            dst[2*x]   = srcu[x];
            dst[2*x+1] = srcv[x];
        }
}

391
static void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
392
{
Henrik Gramner's avatar
Henrik Gramner committed
393
    x264_plane_copy_deinterleave_c( dst, FENC_STRIDE, dst+FENC_STRIDE/2, FENC_STRIDE, src, i_src, 8, height );
394 395
}

396
static void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
397
{
Henrik Gramner's avatar
Henrik Gramner committed
398
    x264_plane_copy_deinterleave_c( dst, FDEC_STRIDE, dst+FDEC_STRIDE/2, FDEC_STRIDE, src, i_src, 8, height );
399 400
}

401 402
static void prefetch_fenc_null( pixel *pix_y,  intptr_t stride_y,
                                pixel *pix_uv, intptr_t stride_uv, int mb_x )
403 404
{}

405
static void prefetch_ref_null( pixel *pix, intptr_t stride, int parity )
406 407
{}

408
static void memzero_aligned( void * dst, size_t n )
Fiona Glaser's avatar
Fiona Glaser committed
409 410 411 412
{
    memset( dst, 0, n );
}

413
static void integral_init4h( uint16_t *sum, pixel *pix, intptr_t stride )
Loren Merritt's avatar
Loren Merritt committed
414
{
415 416
    int v = pix[0]+pix[1]+pix[2]+pix[3];
    for( int x = 0; x < stride-4; x++ )
Loren Merritt's avatar
Loren Merritt committed
417 418 419 420 421 422
    {
        sum[x] = v + sum[x-stride];
        v += pix[x+4] - pix[x];
    }
}

423
static void integral_init8h( uint16_t *sum, pixel *pix, intptr_t stride )
Loren Merritt's avatar
Loren Merritt committed
424
{
425 426
    int v = pix[0]+pix[1]+pix[2]+pix[3]+pix[4]+pix[5]+pix[6]+pix[7];
    for( int x = 0; x < stride-8; x++ )
Loren Merritt's avatar
Loren Merritt committed
427 428 429 430 431 432
    {
        sum[x] = v + sum[x-stride];
        v += pix[x+8] - pix[x];
    }
}

433
static void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
Loren Merritt's avatar
Loren Merritt committed
434
{
435
    for( int x = 0; x < stride-8; x++ )
Loren Merritt's avatar
Loren Merritt committed
436
        sum4[x] = sum8[x+4*stride] - sum8[x];
437
    for( int x = 0; x < stride-8; x++ )
Loren Merritt's avatar
Loren Merritt committed
438 439 440
        sum8[x] = sum8[x+8*stride] + sum8[x+8*stride+4] - sum8[x] - sum8[x+4];
}

441
static void integral_init8v( uint16_t *sum8, intptr_t stride )
Loren Merritt's avatar
Loren Merritt committed
442
{
443
    for( int x = 0; x < stride-8; x++ )
Loren Merritt's avatar
Loren Merritt committed
444 445 446
        sum8[x] = sum8[x+8*stride] - sum8[x];
}

Loren Merritt's avatar
Loren Merritt committed
447 448
void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
{
449
    pixel *src = frame->plane[0];
Loren Merritt's avatar
Loren Merritt committed
450 451 452 453 454
    int i_stride = frame->i_stride[0];
    int i_height = frame->i_lines[0];
    int i_width  = frame->i_width[0];

    // duplicate last row and column so that their interpolation doesn't have to be special-cased
455
    for( int y = 0; y < i_height; y++ )
Loren Merritt's avatar
Loren Merritt committed
456
        src[i_width+y*i_stride] = src[i_width-1+y*i_stride];
457
    memcpy( src+i_stride*i_height, src+i_stride*(i_height-1), (i_width+1) * sizeof(pixel) );
Loren Merritt's avatar
Loren Merritt committed
458 459 460 461
    h->mc.frame_init_lowres_core( src, frame->lowres[0], frame->lowres[1], frame->lowres[2], frame->lowres[3],
                                  i_stride, frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres );
    x264_frame_expand_border_lowres( frame );

462 463
    memset( frame->i_cost_est, -1, sizeof(frame->i_cost_est) );

464 465
    for( int y = 0; y < h->param.i_bframe + 2; y++ )
        for( int x = 0; x < h->param.i_bframe + 2; x++ )
466
            frame->i_row_satds[y][x][0] = -1;
467

468 469
    for( int y = 0; y <= !!h->param.i_bframe; y++ )
        for( int x = 0; x <= h->param.i_bframe; x++ )
470
            frame->lowres_mvs[y][x][0][0] = 0x7FFF;
Loren Merritt's avatar
Loren Merritt committed
471 472
}

473
static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
474
                                    intptr_t src_stride, intptr_t dst_stride, int width, int height )
Loren Merritt's avatar
Loren Merritt committed
475
{
476
    for( int y = 0; y < height; y++ )
Loren Merritt's avatar
Loren Merritt committed
477
    {
478 479
        pixel *src1 = src0+src_stride;
        pixel *src2 = src1+src_stride;
480
        for( int x = 0; x<width; x++ )
Loren Merritt's avatar
Loren Merritt committed
481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497
        {
            // slower than naive bilinear, but matches asm
#define FILTER(a,b,c,d) ((((a+b+1)>>1)+((c+d+1)>>1)+1)>>1)
            dst0[x] = FILTER(src0[2*x  ], src1[2*x  ], src0[2*x+1], src1[2*x+1]);
            dsth[x] = FILTER(src0[2*x+1], src1[2*x+1], src0[2*x+2], src1[2*x+2]);
            dstv[x] = FILTER(src1[2*x  ], src2[2*x  ], src1[2*x+1], src2[2*x+1]);
            dstc[x] = FILTER(src1[2*x+1], src2[2*x+1], src1[2*x+2], src2[2*x+2]);
#undef FILTER
        }
        src0 += src_stride*2;
        dst0 += dst_stride;
        dsth += dst_stride;
        dstv += dst_stride;
        dstc += dst_stride;
    }
}

498 499
/* Estimate the total amount of influence on future quality that could be had if we
 * were to improve the reference samples used to inter predict any given macroblock. */
500
static void mbtree_propagate_cost( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
501
                                   uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
502
{
503
    float fps = *fps_factor;
504
    for( int i = 0; i < len; i++ )
505
    {
506 507 508 509 510 511 512
        int intra_cost = intra_costs[i];
        int inter_cost = X264_MIN(intra_costs[i], inter_costs[i] & LOWRES_COST_MASK);
        float propagate_intra  = intra_cost * inv_qscales[i];
        float propagate_amount = propagate_in[i] + propagate_intra*fps;
        float propagate_num    = intra_cost - inter_cost;
        float propagate_denom  = intra_cost;
        dst[i] = X264_MIN((int)(propagate_amount * propagate_num / propagate_denom + 0.5f), 32767);
513 514 515
    }
}

516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590
static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
                                   int16_t *propagate_amount, uint16_t *lowres_costs,
                                   int bipred_weight, int mb_y, int len, int list )
{
    unsigned stride = h->mb.i_mb_stride;
    unsigned width = h->mb.i_mb_width;
    unsigned height = h->mb.i_mb_height;

    for( unsigned i = 0; i < len; i++ )
    {
#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
        int lists_used = lowres_costs[i]>>LOWRES_COST_SHIFT;

        if( !(lists_used & (1 << list)) )
            continue;

        int listamount = propagate_amount[i];
        /* Apply bipred weighting. */
        if( lists_used == 3 )
            listamount = (listamount * bipred_weight + 32) >> 6;

        /* Early termination for simple case of mv0. */
        if( !M32( mvs[i] ) )
        {
            CLIP_ADD( ref_costs[mb_y*stride + i], listamount );
            continue;
        }

        int x = mvs[i][0];
        int y = mvs[i][1];
        unsigned mbx = (x>>5)+i;
        unsigned mby = (y>>5)+mb_y;
        unsigned idx0 = mbx + mby * stride;
        unsigned idx2 = idx0 + stride;
        x &= 31;
        y &= 31;
        int idx0weight = (32-y)*(32-x);
        int idx1weight = (32-y)*x;
        int idx2weight = y*(32-x);
        int idx3weight = y*x;
        idx0weight = (idx0weight * listamount + 512) >> 10;
        idx1weight = (idx1weight * listamount + 512) >> 10;
        idx2weight = (idx2weight * listamount + 512) >> 10;
        idx3weight = (idx3weight * listamount + 512) >> 10;

        if( mbx < width-1 && mby < height-1 )
        {
            CLIP_ADD( ref_costs[idx0+0], idx0weight );
            CLIP_ADD( ref_costs[idx0+1], idx1weight );
            CLIP_ADD( ref_costs[idx2+0], idx2weight );
            CLIP_ADD( ref_costs[idx2+1], idx3weight );
        }
        else
        {
            /* Note: this takes advantage of unsigned representation to
             * catch negative mbx/mby. */
            if( mby < height )
            {
                if( mbx < width )
                    CLIP_ADD( ref_costs[idx0+0], idx0weight );
                if( mbx+1 < width )
                    CLIP_ADD( ref_costs[idx0+1], idx1weight );
            }
            if( mby+1 < height )
            {
                if( mbx < width )
                    CLIP_ADD( ref_costs[idx2+0], idx2weight );
                if( mbx+1 < width )
                    CLIP_ADD( ref_costs[idx2+1], idx3weight );
            }
        }
    }
#undef CLIP_ADD
}

591
void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
Laurent Aimar's avatar
Laurent Aimar committed
592
{
593 594
    pf->mc_luma   = mc_luma;
    pf->get_ref   = get_ref;
Henrik Gramner's avatar
Henrik Gramner committed
595

596
    pf->mc_chroma = mc_chroma;
Laurent Aimar's avatar
Laurent Aimar committed
597

598 599 600 601 602
    pf->avg[PIXEL_16x16]= pixel_avg_16x16;
    pf->avg[PIXEL_16x8] = pixel_avg_16x8;
    pf->avg[PIXEL_8x16] = pixel_avg_8x16;
    pf->avg[PIXEL_8x8]  = pixel_avg_8x8;
    pf->avg[PIXEL_8x4]  = pixel_avg_8x4;
Henrik Gramner's avatar
Henrik Gramner committed
603
    pf->avg[PIXEL_4x16] = pixel_avg_4x16;
604 605 606
    pf->avg[PIXEL_4x8]  = pixel_avg_4x8;
    pf->avg[PIXEL_4x4]  = pixel_avg_4x4;
    pf->avg[PIXEL_4x2]  = pixel_avg_4x2;
Henrik Gramner's avatar
Henrik Gramner committed
607
    pf->avg[PIXEL_2x8]  = pixel_avg_2x8;
608 609
    pf->avg[PIXEL_2x4]  = pixel_avg_2x4;
    pf->avg[PIXEL_2x2]  = pixel_avg_2x2;
Loren Merritt's avatar
Loren Merritt committed
610

Dylan Yudaken's avatar
Dylan Yudaken committed
611 612 613 614 615
    pf->weight    = x264_mc_weight_wtab;
    pf->offsetadd = x264_mc_weight_wtab;
    pf->offsetsub = x264_mc_weight_wtab;
    pf->weight_cache = x264_weight_cache;

616
    pf->copy_16x16_unaligned = mc_copy_w16;
Loren Merritt's avatar
Loren Merritt committed
617 618 619 620
    pf->copy[PIXEL_16x16] = mc_copy_w16;
    pf->copy[PIXEL_8x8]   = mc_copy_w8;
    pf->copy[PIXEL_4x4]   = mc_copy_w4;

Henrik Gramner's avatar
Henrik Gramner committed
621 622 623
    pf->store_interleave_chroma       = store_interleave_chroma;
    pf->load_deinterleave_chroma_fenc = load_deinterleave_chroma_fenc;
    pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec;
624

625
    pf->plane_copy = x264_plane_copy_c;
Xiaolei Yu's avatar
Xiaolei Yu committed
626
    pf->plane_copy_swap = x264_plane_copy_swap_c;
627 628
    pf->plane_copy_interleave = x264_plane_copy_interleave_c;
    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c;
xvidfan's avatar
xvidfan committed
629
    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_c;
James Weaver's avatar
James Weaver committed
630
    pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_c;
631

632
    pf->hpel_filter = hpel_filter;
633

634 635
    pf->prefetch_fenc_420 = prefetch_fenc_null;
    pf->prefetch_fenc_422 = prefetch_fenc_null;
636
    pf->prefetch_ref  = prefetch_ref_null;
Fiona Glaser's avatar
Fiona Glaser committed
637
    pf->memcpy_aligned = memcpy;
Fiona Glaser's avatar
Fiona Glaser committed
638
    pf->memzero_aligned = memzero_aligned;
Loren Merritt's avatar
Loren Merritt committed
639
    pf->frame_init_lowres_core = frame_init_lowres_core;
640

Loren Merritt's avatar
Loren Merritt committed
641 642 643 644 645
    pf->integral_init4h = integral_init4h;
    pf->integral_init8h = integral_init8h;
    pf->integral_init4v = integral_init4v;
    pf->integral_init8v = integral_init8v;

646
    pf->mbtree_propagate_cost = mbtree_propagate_cost;
647
    pf->mbtree_propagate_list = mbtree_propagate_list;
648

Steven Walters's avatar
Steven Walters committed
649
#if HAVE_MMX
Loren Merritt's avatar
Loren Merritt committed
650
    x264_mc_init_mmx( cpu, pf );
651
#endif
Steven Walters's avatar
Steven Walters committed
652
#if HAVE_ALTIVEC
Laurent Aimar's avatar
Laurent Aimar committed
653 654 655
    if( cpu&X264_CPU_ALTIVEC )
        x264_mc_altivec_init( pf );
#endif
Steven Walters's avatar
Steven Walters committed
656
#if HAVE_ARMV6
657 658
    x264_mc_init_arm( cpu, pf );
#endif
659 660 661
#if ARCH_AARCH64
    x264_mc_init_aarch64( cpu, pf );
#endif
662 663

    if( cpu_independent )
664
    {
665
        pf->mbtree_propagate_cost = mbtree_propagate_cost;
666 667
        pf->mbtree_propagate_list = mbtree_propagate_list;
    }
Laurent Aimar's avatar
Laurent Aimar committed
668 669
}

670
void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
671
{
672
    const int b_interlaced = PARAM_INTERLACED;
Simon Horlick's avatar
Simon Horlick committed
673
    int start = mb_y*16 - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8
674
    int height = (b_end ? frame->i_lines[0] + 16*PARAM_INTERLACED : (mb_y+b_interlaced)*16) + 8;
675

Loren Merritt's avatar
Loren Merritt committed
676 677
    if( mb_y & b_interlaced )
        return;
678

Fiona Glaser's avatar
Fiona Glaser committed
679
    for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
Simon Horlick's avatar
Simon Horlick committed
680
    {
Fiona Glaser's avatar
Fiona Glaser committed
681 682 683 684 685
        int stride = frame->i_stride[p];
        const int width = frame->i_width[p];
        int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd

        if( !b_interlaced || h->mb.b_adaptive_mbaff )
Simon Horlick's avatar
Simon Horlick committed
686
            h->mc.hpel_filter(
Fiona Glaser's avatar
Fiona Glaser committed
687 688 689 690 691
                frame->filtered[p][1] + offs,
                frame->filtered[p][2] + offs,
                frame->filtered[p][3] + offs,
                frame->plane[p] + offs,
                stride, width + 16, height - start,
Simon Horlick's avatar
Simon Horlick committed
692
                h->scratch_buffer );
Fiona Glaser's avatar
Fiona Glaser committed
693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710

        if( b_interlaced )
        {
            /* MC must happen between pixels in the same field. */
            stride = frame->i_stride[p] << 1;
            start = (mb_y*16 >> 1) - 8;
            int height_fld = ((b_end ? frame->i_lines[p] : mb_y*16) >> 1) + 8;
            offs = start*stride - 8;
            for( int i = 0; i < 2; i++, offs += frame->i_stride[p] )
            {
                h->mc.hpel_filter(
                    frame->filtered_fld[p][1] + offs,
                    frame->filtered_fld[p][2] + offs,
                    frame->filtered_fld[p][3] + offs,
                    frame->plane_fld[p] + offs,
                    stride, width + 16, height_fld - start,
                    h->scratch_buffer );
            }
Simon Horlick's avatar
Simon Horlick committed
711
        }
712
    }
713 714

    /* generate integral image:
Loren Merritt's avatar
Loren Merritt committed
715 716
     * frame->integral contains 2 planes. in the upper plane, each element is
     * the sum of an 8x8 pixel region with top-left corner on that point.
Loren Merritt's avatar
Loren Merritt committed
717
     * in the lower plane, 4x4 sums (needed only with --partitions p4x4). */
718

Loren Merritt's avatar
Loren Merritt committed
719
    if( frame->integral )
720
    {
Fiona Glaser's avatar
Fiona Glaser committed
721
        int stride = frame->i_stride[0];
Loren Merritt's avatar
Loren Merritt committed
722 723
        if( start < 0 )
        {
724 725
            memset( frame->integral - PADV * stride - PADH, 0, stride * sizeof(uint16_t) );
            start = -PADV;
Loren Merritt's avatar
Loren Merritt committed
726 727
        }
        if( b_end )
728
            height += PADV-9;
729
        for( int y = start; y < height; y++ )
730
        {
731
            pixel    *pix  = frame->plane[0] + y * stride - PADH;
Loren Merritt's avatar
Loren Merritt committed
732 733 734 735 736 737 738 739 740 741 742
            uint16_t *sum8 = frame->integral + (y+1) * stride - PADH;
            uint16_t *sum4;
            if( h->frames.b_have_sub8x8_esa )
            {
                h->mc.integral_init4h( sum8, pix, stride );
                sum8 -= 8*stride;
                sum4 = sum8 + stride * (frame->i_lines[0] + PADV*2);
                if( y >= 8-PADV )
                    h->mc.integral_init4v( sum8, sum4, stride );
            }
            else
Loren Merritt's avatar
Loren Merritt committed
743
            {
Loren Merritt's avatar
Loren Merritt committed
744 745 746
                h->mc.integral_init8h( sum8, pix, stride );
                if( y >= 8-PADV )
                    h->mc.integral_init8v( sum8-8*stride, stride );
Loren Merritt's avatar
Loren Merritt committed
747 748
            }
        }
749
    }
750
}