mc.c 25.4 KB
Newer Older
Laurent Aimar's avatar
Laurent Aimar committed
1
/*****************************************************************************
Fiona Glaser's avatar
Fiona Glaser committed
2
 * mc.c: motion compensation
Laurent Aimar's avatar
Laurent Aimar committed
3
 *****************************************************************************
Henrik Gramner's avatar
Henrik Gramner committed
4
 * Copyright (C) 2003-2014 x264 project
Laurent Aimar's avatar
Laurent Aimar committed
5 6
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7
 *          Loren Merritt <lorenm@u.washington.edu>
Laurent Aimar's avatar
Laurent Aimar committed
8 9 10 11 12 13 14 15 16 17 18 19 20
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
Fiona Glaser's avatar
Fiona Glaser committed
22 23 24
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
Laurent Aimar's avatar
Laurent Aimar committed
25 26
 *****************************************************************************/

Loren Merritt's avatar
Loren Merritt committed
27
#include "common.h"
Laurent Aimar's avatar
Laurent Aimar committed
28

Steven Walters's avatar
Steven Walters committed
29
#if HAVE_MMX
30
#include "x86/mc.h"
Laurent Aimar's avatar
Laurent Aimar committed
31
#endif
Steven Walters's avatar
Steven Walters committed
32
#if ARCH_PPC
33
#include "ppc/mc.h"
Laurent Aimar's avatar
Laurent Aimar committed
34
#endif
Steven Walters's avatar
Steven Walters committed
35
#if ARCH_ARM
36 37
#include "arm/mc.h"
#endif
38 39 40
#if ARCH_AARCH64
#include "aarch64/mc.h"
#endif
Laurent Aimar's avatar
Laurent Aimar committed
41 42


43 44 45
static inline void pixel_avg( pixel *dst,  intptr_t i_dst_stride,
                              pixel *src1, intptr_t i_src1_stride,
                              pixel *src2, intptr_t i_src2_stride, int i_width, int i_height )
Laurent Aimar's avatar
Laurent Aimar committed
46
{
47
    for( int y = 0; y < i_height; y++ )
Laurent Aimar's avatar
Laurent Aimar committed
48
    {
49
        for( int x = 0; x < i_width; x++ )
Laurent Aimar's avatar
Laurent Aimar committed
50 51 52 53 54 55 56
            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
        dst  += i_dst_stride;
        src1 += i_src1_stride;
        src2 += i_src2_stride;
    }
}

57 58 59
static inline void pixel_avg_wxh( pixel *dst,  intptr_t i_dst,
                                  pixel *src1, intptr_t i_src1,
                                  pixel *src2, intptr_t i_src2, int width, int height )
60
{
61
    for( int y = 0; y < height; y++ )
62
    {
63
        for( int x = 0; x < width; x++ )
64 65 66
            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
        src1 += i_src1;
        src2 += i_src2;
67 68 69 70 71 72
        dst += i_dst;
    }
}

/* Implicit weighted bipred only:
 * assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */
73 74 75
static inline void pixel_avg_weight_wxh( pixel *dst,  intptr_t i_dst,
                                         pixel *src1, intptr_t i_src1,
                                         pixel *src2, intptr_t i_src2, int width, int height, int i_weight1 )
76
{
77
    int i_weight2 = 64 - i_weight1;
78
    for( int y = 0; y<height; y++, dst += i_dst, src1 += i_src1, src2 += i_src2 )
79 80
        for( int x = 0; x<width; x++ )
            dst[x] = x264_clip_pixel( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 );
81
}
Fiona Glaser's avatar
Fiona Glaser committed
82
#undef op_scale2
83

Fiona Glaser's avatar
Fiona Glaser committed
84
#define PIXEL_AVG_C( name, width, height ) \
85 86 87
static void name( pixel *pix1, intptr_t i_stride_pix1, \
                  pixel *pix2, intptr_t i_stride_pix2, \
                  pixel *pix3, intptr_t i_stride_pix3, int weight ) \
88
{ \
Anton Mitrofanov's avatar
Anton Mitrofanov committed
89
    if( weight == 32 ) \
Fiona Glaser's avatar
Fiona Glaser committed
90
        pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
Anton Mitrofanov's avatar
Anton Mitrofanov committed
91
    else \
Fiona Glaser's avatar
Fiona Glaser committed
92
        pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \
93
}
Fiona Glaser's avatar
Fiona Glaser committed
94 95 96 97 98
PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
PIXEL_AVG_C( pixel_avg_16x8,  16, 8 )
PIXEL_AVG_C( pixel_avg_8x16,  8, 16 )
PIXEL_AVG_C( pixel_avg_8x8,   8, 8 )
PIXEL_AVG_C( pixel_avg_8x4,   8, 4 )
Henrik Gramner's avatar
Henrik Gramner committed
99
PIXEL_AVG_C( pixel_avg_4x16,  4, 16 )
Fiona Glaser's avatar
Fiona Glaser committed
100 101 102
PIXEL_AVG_C( pixel_avg_4x8,   4, 8 )
PIXEL_AVG_C( pixel_avg_4x4,   4, 4 )
PIXEL_AVG_C( pixel_avg_4x2,   4, 2 )
Henrik Gramner's avatar
Henrik Gramner committed
103
PIXEL_AVG_C( pixel_avg_2x8,   2, 8 )
Fiona Glaser's avatar
Fiona Glaser committed
104 105
PIXEL_AVG_C( pixel_avg_2x4,   2, 4 )
PIXEL_AVG_C( pixel_avg_2x2,   2, 2 )
106

Dylan Yudaken's avatar
Dylan Yudaken committed
107 108 109 110
static void x264_weight_cache( x264_t *h, x264_weight_t *w )
{
    w->weightfn = h->mc.weight;
}
111 112
#define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * scale + (1<<(denom - 1))) >> denom) + offset )
#define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * scale + offset )
113 114
static void mc_weight( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
                       const x264_weight_t *weight, int i_width, int i_height )
Dylan Yudaken's avatar
Dylan Yudaken committed
115
{
116 117 118 119
    int offset = weight->i_offset << (BIT_DEPTH-8);
    int scale = weight->i_scale;
    int denom = weight->i_denom;
    if( denom >= 1 )
Dylan Yudaken's avatar
Dylan Yudaken committed
120
    {
121 122
        for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
            for( int x = 0; x < i_width; x++ )
Dylan Yudaken's avatar
Dylan Yudaken committed
123 124 125 126
                opscale( x );
    }
    else
    {
127 128
        for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
            for( int x = 0; x < i_width; x++ )
Dylan Yudaken's avatar
Dylan Yudaken committed
129 130 131 132
                opscale_noden( x );
    }
}

133
#define MC_WEIGHT_C( name, width ) \
134
    static void name( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, const x264_weight_t *weight, int height ) \
Dylan Yudaken's avatar
Dylan Yudaken committed
135
{ \
136
    mc_weight( dst, i_dst_stride, src, i_src_stride, weight, width, height );\
Dylan Yudaken's avatar
Dylan Yudaken committed
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
}

MC_WEIGHT_C( mc_weight_w20, 20 )
MC_WEIGHT_C( mc_weight_w16, 16 )
MC_WEIGHT_C( mc_weight_w12, 12 )
MC_WEIGHT_C( mc_weight_w8,   8 )
MC_WEIGHT_C( mc_weight_w4,   4 )
MC_WEIGHT_C( mc_weight_w2,   2 )

static weight_fn_t x264_mc_weight_wtab[6] =
{
    mc_weight_w2,
    mc_weight_w4,
    mc_weight_w8,
    mc_weight_w12,
    mc_weight_w16,
    mc_weight_w20,
};
Anton Mitrofanov's avatar
Anton Mitrofanov committed
155
const x264_weight_t x264_weight_none[3] = { {{0}} };
156
static void mc_copy( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, int i_width, int i_height )
Laurent Aimar's avatar
Laurent Aimar committed
157
{
158
    for( int y = 0; y < i_height; y++ )
Laurent Aimar's avatar
Laurent Aimar committed
159
    {
160
        memcpy( dst, src, i_width * sizeof(pixel) );
Laurent Aimar's avatar
Laurent Aimar committed
161 162 163 164 165 166

        src += i_src_stride;
        dst += i_dst_stride;
    }
}

Loren Merritt's avatar
Loren Merritt committed
167
#define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d]))
168
static void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
169
                         intptr_t stride, int width, int height, int16_t *buf )
Laurent Aimar's avatar
Laurent Aimar committed
170
{
171
    const int pad = (BIT_DEPTH > 9) ? (-10 * PIXEL_MAX) : 0;
172
    for( int y = 0; y < height; y++ )
Laurent Aimar's avatar
Laurent Aimar committed
173
    {
174
        for( int x = -2; x < width+3; x++ )
Laurent Aimar's avatar
Laurent Aimar committed
175
        {
Loren Merritt's avatar
Loren Merritt committed
176
            int v = TAPFILTER(src,stride);
177
            dstv[x] = x264_clip_pixel( (v + 16) >> 5 );
178 179
            /* transform v for storage in a 16-bit integer */
            buf[x+2] = v + pad;
Laurent Aimar's avatar
Laurent Aimar committed
180
        }
181
        for( int x = 0; x < width; x++ )
182
            dstc[x] = x264_clip_pixel( (TAPFILTER(buf+2,1) - 32*pad + 512) >> 10 );
183
        for( int x = 0; x < width; x++ )
184
            dsth[x] = x264_clip_pixel( (TAPFILTER(src,1) + 16) >> 5 );
Loren Merritt's avatar
Loren Merritt committed
185 186 187 188
        dsth += stride;
        dstv += stride;
        dstc += stride;
        src += stride;
Laurent Aimar's avatar
Laurent Aimar committed
189
    }
190 191
}

192 193
static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
Loren Merritt's avatar
Loren Merritt committed
194

195 196
static void mc_luma( pixel *dst,    intptr_t i_dst_stride,
                     pixel *src[4], intptr_t i_src_stride,
Loren Merritt's avatar
Loren Merritt committed
197
                     int mvx, int mvy,
Dylan Yudaken's avatar
Dylan Yudaken committed
198
                     int i_width, int i_height, const x264_weight_t *weight )
199
{
Loren Merritt's avatar
Loren Merritt committed
200 201
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
202
    pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
203

Loren Merritt's avatar
Loren Merritt committed
204
    if( qpel_idx & 5 ) /* qpel interpolation needed */
205
    {
206
        pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
207 208
        pixel_avg( dst, i_dst_stride, src1, i_src_stride,
                   src2, i_src_stride, i_width, i_height );
Dylan Yudaken's avatar
Dylan Yudaken committed
209 210
        if( weight->weightfn )
            mc_weight( dst, i_dst_stride, dst, i_dst_stride, weight, i_width, i_height );
211
    }
Dylan Yudaken's avatar
Dylan Yudaken committed
212 213
    else if( weight->weightfn )
        mc_weight( dst, i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
214 215 216 217
    else
        mc_copy( src1, i_src_stride, dst, i_dst_stride, i_width, i_height );
}

218 219
static pixel *get_ref( pixel *dst,   intptr_t *i_dst_stride,
                       pixel *src[4], intptr_t i_src_stride,
220 221
                       int mvx, int mvy,
                       int i_width, int i_height, const x264_weight_t *weight )
222
{
Loren Merritt's avatar
Loren Merritt committed
223 224
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
225
    pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
226

Loren Merritt's avatar
Loren Merritt committed
227
    if( qpel_idx & 5 ) /* qpel interpolation needed */
228
    {
229
        pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
230 231
        pixel_avg( dst, *i_dst_stride, src1, i_src_stride,
                   src2, i_src_stride, i_width, i_height );
Dylan Yudaken's avatar
Dylan Yudaken committed
232 233 234 235 236 237 238
        if( weight->weightfn )
            mc_weight( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_width, i_height );
        return dst;
    }
    else if( weight->weightfn )
    {
        mc_weight( dst, *i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
239 240 241 242 243 244 245 246 247
        return dst;
    }
    else
    {
        *i_dst_stride = i_src_stride;
        return src1;
    }
}

Laurent Aimar's avatar
Laurent Aimar committed
248
/* full chroma mc (ie until 1/8 pixel)*/
249 250
static void mc_chroma( pixel *dstu, pixel *dstv, intptr_t i_dst_stride,
                       pixel *src, intptr_t i_src_stride,
251 252
                       int mvx, int mvy,
                       int i_width, int i_height )
Laurent Aimar's avatar
Laurent Aimar committed
253
{
254
    pixel *srcp;
Laurent Aimar's avatar
Laurent Aimar committed
255

256 257 258 259 260 261
    int d8x = mvx&0x07;
    int d8y = mvy&0x07;
    int cA = (8-d8x)*(8-d8y);
    int cB = d8x    *(8-d8y);
    int cC = (8-d8x)*d8y;
    int cD = d8x    *d8y;
Laurent Aimar's avatar
Laurent Aimar committed
262

263
    src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
Laurent Aimar's avatar
Laurent Aimar committed
264 265
    srcp = &src[i_src_stride];

266
    for( int y = 0; y < i_height; y++ )
Laurent Aimar's avatar
Laurent Aimar committed
267
    {
268
        for( int x = 0; x < i_width; x++ )
269 270 271 272 273 274 275 276
        {
            dstu[x] = ( cA*src[2*x]  + cB*src[2*x+2] +
                        cC*srcp[2*x] + cD*srcp[2*x+2] + 32 ) >> 6;
            dstv[x] = ( cA*src[2*x+1]  + cB*src[2*x+3] +
                        cC*srcp[2*x+1] + cD*srcp[2*x+3] + 32 ) >> 6;
        }
        dstu += i_dst_stride;
        dstv += i_dst_stride;
Laurent Aimar's avatar
Laurent Aimar committed
277 278 279 280 281
        src   = srcp;
        srcp += i_src_stride;
    }
}

Loren Merritt's avatar
Loren Merritt committed
282
#define MC_COPY(W) \
283
static void mc_copy_w##W( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int i_height ) \
Loren Merritt's avatar
Loren Merritt committed
284 285 286 287 288 289 290
{ \
    mc_copy( src, i_src, dst, i_dst, W, i_height ); \
}
MC_COPY( 16 )
MC_COPY( 8 )
MC_COPY( 4 )

291 292
void x264_plane_copy_c( pixel *dst, intptr_t i_dst,
                        pixel *src, intptr_t i_src, int w, int h )
293 294 295
{
    while( h-- )
    {
296
        memcpy( dst, src, w * sizeof(pixel) );
297 298 299 300 301
        dst += i_dst;
        src += i_src;
    }
}

302 303 304
void x264_plane_copy_interleave_c( pixel *dst,  intptr_t i_dst,
                                   pixel *srcu, intptr_t i_srcu,
                                   pixel *srcv, intptr_t i_srcv, int w, int h )
305 306 307 308
{
    for( int y=0; y<h; y++, dst+=i_dst, srcu+=i_srcu, srcv+=i_srcv )
        for( int x=0; x<w; x++ )
        {
309 310
            dst[2*x]   = srcu[x];
            dst[2*x+1] = srcv[x];
311 312 313
        }
}

314 315 316
static void x264_plane_copy_deinterleave_c( pixel *dstu, intptr_t i_dstu,
                                            pixel *dstv, intptr_t i_dstv,
                                            pixel *src,  intptr_t i_src, int w, int h )
317 318 319 320 321 322 323 324 325
{
    for( int y=0; y<h; y++, dstu+=i_dstu, dstv+=i_dstv, src+=i_src )
        for( int x=0; x<w; x++ )
        {
            dstu[x] = src[2*x];
            dstv[x] = src[2*x+1];
        }
}

326 327 328 329
static void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, intptr_t i_dsta,
                                                pixel *dstb, intptr_t i_dstb,
                                                pixel *dstc, intptr_t i_dstc,
                                                pixel *src,  intptr_t i_src, int pw, int w, int h )
xvidfan's avatar
xvidfan committed
330 331 332 333 334 335 336 337 338 339 340 341
{
    for( int y=0; y<h; y++, dsta+=i_dsta, dstb+=i_dstb, dstc+=i_dstc, src+=i_src )
    {
        for( int x=0; x<w; x++ )
        {
            dsta[x] = src[x*pw];
            dstb[x] = src[x*pw+1];
            dstc[x] = src[x*pw+2];
        }
    }
}

James Weaver's avatar
James Weaver committed
342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369
void x264_plane_copy_deinterleave_v210_c( pixel *dsty, intptr_t i_dsty,
                                          pixel *dstc, intptr_t i_dstc,
                                          uint32_t *src, intptr_t i_src, int w, int h )
{
    for( int l = 0; l < h; l++ )
    {
        pixel *dsty0 = dsty;
        pixel *dstc0 = dstc;
        uint32_t *src0 = src;

        for( int n = 0; n < w; n += 3 )
        {
            *(dstc0++) = *src0 & 0x03FF;
            *(dsty0++) = ( *src0 >> 10 ) & 0x03FF;
            *(dstc0++) = ( *src0 >> 20 ) & 0x03FF;
            src0++;
            *(dsty0++) = *src0 & 0x03FF;
            *(dstc0++) = ( *src0 >> 10 ) & 0x03FF;
            *(dsty0++) = ( *src0 >> 20 ) & 0x03FF;
            src0++;
        }

        dsty += i_dsty;
        dstc += i_dstc;
        src  += i_src;
    }
}

370
static void store_interleave_chroma( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height )
371
{
Henrik Gramner's avatar
Henrik Gramner committed
372
    for( int y=0; y<height; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
373 374 375 376 377 378 379
        for( int x=0; x<8; x++ )
        {
            dst[2*x]   = srcu[x];
            dst[2*x+1] = srcv[x];
        }
}

380
static void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
381
{
Henrik Gramner's avatar
Henrik Gramner committed
382
    x264_plane_copy_deinterleave_c( dst, FENC_STRIDE, dst+FENC_STRIDE/2, FENC_STRIDE, src, i_src, 8, height );
383 384
}

385
static void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
386
{
Henrik Gramner's avatar
Henrik Gramner committed
387
    x264_plane_copy_deinterleave_c( dst, FDEC_STRIDE, dst+FDEC_STRIDE/2, FDEC_STRIDE, src, i_src, 8, height );
388 389
}

390 391
static void prefetch_fenc_null( pixel *pix_y,  intptr_t stride_y,
                                pixel *pix_uv, intptr_t stride_uv, int mb_x )
392 393
{}

394
static void prefetch_ref_null( pixel *pix, intptr_t stride, int parity )
395 396
{}

397
static void memzero_aligned( void * dst, size_t n )
Fiona Glaser's avatar
Fiona Glaser committed
398 399 400 401
{
    memset( dst, 0, n );
}

402
static void integral_init4h( uint16_t *sum, pixel *pix, intptr_t stride )
Loren Merritt's avatar
Loren Merritt committed
403
{
404 405
    int v = pix[0]+pix[1]+pix[2]+pix[3];
    for( int x = 0; x < stride-4; x++ )
Loren Merritt's avatar
Loren Merritt committed
406 407 408 409 410 411
    {
        sum[x] = v + sum[x-stride];
        v += pix[x+4] - pix[x];
    }
}

412
static void integral_init8h( uint16_t *sum, pixel *pix, intptr_t stride )
Loren Merritt's avatar
Loren Merritt committed
413
{
414 415
    int v = pix[0]+pix[1]+pix[2]+pix[3]+pix[4]+pix[5]+pix[6]+pix[7];
    for( int x = 0; x < stride-8; x++ )
Loren Merritt's avatar
Loren Merritt committed
416 417 418 419 420 421
    {
        sum[x] = v + sum[x-stride];
        v += pix[x+8] - pix[x];
    }
}

422
static void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
Loren Merritt's avatar
Loren Merritt committed
423
{
424
    for( int x = 0; x < stride-8; x++ )
Loren Merritt's avatar
Loren Merritt committed
425
        sum4[x] = sum8[x+4*stride] - sum8[x];
426
    for( int x = 0; x < stride-8; x++ )
Loren Merritt's avatar
Loren Merritt committed
427 428 429
        sum8[x] = sum8[x+8*stride] + sum8[x+8*stride+4] - sum8[x] - sum8[x+4];
}

430
static void integral_init8v( uint16_t *sum8, intptr_t stride )
Loren Merritt's avatar
Loren Merritt committed
431
{
432
    for( int x = 0; x < stride-8; x++ )
Loren Merritt's avatar
Loren Merritt committed
433 434 435
        sum8[x] = sum8[x+8*stride] - sum8[x];
}

Loren Merritt's avatar
Loren Merritt committed
436 437
void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
{
438
    pixel *src = frame->plane[0];
Loren Merritt's avatar
Loren Merritt committed
439 440 441 442 443
    int i_stride = frame->i_stride[0];
    int i_height = frame->i_lines[0];
    int i_width  = frame->i_width[0];

    // duplicate last row and column so that their interpolation doesn't have to be special-cased
444
    for( int y = 0; y < i_height; y++ )
Loren Merritt's avatar
Loren Merritt committed
445
        src[i_width+y*i_stride] = src[i_width-1+y*i_stride];
446
    memcpy( src+i_stride*i_height, src+i_stride*(i_height-1), (i_width+1) * sizeof(pixel) );
Loren Merritt's avatar
Loren Merritt committed
447 448 449 450
    h->mc.frame_init_lowres_core( src, frame->lowres[0], frame->lowres[1], frame->lowres[2], frame->lowres[3],
                                  i_stride, frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres );
    x264_frame_expand_border_lowres( frame );

451 452
    memset( frame->i_cost_est, -1, sizeof(frame->i_cost_est) );

453 454
    for( int y = 0; y < h->param.i_bframe + 2; y++ )
        for( int x = 0; x < h->param.i_bframe + 2; x++ )
455
            frame->i_row_satds[y][x][0] = -1;
456

457 458
    for( int y = 0; y <= !!h->param.i_bframe; y++ )
        for( int x = 0; x <= h->param.i_bframe; x++ )
459
            frame->lowres_mvs[y][x][0][0] = 0x7FFF;
Loren Merritt's avatar
Loren Merritt committed
460 461
}

462
static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
463
                                    intptr_t src_stride, intptr_t dst_stride, int width, int height )
Loren Merritt's avatar
Loren Merritt committed
464
{
465
    for( int y = 0; y < height; y++ )
Loren Merritt's avatar
Loren Merritt committed
466
    {
467 468
        pixel *src1 = src0+src_stride;
        pixel *src2 = src1+src_stride;
469
        for( int x = 0; x<width; x++ )
Loren Merritt's avatar
Loren Merritt committed
470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
        {
            // slower than naive bilinear, but matches asm
#define FILTER(a,b,c,d) ((((a+b+1)>>1)+((c+d+1)>>1)+1)>>1)
            dst0[x] = FILTER(src0[2*x  ], src1[2*x  ], src0[2*x+1], src1[2*x+1]);
            dsth[x] = FILTER(src0[2*x+1], src1[2*x+1], src0[2*x+2], src1[2*x+2]);
            dstv[x] = FILTER(src1[2*x  ], src2[2*x  ], src1[2*x+1], src2[2*x+1]);
            dstc[x] = FILTER(src1[2*x+1], src2[2*x+1], src1[2*x+2], src2[2*x+2]);
#undef FILTER
        }
        src0 += src_stride*2;
        dst0 += dst_stride;
        dsth += dst_stride;
        dstv += dst_stride;
        dstc += dst_stride;
    }
}

487 488
/* Estimate the total amount of influence on future quality that could be had if we
 * were to improve the reference samples used to inter predict any given macroblock. */
489
static void mbtree_propagate_cost( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
490
                                   uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
491
{
492
    float fps = *fps_factor;
493
    for( int i = 0; i < len; i++ )
494
    {
495 496 497 498 499 500 501
        int intra_cost = intra_costs[i];
        int inter_cost = X264_MIN(intra_costs[i], inter_costs[i] & LOWRES_COST_MASK);
        float propagate_intra  = intra_cost * inv_qscales[i];
        float propagate_amount = propagate_in[i] + propagate_intra*fps;
        float propagate_num    = intra_cost - inter_cost;
        float propagate_denom  = intra_cost;
        dst[i] = X264_MIN((int)(propagate_amount * propagate_num / propagate_denom + 0.5f), 32767);
502 503 504
    }
}

505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579
static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
                                   int16_t *propagate_amount, uint16_t *lowres_costs,
                                   int bipred_weight, int mb_y, int len, int list )
{
    unsigned stride = h->mb.i_mb_stride;
    unsigned width = h->mb.i_mb_width;
    unsigned height = h->mb.i_mb_height;

    for( unsigned i = 0; i < len; i++ )
    {
#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
        int lists_used = lowres_costs[i]>>LOWRES_COST_SHIFT;

        if( !(lists_used & (1 << list)) )
            continue;

        int listamount = propagate_amount[i];
        /* Apply bipred weighting. */
        if( lists_used == 3 )
            listamount = (listamount * bipred_weight + 32) >> 6;

        /* Early termination for simple case of mv0. */
        if( !M32( mvs[i] ) )
        {
            CLIP_ADD( ref_costs[mb_y*stride + i], listamount );
            continue;
        }

        int x = mvs[i][0];
        int y = mvs[i][1];
        unsigned mbx = (x>>5)+i;
        unsigned mby = (y>>5)+mb_y;
        unsigned idx0 = mbx + mby * stride;
        unsigned idx2 = idx0 + stride;
        x &= 31;
        y &= 31;
        int idx0weight = (32-y)*(32-x);
        int idx1weight = (32-y)*x;
        int idx2weight = y*(32-x);
        int idx3weight = y*x;
        idx0weight = (idx0weight * listamount + 512) >> 10;
        idx1weight = (idx1weight * listamount + 512) >> 10;
        idx2weight = (idx2weight * listamount + 512) >> 10;
        idx3weight = (idx3weight * listamount + 512) >> 10;

        if( mbx < width-1 && mby < height-1 )
        {
            CLIP_ADD( ref_costs[idx0+0], idx0weight );
            CLIP_ADD( ref_costs[idx0+1], idx1weight );
            CLIP_ADD( ref_costs[idx2+0], idx2weight );
            CLIP_ADD( ref_costs[idx2+1], idx3weight );
        }
        else
        {
            /* Note: this takes advantage of unsigned representation to
             * catch negative mbx/mby. */
            if( mby < height )
            {
                if( mbx < width )
                    CLIP_ADD( ref_costs[idx0+0], idx0weight );
                if( mbx+1 < width )
                    CLIP_ADD( ref_costs[idx0+1], idx1weight );
            }
            if( mby+1 < height )
            {
                if( mbx < width )
                    CLIP_ADD( ref_costs[idx2+0], idx2weight );
                if( mbx+1 < width )
                    CLIP_ADD( ref_costs[idx2+1], idx3weight );
            }
        }
    }
#undef CLIP_ADD
}

580
void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
Laurent Aimar's avatar
Laurent Aimar committed
581
{
582 583
    pf->mc_luma   = mc_luma;
    pf->get_ref   = get_ref;
Henrik Gramner's avatar
Henrik Gramner committed
584

585
    pf->mc_chroma = mc_chroma;
Laurent Aimar's avatar
Laurent Aimar committed
586

587 588 589 590 591
    pf->avg[PIXEL_16x16]= pixel_avg_16x16;
    pf->avg[PIXEL_16x8] = pixel_avg_16x8;
    pf->avg[PIXEL_8x16] = pixel_avg_8x16;
    pf->avg[PIXEL_8x8]  = pixel_avg_8x8;
    pf->avg[PIXEL_8x4]  = pixel_avg_8x4;
Henrik Gramner's avatar
Henrik Gramner committed
592
    pf->avg[PIXEL_4x16] = pixel_avg_4x16;
593 594 595
    pf->avg[PIXEL_4x8]  = pixel_avg_4x8;
    pf->avg[PIXEL_4x4]  = pixel_avg_4x4;
    pf->avg[PIXEL_4x2]  = pixel_avg_4x2;
Henrik Gramner's avatar
Henrik Gramner committed
596
    pf->avg[PIXEL_2x8]  = pixel_avg_2x8;
597 598
    pf->avg[PIXEL_2x4]  = pixel_avg_2x4;
    pf->avg[PIXEL_2x2]  = pixel_avg_2x2;
Loren Merritt's avatar
Loren Merritt committed
599

Dylan Yudaken's avatar
Dylan Yudaken committed
600 601 602 603 604
    pf->weight    = x264_mc_weight_wtab;
    pf->offsetadd = x264_mc_weight_wtab;
    pf->offsetsub = x264_mc_weight_wtab;
    pf->weight_cache = x264_weight_cache;

605
    pf->copy_16x16_unaligned = mc_copy_w16;
Loren Merritt's avatar
Loren Merritt committed
606 607 608 609
    pf->copy[PIXEL_16x16] = mc_copy_w16;
    pf->copy[PIXEL_8x8]   = mc_copy_w8;
    pf->copy[PIXEL_4x4]   = mc_copy_w4;

Henrik Gramner's avatar
Henrik Gramner committed
610 611 612
    pf->store_interleave_chroma       = store_interleave_chroma;
    pf->load_deinterleave_chroma_fenc = load_deinterleave_chroma_fenc;
    pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec;
613

614
    pf->plane_copy = x264_plane_copy_c;
615 616
    pf->plane_copy_interleave = x264_plane_copy_interleave_c;
    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c;
xvidfan's avatar
xvidfan committed
617
    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_c;
James Weaver's avatar
James Weaver committed
618
    pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_c;
619

620
    pf->hpel_filter = hpel_filter;
621

622 623
    pf->prefetch_fenc_420 = prefetch_fenc_null;
    pf->prefetch_fenc_422 = prefetch_fenc_null;
624
    pf->prefetch_ref  = prefetch_ref_null;
Fiona Glaser's avatar
Fiona Glaser committed
625
    pf->memcpy_aligned = memcpy;
Fiona Glaser's avatar
Fiona Glaser committed
626
    pf->memzero_aligned = memzero_aligned;
Loren Merritt's avatar
Loren Merritt committed
627
    pf->frame_init_lowres_core = frame_init_lowres_core;
628

Loren Merritt's avatar
Loren Merritt committed
629 630 631 632 633
    pf->integral_init4h = integral_init4h;
    pf->integral_init8h = integral_init8h;
    pf->integral_init4v = integral_init4v;
    pf->integral_init8v = integral_init8v;

634
    pf->mbtree_propagate_cost = mbtree_propagate_cost;
635
    pf->mbtree_propagate_list = mbtree_propagate_list;
636

Steven Walters's avatar
Steven Walters committed
637
#if HAVE_MMX
Loren Merritt's avatar
Loren Merritt committed
638
    x264_mc_init_mmx( cpu, pf );
639
#endif
Steven Walters's avatar
Steven Walters committed
640
#if HAVE_ALTIVEC
Laurent Aimar's avatar
Laurent Aimar committed
641 642 643
    if( cpu&X264_CPU_ALTIVEC )
        x264_mc_altivec_init( pf );
#endif
Steven Walters's avatar
Steven Walters committed
644
#if HAVE_ARMV6
645 646
    x264_mc_init_arm( cpu, pf );
#endif
647 648 649
#if ARCH_AARCH64
    x264_mc_init_aarch64( cpu, pf );
#endif
650 651

    if( cpu_independent )
652
    {
653
        pf->mbtree_propagate_cost = mbtree_propagate_cost;
654 655
        pf->mbtree_propagate_list = mbtree_propagate_list;
    }
Laurent Aimar's avatar
Laurent Aimar committed
656 657
}

658
void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
659
{
660
    const int b_interlaced = PARAM_INTERLACED;
Simon Horlick's avatar
Simon Horlick committed
661
    int start = mb_y*16 - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8
662
    int height = (b_end ? frame->i_lines[0] + 16*PARAM_INTERLACED : (mb_y+b_interlaced)*16) + 8;
663

Loren Merritt's avatar
Loren Merritt committed
664 665
    if( mb_y & b_interlaced )
        return;
666

Fiona Glaser's avatar
Fiona Glaser committed
667
    for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
Simon Horlick's avatar
Simon Horlick committed
668
    {
Fiona Glaser's avatar
Fiona Glaser committed
669 670 671 672 673
        int stride = frame->i_stride[p];
        const int width = frame->i_width[p];
        int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd

        if( !b_interlaced || h->mb.b_adaptive_mbaff )
Simon Horlick's avatar
Simon Horlick committed
674
            h->mc.hpel_filter(
Fiona Glaser's avatar
Fiona Glaser committed
675 676 677 678 679
                frame->filtered[p][1] + offs,
                frame->filtered[p][2] + offs,
                frame->filtered[p][3] + offs,
                frame->plane[p] + offs,
                stride, width + 16, height - start,
Simon Horlick's avatar
Simon Horlick committed
680
                h->scratch_buffer );
Fiona Glaser's avatar
Fiona Glaser committed
681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698

        if( b_interlaced )
        {
            /* MC must happen between pixels in the same field. */
            stride = frame->i_stride[p] << 1;
            start = (mb_y*16 >> 1) - 8;
            int height_fld = ((b_end ? frame->i_lines[p] : mb_y*16) >> 1) + 8;
            offs = start*stride - 8;
            for( int i = 0; i < 2; i++, offs += frame->i_stride[p] )
            {
                h->mc.hpel_filter(
                    frame->filtered_fld[p][1] + offs,
                    frame->filtered_fld[p][2] + offs,
                    frame->filtered_fld[p][3] + offs,
                    frame->plane_fld[p] + offs,
                    stride, width + 16, height_fld - start,
                    h->scratch_buffer );
            }
Simon Horlick's avatar
Simon Horlick committed
699
        }
700
    }
701 702

    /* generate integral image:
Loren Merritt's avatar
Loren Merritt committed
703 704
     * frame->integral contains 2 planes. in the upper plane, each element is
     * the sum of an 8x8 pixel region with top-left corner on that point.
Loren Merritt's avatar
Loren Merritt committed
705
     * in the lower plane, 4x4 sums (needed only with --partitions p4x4). */
706

Loren Merritt's avatar
Loren Merritt committed
707
    if( frame->integral )
708
    {
Fiona Glaser's avatar
Fiona Glaser committed
709
        int stride = frame->i_stride[0];
Loren Merritt's avatar
Loren Merritt committed
710 711
        if( start < 0 )
        {
712 713
            memset( frame->integral - PADV * stride - PADH, 0, stride * sizeof(uint16_t) );
            start = -PADV;
Loren Merritt's avatar
Loren Merritt committed
714 715
        }
        if( b_end )
716
            height += PADV-9;
717
        for( int y = start; y < height; y++ )
718
        {
719
            pixel    *pix  = frame->plane[0] + y * stride - PADH;
Loren Merritt's avatar
Loren Merritt committed
720 721 722 723 724 725 726 727 728 729 730
            uint16_t *sum8 = frame->integral + (y+1) * stride - PADH;
            uint16_t *sum4;
            if( h->frames.b_have_sub8x8_esa )
            {
                h->mc.integral_init4h( sum8, pix, stride );
                sum8 -= 8*stride;
                sum4 = sum8 + stride * (frame->i_lines[0] + PADV*2);
                if( y >= 8-PADV )
                    h->mc.integral_init4v( sum8, sum4, stride );
            }
            else
Loren Merritt's avatar
Loren Merritt committed
731
            {
Loren Merritt's avatar
Loren Merritt committed
732 733 734
                h->mc.integral_init8h( sum8, pix, stride );
                if( y >= 8-PADV )
                    h->mc.integral_init8v( sum8-8*stride, stride );
Loren Merritt's avatar
Loren Merritt committed
735 736
            }
        }
737
    }
738
}