ipred_tmpl.c 26.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
/*
 * Copyright © 2018, VideoLAN and dav1d authors
 * Copyright © 2018, Two Orioles, LLC
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"

#include <assert.h>
#include <stdlib.h>
#include <string.h>

34
#include "common/attributes.h"
35 36 37
#include "common/intops.h"

#include "src/ipred.h"
38
#include "src/tables.h"
39

40
static NOINLINE void
41
splat_dc(pixel *dst, const ptrdiff_t stride,
42
         const int width, const int height, const int dc HIGHBD_DECL_SUFFIX)
43 44
{
#if BITDEPTH == 8
45
    assert(dc <= 0xff);
46
    if (width > 4) {
47
        const uint64_t dcN = dc * 0x0101010101010101ULL;
48 49
        for (int y = 0; y < height; y++) {
            for (int x = 0; x < width; x += sizeof(dcN))
50 51 52 53 54
                *((uint64_t *) &dst[x]) = dcN;
            dst += PXSTRIDE(stride);
        }
    } else {
        const unsigned dcN = dc * 0x01010101U;
55 56
        for (int y = 0; y < height; y++) {
            for (int x = 0; x < width; x += sizeof(dcN))
57 58 59 60 61
                *((unsigned *) &dst[x]) = dcN;
            dst += PXSTRIDE(stride);
        }
    }
#else
62
    assert(dc <= bitdepth_max);
63
    const uint64_t dcN = dc * 0x0001000100010001ULL;
64 65
    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x += sizeof(dcN) >> 1)
66 67 68 69 70 71
            *((uint64_t *) &dst[x]) = dcN;
        dst += PXSTRIDE(stride);
    }
#endif
}

72 73
static NOINLINE void
cfl_pred(pixel *dst, const ptrdiff_t stride,
74 75
         const int width, const int height, const int dc,
         const int16_t *ac, const int alpha HIGHBD_DECL_SUFFIX)
76 77 78 79 80 81 82 83 84 85 86
{
    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            const int diff = alpha * ac[x];
            dst[x] = iclip_pixel(dc + apply_sign((abs(diff) + 32) >> 6, diff));
        }
        ac += width;
        dst += PXSTRIDE(stride);
    }
}

87
static unsigned dc_gen_top(const pixel *const topleft, const int width) {
88 89 90
    unsigned dc = width >> 1;
    for (int i = 0; i < width; i++)
       dc += topleft[1 + i];
91 92
    return dc >> ctz(width);
}
93

94 95
static void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride,
                           const pixel *const topleft,
96
                           const int width, const int height, const int a,
97 98
                           const int max_width, const int max_height
                           HIGHBD_DECL_SUFFIX)
99
{
100 101
    splat_dc(dst, stride, width, height, dc_gen_top(topleft, width)
             HIGHBD_TAIL_SUFFIX);
102 103
}

104
static void ipred_cfl_top_c(pixel *dst, const ptrdiff_t stride,
105
                            const pixel *const topleft,
106
                            const int width, const int height,
107 108
                            const int16_t *ac, const int alpha
                            HIGHBD_DECL_SUFFIX)
109
{
110 111
    cfl_pred(dst, stride, width, height, dc_gen_top(topleft, width), ac, alpha
             HIGHBD_TAIL_SUFFIX);
112 113
}

114
static unsigned dc_gen_left(const pixel *const topleft, const int height) {
115 116 117
    unsigned dc = height >> 1;
    for (int i = 0; i < height; i++)
       dc += topleft[-(1 + i)];
118 119
    return dc >> ctz(height);
}
120

121 122
static void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride,
                            const pixel *const topleft,
123
                            const int width, const int height, const int a,
124 125
                            const int max_width, const int max_height
                            HIGHBD_DECL_SUFFIX)
126
{
127 128
    splat_dc(dst, stride, width, height, dc_gen_left(topleft, height)
             HIGHBD_TAIL_SUFFIX);
129 130 131 132 133
}

static void ipred_cfl_left_c(pixel *dst, const ptrdiff_t stride,
                             const pixel *const topleft,
                             const int width, const int height,
134 135
                             const int16_t *ac, const int alpha
                             HIGHBD_DECL_SUFFIX)
136 137
{
    unsigned dc = dc_gen_left(topleft, height);
138
    cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX);
139
}
140

141 142 143 144 145 146 147 148 149 150
#if BITDEPTH == 8
#define MULTIPLIER_1x2 0x5556
#define MULTIPLIER_1x4 0x3334
#define BASE_SHIFT 16
#else
#define MULTIPLIER_1x2 0xAAAB
#define MULTIPLIER_1x4 0x6667
#define BASE_SHIFT 17
#endif

151 152
static unsigned dc_gen(const pixel *const topleft,
                       const int width, const int height)
153 154 155 156 157 158 159 160 161 162 163 164 165
{
    unsigned dc = (width + height) >> 1;
    for (int i = 0; i < width; i++)
       dc += topleft[i + 1];
    for (int i = 0; i < height; i++)
       dc += topleft[-(i + 1)];
    dc >>= ctz(width + height);

    if (width != height) {
        dc *= (width > height * 2 || height > width * 2) ? MULTIPLIER_1x4 :
                                                           MULTIPLIER_1x2;
        dc >>= BASE_SHIFT;
    }
166 167 168 169 170
    return dc;
}

static void ipred_dc_c(pixel *dst, const ptrdiff_t stride,
                       const pixel *const topleft,
171
                       const int width, const int height, const int a,
172 173
                       const int max_width, const int max_height
                       HIGHBD_DECL_SUFFIX)
174
{
175 176
    splat_dc(dst, stride, width, height, dc_gen(topleft, width, height)
             HIGHBD_TAIL_SUFFIX);
177
}
178

179 180 181
static void ipred_cfl_c(pixel *dst, const ptrdiff_t stride,
                        const pixel *const topleft,
                        const int width, const int height,
182 183
                        const int16_t *ac, const int alpha
                        HIGHBD_DECL_SUFFIX)
184 185
{
    unsigned dc = dc_gen(topleft, width, height);
186
    cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX);
187
}
188

189 190 191 192
#undef MULTIPLIER_1x2
#undef MULTIPLIER_1x4
#undef BASE_SHIFT

193 194
static void ipred_dc_128_c(pixel *dst, const ptrdiff_t stride,
                           const pixel *const topleft,
195
                           const int width, const int height, const int a,
196 197
                           const int max_width, const int max_height
                           HIGHBD_DECL_SUFFIX)
198
{
199 200 201 202 203 204
#if BITDEPTH == 16
    const int dc = (bitdepth_max + 1) >> 1;
#else
    const int dc = 128;
#endif
    splat_dc(dst, stride, width, height, dc HIGHBD_TAIL_SUFFIX);
205
}
206

207 208 209
static void ipred_cfl_128_c(pixel *dst, const ptrdiff_t stride,
                            const pixel *const topleft,
                            const int width, const int height,
210 211
                            const int16_t *ac, const int alpha
                            HIGHBD_DECL_SUFFIX)
212
{
213 214 215 216 217 218
#if BITDEPTH == 16
    const int dc = (bitdepth_max + 1) >> 1;
#else
    const int dc = 128;
#endif
    cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX);
219 220
}

221 222
static void ipred_v_c(pixel *dst, const ptrdiff_t stride,
                      const pixel *const topleft,
223
                      const int width, const int height, const int a,
224 225
                      const int max_width, const int max_height
                      HIGHBD_DECL_SUFFIX)
226 227 228 229 230 231 232
{
    for (int y = 0; y < height; y++) {
        pixel_copy(dst, topleft + 1, width);
        dst += PXSTRIDE(stride);
    }
}

233 234
static void ipred_h_c(pixel *dst, const ptrdiff_t stride,
                      const pixel *const topleft,
235
                      const int width, const int height, const int a,
236 237
                      const int max_width, const int max_height
                      HIGHBD_DECL_SUFFIX)
238 239 240 241 242 243 244
{
    for (int y = 0; y < height; y++) {
        pixel_set(dst, topleft[-(1 + y)], width);
        dst += PXSTRIDE(stride);
    }
}

245 246
static void ipred_paeth_c(pixel *dst, const ptrdiff_t stride,
                          const pixel *const tl_ptr,
247
                          const int width, const int height, const int a,
248 249
                          const int max_width, const int max_height
                          HIGHBD_DECL_SUFFIX)
250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
{
    const int topleft = tl_ptr[0];
    for (int y = 0; y < height; y++) {
        const int left = tl_ptr[-(y + 1)];
        for (int x = 0; x < width; x++) {
            const int top = tl_ptr[1 + x];
            const int base = left + top - topleft;
            const int ldiff = abs(left - base);
            const int tdiff = abs(top - base);
            const int tldiff = abs(topleft - base);

            dst[x] = ldiff <= tdiff && ldiff <= tldiff ? left :
                     tdiff <= tldiff ? top : topleft;
        }
        dst += PXSTRIDE(stride);
    }
}

268 269
static void ipred_smooth_c(pixel *dst, const ptrdiff_t stride,
                           const pixel *const topleft,
270
                           const int width, const int height, const int a,
271 272
                           const int max_width, const int max_height
                           HIGHBD_DECL_SUFFIX)
273
{
274 275
    const uint8_t *const weights_hor = &dav1d_sm_weights[width];
    const uint8_t *const weights_ver = &dav1d_sm_weights[height];
276 277 278 279 280 281 282 283 284 285 286 287 288 289
    const int right = topleft[width], bottom = topleft[-height];

    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            const int pred = weights_ver[y]  * topleft[1 + x] +
                      (256 - weights_ver[y]) * bottom +
                             weights_hor[x]  * topleft[-(1 + y)] +
                      (256 - weights_hor[x]) * right;
            dst[x] = (pred + 256) >> 9;
        }
        dst += PXSTRIDE(stride);
    }
}

290 291
static void ipred_smooth_v_c(pixel *dst, const ptrdiff_t stride,
                             const pixel *const topleft,
292
                             const int width, const int height, const int a,
293 294
                             const int max_width, const int max_height
                             HIGHBD_DECL_SUFFIX)
295
{
296
    const uint8_t *const weights_ver = &dav1d_sm_weights[height];
297 298 299 300 301 302 303 304 305 306 307 308
    const int bottom = topleft[-height];

    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            const int pred = weights_ver[y]  * topleft[1 + x] +
                      (256 - weights_ver[y]) * bottom;
            dst[x] = (pred + 128) >> 8;
        }
        dst += PXSTRIDE(stride);
    }
}

309 310
static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,
                             const pixel *const topleft,
311
                             const int width, const int height, const int a,
312 313
                             const int max_width, const int max_height
                             HIGHBD_DECL_SUFFIX)
314
{
315
    const uint8_t *const weights_hor = &dav1d_sm_weights[width];
316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367
    const int right = topleft[width];

    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            const int pred = weights_hor[x]  * topleft[-(y + 1)] +
                      (256 - weights_hor[x]) * right;
            dst[x] = (pred + 128) >> 8;
        }
        dst += PXSTRIDE(stride);
    }
}

static int get_filter_strength(const unsigned blk_wh, const unsigned d,
                               const int type)
{
    int strength = 0;

    if (type == 0) {
        if (blk_wh <= 8) {
            if (d >= 56) strength = 1;
        } else if (blk_wh <= 12) {
            if (d >= 40) strength = 1;
        } else if (blk_wh <= 16) {
            if (d >= 40) strength = 1;
        } else if (blk_wh <= 24) {
            if (d >= 8) strength = 1;
            if (d >= 16) strength = 2;
            if (d >= 32) strength = 3;
        } else if (blk_wh <= 32) {
            if (d >= 1) strength = 1;
            if (d >= 4) strength = 2;
            if (d >= 32) strength = 3;
        } else {
            if (d >= 1) strength = 3;
        }
    } else {
        if (blk_wh <= 8) {
            if (d >= 40) strength = 1;
            if (d >= 64) strength = 2;
        } else if (blk_wh <= 16) {
            if (d >= 20) strength = 1;
            if (d >= 48) strength = 2;
        } else if (blk_wh <= 24) {
            if (d >= 4) strength = 3;
        } else {
            if (d >= 1) strength = 3;
        }
    }

    return strength;
}

368 369 370
static void filter_edge(pixel *const out, const int sz,
                        const int lim_from, const int lim_to,
                        const pixel *const in,
371
                        const int from, const int to, const unsigned strength)
372
{
373
    static const uint8_t kernel[3][5] = {
374 375 376 377 378 379
        { 0, 4, 8, 4, 0 },
        { 0, 5, 6, 5, 0 },
        { 2, 4, 4, 4, 2 }
    };

    assert(strength > 0);
380
    int i = 0;
381
    for (; i < imin(sz, lim_from); i++)
382 383
        out[i] = in[iclip(i, from, to - 1)];
    for (; i < imin(lim_to, sz); i++) {
384 385 386 387 388
        int s = 0;
        for (int j = 0; j < 5; j++)
            s += in[iclip(i - 2 + j, from, to - 1)] * kernel[strength - 1][j];
        out[i] = (s + 8) >> 4;
    }
389 390
    for (; i < sz; i++)
        out[i] = in[iclip(i, from, to - 1)];
391 392 393 394 395 396 397 398
}

static int get_upsample(const int blk_wh, const unsigned d, const int type) {
    if (d >= 40) return 0;
    return type ? (blk_wh <= 8) : (blk_wh <= 16);
}

static void upsample_edge(pixel *const out, const int hsz,
399 400
                          const pixel *const in, const int from, const int to
                          HIGHBD_DECL_SUFFIX)
401
{
402
    static const int8_t kernel[4] = { -1, 9, 9, -1 };
403 404 405 406 407 408 409 410 411 412 413 414
    int i;
    for (i = 0; i < hsz - 1; i++) {
        out[i * 2] = in[iclip(i, from, to - 1)];

        int s = 0;
        for (int j = 0; j < 4; j++)
            s += in[iclip(i + j - 1, from, to - 1)] * kernel[j];
        out[i * 2 + 1] = iclip_pixel((s + 8) >> 4);
    }
    out[i * 2] = in[iclip(i, from, to - 1)];
}

415 416
static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
                       const pixel *const topleft_in,
417
                       const int width, const int height, int angle,
418 419
                       const int max_width, const int max_height
                       HIGHBD_DECL_SUFFIX)
420
{
421 422
    const int is_sm = (angle >> 9) & 0x1;
    const int enable_intra_edge_filter = angle >> 10;
423 424
    angle &= 511;
    assert(angle < 90);
425
    int dx = dav1d_dr_intra_derivative[angle >> 1];
426
    pixel top_out[(64 + 64) * 2];
427 428
    const pixel *top;
    int max_base_x;
429 430
    const int upsample_above = enable_intra_edge_filter ?
        get_upsample(width + height, 90 - angle, is_sm) : 0;
431
    if (upsample_above) {
432 433
        upsample_edge(top_out, width + height, &topleft_in[1], -1,
                      width + imin(width, height) HIGHBD_TAIL_SUFFIX);
434 435
        top = top_out;
        max_base_x = 2 * (width + height) - 2;
436
        dx <<= 1;
437
    } else {
438 439
        const int filter_strength = enable_intra_edge_filter ?
            get_filter_strength(width + height, 90 - angle, is_sm) : 0;
440
        if (filter_strength) {
441
            filter_edge(top_out, width + height, 0, width + height,
442 443 444 445 446 447 448 449 450
                        &topleft_in[1], -1, width + imin(width, height),
                        filter_strength);
            top = top_out;
            max_base_x = width + height - 1;
        } else {
            top = &topleft_in[1];
            max_base_x = width + imin(width, height) - 1;
        }
    }
451
    const int base_inc = 1 + upsample_above;
452 453 454
    for (int y = 0, xpos = dx; y < height;
         y++, dst += PXSTRIDE(stride), xpos += dx)
    {
455
        const int frac = (xpos >> 1) & 0x1F;
456

457
        for (int x = 0, base = xpos >> 6; x < width; x++, base += base_inc) {
458 459 460 461 462 463 464 465 466 467 468
            if (base < max_base_x) {
                const int v = top[base] * (32 - frac) + top[base + 1] * frac;
                dst[x] = iclip_pixel((v + 16) >> 5);
            } else {
                pixel_set(&dst[x], top[max_base_x], width - x);
                break;
            }
        }
    }
}

469 470
static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
                       const pixel *const topleft_in,
471
                       const int width, const int height, int angle,
472 473
                       const int max_width, const int max_height
                       HIGHBD_DECL_SUFFIX)
474
{
475 476
    const int is_sm = (angle >> 9) & 0x1;
    const int enable_intra_edge_filter = angle >> 10;
477 478
    angle &= 511;
    assert(angle > 90 && angle < 180);
479 480
    int dy = dav1d_dr_intra_derivative[(angle - 90) >> 1];
    int dx = dav1d_dr_intra_derivative[(180 - angle) >> 1];
481 482 483 484
    const int upsample_left = enable_intra_edge_filter ?
        get_upsample(width + height, 180 - angle, is_sm) : 0;
    const int upsample_above = enable_intra_edge_filter ?
        get_upsample(width + height, angle - 90, is_sm) : 0;
485
    pixel edge[64 * 2 + 64 * 2 + 1];
486 487 488
    pixel *const topleft = &edge[height * 2];

    if (upsample_above) {
489 490
        upsample_edge(topleft, width + 1, topleft_in, 0, width + 1
                      HIGHBD_TAIL_SUFFIX);
491
        dx <<= 1;
492
    } else {
493 494
        const int filter_strength = enable_intra_edge_filter ?
            get_filter_strength(width + height, angle - 90, is_sm) : 0;
495 496

        if (filter_strength) {
497 498
            filter_edge(&topleft[1], width, 0, max_width,
                        &topleft_in[1], -1, width,
499 500 501 502 503 504
                        filter_strength);
        } else {
            pixel_copy(&topleft[1], &topleft_in[1], width);
        }
    }
    if (upsample_left) {
505 506
        upsample_edge(edge, height + 1, &topleft_in[-height], 0, height + 1
                      HIGHBD_TAIL_SUFFIX);
507
        dy <<= 1;
508
    } else {
509 510
        const int filter_strength = enable_intra_edge_filter ?
            get_filter_strength(width + height, 180 - angle, is_sm) : 0;
511 512

        if (filter_strength) {
513 514
            filter_edge(&topleft[-height], height, height - max_height, height,
                        &topleft_in[-height],
515 516 517 518 519 520 521
                        0, height + 1, filter_strength);
        } else {
            pixel_copy(&topleft[-height], &topleft_in[-height], height);
        }
    }
    *topleft = *topleft_in;

522 523 524 525
    const int min_base_x = -(1 + upsample_above);
    const int base_inc_x = 1 + upsample_above;
    const pixel *const left = &topleft[-(1 + upsample_left)];
    const pixel *const top = &topleft[1 + upsample_above];
526 527 528
    for (int y = 0, xpos = -dx; y < height;
         y++, xpos -= dx, dst += PXSTRIDE(stride))
    {
529 530
        int base_x = xpos >> 6;
        const int frac_x = (xpos >> 1) & 0x1F;
531

532
        for (int x = 0, ypos = (y << (6 + upsample_left)) - dy; x < width;
533 534 535 536 537 538 539
             x++, base_x += base_inc_x, ypos -= dy)
        {
            int v;

            if (base_x >= min_base_x) {
                v = top[base_x] * (32 - frac_x) + top[base_x + 1] * frac_x;
            } else {
540 541 542
                const int base_y = ypos >> 6;
                assert(base_y >= -(1 + upsample_left));
                const int frac_y = (ypos >> 1) & 0x1F;
543 544 545 546 547 548 549
                v = left[-base_y] * (32 - frac_y) + left[-(base_y + 1)] * frac_y;
            }
            dst[x] = iclip_pixel((v + 16) >> 5);
        }
    }
}

550 551
static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
                       const pixel *const topleft_in,
552
                       const int width, const int height, int angle,
553 554
                       const int max_width, const int max_height
                       HIGHBD_DECL_SUFFIX)
555
{
556 557
    const int is_sm = (angle >> 9) & 0x1;
    const int enable_intra_edge_filter = angle >> 10;
558 559
    angle &= 511;
    assert(angle > 180);
560
    int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1];
561
    pixel left_out[(64 + 64) * 2];
562 563
    const pixel *left;
    int max_base_y;
564 565
    const int upsample_left = enable_intra_edge_filter ?
        get_upsample(width + height, angle - 180, is_sm) : 0;
566 567 568
    if (upsample_left) {
        upsample_edge(left_out, width + height,
                      &topleft_in[-(width + height)],
569 570
                      imax(width - height, 0), width + height + 1
                      HIGHBD_TAIL_SUFFIX);
571 572
        left = &left_out[2 * (width + height) - 2];
        max_base_y = 2 * (width + height) - 2;
573
        dy <<= 1;
574
    } else {
575 576
        const int filter_strength = enable_intra_edge_filter ?
            get_filter_strength(width + height, angle - 180, is_sm) : 0;
577 578

        if (filter_strength) {
579
            filter_edge(left_out, width + height, 0, width + height,
580 581 582 583 584 585 586 587 588 589
                        &topleft_in[-(width + height)],
                        imax(width - height, 0), width + height + 1,
                        filter_strength);
            left = &left_out[width + height - 1];
            max_base_y = width + height - 1;
        } else {
            left = &topleft_in[-1];
            max_base_y = height + imin(width, height) - 1;
        }
    }
590
    const int base_inc = 1 + upsample_left;
591
    for (int x = 0, ypos = dy; x < width; x++, ypos += dy) {
592
        const int frac = (ypos >> 1) & 0x1F;
593

594
        for (int y = 0, base = ypos >> 6; y < height; y++, base += base_inc) {
595 596 597 598 599 600 601 602 603 604 605 606 607 608
            if (base < max_base_y) {
                const int v = left[-base] * (32 - frac) +
                              left[-(base + 1)] * frac;
                dst[y * PXSTRIDE(stride) + x] = iclip_pixel((v + 16) >> 5);
            } else {
                do {
                    dst[y * PXSTRIDE(stride) + x] = left[-max_base_y];
                } while (++y < height);
                break;
            }
        }
    }
}

609 610 611
/* Up to 32x32 only */
static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
                           const pixel *const topleft_in,
612
                           const int width, const int height, int filt_idx,
613 614
                           const int max_width, const int max_height
                           HIGHBD_DECL_SUFFIX)
615 616 617 618
{
    filt_idx &= 511;
    assert(filt_idx < 5);

619
    const int8_t *const filter = dav1d_filter_intra_taps[filt_idx];
620 621 622 623 624 625 626 627 628 629 630 631 632 633
    int x, y;
    ptrdiff_t left_stride;
    const pixel *left, *topleft, *top;

    top = &topleft_in[1];
    for (y = 0; y < height; y += 2) {
        topleft = &topleft_in[-y];
        left = &topleft[-1];
        left_stride = -1;
        for (x = 0; x < width; x += 4) {
            const int p0 = *topleft;
            const int p1 = top[0], p2 = top[1], p3 = top[2], p4 = top[3];
            const int p5 = left[0 * left_stride], p6 = left[1 * left_stride];
            pixel *ptr = &dst[x];
634
            const int8_t *flt_ptr = filter;
635 636

            for (int yy = 0; yy < 2; yy++) {
637 638 639 640 641
                for (int xx = 0; xx < 4; xx++, flt_ptr += 2) {
                    int acc = flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 +
                              flt_ptr[16] * p2 + flt_ptr[17] * p3 +
                              flt_ptr[32] * p4 + flt_ptr[33] * p5 +
                              flt_ptr[48] * p6;
642 643 644 645 646 647 648 649 650 651 652 653 654 655
                    ptr[xx] = iclip_pixel((acc + 8) >> 4);
                }
                ptr += PXSTRIDE(stride);
            }
            left = &dst[x + 4 - 1];
            left_stride = PXSTRIDE(stride);
            top += 4;
            topleft = &top[-1];
        }
        top = &dst[PXSTRIDE(stride)];
        dst = &dst[PXSTRIDE(stride) * 2];
    }
}

656
static NOINLINE void
657 658
cfl_ac_c(int16_t *ac, const pixel *ypx, const ptrdiff_t stride,
         const int w_pad, const int h_pad, const int width, const int height,
659
         const int ss_hor, const int ss_ver)
660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682
{
    int y, x;
    int16_t *const ac_orig = ac;

    assert(w_pad >= 0 && w_pad * 4 < width);
    assert(h_pad >= 0 && h_pad * 4 < height);

    for (y = 0; y < height - 4 * h_pad; y++) {
        for (x = 0; x < width - 4 * w_pad; x++) {
            int ac_sum = ypx[x << ss_hor];
            if (ss_hor) ac_sum += ypx[x * 2 + 1];
            if (ss_ver) {
                ac_sum += ypx[(x << ss_hor) + PXSTRIDE(stride)];
                if (ss_hor) ac_sum += ypx[x * 2 + 1 + PXSTRIDE(stride)];
            }
            ac[x] = ac_sum << (1 + !ss_ver + !ss_hor);
        }
        for (; x < width; x++)
            ac[x] = ac[x - 1];
        ac += width;
        ypx += PXSTRIDE(stride) << ss_ver;
    }
    for (; y < height; y++) {
683
        memcpy(ac, &ac[-width], width * sizeof(*ac));
684 685 686
        ac += width;
    }

687
    const int log2sz = ctz(width) + ctz(height);
688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703
    int sum = (1 << log2sz) >> 1;
    for (ac = ac_orig, y = 0; y < height; y++) {
        for (x = 0; x < width; x++)
            sum += ac[x];
        ac += width;
    }
    sum >>= log2sz;

    // subtract DC
    for (ac = ac_orig, y = 0; y < height; y++) {
        for (x = 0; x < width; x++)
            ac[x] -= sum;
        ac += width;
    }
}

704 705 706 707
#define cfl_ac_fn(fmt, ss_hor, ss_ver) \
static void cfl_ac_##fmt##_c(int16_t *const ac, const pixel *const ypx, \
                             const ptrdiff_t stride, const int w_pad, \
                             const int h_pad, const int cw, const int ch) \
708
{ \
709
    cfl_ac_c(ac, ypx, stride, w_pad, h_pad, cw, ch, ss_hor, ss_ver); \
710 711
}

712 713 714
cfl_ac_fn(420, 1, 1)
cfl_ac_fn(422, 1, 0)
cfl_ac_fn(444, 0, 0)
715 716 717 718 719 720 721

static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
                       const uint16_t *const pal, const uint8_t *idx,
                       const int w, const int h)
{
    for (int y = 0; y < h; y++) {
        for (int x = 0; x < w; x++)
722
            dst[x] = (pixel) pal[idx[x]];
723 724 725 726 727 728
        idx += w;
        dst += PXSTRIDE(stride);
    }
}

void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
729 730 731 732 733 734 735 736 737 738 739 740 741 742
    c->intra_pred[DC_PRED      ] = ipred_dc_c;
    c->intra_pred[DC_128_PRED  ] = ipred_dc_128_c;
    c->intra_pred[TOP_DC_PRED  ] = ipred_dc_top_c;
    c->intra_pred[LEFT_DC_PRED ] = ipred_dc_left_c;
    c->intra_pred[HOR_PRED     ] = ipred_h_c;
    c->intra_pred[VERT_PRED    ] = ipred_v_c;
    c->intra_pred[PAETH_PRED   ] = ipred_paeth_c;
    c->intra_pred[SMOOTH_PRED  ] = ipred_smooth_c;
    c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c;
    c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c;
    c->intra_pred[Z1_PRED      ] = ipred_z1_c;
    c->intra_pred[Z2_PRED      ] = ipred_z2_c;
    c->intra_pred[Z3_PRED      ] = ipred_z3_c;
    c->intra_pred[FILTER_PRED  ] = ipred_filter_c;
743

744 745 746
    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = cfl_ac_420_c;
    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = cfl_ac_422_c;
    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = cfl_ac_444_c;
747

748 749 750 751
    c->cfl_pred[DC_PRED     ] = ipred_cfl_c;
    c->cfl_pred[DC_128_PRED ] = ipred_cfl_128_c;
    c->cfl_pred[TOP_DC_PRED ] = ipred_cfl_top_c;
    c->cfl_pred[LEFT_DC_PRED] = ipred_cfl_left_c;
752

753
    c->pal_pred = pal_pred_c;
754 755 756 757

#if HAVE_ASM && ARCH_X86
    bitfn(dav1d_intra_pred_dsp_init_x86)(c);
#endif
758
}