ipred_tmpl.c 25.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
/*
 * Copyright © 2018, VideoLAN and dav1d authors
 * Copyright © 2018, Two Orioles, LLC
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"

#include <assert.h>
#include <stdlib.h>
#include <string.h>

34
#include "common/attributes.h"
35 36 37
#include "common/intops.h"

#include "src/ipred.h"
38
#include "src/tables.h"
39

40
static NOINLINE void
41 42
splat_dc(pixel *dst, const ptrdiff_t stride,
         const int width, const int height, const unsigned dc)
43 44 45
{
    assert(dc <= (1 << BITDEPTH) - 1);
#if BITDEPTH == 8
46
    if (width > 4) {
47
        const uint64_t dcN = dc * 0x0101010101010101ULL;
48 49
        for (int y = 0; y < height; y++) {
            for (int x = 0; x < width; x += sizeof(dcN))
50 51 52 53 54
                *((uint64_t *) &dst[x]) = dcN;
            dst += PXSTRIDE(stride);
        }
    } else {
        const unsigned dcN = dc * 0x01010101U;
55 56
        for (int y = 0; y < height; y++) {
            for (int x = 0; x < width; x += sizeof(dcN))
57 58 59 60 61 62
                *((unsigned *) &dst[x]) = dcN;
            dst += PXSTRIDE(stride);
        }
    }
#else
    const uint64_t dcN = dc * 0x0001000100010001ULL;
63 64
    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x += sizeof(dcN) >> 1)
65 66 67 68 69 70
            *((uint64_t *) &dst[x]) = dcN;
        dst += PXSTRIDE(stride);
    }
#endif
}

71 72 73
static NOINLINE void
cfl_pred(pixel *dst, const ptrdiff_t stride,
         const int width, const int height, const unsigned dc,
74
         const int16_t *ac, const int alpha)
75 76 77 78 79 80 81 82 83 84 85
{
    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            const int diff = alpha * ac[x];
            dst[x] = iclip_pixel(dc + apply_sign((abs(diff) + 32) >> 6, diff));
        }
        ac += width;
        dst += PXSTRIDE(stride);
    }
}

86
static unsigned dc_gen_top(const pixel *const topleft, const int width) {
87 88 89
    unsigned dc = width >> 1;
    for (int i = 0; i < width; i++)
       dc += topleft[1 + i];
90 91
    return dc >> ctz(width);
}
92

93 94
static void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride,
                           const pixel *const topleft,
95 96
                           const int width, const int height, const int a,
                           const int max_width, const int max_height)
97 98
{
    splat_dc(dst, stride, width, height, dc_gen_top(topleft, width));
99 100
}

101
static void ipred_cfl_top_c(pixel *dst, const ptrdiff_t stride,
102
                            const pixel *const topleft,
103
                            const int width, const int height,
104
                            const int16_t *ac, const int alpha)
105
{
106
    cfl_pred(dst, stride, width, height, dc_gen_top(topleft, width), ac, alpha);
107 108
}

109
static unsigned dc_gen_left(const pixel *const topleft, const int height) {
110 111 112
    unsigned dc = height >> 1;
    for (int i = 0; i < height; i++)
       dc += topleft[-(1 + i)];
113 114
    return dc >> ctz(height);
}
115

116 117
static void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride,
                            const pixel *const topleft,
118 119
                            const int width, const int height, const int a,
                            const int max_width, const int max_height)
120 121 122 123 124 125 126
{
    splat_dc(dst, stride, width, height, dc_gen_left(topleft, height));
}

static void ipred_cfl_left_c(pixel *dst, const ptrdiff_t stride,
                             const pixel *const topleft,
                             const int width, const int height,
127
                             const int16_t *ac, const int alpha)
128 129
{
    unsigned dc = dc_gen_left(topleft, height);
130
    cfl_pred(dst, stride, width, height, dc, ac, alpha);
131
}
132

133 134 135 136 137 138 139 140 141 142
#if BITDEPTH == 8
#define MULTIPLIER_1x2 0x5556
#define MULTIPLIER_1x4 0x3334
#define BASE_SHIFT 16
#else
#define MULTIPLIER_1x2 0xAAAB
#define MULTIPLIER_1x4 0x6667
#define BASE_SHIFT 17
#endif

143 144
static unsigned dc_gen(const pixel *const topleft,
                       const int width, const int height)
145 146 147 148 149 150 151 152 153 154 155 156 157
{
    unsigned dc = (width + height) >> 1;
    for (int i = 0; i < width; i++)
       dc += topleft[i + 1];
    for (int i = 0; i < height; i++)
       dc += topleft[-(i + 1)];
    dc >>= ctz(width + height);

    if (width != height) {
        dc *= (width > height * 2 || height > width * 2) ? MULTIPLIER_1x4 :
                                                           MULTIPLIER_1x2;
        dc >>= BASE_SHIFT;
    }
158 159 160 161 162
    return dc;
}

static void ipred_dc_c(pixel *dst, const ptrdiff_t stride,
                       const pixel *const topleft,
163 164
                       const int width, const int height, const int a,
                       const int max_width, const int max_height)
165 166 167
{
    splat_dc(dst, stride, width, height, dc_gen(topleft, width, height));
}
168

169 170 171
static void ipred_cfl_c(pixel *dst, const ptrdiff_t stride,
                        const pixel *const topleft,
                        const int width, const int height,
172
                        const int16_t *ac, const int alpha)
173 174
{
    unsigned dc = dc_gen(topleft, width, height);
175
    cfl_pred(dst, stride, width, height, dc, ac, alpha);
176
}
177

178 179 180 181
#undef MULTIPLIER_1x2
#undef MULTIPLIER_1x4
#undef BASE_SHIFT

182 183
static void ipred_dc_128_c(pixel *dst, const ptrdiff_t stride,
                           const pixel *const topleft,
184 185
                           const int width, const int height, const int a,
                           const int max_width, const int max_height)
186 187 188
{
    splat_dc(dst, stride, width, height, 1 << (BITDEPTH - 1));
}
189

190 191 192
static void ipred_cfl_128_c(pixel *dst, const ptrdiff_t stride,
                            const pixel *const topleft,
                            const int width, const int height,
193
                            const int16_t *ac, const int alpha)
194
{
195
    cfl_pred(dst, stride, width, height, 1 << (BITDEPTH - 1), ac, alpha);
196 197
}

198 199
static void ipred_v_c(pixel *dst, const ptrdiff_t stride,
                      const pixel *const topleft,
200 201
                      const int width, const int height, const int a,
                      const int max_width, const int max_height)
202 203 204 205 206 207 208
{
    for (int y = 0; y < height; y++) {
        pixel_copy(dst, topleft + 1, width);
        dst += PXSTRIDE(stride);
    }
}

209 210
static void ipred_h_c(pixel *dst, const ptrdiff_t stride,
                      const pixel *const topleft,
211 212
                      const int width, const int height, const int a,
                      const int max_width, const int max_height)
213 214 215 216 217 218 219
{
    for (int y = 0; y < height; y++) {
        pixel_set(dst, topleft[-(1 + y)], width);
        dst += PXSTRIDE(stride);
    }
}

220 221
static void ipred_paeth_c(pixel *dst, const ptrdiff_t stride,
                          const pixel *const tl_ptr,
222 223
                          const int width, const int height, const int a,
                          const int max_width, const int max_height)
224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
{
    const int topleft = tl_ptr[0];
    for (int y = 0; y < height; y++) {
        const int left = tl_ptr[-(y + 1)];
        for (int x = 0; x < width; x++) {
            const int top = tl_ptr[1 + x];
            const int base = left + top - topleft;
            const int ldiff = abs(left - base);
            const int tdiff = abs(top - base);
            const int tldiff = abs(topleft - base);

            dst[x] = ldiff <= tdiff && ldiff <= tldiff ? left :
                     tdiff <= tldiff ? top : topleft;
        }
        dst += PXSTRIDE(stride);
    }
}

242 243
static void ipred_smooth_c(pixel *dst, const ptrdiff_t stride,
                           const pixel *const topleft,
244 245
                           const int width, const int height, const int a,
                           const int max_width, const int max_height)
246
{
247 248
    const uint8_t *const weights_hor = &dav1d_sm_weights[width];
    const uint8_t *const weights_ver = &dav1d_sm_weights[height];
249 250 251 252 253 254 255 256 257 258 259 260 261 262
    const int right = topleft[width], bottom = topleft[-height];

    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            const int pred = weights_ver[y]  * topleft[1 + x] +
                      (256 - weights_ver[y]) * bottom +
                             weights_hor[x]  * topleft[-(1 + y)] +
                      (256 - weights_hor[x]) * right;
            dst[x] = (pred + 256) >> 9;
        }
        dst += PXSTRIDE(stride);
    }
}

263 264
static void ipred_smooth_v_c(pixel *dst, const ptrdiff_t stride,
                             const pixel *const topleft,
265 266
                             const int width, const int height, const int a,
                             const int max_width, const int max_height)
267
{
268
    const uint8_t *const weights_ver = &dav1d_sm_weights[height];
269 270 271 272 273 274 275 276 277 278 279 280
    const int bottom = topleft[-height];

    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            const int pred = weights_ver[y]  * topleft[1 + x] +
                      (256 - weights_ver[y]) * bottom;
            dst[x] = (pred + 128) >> 8;
        }
        dst += PXSTRIDE(stride);
    }
}

281 282
static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,
                             const pixel *const topleft,
283 284
                             const int width, const int height, const int a,
                             const int max_width, const int max_height)
285
{
286
    const uint8_t *const weights_hor = &dav1d_sm_weights[width];
287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
    const int right = topleft[width];

    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            const int pred = weights_hor[x]  * topleft[-(y + 1)] +
                      (256 - weights_hor[x]) * right;
            dst[x] = (pred + 128) >> 8;
        }
        dst += PXSTRIDE(stride);
    }
}

static int get_filter_strength(const unsigned blk_wh, const unsigned d,
                               const int type)
{
    int strength = 0;

    if (type == 0) {
        if (blk_wh <= 8) {
            if (d >= 56) strength = 1;
        } else if (blk_wh <= 12) {
            if (d >= 40) strength = 1;
        } else if (blk_wh <= 16) {
            if (d >= 40) strength = 1;
        } else if (blk_wh <= 24) {
            if (d >= 8) strength = 1;
            if (d >= 16) strength = 2;
            if (d >= 32) strength = 3;
        } else if (blk_wh <= 32) {
            if (d >= 1) strength = 1;
            if (d >= 4) strength = 2;
            if (d >= 32) strength = 3;
        } else {
            if (d >= 1) strength = 3;
        }
    } else {
        if (blk_wh <= 8) {
            if (d >= 40) strength = 1;
            if (d >= 64) strength = 2;
        } else if (blk_wh <= 16) {
            if (d >= 20) strength = 1;
            if (d >= 48) strength = 2;
        } else if (blk_wh <= 24) {
            if (d >= 4) strength = 3;
        } else {
            if (d >= 1) strength = 3;
        }
    }

    return strength;
}

339 340 341
static void filter_edge(pixel *const out, const int sz,
                        const int lim_from, const int lim_to,
                        const pixel *const in,
342
                        const int from, const int to, const unsigned strength)
343
{
344
    static const uint8_t kernel[3][5] = {
345 346 347 348 349 350
        { 0, 4, 8, 4, 0 },
        { 0, 5, 6, 5, 0 },
        { 2, 4, 4, 4, 2 }
    };

    assert(strength > 0);
351
    int i = 0;
352
    for (; i < imin(sz, lim_from); i++)
353 354
        out[i] = in[iclip(i, from, to - 1)];
    for (; i < imin(lim_to, sz); i++) {
355 356 357 358 359
        int s = 0;
        for (int j = 0; j < 5; j++)
            s += in[iclip(i - 2 + j, from, to - 1)] * kernel[strength - 1][j];
        out[i] = (s + 8) >> 4;
    }
360 361
    for (; i < sz; i++)
        out[i] = in[iclip(i, from, to - 1)];
362 363 364 365 366 367 368 369 370 371
}

static int get_upsample(const int blk_wh, const unsigned d, const int type) {
    if (d >= 40) return 0;
    return type ? (blk_wh <= 8) : (blk_wh <= 16);
}

static void upsample_edge(pixel *const out, const int hsz,
                          const pixel *const in, const int from, const int to)
{
372
    static const int8_t kernel[4] = { -1, 9, 9, -1 };
373 374 375 376 377 378 379 380 381 382 383 384
    int i;
    for (i = 0; i < hsz - 1; i++) {
        out[i * 2] = in[iclip(i, from, to - 1)];

        int s = 0;
        for (int j = 0; j < 4; j++)
            s += in[iclip(i + j - 1, from, to - 1)] * kernel[j];
        out[i * 2 + 1] = iclip_pixel((s + 8) >> 4);
    }
    out[i * 2] = in[iclip(i, from, to - 1)];
}

385 386
static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
                       const pixel *const topleft_in,
387 388
                       const int width, const int height, int angle,
                       const int max_width, const int max_height)
389
{
390 391
    const int is_sm = (angle >> 9) & 0x1;
    const int enable_intra_edge_filter = angle >> 10;
392 393
    angle &= 511;
    assert(angle < 90);
394
    int dx = dav1d_dr_intra_derivative[angle];
395
    pixel top_out[(64 + 64) * 2];
396 397
    const pixel *top;
    int max_base_x;
398 399
    const int upsample_above = enable_intra_edge_filter ?
        get_upsample(width + height, 90 - angle, is_sm) : 0;
400 401 402 403 404
    if (upsample_above) {
        upsample_edge(top_out, width + height,
                      &topleft_in[1], -1, width + imin(width, height));
        top = top_out;
        max_base_x = 2 * (width + height) - 2;
405
        dx <<= 1;
406
    } else {
407 408
        const int filter_strength = enable_intra_edge_filter ?
            get_filter_strength(width + height, 90 - angle, is_sm) : 0;
409
        if (filter_strength) {
410
            filter_edge(top_out, width + height, 0, width + height,
411 412 413 414 415 416 417 418 419
                        &topleft_in[1], -1, width + imin(width, height),
                        filter_strength);
            top = top_out;
            max_base_x = width + height - 1;
        } else {
            top = &topleft_in[1];
            max_base_x = width + imin(width, height) - 1;
        }
    }
420
    const int base_inc = 1 + upsample_above;
421 422 423
    for (int y = 0, xpos = dx; y < height;
         y++, dst += PXSTRIDE(stride), xpos += dx)
    {
424
        const int frac = (xpos >> 1) & 0x1F;
425

426
        for (int x = 0, base = xpos >> 6; x < width; x++, base += base_inc) {
427 428 429 430 431 432 433 434 435 436 437
            if (base < max_base_x) {
                const int v = top[base] * (32 - frac) + top[base + 1] * frac;
                dst[x] = iclip_pixel((v + 16) >> 5);
            } else {
                pixel_set(&dst[x], top[max_base_x], width - x);
                break;
            }
        }
    }
}

438 439
static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
                       const pixel *const topleft_in,
440 441
                       const int width, const int height, int angle,
                       const int max_width, const int max_height)
442
{
443 444
    const int is_sm = (angle >> 9) & 0x1;
    const int enable_intra_edge_filter = angle >> 10;
445 446
    angle &= 511;
    assert(angle > 90 && angle < 180);
447 448
    int dy = dav1d_dr_intra_derivative[angle - 90];
    int dx = dav1d_dr_intra_derivative[180 - angle];
449 450 451 452
    const int upsample_left = enable_intra_edge_filter ?
        get_upsample(width + height, 180 - angle, is_sm) : 0;
    const int upsample_above = enable_intra_edge_filter ?
        get_upsample(width + height, angle - 90, is_sm) : 0;
453
    pixel edge[64 * 2 + 64 * 2 + 1];
454 455 456 457
    pixel *const topleft = &edge[height * 2];

    if (upsample_above) {
        upsample_edge(topleft, width + 1, topleft_in, 0, width + 1);
458
        dx <<= 1;
459
    } else {
460 461
        const int filter_strength = enable_intra_edge_filter ?
            get_filter_strength(width + height, angle - 90, is_sm) : 0;
462 463

        if (filter_strength) {
464 465
            filter_edge(&topleft[1], width, 0, max_width,
                        &topleft_in[1], -1, width,
466 467 468 469 470 471 472
                        filter_strength);
        } else {
            pixel_copy(&topleft[1], &topleft_in[1], width);
        }
    }
    if (upsample_left) {
        upsample_edge(edge, height + 1, &topleft_in[-height], 0, height + 1);
473
        dy <<= 1;
474
    } else {
475 476
        const int filter_strength = enable_intra_edge_filter ?
            get_filter_strength(width + height, 180 - angle, is_sm) : 0;
477 478

        if (filter_strength) {
479 480
            filter_edge(&topleft[-height], height, height - max_height, height,
                        &topleft_in[-height],
481 482 483 484 485 486 487
                        0, height + 1, filter_strength);
        } else {
            pixel_copy(&topleft[-height], &topleft_in[-height], height);
        }
    }
    *topleft = *topleft_in;

488 489 490 491
    const int min_base_x = -(1 + upsample_above);
    const int base_inc_x = 1 + upsample_above;
    const pixel *const left = &topleft[-(1 + upsample_left)];
    const pixel *const top = &topleft[1 + upsample_above];
492 493 494
    for (int y = 0, xpos = -dx; y < height;
         y++, xpos -= dx, dst += PXSTRIDE(stride))
    {
495 496
        int base_x = xpos >> 6;
        const int frac_x = (xpos >> 1) & 0x1F;
497

498
        for (int x = 0, ypos = (y << (6 + upsample_left)) - dy; x < width;
499 500 501 502 503 504 505
             x++, base_x += base_inc_x, ypos -= dy)
        {
            int v;

            if (base_x >= min_base_x) {
                v = top[base_x] * (32 - frac_x) + top[base_x + 1] * frac_x;
            } else {
506 507 508
                const int base_y = ypos >> 6;
                assert(base_y >= -(1 + upsample_left));
                const int frac_y = (ypos >> 1) & 0x1F;
509 510 511 512 513 514 515
                v = left[-base_y] * (32 - frac_y) + left[-(base_y + 1)] * frac_y;
            }
            dst[x] = iclip_pixel((v + 16) >> 5);
        }
    }
}

516 517
static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
                       const pixel *const topleft_in,
518 519
                       const int width, const int height, int angle,
                       const int max_width, const int max_height)
520
{
521 522
    const int is_sm = (angle >> 9) & 0x1;
    const int enable_intra_edge_filter = angle >> 10;
523 524
    angle &= 511;
    assert(angle > 180);
525
    int dy = dav1d_dr_intra_derivative[270 - angle];
526
    pixel left_out[(64 + 64) * 2];
527 528
    const pixel *left;
    int max_base_y;
529 530
    const int upsample_left = enable_intra_edge_filter ?
        get_upsample(width + height, angle - 180, is_sm) : 0;
531 532 533 534 535 536
    if (upsample_left) {
        upsample_edge(left_out, width + height,
                      &topleft_in[-(width + height)],
                      imax(width - height, 0), width + height + 1);
        left = &left_out[2 * (width + height) - 2];
        max_base_y = 2 * (width + height) - 2;
537
        dy <<= 1;
538
    } else {
539 540
        const int filter_strength = enable_intra_edge_filter ?
            get_filter_strength(width + height, angle - 180, is_sm) : 0;
541 542

        if (filter_strength) {
543
            filter_edge(left_out, width + height, 0, width + height,
544 545 546 547 548 549 550 551 552 553
                        &topleft_in[-(width + height)],
                        imax(width - height, 0), width + height + 1,
                        filter_strength);
            left = &left_out[width + height - 1];
            max_base_y = width + height - 1;
        } else {
            left = &topleft_in[-1];
            max_base_y = height + imin(width, height) - 1;
        }
    }
554
    const int base_inc = 1 + upsample_left;
555
    for (int x = 0, ypos = dy; x < width; x++, ypos += dy) {
556
        const int frac = (ypos >> 1) & 0x1F;
557

558
        for (int y = 0, base = ypos >> 6; y < height; y++, base += base_inc) {
559 560 561 562 563 564 565 566 567 568 569 570 571 572
            if (base < max_base_y) {
                const int v = left[-base] * (32 - frac) +
                              left[-(base + 1)] * frac;
                dst[y * PXSTRIDE(stride) + x] = iclip_pixel((v + 16) >> 5);
            } else {
                do {
                    dst[y * PXSTRIDE(stride) + x] = left[-max_base_y];
                } while (++y < height);
                break;
            }
        }
    }
}

573 574 575
/* Up to 32x32 only */
static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
                           const pixel *const topleft_in,
576 577
                           const int width, const int height, int filt_idx,
                           const int max_width, const int max_height)
578 579 580 581
{
    filt_idx &= 511;
    assert(filt_idx < 5);

582
    const int8_t *const filter = dav1d_filter_intra_taps[filt_idx];
583 584 585 586 587 588 589 590 591 592 593 594 595 596
    int x, y;
    ptrdiff_t left_stride;
    const pixel *left, *topleft, *top;

    top = &topleft_in[1];
    for (y = 0; y < height; y += 2) {
        topleft = &topleft_in[-y];
        left = &topleft[-1];
        left_stride = -1;
        for (x = 0; x < width; x += 4) {
            const int p0 = *topleft;
            const int p1 = top[0], p2 = top[1], p3 = top[2], p4 = top[3];
            const int p5 = left[0 * left_stride], p6 = left[1 * left_stride];
            pixel *ptr = &dst[x];
597
            const int8_t *flt_ptr = filter;
598 599

            for (int yy = 0; yy < 2; yy++) {
600 601 602 603 604
                for (int xx = 0; xx < 4; xx++, flt_ptr += 2) {
                    int acc = flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 +
                              flt_ptr[16] * p2 + flt_ptr[17] * p3 +
                              flt_ptr[32] * p4 + flt_ptr[33] * p5 +
                              flt_ptr[48] * p6;
605 606 607 608 609 610 611 612 613 614 615 616 617 618
                    ptr[xx] = iclip_pixel((acc + 8) >> 4);
                }
                ptr += PXSTRIDE(stride);
            }
            left = &dst[x + 4 - 1];
            left_stride = PXSTRIDE(stride);
            top += 4;
            topleft = &top[-1];
        }
        top = &dst[PXSTRIDE(stride)];
        dst = &dst[PXSTRIDE(stride) * 2];
    }
}

619
static NOINLINE void
620 621
cfl_ac_c(int16_t *ac, const pixel *ypx, const ptrdiff_t stride,
         const int w_pad, const int h_pad, const int width, const int height,
622
         const int ss_hor, const int ss_ver)
623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645
{
    int y, x;
    int16_t *const ac_orig = ac;

    assert(w_pad >= 0 && w_pad * 4 < width);
    assert(h_pad >= 0 && h_pad * 4 < height);

    for (y = 0; y < height - 4 * h_pad; y++) {
        for (x = 0; x < width - 4 * w_pad; x++) {
            int ac_sum = ypx[x << ss_hor];
            if (ss_hor) ac_sum += ypx[x * 2 + 1];
            if (ss_ver) {
                ac_sum += ypx[(x << ss_hor) + PXSTRIDE(stride)];
                if (ss_hor) ac_sum += ypx[x * 2 + 1 + PXSTRIDE(stride)];
            }
            ac[x] = ac_sum << (1 + !ss_ver + !ss_hor);
        }
        for (; x < width; x++)
            ac[x] = ac[x - 1];
        ac += width;
        ypx += PXSTRIDE(stride) << ss_ver;
    }
    for (; y < height; y++) {
646
        memcpy(ac, &ac[-width], width * sizeof(*ac));
647 648 649
        ac += width;
    }

650
    const int log2sz = ctz(width) + ctz(height);
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666
    int sum = (1 << log2sz) >> 1;
    for (ac = ac_orig, y = 0; y < height; y++) {
        for (x = 0; x < width; x++)
            sum += ac[x];
        ac += width;
    }
    sum >>= log2sz;

    // subtract DC
    for (ac = ac_orig, y = 0; y < height; y++) {
        for (x = 0; x < width; x++)
            ac[x] -= sum;
        ac += width;
    }
}

667 668 669 670
#define cfl_ac_fn(fmt, ss_hor, ss_ver) \
static void cfl_ac_##fmt##_c(int16_t *const ac, const pixel *const ypx, \
                             const ptrdiff_t stride, const int w_pad, \
                             const int h_pad, const int cw, const int ch) \
671
{ \
672
    cfl_ac_c(ac, ypx, stride, w_pad, h_pad, cw, ch, ss_hor, ss_ver); \
673 674
}

675 676 677
cfl_ac_fn(420, 1, 1)
cfl_ac_fn(422, 1, 0)
cfl_ac_fn(444, 0, 0)
678 679 680 681 682 683 684 685 686 687 688 689 690 691

static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
                       const uint16_t *const pal, const uint8_t *idx,
                       const int w, const int h)
{
    for (int y = 0; y < h; y++) {
        for (int x = 0; x < w; x++)
            dst[x] = pal[idx[x]];
        idx += w;
        dst += PXSTRIDE(stride);
    }
}

void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
692 693 694 695 696 697 698 699 700 701 702 703 704 705
    c->intra_pred[DC_PRED      ] = ipred_dc_c;
    c->intra_pred[DC_128_PRED  ] = ipred_dc_128_c;
    c->intra_pred[TOP_DC_PRED  ] = ipred_dc_top_c;
    c->intra_pred[LEFT_DC_PRED ] = ipred_dc_left_c;
    c->intra_pred[HOR_PRED     ] = ipred_h_c;
    c->intra_pred[VERT_PRED    ] = ipred_v_c;
    c->intra_pred[PAETH_PRED   ] = ipred_paeth_c;
    c->intra_pred[SMOOTH_PRED  ] = ipred_smooth_c;
    c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c;
    c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c;
    c->intra_pred[Z1_PRED      ] = ipred_z1_c;
    c->intra_pred[Z2_PRED      ] = ipred_z2_c;
    c->intra_pred[Z3_PRED      ] = ipred_z3_c;
    c->intra_pred[FILTER_PRED  ] = ipred_filter_c;
706

707 708 709
    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = cfl_ac_420_c;
    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = cfl_ac_422_c;
    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = cfl_ac_444_c;
710

711 712 713 714
    c->cfl_pred[DC_PRED     ] = ipred_cfl_c;
    c->cfl_pred[DC_128_PRED ] = ipred_cfl_128_c;
    c->cfl_pred[TOP_DC_PRED ] = ipred_cfl_top_c;
    c->cfl_pred[LEFT_DC_PRED] = ipred_cfl_left_c;
715

716
    c->pal_pred = pal_pred_c;
717 718 719 720

#if HAVE_ASM && ARCH_X86
    bitfn(dav1d_intra_pred_dsp_init_x86)(c);
#endif
721
}