ipred_tmpl.c 25.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
/*
 * Copyright © 2018, VideoLAN and dav1d authors
 * Copyright © 2018, Two Orioles, LLC
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"

#include <assert.h>
#include <stdlib.h>
#include <string.h>

34
#include "common/attributes.h"
35 36 37
#include "common/intops.h"

#include "src/ipred.h"
38
#include "src/tables.h"
39

40
static NOINLINE void
41 42
splat_dc(pixel *dst, const ptrdiff_t stride,
         const int width, const int height, const unsigned dc)
43 44 45
{
    assert(dc <= (1 << BITDEPTH) - 1);
#if BITDEPTH == 8
46
    if (width > 4) {
47
        const uint64_t dcN = dc * 0x0101010101010101ULL;
48 49
        for (int y = 0; y < height; y++) {
            for (int x = 0; x < width; x += sizeof(dcN))
50 51 52 53 54
                *((uint64_t *) &dst[x]) = dcN;
            dst += PXSTRIDE(stride);
        }
    } else {
        const unsigned dcN = dc * 0x01010101U;
55 56
        for (int y = 0; y < height; y++) {
            for (int x = 0; x < width; x += sizeof(dcN))
57 58 59 60 61 62
                *((unsigned *) &dst[x]) = dcN;
            dst += PXSTRIDE(stride);
        }
    }
#else
    const uint64_t dcN = dc * 0x0001000100010001ULL;
63 64
    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x += sizeof(dcN) >> 1)
65 66 67 68 69 70
            *((uint64_t *) &dst[x]) = dcN;
        dst += PXSTRIDE(stride);
    }
#endif
}

71 72 73
static NOINLINE void
cfl_pred(pixel *dst, const ptrdiff_t stride,
         const int width, const int height, const unsigned dc,
74
         const int16_t *ac, const int alpha)
75 76 77 78 79 80 81 82 83 84 85
{
    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            const int diff = alpha * ac[x];
            dst[x] = iclip_pixel(dc + apply_sign((abs(diff) + 32) >> 6, diff));
        }
        ac += width;
        dst += PXSTRIDE(stride);
    }
}

86
static unsigned dc_gen_top(const pixel *const topleft, const int width) {
87 88 89
    unsigned dc = width >> 1;
    for (int i = 0; i < width; i++)
       dc += topleft[1 + i];
90 91
    return dc >> ctz(width);
}
92

93 94
static void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride,
                           const pixel *const topleft,
95 96
                           const int width, const int height, const int a,
                           const int max_width, const int max_height)
97 98
{
    splat_dc(dst, stride, width, height, dc_gen_top(topleft, width));
99 100
}

101
static void ipred_cfl_top_c(pixel *dst, const ptrdiff_t stride,
102
                            const pixel *const topleft,
103
                            const int width, const int height,
104
                            const int16_t *ac, const int alpha)
105
{
106
    cfl_pred(dst, stride, width, height, dc_gen_top(topleft, width), ac, alpha);
107 108
}

109
static unsigned dc_gen_left(const pixel *const topleft, const int height) {
110 111 112
    unsigned dc = height >> 1;
    for (int i = 0; i < height; i++)
       dc += topleft[-(1 + i)];
113 114
    return dc >> ctz(height);
}
115

116 117
static void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride,
                            const pixel *const topleft,
118 119
                            const int width, const int height, const int a,
                            const int max_width, const int max_height)
120 121 122 123 124 125 126
{
    splat_dc(dst, stride, width, height, dc_gen_left(topleft, height));
}

static void ipred_cfl_left_c(pixel *dst, const ptrdiff_t stride,
                             const pixel *const topleft,
                             const int width, const int height,
127
                             const int16_t *ac, const int alpha)
128 129
{
    unsigned dc = dc_gen_left(topleft, height);
130
    cfl_pred(dst, stride, width, height, dc, ac, alpha);
131
}
132

133 134 135 136 137 138 139 140 141 142
#if BITDEPTH == 8
#define MULTIPLIER_1x2 0x5556
#define MULTIPLIER_1x4 0x3334
#define BASE_SHIFT 16
#else
#define MULTIPLIER_1x2 0xAAAB
#define MULTIPLIER_1x4 0x6667
#define BASE_SHIFT 17
#endif

143 144
static unsigned dc_gen(const pixel *const topleft,
                       const int width, const int height)
145 146 147 148 149 150 151 152 153 154 155 156 157
{
    unsigned dc = (width + height) >> 1;
    for (int i = 0; i < width; i++)
       dc += topleft[i + 1];
    for (int i = 0; i < height; i++)
       dc += topleft[-(i + 1)];
    dc >>= ctz(width + height);

    if (width != height) {
        dc *= (width > height * 2 || height > width * 2) ? MULTIPLIER_1x4 :
                                                           MULTIPLIER_1x2;
        dc >>= BASE_SHIFT;
    }
158 159 160 161 162
    return dc;
}

static void ipred_dc_c(pixel *dst, const ptrdiff_t stride,
                       const pixel *const topleft,
163 164
                       const int width, const int height, const int a,
                       const int max_width, const int max_height)
165 166 167
{
    splat_dc(dst, stride, width, height, dc_gen(topleft, width, height));
}
168

169 170 171
static void ipred_cfl_c(pixel *dst, const ptrdiff_t stride,
                        const pixel *const topleft,
                        const int width, const int height,
172
                        const int16_t *ac, const int alpha)
173 174
{
    unsigned dc = dc_gen(topleft, width, height);
175
    cfl_pred(dst, stride, width, height, dc, ac, alpha);
176
}
177

178 179 180 181
#undef MULTIPLIER_1x2
#undef MULTIPLIER_1x4
#undef BASE_SHIFT

182 183
static void ipred_dc_128_c(pixel *dst, const ptrdiff_t stride,
                           const pixel *const topleft,
184 185
                           const int width, const int height, const int a,
                           const int max_width, const int max_height)
186 187 188
{
    splat_dc(dst, stride, width, height, 1 << (BITDEPTH - 1));
}
189

190 191 192
static void ipred_cfl_128_c(pixel *dst, const ptrdiff_t stride,
                            const pixel *const topleft,
                            const int width, const int height,
193
                            const int16_t *ac, const int alpha)
194
{
195
    cfl_pred(dst, stride, width, height, 1 << (BITDEPTH - 1), ac, alpha);
196 197
}

198 199
static void ipred_v_c(pixel *dst, const ptrdiff_t stride,
                      const pixel *const topleft,
200 201
                      const int width, const int height, const int a,
                      const int max_width, const int max_height)
202 203 204 205 206 207 208
{
    for (int y = 0; y < height; y++) {
        pixel_copy(dst, topleft + 1, width);
        dst += PXSTRIDE(stride);
    }
}

209 210
static void ipred_h_c(pixel *dst, const ptrdiff_t stride,
                      const pixel *const topleft,
211 212
                      const int width, const int height, const int a,
                      const int max_width, const int max_height)
213 214 215 216 217 218 219
{
    for (int y = 0; y < height; y++) {
        pixel_set(dst, topleft[-(1 + y)], width);
        dst += PXSTRIDE(stride);
    }
}

220 221
static void ipred_paeth_c(pixel *dst, const ptrdiff_t stride,
                          const pixel *const tl_ptr,
222 223
                          const int width, const int height, const int a,
                          const int max_width, const int max_height)
224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
{
    const int topleft = tl_ptr[0];
    for (int y = 0; y < height; y++) {
        const int left = tl_ptr[-(y + 1)];
        for (int x = 0; x < width; x++) {
            const int top = tl_ptr[1 + x];
            const int base = left + top - topleft;
            const int ldiff = abs(left - base);
            const int tdiff = abs(top - base);
            const int tldiff = abs(topleft - base);

            dst[x] = ldiff <= tdiff && ldiff <= tldiff ? left :
                     tdiff <= tldiff ? top : topleft;
        }
        dst += PXSTRIDE(stride);
    }
}

242 243
static void ipred_smooth_c(pixel *dst, const ptrdiff_t stride,
                           const pixel *const topleft,
244 245
                           const int width, const int height, const int a,
                           const int max_width, const int max_height)
246
{
247 248
    const uint8_t *const weights_hor = &dav1d_sm_weights[width];
    const uint8_t *const weights_ver = &dav1d_sm_weights[height];
249 250 251 252 253 254 255 256 257 258 259 260 261 262
    const int right = topleft[width], bottom = topleft[-height];

    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            const int pred = weights_ver[y]  * topleft[1 + x] +
                      (256 - weights_ver[y]) * bottom +
                             weights_hor[x]  * topleft[-(1 + y)] +
                      (256 - weights_hor[x]) * right;
            dst[x] = (pred + 256) >> 9;
        }
        dst += PXSTRIDE(stride);
    }
}

263 264
static void ipred_smooth_v_c(pixel *dst, const ptrdiff_t stride,
                             const pixel *const topleft,
265 266
                             const int width, const int height, const int a,
                             const int max_width, const int max_height)
267
{
268
    const uint8_t *const weights_ver = &dav1d_sm_weights[height];
269 270 271 272 273 274 275 276 277 278 279 280
    const int bottom = topleft[-height];

    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            const int pred = weights_ver[y]  * topleft[1 + x] +
                      (256 - weights_ver[y]) * bottom;
            dst[x] = (pred + 128) >> 8;
        }
        dst += PXSTRIDE(stride);
    }
}

281 282
static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,
                             const pixel *const topleft,
283 284
                             const int width, const int height, const int a,
                             const int max_width, const int max_height)
285
{
286
    const uint8_t *const weights_hor = &dav1d_sm_weights[width];
287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
    const int right = topleft[width];

    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            const int pred = weights_hor[x]  * topleft[-(y + 1)] +
                      (256 - weights_hor[x]) * right;
            dst[x] = (pred + 128) >> 8;
        }
        dst += PXSTRIDE(stride);
    }
}

static int get_filter_strength(const unsigned blk_wh, const unsigned d,
                               const int type)
{
    int strength = 0;

    if (type == 0) {
        if (blk_wh <= 8) {
            if (d >= 56) strength = 1;
        } else if (blk_wh <= 12) {
            if (d >= 40) strength = 1;
        } else if (blk_wh <= 16) {
            if (d >= 40) strength = 1;
        } else if (blk_wh <= 24) {
            if (d >= 8) strength = 1;
            if (d >= 16) strength = 2;
            if (d >= 32) strength = 3;
        } else if (blk_wh <= 32) {
            if (d >= 1) strength = 1;
            if (d >= 4) strength = 2;
            if (d >= 32) strength = 3;
        } else {
            if (d >= 1) strength = 3;
        }
    } else {
        if (blk_wh <= 8) {
            if (d >= 40) strength = 1;
            if (d >= 64) strength = 2;
        } else if (blk_wh <= 16) {
            if (d >= 20) strength = 1;
            if (d >= 48) strength = 2;
        } else if (blk_wh <= 24) {
            if (d >= 4) strength = 3;
        } else {
            if (d >= 1) strength = 3;
        }
    }

    return strength;
}

339 340 341
static void filter_edge(pixel *const out, const int sz,
                        const int lim_from, const int lim_to,
                        const pixel *const in,
342
                        const int from, const int to, const unsigned strength)
343
{
344
    static const uint8_t kernel[3][5] = {
345 346 347 348 349 350
        { 0, 4, 8, 4, 0 },
        { 0, 5, 6, 5, 0 },
        { 2, 4, 4, 4, 2 }
    };

    assert(strength > 0);
351
    int i = 0;
352
    for (; i < imin(sz, lim_from); i++)
353 354
        out[i] = in[iclip(i, from, to - 1)];
    for (; i < imin(lim_to, sz); i++) {
355 356 357 358 359
        int s = 0;
        for (int j = 0; j < 5; j++)
            s += in[iclip(i - 2 + j, from, to - 1)] * kernel[strength - 1][j];
        out[i] = (s + 8) >> 4;
    }
360 361
    for (; i < sz; i++)
        out[i] = in[iclip(i, from, to - 1)];
362 363 364 365 366 367 368 369 370 371
}

static int get_upsample(const int blk_wh, const unsigned d, const int type) {
    if (d >= 40) return 0;
    return type ? (blk_wh <= 8) : (blk_wh <= 16);
}

static void upsample_edge(pixel *const out, const int hsz,
                          const pixel *const in, const int from, const int to)
{
372
    static const int8_t kernel[4] = { -1, 9, 9, -1 };
373 374 375 376 377 378 379 380 381 382 383 384
    int i;
    for (i = 0; i < hsz - 1; i++) {
        out[i * 2] = in[iclip(i, from, to - 1)];

        int s = 0;
        for (int j = 0; j < 4; j++)
            s += in[iclip(i + j - 1, from, to - 1)] * kernel[j];
        out[i * 2 + 1] = iclip_pixel((s + 8) >> 4);
    }
    out[i * 2] = in[iclip(i, from, to - 1)];
}

385 386
static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
                       const pixel *const topleft_in,
387 388
                       const int width, const int height, int angle,
                       const int max_width, const int max_height)
389
{
390 391
    const int is_sm = (angle >> 9) & 0x1;
    const int enable_intra_edge_filter = angle >> 10;
392 393
    angle &= 511;
    assert(angle < 90);
394
    const int dx = dav1d_dr_intra_derivative[angle];
395
    pixel top_out[(64 + 64) * 2];
396 397
    const pixel *top;
    int max_base_x;
398 399
    const int upsample_above = enable_intra_edge_filter ?
        get_upsample(width + height, 90 - angle, is_sm) : 0;
400 401 402 403 404 405
    if (upsample_above) {
        upsample_edge(top_out, width + height,
                      &topleft_in[1], -1, width + imin(width, height));
        top = top_out;
        max_base_x = 2 * (width + height) - 2;
    } else {
406 407
        const int filter_strength = enable_intra_edge_filter ?
            get_filter_strength(width + height, 90 - angle, is_sm) : 0;
408
        if (filter_strength) {
409
            filter_edge(top_out, width + height, 0, width + height,
410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438
                        &topleft_in[1], -1, width + imin(width, height),
                        filter_strength);
            top = top_out;
            max_base_x = width + height - 1;
        } else {
            top = &topleft_in[1];
            max_base_x = width + imin(width, height) - 1;
        }
    }
    const int frac_bits = 6 - upsample_above;
    const int base_inc = 1 << upsample_above;
    for (int y = 0, xpos = dx; y < height;
         y++, dst += PXSTRIDE(stride), xpos += dx)
    {
        int base = xpos >> frac_bits;
        const int frac = ((xpos << upsample_above) & 0x3F) >> 1;

        for (int x = 0; x < width; x++, base += base_inc) {
            if (base < max_base_x) {
                const int v = top[base] * (32 - frac) + top[base + 1] * frac;
                dst[x] = iclip_pixel((v + 16) >> 5);
            } else {
                pixel_set(&dst[x], top[max_base_x], width - x);
                break;
            }
        }
    }
}

439 440
static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
                       const pixel *const topleft_in,
441 442
                       const int width, const int height, int angle,
                       const int max_width, const int max_height)
443
{
444 445
    const int is_sm = (angle >> 9) & 0x1;
    const int enable_intra_edge_filter = angle >> 10;
446 447
    angle &= 511;
    assert(angle > 90 && angle < 180);
448 449
    const int dy = dav1d_dr_intra_derivative[angle - 90];
    const int dx = dav1d_dr_intra_derivative[180 - angle];
450 451 452 453
    const int upsample_left = enable_intra_edge_filter ?
        get_upsample(width + height, 180 - angle, is_sm) : 0;
    const int upsample_above = enable_intra_edge_filter ?
        get_upsample(width + height, angle - 90, is_sm) : 0;
454
    pixel edge[64 * 2 + 64 * 2 + 1];
455 456 457 458 459
    pixel *const topleft = &edge[height * 2];

    if (upsample_above) {
        upsample_edge(topleft, width + 1, topleft_in, 0, width + 1);
    } else {
460 461
        const int filter_strength = enable_intra_edge_filter ?
            get_filter_strength(width + height, angle - 90, is_sm) : 0;
462 463

        if (filter_strength) {
464 465
            filter_edge(&topleft[1], width, 0, max_width,
                        &topleft_in[1], -1, width,
466 467 468 469 470 471 472 473
                        filter_strength);
        } else {
            pixel_copy(&topleft[1], &topleft_in[1], width);
        }
    }
    if (upsample_left) {
        upsample_edge(edge, height + 1, &topleft_in[-height], 0, height + 1);
    } else {
474 475
        const int filter_strength = enable_intra_edge_filter ?
            get_filter_strength(width + height, 180 - angle, is_sm) : 0;
476 477

        if (filter_strength) {
478 479
            filter_edge(&topleft[-height], height, height - max_height, height,
                        &topleft_in[-height],
480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515
                        0, height + 1, filter_strength);
        } else {
            pixel_copy(&topleft[-height], &topleft_in[-height], height);
        }
    }
    *topleft = *topleft_in;

    const int min_base_x = -(1 << upsample_above);
    const int frac_bits_y = 6 - upsample_left, frac_bits_x = 6 - upsample_above;
    const int base_inc_x = 1 << upsample_above;
    const pixel *const left = &topleft[-(1 << upsample_left)];
    const pixel *const top = &topleft[1 << upsample_above];
    for (int y = 0, xpos = -dx; y < height;
         y++, xpos -= dx, dst += PXSTRIDE(stride))
    {
        int base_x = xpos >> frac_bits_x;
        const int frac_x = ((xpos * (1 << upsample_above)) & 0x3F) >> 1;

        for (int x = 0, ypos = (y << 6) - dy; x < width;
             x++, base_x += base_inc_x, ypos -= dy)
        {
            int v;

            if (base_x >= min_base_x) {
                v = top[base_x] * (32 - frac_x) + top[base_x + 1] * frac_x;
            } else {
                const int base_y = ypos >> frac_bits_y;
                assert(base_y >= -(1 << upsample_left));
                const int frac_y = ((ypos * (1 << upsample_left)) & 0x3F) >> 1;
                v = left[-base_y] * (32 - frac_y) + left[-(base_y + 1)] * frac_y;
            }
            dst[x] = iclip_pixel((v + 16) >> 5);
        }
    }
}

516 517
static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
                       const pixel *const topleft_in,
518 519
                       const int width, const int height, int angle,
                       const int max_width, const int max_height)
520
{
521 522
    const int is_sm = (angle >> 9) & 0x1;
    const int enable_intra_edge_filter = angle >> 10;
523 524
    angle &= 511;
    assert(angle > 180);
525
    const int dy = dav1d_dr_intra_derivative[270 - angle];
526
    pixel left_out[(64 + 64) * 2];
527 528
    const pixel *left;
    int max_base_y;
529 530
    const int upsample_left = enable_intra_edge_filter ?
        get_upsample(width + height, angle - 180, is_sm) : 0;
531 532 533 534 535 536 537
    if (upsample_left) {
        upsample_edge(left_out, width + height,
                      &topleft_in[-(width + height)],
                      imax(width - height, 0), width + height + 1);
        left = &left_out[2 * (width + height) - 2];
        max_base_y = 2 * (width + height) - 2;
    } else {
538 539
        const int filter_strength = enable_intra_edge_filter ?
            get_filter_strength(width + height, angle - 180, is_sm) : 0;
540 541

        if (filter_strength) {
542
            filter_edge(left_out, width + height, 0, width + height,
543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573
                        &topleft_in[-(width + height)],
                        imax(width - height, 0), width + height + 1,
                        filter_strength);
            left = &left_out[width + height - 1];
            max_base_y = width + height - 1;
        } else {
            left = &topleft_in[-1];
            max_base_y = height + imin(width, height) - 1;
        }
    }
    const int frac_bits = 6 - upsample_left;
    const int base_inc = 1 << upsample_left;
    for (int x = 0, ypos = dy; x < width; x++, ypos += dy) {
        int base = ypos >> frac_bits;
        const int frac = ((ypos << upsample_left) & 0x3F) >> 1;

        for (int y = 0; y < height; y++, base += base_inc) {
            if (base < max_base_y) {
                const int v = left[-base] * (32 - frac) +
                              left[-(base + 1)] * frac;
                dst[y * PXSTRIDE(stride) + x] = iclip_pixel((v + 16) >> 5);
            } else {
                do {
                    dst[y * PXSTRIDE(stride) + x] = left[-max_base_y];
                } while (++y < height);
                break;
            }
        }
    }
}

574 575 576
/* Up to 32x32 only */
static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
                           const pixel *const topleft_in,
577 578
                           const int width, const int height, int filt_idx,
                           const int max_width, const int max_height)
579 580 581 582
{
    filt_idx &= 511;
    assert(filt_idx < 5);

583
    const int8_t *const filter = dav1d_filter_intra_taps[filt_idx];
584 585 586 587 588 589 590 591 592 593 594 595 596 597
    int x, y;
    ptrdiff_t left_stride;
    const pixel *left, *topleft, *top;

    top = &topleft_in[1];
    for (y = 0; y < height; y += 2) {
        topleft = &topleft_in[-y];
        left = &topleft[-1];
        left_stride = -1;
        for (x = 0; x < width; x += 4) {
            const int p0 = *topleft;
            const int p1 = top[0], p2 = top[1], p3 = top[2], p4 = top[3];
            const int p5 = left[0 * left_stride], p6 = left[1 * left_stride];
            pixel *ptr = &dst[x];
598
            const int8_t *flt_ptr = filter;
599 600

            for (int yy = 0; yy < 2; yy++) {
601 602 603 604 605
                for (int xx = 0; xx < 4; xx++, flt_ptr += 2) {
                    int acc = flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 +
                              flt_ptr[16] * p2 + flt_ptr[17] * p3 +
                              flt_ptr[32] * p4 + flt_ptr[33] * p5 +
                              flt_ptr[48] * p6;
606 607 608 609 610 611 612 613 614 615 616 617 618 619
                    ptr[xx] = iclip_pixel((acc + 8) >> 4);
                }
                ptr += PXSTRIDE(stride);
            }
            left = &dst[x + 4 - 1];
            left_stride = PXSTRIDE(stride);
            top += 4;
            topleft = &top[-1];
        }
        top = &dst[PXSTRIDE(stride)];
        dst = &dst[PXSTRIDE(stride) * 2];
    }
}

620
static NOINLINE void
621 622
cfl_ac_c(int16_t *ac, const pixel *ypx, const ptrdiff_t stride,
         const int w_pad, const int h_pad, const int width, const int height,
623
         const int ss_hor, const int ss_ver)
624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646
{
    int y, x;
    int16_t *const ac_orig = ac;

    assert(w_pad >= 0 && w_pad * 4 < width);
    assert(h_pad >= 0 && h_pad * 4 < height);

    for (y = 0; y < height - 4 * h_pad; y++) {
        for (x = 0; x < width - 4 * w_pad; x++) {
            int ac_sum = ypx[x << ss_hor];
            if (ss_hor) ac_sum += ypx[x * 2 + 1];
            if (ss_ver) {
                ac_sum += ypx[(x << ss_hor) + PXSTRIDE(stride)];
                if (ss_hor) ac_sum += ypx[x * 2 + 1 + PXSTRIDE(stride)];
            }
            ac[x] = ac_sum << (1 + !ss_ver + !ss_hor);
        }
        for (; x < width; x++)
            ac[x] = ac[x - 1];
        ac += width;
        ypx += PXSTRIDE(stride) << ss_ver;
    }
    for (; y < height; y++) {
647
        memcpy(ac, &ac[-width], width * sizeof(*ac));
648 649 650
        ac += width;
    }

651
    const int log2sz = ctz(width) + ctz(height);
652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667
    int sum = (1 << log2sz) >> 1;
    for (ac = ac_orig, y = 0; y < height; y++) {
        for (x = 0; x < width; x++)
            sum += ac[x];
        ac += width;
    }
    sum >>= log2sz;

    // subtract DC
    for (ac = ac_orig, y = 0; y < height; y++) {
        for (x = 0; x < width; x++)
            ac[x] -= sum;
        ac += width;
    }
}

668 669 670 671
#define cfl_ac_fn(fmt, ss_hor, ss_ver) \
static void cfl_ac_##fmt##_c(int16_t *const ac, const pixel *const ypx, \
                             const ptrdiff_t stride, const int w_pad, \
                             const int h_pad, const int cw, const int ch) \
672
{ \
673
    cfl_ac_c(ac, ypx, stride, w_pad, h_pad, cw, ch, ss_hor, ss_ver); \
674 675
}

676 677 678
cfl_ac_fn(420, 1, 1)
cfl_ac_fn(422, 1, 0)
cfl_ac_fn(444, 0, 0)
679 680 681 682 683 684 685 686 687 688 689 690 691 692

static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
                       const uint16_t *const pal, const uint8_t *idx,
                       const int w, const int h)
{
    for (int y = 0; y < h; y++) {
        for (int x = 0; x < w; x++)
            dst[x] = pal[idx[x]];
        idx += w;
        dst += PXSTRIDE(stride);
    }
}

void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
693 694 695 696 697 698 699 700 701 702 703 704 705 706
    c->intra_pred[DC_PRED      ] = ipred_dc_c;
    c->intra_pred[DC_128_PRED  ] = ipred_dc_128_c;
    c->intra_pred[TOP_DC_PRED  ] = ipred_dc_top_c;
    c->intra_pred[LEFT_DC_PRED ] = ipred_dc_left_c;
    c->intra_pred[HOR_PRED     ] = ipred_h_c;
    c->intra_pred[VERT_PRED    ] = ipred_v_c;
    c->intra_pred[PAETH_PRED   ] = ipred_paeth_c;
    c->intra_pred[SMOOTH_PRED  ] = ipred_smooth_c;
    c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c;
    c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c;
    c->intra_pred[Z1_PRED      ] = ipred_z1_c;
    c->intra_pred[Z2_PRED      ] = ipred_z2_c;
    c->intra_pred[Z3_PRED      ] = ipred_z3_c;
    c->intra_pred[FILTER_PRED  ] = ipred_filter_c;
707

708 709 710
    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = cfl_ac_420_c;
    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = cfl_ac_422_c;
    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = cfl_ac_444_c;
711

712 713 714 715
    c->cfl_pred[DC_PRED     ] = ipred_cfl_c;
    c->cfl_pred[DC_128_PRED ] = ipred_cfl_128_c;
    c->cfl_pred[TOP_DC_PRED ] = ipred_cfl_top_c;
    c->cfl_pred[LEFT_DC_PRED] = ipred_cfl_left_c;
716

717
    c->pal_pred = pal_pred_c;
718 719 720 721

#if HAVE_ASM && ARCH_X86
    bitfn(dav1d_intra_pred_dsp_init_x86)(c);
#endif
722
}