loopfilter_tmpl.c 10.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
/*
 * Copyright © 2018, VideoLAN and dav1d authors
 * Copyright © 2018, Two Orioles, LLC
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"

#include <stdlib.h>

32
#include "common/attributes.h"
33 34 35 36
#include "common/intops.h"

#include "src/loopfilter.h"

37
static NOINLINE void
38
loop_filter(pixel *dst, int E, int I, int H,
39 40
            const ptrdiff_t stridea, const ptrdiff_t strideb, const int wd
            HIGHBD_DECL_SUFFIX)
41
{
42 43 44 45 46
    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
    const int F = 1 << bitdepth_min_8;
    E <<= bitdepth_min_8;
    I <<= bitdepth_min_8;
    H <<= bitdepth_min_8;
47

48
    for (int i = 0; i < 4; i++, dst += stridea) {
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
        int p6, p5, p4, p3, p2;
        int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
        int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
        int q2, q3, q4, q5, q6;
        int fm, flat8out, flat8in;

        fm = abs(p1 - p0) <= I && abs(q1 - q0) <= I &&
             abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E;

        if (wd > 4) {
            p2 = dst[strideb * -3];
            q2 = dst[strideb * +2];

            fm &= abs(p2 - p1) <= I && abs(q2 - q1) <= I;

            if (wd > 6) {
                p3 = dst[strideb * -4];
                q3 = dst[strideb * +3];

                fm &= abs(p3 - p2) <= I && abs(q3 - q2) <= I;
            }
        }
        if (!fm) continue;

        if (wd >= 16) {
            p6 = dst[strideb * -7];
            p5 = dst[strideb * -6];
            p4 = dst[strideb * -5];
            q4 = dst[strideb * +4];
            q5 = dst[strideb * +5];
            q6 = dst[strideb * +6];

            flat8out = abs(p6 - p0) <= F && abs(p5 - p0) <= F &&
                       abs(p4 - p0) <= F && abs(q4 - q0) <= F &&
                       abs(q5 - q0) <= F && abs(q6 - q0) <= F;
        }

        if (wd >= 6)
            flat8in = abs(p2 - p0) <= F && abs(p1 - p0) <= F &&
                      abs(q1 - q0) <= F && abs(q2 - q0) <= F;

        if (wd >= 8)
            flat8in &= abs(p3 - p0) <= F && abs(q3 - q0) <= F;

        if (wd >= 16 && (flat8out & flat8in)) {
            dst[strideb * -6] = (p6 + p6 + p6 + p6 + p6 + p6 * 2 + p5 * 2 +
                                 p4 * 2 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
            dst[strideb * -5] = (p6 + p6 + p6 + p6 + p6 + p5 * 2 + p4 * 2 +
                                 p3 * 2 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
            dst[strideb * -4] = (p6 + p6 + p6 + p6 + p5 + p4 * 2 + p3 * 2 +
                                 p2 * 2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
            dst[strideb * -3] = (p6 + p6 + p6 + p5 + p4 + p3 * 2 + p2 * 2 +
                                 p1 * 2 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
            dst[strideb * -2] = (p6 + p6 + p5 + p4 + p3 + p2 * 2 + p1 * 2 +
                                 p0 * 2 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
            dst[strideb * -1] = (p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
                                 q0 * 2 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
            dst[strideb * +0] = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
                                 q1 * 2 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
            dst[strideb * +1] = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
                                 q2 * 2 + q3 + q4 + q5 + q6 + q6 + 8) >> 4;
            dst[strideb * +2] = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 +
                                 q3 * 2 + q4 + q5 + q6 + q6 + q6 + 8) >> 4;
            dst[strideb * +3] = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 +
                                 q4 * 2 + q5 + q6 + q6 + q6 + q6 + 8) >> 4;
            dst[strideb * +4] = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 +
                                 q5 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
            dst[strideb * +5] = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 +
                                 q6 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
        } else if (wd >= 8 && flat8in) {
            dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
            dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
            dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
            dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
            dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
            dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
        } else if (wd == 6 && flat8in) {
            dst[strideb * -2] = (p2 + 2 * p2 + 2 * p1 + 2 * p0 + q0 + 4) >> 3;
            dst[strideb * -1] = (p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3;
            dst[strideb * +0] = (p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3;
            dst[strideb * +1] = (p0 + 2 * q0 + 2 * q1 + 2 * q2 + q2 + 4) >> 3;
        } else {
131
            const int hev = abs(p1 - p0) > H || abs(q1 - q0) > H;
132

133 134
#define iclip_diff(v) iclip(v, -128 * (1 << bitdepth_min_8), \
                                128 * (1 << bitdepth_min_8) - 1)
135 136 137 138 139

            if (hev) {
                int f = iclip_diff(p1 - q1), f1, f2;
                f = iclip_diff(3 * (q0 - p0) + f);

140 141
                f1 = imin(f + 4, (128 << bitdepth_min_8) - 1) >> 3;
                f2 = imin(f + 3, (128 << bitdepth_min_8) - 1) >> 3;
142 143 144 145 146 147

                dst[strideb * -1] = iclip_pixel(p0 + f2);
                dst[strideb * +0] = iclip_pixel(q0 - f1);
            } else {
                int f = iclip_diff(3 * (q0 - p0)), f1, f2;

148 149
                f1 = imin(f + 4, (128 << bitdepth_min_8) - 1) >> 3;
                f2 = imin(f + 3, (128 << bitdepth_min_8) - 1) >> 3;
150 151 152 153 154 155 156 157 158 159 160 161 162

                dst[strideb * -1] = iclip_pixel(p0 + f2);
                dst[strideb * +0] = iclip_pixel(q0 - f1);

                f = (f1 + 1) >> 1;
                dst[strideb * -2] = iclip_pixel(p1 + f);
                dst[strideb * +1] = iclip_pixel(q1 - f);
            }
#undef iclip_diff
        }
    }
}

163 164 165
static void loop_filter_h_sb128y_c(pixel *dst, const ptrdiff_t stride,
                                   const uint32_t *const vmask,
                                   const uint8_t (*l)[4], ptrdiff_t b4_stride,
166 167
                                   const Av1FilterLUT *lut, const int h
                                   HIGHBD_DECL_SUFFIX)
168
{
169
    const unsigned vm = vmask[0] | vmask[1] | vmask[2];
170 171 172 173 174 175 176 177 178
    for (unsigned y = 1; vm & ~(y - 1);
         y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
    {
        if (vm & y) {
            const int L = l[0][0] ? l[0][0] : l[-1][0];
            if (!L) continue;
            const int H = L >> 4;
            const int E = lut->e[L], I = lut->i[L];
            const int idx = (vmask[2] & y) ? 2 : !!(vmask[1] & y);
179 180
            loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 << idx
                        HIGHBD_TAIL_SUFFIX);
181 182
        }
    }
183 184
}

185 186 187
static void loop_filter_v_sb128y_c(pixel *dst, const ptrdiff_t stride,
                                   const uint32_t *const vmask,
                                   const uint8_t (*l)[4], ptrdiff_t b4_stride,
188 189
                                   const Av1FilterLUT *lut, const int w
                                   HIGHBD_DECL_SUFFIX)
190 191 192 193 194 195 196 197 198
{
    const unsigned vm = vmask[0] | vmask[1] | vmask[2];
    for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
        if (vm & x) {
            const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
            if (!L) continue;
            const int H = L >> 4;
            const int E = lut->e[L], I = lut->i[L];
            const int idx = (vmask[2] & x) ? 2 : !!(vmask[1] & x);
199 200
            loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 << idx
                        HIGHBD_TAIL_SUFFIX);
201 202 203 204
        }
    }
}

205 206 207
static void loop_filter_h_sb128uv_c(pixel *dst, const ptrdiff_t stride,
                                    const uint32_t *const vmask,
                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,
208 209
                                    const Av1FilterLUT *lut, const int h
                                    HIGHBD_DECL_SUFFIX)
210
{
211
    const unsigned vm = vmask[0] | vmask[1];
212 213 214 215 216 217 218 219 220
    for (unsigned y = 1; vm & ~(y - 1);
         y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
    {
        if (vm & y) {
            const int L = l[0][0] ? l[0][0] : l[-1][0];
            if (!L) continue;
            const int H = L >> 4;
            const int E = lut->e[L], I = lut->i[L];
            const int idx = !!(vmask[1] & y);
221 222
            loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 + 2 * idx
                        HIGHBD_TAIL_SUFFIX);
223 224 225 226
        }
    }
}

227 228 229
static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
                                    const uint32_t *const vmask,
                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,
230 231
                                    const Av1FilterLUT *lut, const int w
                                    HIGHBD_DECL_SUFFIX)
232 233 234 235 236 237 238 239 240
{
    const unsigned vm = vmask[0] | vmask[1];
    for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
        if (vm & x) {
            const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
            if (!L) continue;
            const int H = L >> 4;
            const int E = lut->e[L], I = lut->i[L];
            const int idx = !!(vmask[1] & x);
241 242
            loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 + 2 * idx
                        HIGHBD_TAIL_SUFFIX);
243 244 245 246
        }
    }
}

247
void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
248 249 250 251
    c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c;
    c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c;
    c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;
    c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;
252 253 254 255

#if HAVE_ASM && ARCH_X86
    bitfn(dav1d_loop_filter_dsp_init_x86)(c);
#endif
256
}