LCOV - code coverage report
Current view: top level - src/shaders - dithering.c (source / functions) Hit Total Coverage
Test: Code coverage Lines: 122 141 86.5 %
Date: 2025-03-29 09:04:10 Functions: 6 7 85.7 %
Legend: Lines: hit not hit | Branches: + taken - not taken # not executed Branches: 64 92 69.6 %

           Branch data     Line data    Source code
       1                 :            : /*
       2                 :            :  * This file is part of libplacebo.
       3                 :            :  *
       4                 :            :  * libplacebo is free software; you can redistribute it and/or
       5                 :            :  * modify it under the terms of the GNU Lesser General Public
       6                 :            :  * License as published by the Free Software Foundation; either
       7                 :            :  * version 2.1 of the License, or (at your option) any later version.
       8                 :            :  *
       9                 :            :  * libplacebo is distributed in the hope that it will be useful,
      10                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      11                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12                 :            :  * GNU Lesser General Public License for more details.
      13                 :            :  *
      14                 :            :  * You should have received a copy of the GNU Lesser General Public
      15                 :            :  * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
      16                 :            :  */
      17                 :            : 
      18                 :            : #include <math.h>
      19                 :            : #include "shaders.h"
      20                 :            : 
      21                 :            : #include <libplacebo/shaders/dithering.h>
      22                 :            : 
      23                 :            : const struct pl_dither_params pl_dither_default_params = { PL_DITHER_DEFAULTS };
      24                 :            : 
      25                 :            : struct sh_dither_obj {
      26                 :            :     pl_shader_obj lut;
      27                 :            : };
      28                 :            : 
      29                 :          5 : static void sh_dither_uninit(pl_gpu gpu, void *ptr)
      30                 :            : {
      31                 :            :     struct sh_dither_obj *obj = ptr;
      32                 :          5 :     pl_shader_obj_destroy(&obj->lut);
      33                 :          5 :     *obj = (struct sh_dither_obj) {0};
      34                 :          5 : }
      35                 :            : 
      36                 :         12 : static void fill_dither_matrix(void *data, const struct sh_lut_params *params)
      37                 :            : {
      38   [ +  -  +  -  :         12 :     pl_assert(params->width > 0 && params->height > 0 && params->comps == 1);
                   -  + ]
      39                 :            : 
      40                 :         12 :     const struct pl_dither_params *dpar = params->priv;
      41   [ +  +  -  - ]:         12 :     switch (dpar->method) {
      42                 :          4 :     case PL_DITHER_ORDERED_LUT:
      43         [ -  + ]:          4 :         pl_assert(params->width == params->height);
      44                 :          4 :         pl_generate_bayer_matrix(data, params->width);
      45                 :          4 :         return;
      46                 :            : 
      47                 :          8 :     case PL_DITHER_BLUE_NOISE:
      48         [ -  + ]:          8 :         pl_assert(params->width == params->height);
      49                 :          8 :         pl_generate_blue_noise(data, params->width);
      50                 :          8 :         return;
      51                 :            : 
      52                 :            :     case PL_DITHER_ORDERED_FIXED:
      53                 :            :     case PL_DITHER_WHITE_NOISE:
      54                 :            :     case PL_DITHER_METHOD_COUNT:
      55                 :            :         return;
      56                 :            :     }
      57                 :            : 
      58                 :          0 :     pl_unreachable();
      59                 :            : }
      60                 :            : 
      61                 :        143 : static bool dither_method_is_lut(enum pl_dither_method method)
      62                 :            : {
      63      [ +  -  + ]:        143 :     switch (method) {
      64                 :            :     case PL_DITHER_BLUE_NOISE:
      65                 :            :     case PL_DITHER_ORDERED_LUT:
      66                 :            :         return true;
      67                 :          8 :     case PL_DITHER_ORDERED_FIXED:
      68                 :            :     case PL_DITHER_WHITE_NOISE:
      69                 :          8 :         return false;
      70                 :            :     case PL_DITHER_METHOD_COUNT:
      71                 :            :         break;
      72                 :            :     }
      73                 :            : 
      74                 :          0 :     pl_unreachable();
      75                 :            : }
      76                 :            : 
      77                 :            : static inline float approx_gamma(enum pl_color_transfer trc)
      78                 :            : {
      79                 :            :     switch (trc) {
      80                 :            :     case PL_COLOR_TRC_UNKNOWN:  return 1.0f;
      81                 :            :     case PL_COLOR_TRC_LINEAR:   return 1.0f;
      82                 :            :     case PL_COLOR_TRC_PRO_PHOTO:return 1.8f;
      83                 :            :     case PL_COLOR_TRC_GAMMA18:  return 1.8f;
      84                 :            :     case PL_COLOR_TRC_GAMMA20:  return 2.0f;
      85                 :            :     case PL_COLOR_TRC_GAMMA24:  return 2.4f;
      86                 :            :     case PL_COLOR_TRC_GAMMA26:  return 2.6f;
      87                 :            :     case PL_COLOR_TRC_ST428:    return 2.6f;
      88                 :            :     case PL_COLOR_TRC_GAMMA28:  return 2.8f;
      89                 :            : 
      90                 :            :     case PL_COLOR_TRC_SRGB:
      91                 :            :     case PL_COLOR_TRC_BT_1886:
      92                 :            :     case PL_COLOR_TRC_GAMMA22:
      93                 :            :         return 2.2f;
      94                 :            : 
      95                 :            :     case PL_COLOR_TRC_PQ:
      96                 :            :     case PL_COLOR_TRC_HLG:
      97                 :            :     case PL_COLOR_TRC_V_LOG:
      98                 :            :     case PL_COLOR_TRC_S_LOG1:
      99                 :            :     case PL_COLOR_TRC_S_LOG2:
     100                 :            :         return 2.0f; // TODO: handle this better
     101                 :            : 
     102                 :            :     case PL_COLOR_TRC_COUNT: break;
     103                 :            :     }
     104                 :            : 
     105                 :          0 :     pl_unreachable();
     106                 :            : }
     107                 :            : 
     108                 :        143 : void pl_shader_dither(pl_shader sh, int new_depth,
     109                 :            :                       pl_shader_obj *dither_state,
     110                 :            :                       const struct pl_dither_params *params)
     111                 :            : {
     112         [ +  - ]:        143 :     if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
     113                 :            :         return;
     114                 :            : 
     115         [ -  + ]:        143 :     if (new_depth <= 0 || new_depth > 256) {
     116                 :          0 :         PL_WARN(sh, "Invalid dither depth: %d.. ignoring", new_depth);
     117                 :          0 :         return;
     118                 :            :     }
     119                 :            : 
     120                 :        143 :     sh_describef(sh, "dithering (%d bits)", new_depth);
     121                 :        143 :     GLSL("// pl_shader_dither \n"
     122                 :            :         "{                    \n"
     123                 :            :         "float bias;          \n");
     124                 :            : 
     125         [ +  + ]:        143 :     params = PL_DEF(params, &pl_dither_default_params);
     126         [ -  + ]:        143 :     if (params->lut_size < 0 || params->lut_size > 8) {
     127                 :          0 :         SH_FAIL(sh, "Invalid `lut_size` specified: %d", params->lut_size);
     128                 :          0 :         return;
     129                 :            :     }
     130                 :            : 
     131                 :        143 :     enum pl_dither_method method = params->method;
     132                 :            :     ident_t lut = NULL_IDENT;
     133                 :            :     int lut_size = 0;
     134                 :            : 
     135         [ +  + ]:        143 :     if (dither_method_is_lut(method)) {
     136         [ -  + ]:        135 :         if (!dither_state) {
     137                 :          0 :             PL_WARN(sh, "LUT-based dither method specified but no dither state "
     138                 :            :                     "object given, falling back to non-LUT based methods.");
     139                 :          1 :             goto fallback;
     140                 :            :         }
     141                 :            : 
     142                 :            :         struct sh_dither_obj *obj;
     143                 :        135 :         obj = SH_OBJ(sh, dither_state, PL_SHADER_OBJ_DITHER,
     144                 :            :                      struct sh_dither_obj, sh_dither_uninit);
     145         [ -  + ]:        135 :         if (!obj)
     146                 :          0 :             goto fallback;
     147                 :            : 
     148                 :            :         bool cache = method == PL_DITHER_BLUE_NOISE;
     149         [ +  - ]:        135 :         lut_size = 1 << PL_DEF(params->lut_size, pl_dither_default_params.lut_size);
     150         [ +  + ]:        135 :         lut = sh_lut(sh, sh_lut_params(
     151                 :            :             .object     = &obj->lut,
     152                 :            :             .var_type   = PL_VAR_FLOAT,
     153                 :            :             .width      = lut_size,
     154                 :            :             .height     = lut_size,
     155                 :            :             .comps      = 1,
     156                 :            :             .fill       = fill_dither_matrix,
     157                 :            :             .signature  = (CACHE_KEY_DITHER ^ method) * lut_size,
     158                 :            :             .cache      = cache ? SH_CACHE(sh) : NULL,
     159                 :            :             .priv       = (void *) params,
     160                 :            :         ));
     161         [ +  + ]:        135 :         if (!lut)
     162                 :          1 :             goto fallback;
     163                 :            :     }
     164                 :            : 
     165                 :        142 :     goto done;
     166                 :            : 
     167                 :            : fallback:
     168                 :            :     method = PL_DITHER_ORDERED_FIXED;
     169                 :            :     // fall through
     170                 :            : 
     171                 :        143 : done: ;
     172                 :            : 
     173                 :            :     int size = 0;
     174         [ +  + ]:        143 :     if (lut) {
     175                 :            :         size = lut_size;
     176         [ +  + ]:          9 :     } else if (method == PL_DITHER_ORDERED_FIXED) {
     177                 :            :         size = 16; // hard-coded size
     178                 :            :     }
     179                 :            : 
     180         [ +  - ]:        134 :     if (size) {
     181                 :            :         // Transform the screen position to the cyclic range [0,1)
     182                 :        139 :         GLSL("vec2 pos = fract(gl_FragCoord.xy * 1.0/"$"); \n", SH_FLOAT(size));
     183                 :            : 
     184         [ +  + ]:        139 :         if (params->temporal) {
     185                 :          4 :             int phase = SH_PARAMS(sh).index % 8;
     186                 :          4 :             float r = phase * (M_PI / 2); // rotate
     187         [ +  - ]:          4 :             float m = phase < 4 ? 1 : -1; // mirror
     188                 :          4 :             float mat[2][2] = {
     189                 :          4 :                 {cos(r),     -sin(r)    },
     190                 :          4 :                 {sin(r) * m,  cos(r) * m},
     191                 :            :             };
     192                 :            : 
     193                 :          4 :             ident_t rot = sh_var(sh, (struct pl_shader_var) {
     194                 :          4 :                 .var  = pl_var_mat2("dither_rot"),
     195                 :            :                 .data = &mat[0][0],
     196                 :            :                 .dynamic = true,
     197                 :            :             });
     198                 :          4 :             GLSL("pos = fract("$" * pos + vec2(1.0));\n", rot);
     199                 :            :         }
     200                 :            :     }
     201                 :            : 
     202   [ +  +  +  -  :        143 :     switch (method) {
                      - ]
     203                 :          4 :     case PL_DITHER_WHITE_NOISE: {
     204                 :          4 :         ident_t prng = sh_prng(sh, params->temporal, NULL);
     205                 :          4 :         GLSL("bias = "$".x;\n", prng);
     206                 :            :         break;
     207                 :            :     }
     208                 :            : 
     209                 :            :     case PL_DITHER_ORDERED_FIXED:
     210                 :            :         // Bitwise ordered dither using only 32-bit uints
     211                 :          5 :         GLSL("uvec2 xy = uvec2(pos * 16.0) %% 16u;     \n"
     212                 :            :              // Bitwise merge (morton number)
     213                 :            :              "xy.x = xy.x ^ xy.y;                      \n"
     214                 :            :              "xy = (xy | xy << 2) & uvec2(0x33333333); \n"
     215                 :            :              "xy = (xy | xy << 1) & uvec2(0x55555555); \n"
     216                 :            :              // Bitwise inversion
     217                 :            :              "uint b = xy.x + (xy.y << 1);             \n"
     218                 :            :              "b = (b * 0x0802u & 0x22110u) |           \n"
     219                 :            :              "    (b * 0x8020u & 0x88440u);            \n"
     220                 :            :              "b = 0x10101u * b;                        \n"
     221                 :            :              "b = (b >> 16) & 0xFFu;                   \n"
     222                 :            :              // Generate bias value
     223                 :            :              "bias = float(b) * 1.0/256.0;             \n");
     224                 :          5 :         break;
     225                 :            : 
     226                 :        134 :     case PL_DITHER_BLUE_NOISE:
     227                 :            :     case PL_DITHER_ORDERED_LUT:
     228         [ -  + ]:        134 :         pl_assert(lut);
     229                 :        134 :         GLSL("bias = "$"(ivec2(pos * "$"));\n", lut, SH_FLOAT(lut_size));
     230                 :        134 :         break;
     231                 :            : 
     232                 :            :     case PL_DITHER_METHOD_COUNT:
     233                 :          0 :         pl_unreachable();
     234                 :            :     }
     235                 :            : 
     236                 :            :     // Scale factor for dither rounding
     237                 :        143 :     GLSL("const float scale = %llu.0; \n", (1LLU << new_depth) - 1);
     238                 :            : 
     239         [ +  - ]:        143 :     const float gamma = approx_gamma(params->transfer);
     240         [ +  + ]:        143 :     if (gamma != 1.0f && new_depth <= 4) {
     241                 :         28 :         GLSL("const float gamma = "$";                  \n"
     242                 :            :              "vec4 color_lin = pow(color, vec4(gamma)); \n",
     243                 :            :              SH_FLOAT(gamma));
     244                 :            : 
     245         [ -  + ]:         28 :         if (new_depth == 1) {
     246                 :            :             // Special case for bit depth 1 dithering, in this case we can just
     247                 :            :             // ignore the low/high rounding because we know we are always
     248                 :            :             // dithering between 0.0 and 1.0.
     249                 :          0 :             GLSL("const vec4 low = vec4(0.0);           \n"
     250                 :            :                  "const vec4 high = vec4(1.0);          \n"
     251                 :            :                  "vec4 offset = color_lin;              \n");
     252                 :            :         } else {
     253                 :            :             // Linearize the low, high and current color values
     254                 :         28 :             GLSL("vec4 low = floor(color * scale) / scale;  \n"
     255                 :            :                  "vec4 high = ceil(color * scale) / scale;  \n"
     256                 :            :                  "vec4 low_lin = pow(low, vec4(gamma));     \n"
     257                 :            :                  "vec4 high_lin = pow(high, vec4(gamma));   \n"
     258                 :            :                  "vec4 range = high_lin - low_lin;          \n"
     259                 :            :                  "vec4 offset = (color_lin - low_lin) /     \n"
     260                 :            :                  "              max(range, 1e-6);           \n");
     261                 :            :         }
     262                 :            : 
     263                 :            :         // Mix in the correct ratio corresponding to the offset and bias
     264                 :         28 :         GLSL("color = mix(low, high, greaterThan(offset, vec4(bias))); \n");
     265                 :            :     } else {
     266                 :            :         // Approximate each gamma segment as a straight line, this simplifies
     267                 :            :         // the process of dithering down to a single scale and (biased) round.
     268                 :        115 :         GLSL("color = scale * color + vec4(bias);   \n"
     269                 :            :              "color = floor(color) * (1.0 / scale); \n");
     270                 :            :     }
     271                 :            : 
     272                 :        143 :     GLSL("} \n");
     273                 :            : }
     274                 :            : 
     275                 :            : /* Error diffusion code is taken from mpv, original copyright (c) 2019 Bin Jin
     276                 :            :  *
     277                 :            :  * mpv is free software; you can redistribute it and/or
     278                 :            :  * modify it under the terms of the GNU Lesser General Public
     279                 :            :  * License as published by the Free Software Foundation; either
     280                 :            :  * version 2.1 of the License, or (at your option) any later version.
     281                 :            :  *
     282                 :            :  * mpv is distributed in the hope that it will be useful,
     283                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     284                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     285                 :            :  * GNU Lesser General Public License for more details.
     286                 :            :  *
     287                 :            :  * You should have received a copy of the GNU Lesser General Public
     288                 :            :  * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
     289                 :            :  */
     290                 :            : 
     291                 :            : // After a (y, x) -> (y, x + y * shift) mapping, find the right most column that
     292                 :            : // will be affected by the current column.
     293                 :         20 : static int compute_rightmost_shifted_column(const struct pl_error_diffusion_kernel *k)
     294                 :            : {
     295                 :            :     int ret = 0;
     296         [ +  + ]:         80 :     for (int y = 0; y <= PL_EDF_MAX_DY; y++) {
     297         [ +  + ]:        360 :         for (int x = PL_EDF_MIN_DX; x <= PL_EDF_MAX_DX; x++) {
     298         [ +  + ]:        300 :             if (k->pattern[y][x - PL_EDF_MIN_DX] != 0) {
     299                 :        132 :                 int shifted_x = x + y * k->shift;
     300                 :            : 
     301                 :            :                 // The shift mapping guarantees current column (or left of it)
     302                 :            :                 // won't be affected by error diffusion.
     303         [ -  + ]:        132 :                 assert(shifted_x > 0);
     304                 :            : 
     305                 :        132 :                 ret = PL_MAX(ret, shifted_x);
     306                 :            :             }
     307                 :            :         }
     308                 :            :     }
     309                 :         20 :     return ret;
     310                 :            : }
     311                 :            : 
     312                 :          0 : size_t pl_error_diffusion_shmem_req(const struct pl_error_diffusion_kernel *kernel,
     313                 :            :                                     int height)
     314                 :            : {
     315                 :            :     // We add PL_EDF_MAX_DY empty lines on the bottom to handle errors
     316                 :            :     // propagated out from bottom side.
     317                 :          0 :     int rows = height + PL_EDF_MAX_DY;
     318                 :          0 :     int shifted_columns = compute_rightmost_shifted_column(kernel) + 1;
     319                 :            : 
     320                 :            :     // The shared memory is an array of size rows*shifted_columns. Each element
     321                 :            :     // is a single uint for three RGB component.
     322                 :          0 :     return rows * shifted_columns * sizeof(uint32_t);
     323                 :            : }
     324                 :            : 
     325                 :         20 : bool pl_shader_error_diffusion(pl_shader sh, const struct pl_error_diffusion_params *params)
     326                 :            : {
     327                 :         20 :     const int width = params->input_tex->params.w, height = params->input_tex->params.h;
     328                 :         20 :     const struct pl_glsl_version glsl = sh_glsl(sh);
     329                 :            :     const struct pl_error_diffusion_kernel *kernel =
     330         [ +  - ]:         20 :         PL_DEF(params->kernel, &pl_error_diffusion_sierra_lite);
     331                 :            : 
     332         [ -  + ]:         20 :     pl_assert(params->output_tex->params.w == width);
     333         [ -  + ]:         20 :     pl_assert(params->output_tex->params.h == height);
     334         [ -  + ]:         20 :     if (!sh_require(sh, PL_SHADER_SIG_NONE, width, height))
     335                 :            :         return false;
     336                 :            : 
     337         [ -  + ]:         20 :     if (params->new_depth <= 0 || params->new_depth > 256) {
     338                 :          0 :         PL_WARN(sh, "Invalid dither depth: %d.. ignoring", params->new_depth);
     339                 :          0 :         return false;
     340                 :            :     }
     341                 :            : 
     342                 :            :     // The parallel error diffusion works by applying the shift mapping first.
     343                 :            :     // Taking the Floyd and Steinberg algorithm for example. After applying
     344                 :            :     // the (y, x) -> (y, x + y * shift) mapping (with shift=2), all errors are
     345                 :            :     // propagated into the next few columns, which makes parallel processing on
     346                 :            :     // the same column possible.
     347                 :            :     //
     348                 :            :     //           X    7/16                X    7/16
     349                 :            :     //    3/16  5/16  1/16   ==>    0     0    3/16  5/16  1/16
     350                 :            : 
     351                 :            :     // Figuring out the size of rectangle containing all shifted pixels.
     352                 :            :     // The rectangle height is not changed.
     353                 :         20 :     int shifted_width = width + (height - 1) * kernel->shift;
     354                 :            : 
     355                 :            :     // We process all pixels from the shifted rectangles column by column, with
     356                 :            :     // a single global work group of size |block_size|.
     357                 :            :     // Figuring out how many block are required to process all pixels. We need
     358                 :            :     // this explicitly to make the number of barrier() calls match.
     359                 :         20 :     int block_size = PL_MIN(glsl.max_group_threads, height);
     360                 :         20 :     int blocks = PL_DIV_UP(height * shifted_width, block_size);
     361                 :            : 
     362                 :            :     // If we figure out how many of the next columns will be affected while the
     363                 :            :     // current columns is being processed. We can store errors of only a few
     364                 :            :     // columns in the shared memory. Using a ring buffer will further save the
     365                 :            :     // cost while iterating to next column.
     366                 :            :     //
     367                 :         20 :     int ring_buffer_rows = height + PL_EDF_MAX_DY;
     368                 :         20 :     int ring_buffer_columns = compute_rightmost_shifted_column(kernel) + 1;
     369                 :         20 :     ident_t ring_buffer_size = sh_const(sh, (struct pl_shader_const) {
     370                 :            :         .type = PL_VAR_UINT,
     371                 :            :         .name = "ring_buffer_size",
     372                 :         20 :         .data = &(unsigned) { ring_buffer_rows * ring_buffer_columns },
     373                 :            :         .compile_time = true,
     374                 :            :     });
     375                 :            : 
     376                 :            :     // Compute shared memory requirements and try enabling compute shader.
     377                 :         20 :     size_t shmem_req = ring_buffer_rows * ring_buffer_columns * sizeof(uint32_t);
     378         [ -  + ]:         20 :     if (!sh_try_compute(sh, block_size, 1, false, shmem_req)) {
     379                 :          0 :         PL_ERR(sh, "Cannot execute error diffusion kernel: too old GPU or "
     380                 :            :                "insufficient compute shader memory!");
     381                 :          0 :         return false;
     382                 :            :     }
     383                 :            : 
     384                 :         20 :     ident_t in_tex = sh_desc(sh, (struct pl_shader_desc) {
     385                 :         20 :         .binding.object = params->input_tex,
     386                 :            :         .desc = {
     387                 :            :             .name   = "input_tex",
     388                 :            :             .type   = PL_DESC_SAMPLED_TEX,
     389                 :            :         },
     390                 :            :     });
     391                 :            : 
     392                 :         20 :     ident_t out_img = sh_desc(sh, (struct pl_shader_desc) {
     393                 :         20 :         .binding.object = params->output_tex,
     394                 :            :         .desc = {
     395                 :            :             .name    = "output_tex",
     396                 :            :             .type    = PL_DESC_STORAGE_IMG,
     397                 :            :             .access  = PL_DESC_ACCESS_WRITEONLY,
     398                 :            :         },
     399                 :            :     });
     400                 :            : 
     401                 :         20 :     sh->output = PL_SHADER_SIG_NONE;
     402                 :         20 :     sh_describef(sh, "error diffusion (%s, %d bits)",
     403                 :         20 :                  kernel->name, params->new_depth);
     404                 :            : 
     405                 :            :     // Defines the ring buffer in shared memory.
     406                 :         20 :     GLSLH("shared uint err_rgb8["$"]; \n", ring_buffer_size);
     407                 :         20 :     GLSL("// pl_shader_error_diffusion                                          \n"
     408                 :            :          // Safeguard against accidental over-execution
     409                 :            :          "if (gl_WorkGroupID != uvec3(0))                                       \n"
     410                 :            :          "    return;                                                           \n"
     411                 :            :          // Initialize the ring buffer.
     412                 :            :          "for (uint i = gl_LocalInvocationIndex; i < "$"; i+=gl_WorkGroupSize.x)\n"
     413                 :            :          "    err_rgb8[i] = 0u;                                                 \n"
     414                 :            : 
     415                 :            :         // Main block loop, add barrier here to have previous block all
     416                 :            :         // processed before starting the processing of the next.
     417                 :            :          "for (uint block_id = 0; block_id < "$"; block_id++) {                 \n"
     418                 :            :          "barrier();                                                            \n"
     419                 :            :         // Compute the coordinate of the pixel we are currently processing,
     420                 :            :         // both before and after the shift mapping.
     421                 :            :          "uint id = block_id * gl_WorkGroupSize.x + gl_LocalInvocationIndex;    \n"
     422                 :            :          "const uint height = "$";                                              \n"
     423                 :            :          "int y = int(id %% height), x_shifted = int(id / height);              \n"
     424                 :            :          "int x = x_shifted - y * %d;                                           \n"
     425                 :            :          // Proceed only if we are processing a valid pixel.
     426                 :            :          "if (x >= 0 && x < "$") {                                              \n"
     427                 :            :          // The index that the current pixel have on the ring buffer.
     428                 :            :          "uint idx = uint(x_shifted * "$" + y) %% "$";                          \n"
     429                 :            :          // Fetch the current pixel.
     430                 :            :          "vec4 pix_orig = texelFetch("$", ivec2(x, y), 0);                      \n"
     431                 :            :          "vec3 pix = pix_orig.rgb;                                              \n",
     432                 :            :          ring_buffer_size,
     433                 :            :          SH_UINT(blocks),
     434                 :            :          SH_UINT(height),
     435                 :            :          kernel->shift,
     436                 :            :          SH_INT(width),
     437                 :            :          SH_INT(ring_buffer_rows),
     438                 :            :          ring_buffer_size,
     439                 :            :          in_tex);
     440                 :            : 
     441                 :            :     // The dithering will quantize pixel value into multiples of 1/dither_quant.
     442                 :         20 :     int dither_quant = (1 << params->new_depth) - 1;
     443                 :            : 
     444                 :            :     // We encode errors in RGB components into a single 32-bit unsigned integer.
     445                 :            :     // The error we propagate from the current pixel is in range of
     446                 :            :     // [-0.5 / dither_quant, 0.5 / dither_quant]. While not quite obvious, the
     447                 :            :     // sum of all errors been propagated into a pixel is also in the same range.
     448                 :            :     // It's possible to map errors in this range into [-127, 127], and use an
     449                 :            :     // unsigned 8-bit integer to store it (using standard two's complement).
     450                 :            :     // The three 8-bit unsigned integers can then be encoded into a single
     451                 :            :     // 32-bit unsigned integer, with two 4-bit padding to prevent addition
     452                 :            :     // operation overflows affecting other component. There are at most 12
     453                 :            :     // addition operations on each pixel, so 4-bit padding should be enough.
     454                 :            :     // The overflow from R component will be discarded.
     455                 :            :     //
     456                 :            :     // The following figure is how the encoding looks like.
     457                 :            :     //
     458                 :            :     //     +------------------------------------+
     459                 :            :     //     |RRRRRRRR|0000|GGGGGGGG|0000|BBBBBBBB|
     460                 :            :     //     +------------------------------------+
     461                 :            :     //
     462                 :            : 
     463                 :            :     // The bitshift position for R and G component.
     464                 :            :     const int bitshift_r = 24, bitshift_g = 12;
     465                 :            :     // The multiplier we use to map [-0.5, 0.5] to [-127, 127].
     466                 :            :     const int uint8_mul = 127 * 2;
     467                 :            : 
     468                 :         20 :     GLSL(// Add the error previously propagated into current pixel, and clear
     469                 :            :          // it in the ring buffer.
     470                 :            :          "uint err_u32 = err_rgb8[idx] + %uu;                                   \n"
     471                 :            :          "pix = pix * %d.0 + vec3(int((err_u32 >> %d) & 0xFFu) - 128,           \n"
     472                 :            :          "                        int((err_u32 >> %d) & 0xFFu) - 128,           \n"
     473                 :            :          "                        int( err_u32        & 0xFFu) - 128) / %d.0;   \n"
     474                 :            :          "err_rgb8[idx] = 0u;                                                   \n"
     475                 :            :          // Write the dithered pixel.
     476                 :            :          "vec3 dithered = round(pix);                                           \n"
     477                 :            :          "imageStore("$", ivec2(x, y), vec4(dithered / %d.0, pix_orig.a));      \n"
     478                 :            :          // Prepare for error propagation pass
     479                 :            :          "vec3 err_divided = (pix - dithered) * %d.0 / %d.0;                    \n"
     480                 :            :          "ivec3 tmp;                                                            \n",
     481                 :            :          (128u << bitshift_r) | (128u << bitshift_g) | 128u,
     482                 :            :          dither_quant, bitshift_r, bitshift_g, uint8_mul,
     483                 :            :          out_img, dither_quant,
     484                 :            :          uint8_mul, kernel->divisor);
     485                 :            : 
     486                 :            :     // Group error propagation with same weight factor together, in order to
     487                 :            :     // reduce the number of annoying error encoding.
     488         [ +  + ]:        436 :     for (int dividend = 1; dividend <= kernel->divisor; dividend++) {
     489                 :            :         bool err_assigned = false;
     490                 :            : 
     491         [ +  + ]:       1664 :         for (int y = 0; y <= PL_EDF_MAX_DY; y++) {
     492         [ +  + ]:       7488 :             for (int x = PL_EDF_MIN_DX; x <= PL_EDF_MAX_DX; x++) {
     493         [ +  + ]:       6240 :                 if (kernel->pattern[y][x - PL_EDF_MIN_DX] != dividend)
     494                 :       6108 :                     continue;
     495                 :            : 
     496         [ +  + ]:        132 :                 if (!err_assigned) {
     497                 :            :                     err_assigned = true;
     498                 :            : 
     499                 :         58 :                     GLSL("tmp = ivec3(round(err_divided * %d.0));   \n"
     500                 :            :                          "err_u32 = (uint(tmp.r & 0xFF) << %d) |    \n"
     501                 :            :                          "          (uint(tmp.g & 0xFF) << %d) |    \n"
     502                 :            :                          "           uint(tmp.b & 0xFF);            \n",
     503                 :            :                          dividend,
     504                 :            :                          bitshift_r, bitshift_g);
     505                 :            :                 }
     506                 :            : 
     507                 :        132 :                 int shifted_x = x + y * kernel->shift;
     508                 :            : 
     509                 :            :                 // Unlike the right border, errors propagated out from left
     510                 :            :                 // border will remain in the ring buffer. This will produce
     511                 :            :                 // visible artifacts near the left border, especially for
     512                 :            :                 // shift=3 kernels.
     513         [ +  + ]:        132 :                 if (x < 0)
     514                 :         36 :                     GLSL("if (x >= %d) \n", -x);
     515                 :            : 
     516                 :            :                 // Calculate the new position in the ring buffer to propagate
     517                 :            :                 // the error into.
     518                 :        132 :                 int ring_buffer_delta = shifted_x * ring_buffer_rows + y;
     519                 :        132 :                 GLSL("atomicAdd(err_rgb8[(idx + %du) %% "$"], err_u32); \n",
     520                 :            :                      ring_buffer_delta, ring_buffer_size);
     521                 :            :             }
     522                 :            :         }
     523                 :            :     }
     524                 :            : 
     525                 :         20 :     GLSL("}} \n"); // end of main loop + valid pixel conditional
     526                 :         20 :     return true;
     527                 :            : }

Generated by: LCOV version 1.16