error diffusion compute shader fails to compile on d3d11
Windows 10, gpu-api=d3d11, works ok with vulkan. Here is the log:
[ 2.497][e][vo/gpu-next] D3DCompile failed: Unspecified error (E_FAIL, 0x80004005)
[ 2.497][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(66,7-18): warning X3557: loop only executes for 1 iteration(s), forcing loop to unroll
[ 2.497][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(33,13-45): error X3663: thread sync operation found in varying flow control, consider reformulating your algorithm so all threads will hit the sync simultaneously
[ 2.497][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(73,5-65): error X3663: this variable dependent on potentially varying data: stage_input
[ 2.497][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(34,25-63): error X3663: this variable dependent on potentially varying data: gl_LocalInvocationIndex
[ 2.497][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(36,28-38): error X3663: this variable dependent on potentially varying data: _304
[ 2.497][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(37,24-40): error X3663: this variable dependent on potentially varying data: _310
[ 2.497][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(38,18-25): error X3663: this variable dependent on potentially varying data: _314
[ 2.498][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(41,13): error X3663: this variable dependent on potentially varying data: _461
[ 2.498][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(31,44-49): error X3663: this variable dependent on potentially varying data: _461
[ 2.498][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(31,30-41): error X3663: this variable dependent on potentially varying data: _461
[ 2.498][e][vo/gpu-next]
[ 2.498][e][vo/gpu-next] compute shader HLSL source:
[ 2.498][e][vo/gpu-next] [ 1] static const uint3 gl_WorkGroupSize = uint3(964u, 1u, 1u);
[ 2.498][e][vo/gpu-next] [ 2]
[ 2.498][e][vo/gpu-next] [ 3] Texture2D<float4> _101 : register(t0);
[ 2.498][e][vo/gpu-next] [ 4] SamplerState __101_sampler : register(s0);
[ 2.498][e][vo/gpu-next] [ 5] RWTexture2D<float4> _156 : register(u0);
[ 2.498][e][vo/gpu-next] [ 6]
[ 2.498][e][vo/gpu-next] [ 7] static uint3 gl_WorkGroupID;
[ 2.498][e][vo/gpu-next] [ 8] static uint gl_LocalInvocationIndex;
[ 2.498][e][vo/gpu-next] [ 9] struct SPIRV_Cross_Input
[ 2.498][e][vo/gpu-next] [ 10] {
[ 2.498][e][vo/gpu-next] [ 11] uint3 gl_WorkGroupID : SV_GroupID;
[ 2.498][e][vo/gpu-next] [ 12] uint gl_LocalInvocationIndex : SV_GroupIndex;
[ 2.498][e][vo/gpu-next] [ 13] };
[ 2.498][e][vo/gpu-next] [ 14]
[ 2.498][e][vo/gpu-next] [ 15] groupshared uint _37[2898];
[ 2.498][e][vo/gpu-next] [ 16]
[ 2.498][e][vo/gpu-next] [ 17] void comp_main()
[ 2.498][e][vo/gpu-next] [ 18] {
[ 2.498][e][vo/gpu-next] [ 19] do
[ 2.498][e][vo/gpu-next] [ 20] {
[ 2.498][e][vo/gpu-next] [ 21] if (any(bool3(gl_WorkGroupID.x != uint3(0u, 0u, 0u).x, gl_WorkGroupID.y != uint3(0u, 0u, 0u).y, gl_WorkGroupID.z != uint3(0u, 0u, 0u).z)))
[ 2.498][e][vo/gpu-next] [ 22] {
[ 2.498][e][vo/gpu-next] [ 23] break;
[ 2.498][e][vo/gpu-next] [ 24] }
[ 2.498][e][vo/gpu-next] [ 25] for (uint _460 = gl_LocalInvocationIndex; _460 < 2898u; )
[ 2.498][e][vo/gpu-next] [ 26] {
[ 2.498][e][vo/gpu-next] [ 27] _37[_460] = 0u;
[ 2.498][e][vo/gpu-next] [ 28] _460 += 964u;
[ 2.498][e][vo/gpu-next] [ 29] continue;
[ 2.498][e][vo/gpu-next] [ 30] }
[ 2.498][e][vo/gpu-next] [ 31] for (uint _461 = 0u; _461 < 3639u; _461++)
[ 2.498][e][vo/gpu-next] [ 32] {
[ 2.498][e][vo/gpu-next] [ 33] GroupMemoryBarrierWithGroupSync();
[ 2.498][e][vo/gpu-next] [ 34] uint _304 = (_461 * 964u) + gl_LocalInvocationIndex;
[ 2.498][e][vo/gpu-next] [ 35] int _307 = int(_304 % 964u);
[ 2.498][e][vo/gpu-next] [ 36] int _310 = int(_304 / 964u);
[ 2.498][e][vo/gpu-next] [ 37] int _314 = _310 - (_307 * 2);
[ 2.498][e][vo/gpu-next] [ 38] if ((_314 < 0) || (_314 > 1713))
[ 2.498][e][vo/gpu-next] [ 39] {
[ 2.498][e][vo/gpu-next] [ 40] continue;
[ 2.498][e][vo/gpu-next] [ 41] }
[ 2.498][e][vo/gpu-next] [ 42] uint _327 = uint((_310 * 966) + _307) % 2898u;
[ 2.498][e][vo/gpu-next] [ 43] int2 _331 = int2(_314, _307);
[ 2.498][e][vo/gpu-next] [ 44] float4 _333 = _101.Load(int3(_331, 0));
[ 2.498][e][vo/gpu-next] [ 45] uint _338 = _37[_327];
[ 2.498][e][vo/gpu-next] [ 46] uint _339 = _338 + 2148008064u;
[ 2.498][e][vo/gpu-next] [ 47] float3 _362 = mad(float3(float(int((_339 >> uint(24)) & 255u) - 128), float(int((_339 >> uint(12)) & 255u) - 128), float(int(_339 & 255u) - 128)), 0.00393700785934925079345703125f.xxx, _333.xyz * 255.0f);
[ 2.498][e][vo/gpu-next] [ 48] _37[_327] = 0u;
[ 2.498][e][vo/gpu-next] [ 49] float3 _366 = round(_362);
[ 2.498][e][vo/gpu-next] [ 50] _156[_331] = float4(_366 * 0.0039215688593685626983642578125f.xxx, _333.w);
[ 2.498][e][vo/gpu-next] [ 51] float3 _385 = ((_362 - _366) * 254.0f) * 0.25f.xxx;
[ 2.498][e][vo/gpu-next] [ 52] int3 _389 = int3(round(_385 * 1.0f));
[ 2.498][e][vo/gpu-next] [ 53] uint _405 = ((uint(_389.x & 255) << uint(24)) | (uint(_389.y & 255) << uint(12))) | uint(_389.z & 255);
[ 2.498][e][vo/gpu-next] [ 54] if (_314 >= 1)
[ 2.498][e][vo/gpu-next] [ 55] {
[ 2.498][e][vo/gpu-next] [ 56] uint _414;
[ 2.498][e][vo/gpu-next] [ 57] InterlockedAdd(_37[(_327 + 967u) % 2898u], _405, _414);
[ 2.498][e][vo/gpu-next] [ 58] }
[ 2.498][e][vo/gpu-next] [ 59] uint _421;
[ 2.498][e][vo/gpu-next] [ 60] InterlockedAdd(_37[(_327 + 1933u) % 2898u], _405, _421);
[ 2.498][e][vo/gpu-next] [ 61] int3 _425 = int3(round(_385 * 2.0f));
[ 2.498][e][vo/gpu-next] [ 62] uint _447;
[ 2.499][e][vo/gpu-next] [ 63] InterlockedAdd(_37[(_327 + 966u) % 2898u], ((uint(_425.x & 255) << uint(24)) | (uint(_425.y & 255) << uint(12))) | uint(_425.z & 255), _447);
[ 2.499][e][vo/gpu-next] [ 64] }
[ 2.499][e][vo/gpu-next] [ 65] break;
[ 2.499][e][vo/gpu-next] [ 66] } while(false);
[ 2.499][e][vo/gpu-next] [ 67] }
[ 2.499][e][vo/gpu-next] [ 68]
[ 2.499][e][vo/gpu-next] [ 69] [numthreads(964, 1, 1)]
[ 2.499][e][vo/gpu-next] [ 70] void main(SPIRV_Cross_Input stage_input)
[ 2.499][e][vo/gpu-next] [ 71] {
[ 2.499][e][vo/gpu-next] [ 72] gl_WorkGroupID = stage_input.gl_WorkGroupID;
[ 2.499][e][vo/gpu-next] [ 73] gl_LocalInvocationIndex = stage_input.gl_LocalInvocationIndex;
[ 2.499][e][vo/gpu-next] [ 74] comp_main();
[ 2.499][e][vo/gpu-next] [ 75] }
[ 2.499][e][vo/gpu-next] compute shader source:
[ 2.499][e][vo/gpu-next] [ 1] #version 450
[ 2.499][e][vo/gpu-next] [ 2] #extension GL_ARB_compute_shader : enable
[ 2.499][e][vo/gpu-next] [ 3] #extension GL_ARB_shader_image_load_store : enable
[ 2.499][e][vo/gpu-next] [ 4] #extension GL_ARB_texture_gather : enable
[ 2.499][e][vo/gpu-next] [ 5] layout(binding=0) uniform sampler2D _input_tex_1_0;
[ 2.499][e][vo/gpu-next] [ 6] layout(binding=1, rgba16f) writeonly restrict uniform image2D _output_tex_2_0;
[ 2.499][e][vo/gpu-next] [ 7] layout (local_size_x = 964, local_size_y = 1) in;
[ 2.499][e][vo/gpu-next] [ 8]
[ 2.499][e][vo/gpu-next] [ 9] const uint _ring_buffer_size_0_0 = 2898u;
[ 2.499][e][vo/gpu-next] [ 10] shared uint err_rgb8[_ring_buffer_size_0_0];
[ 2.499][e][vo/gpu-next] [ 11] const int _const_3_0 = 966;
[ 2.499][e][vo/gpu-next] [ 12] const int _const_4_0 = 1713;
[ 2.499][e][vo/gpu-next] [ 13] const uint _const_5_0 = 964u;
[ 2.499][e][vo/gpu-next] [ 14] const uint _const_6_0 = 3639u;
[ 2.499][e][vo/gpu-next] [ 15] void _main_7_0() {
[ 2.499][e][vo/gpu-next] [ 16] // pl_shader_error_diffusion
[ 2.499][e][vo/gpu-next] [ 17] if (gl_WorkGroupID != uvec3(0))
[ 2.499][e][vo/gpu-next] [ 18] return;
[ 2.499][e][vo/gpu-next] [ 19] for (uint i = gl_LocalInvocationIndex; i < _ring_buffer_size_0_0; i+=gl_WorkGroupSize.x)
[ 2.499][e][vo/gpu-next] [ 20] err_rgb8[i] = 0u;
[ 2.499][e][vo/gpu-next] [ 21] for (uint block_id = 0; block_id < _const_6_0; block_id++) {
[ 2.499][e][vo/gpu-next] [ 22] barrier();
[ 2.499][e][vo/gpu-next] [ 23] uint id = block_id * gl_WorkGroupSize.x + gl_LocalInvocationIndex;
[ 2.499][e][vo/gpu-next] [ 24] const uint height = _const_5_0;
[ 2.499][e][vo/gpu-next] [ 25] int y = int(id % height), x_shifted = int(id / height);
[ 2.499][e][vo/gpu-next] [ 26] int x = x_shifted - y * 2;
[ 2.499][e][vo/gpu-next] [ 27] if (x < 0 || x > _const_4_0)
[ 2.499][e][vo/gpu-next] [ 28] continue;
[ 2.499][e][vo/gpu-next] [ 29] uint idx = uint(x_shifted * _const_3_0 + y) % _ring_buffer_size_0_0;
[ 2.499][e][vo/gpu-next] [ 30] vec4 pix_orig = texelFetch(_input_tex_1_0, ivec2(x, y), 0);
[ 2.499][e][vo/gpu-next] [ 31] vec3 pix = pix_orig.rgb;
[ 2.499][e][vo/gpu-next] [ 32] uint err_u32 = err_rgb8[idx] + 2148008064u;
[ 2.499][e][vo/gpu-next] [ 33] pix = pix * 255.0 + vec3(int((err_u32 >> 24) & 0xFFu) - 128,
[ 2.499][e][vo/gpu-next] [ 34] int((err_u32 >> 12) & 0xFFu) - 128,
[ 2.499][e][vo/gpu-next] [ 35] int( err_u32 & 0xFFu) - 128) / 254.0;
[ 2.499][e][vo/gpu-next] [ 36] err_rgb8[idx] = 0u;
[ 2.499][e][vo/gpu-next] [ 37] vec3 dithered = round(pix);
[ 2.499][e][vo/gpu-next] [ 38] imageStore(_output_tex_2_0, ivec2(x, y), vec4(dithered / 255.0, pix_orig.a));
[ 2.499][e][vo/gpu-next] [ 39] vec3 err_divided = (pix - dithered) * 254.0 / 4.0;
[ 2.499][e][vo/gpu-next] [ 40] ivec3 tmp;
[ 2.500][e][vo/gpu-next] [ 41] tmp = ivec3(round(err_divided * 1.0));
[ 2.500][e][vo/gpu-next] [ 42] err_u32 = (uint(tmp.r & 0xFF) << 24) |
[ 2.500][e][vo/gpu-next] [ 43] (uint(tmp.g & 0xFF) << 12) |
[ 2.500][e][vo/gpu-next] [ 44] uint(tmp.b & 0xFF);
[ 2.500][e][vo/gpu-next] [ 45] if (x >= 1)
[ 2.500][e][vo/gpu-next] [ 46] atomicAdd(err_rgb8[(idx + 967u) % _ring_buffer_size_0_0], err_u32);
[ 2.500][e][vo/gpu-next] [ 47] atomicAdd(err_rgb8[(idx + 1933u) % _ring_buffer_size_0_0], err_u32);
[ 2.500][e][vo/gpu-next] [ 48] tmp = ivec3(round(err_divided * 2.0));
[ 2.500][e][vo/gpu-next] [ 49] err_u32 = (uint(tmp.r & 0xFF) << 24) |
[ 2.500][e][vo/gpu-next] [ 50] (uint(tmp.g & 0xFF) << 12) |
[ 2.500][e][vo/gpu-next] [ 51] uint(tmp.b & 0xFF);
[ 2.500][e][vo/gpu-next] [ 52] atomicAdd(err_rgb8[(idx + 966u) % _ring_buffer_size_0_0], err_u32);
[ 2.500][e][vo/gpu-next] [ 53] }
[ 2.500][e][vo/gpu-next] [ 54]
[ 2.500][e][vo/gpu-next] [ 55] }
[ 2.500][e][vo/gpu-next] [ 56]
[ 2.500][e][vo/gpu-next] [ 57] void main() {
[ 2.500][e][vo/gpu-next] [ 58] _main_7_0();
[ 2.500][e][vo/gpu-next] [ 59] }
[ 2.500][e][vo/gpu-next] Failed creating render pass for dispatch