error diffusion compute shader fails to compile on d3d11

Windows 10, gpu-api=d3d11, works ok with vulkan. Here is the log:
[   2.497][e][vo/gpu-next] D3DCompile failed: Unspecified error (E_FAIL, 0x80004005)
[   2.497][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(66,7-18): warning X3557: loop only executes for 1 iteration(s), forcing loop to unroll
[   2.497][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(33,13-45): error X3663: thread sync operation found in varying flow control, consider reformulating your algorithm so all threads will hit the sync simultaneously
[   2.497][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(73,5-65): error X3663: this variable dependent on potentially varying data: stage_input
[   2.497][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(34,25-63): error X3663: this variable dependent on potentially varying data: gl_LocalInvocationIndex
[   2.497][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(36,28-38): error X3663: this variable dependent on potentially varying data: _304
[   2.497][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(37,24-40): error X3663: this variable dependent on potentially varying data: _310
[   2.497][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(38,18-25): error X3663: this variable dependent on potentially varying data: _314
[   2.498][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(41,13): error X3663: this variable dependent on potentially varying data: _461
[   2.498][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(31,44-49): error X3663: this variable dependent on potentially varying data: _461
[   2.498][e][vo/gpu-next] F:\dev\Shader@0x000002D856513860(31,30-41): error X3663: this variable dependent on potentially varying data: _461
[   2.498][e][vo/gpu-next] 
[   2.498][e][vo/gpu-next] compute shader HLSL source:
[   2.498][e][vo/gpu-next] [  1] static const uint3 gl_WorkGroupSize = uint3(964u, 1u, 1u);
[   2.498][e][vo/gpu-next] [  2] 
[   2.498][e][vo/gpu-next] [  3] Texture2D<float4> _101 : register(t0);
[   2.498][e][vo/gpu-next] [  4] SamplerState __101_sampler : register(s0);
[   2.498][e][vo/gpu-next] [  5] RWTexture2D<float4> _156 : register(u0);
[   2.498][e][vo/gpu-next] [  6] 
[   2.498][e][vo/gpu-next] [  7] static uint3 gl_WorkGroupID;
[   2.498][e][vo/gpu-next] [  8] static uint gl_LocalInvocationIndex;
[   2.498][e][vo/gpu-next] [  9] struct SPIRV_Cross_Input
[   2.498][e][vo/gpu-next] [ 10] {
[   2.498][e][vo/gpu-next] [ 11]     uint3 gl_WorkGroupID : SV_GroupID;
[   2.498][e][vo/gpu-next] [ 12]     uint gl_LocalInvocationIndex : SV_GroupIndex;
[   2.498][e][vo/gpu-next] [ 13] };
[   2.498][e][vo/gpu-next] [ 14] 
[   2.498][e][vo/gpu-next] [ 15] groupshared uint _37[2898];
[   2.498][e][vo/gpu-next] [ 16] 
[   2.498][e][vo/gpu-next] [ 17] void comp_main()
[   2.498][e][vo/gpu-next] [ 18] {
[   2.498][e][vo/gpu-next] [ 19]     do
[   2.498][e][vo/gpu-next] [ 20]     {
[   2.498][e][vo/gpu-next] [ 21]         if (any(bool3(gl_WorkGroupID.x != uint3(0u, 0u, 0u).x, gl_WorkGroupID.y != uint3(0u, 0u, 0u).y, gl_WorkGroupID.z != uint3(0u, 0u, 0u).z)))
[   2.498][e][vo/gpu-next] [ 22]         {
[   2.498][e][vo/gpu-next] [ 23]             break;
[   2.498][e][vo/gpu-next] [ 24]         }
[   2.498][e][vo/gpu-next] [ 25]         for (uint _460 = gl_LocalInvocationIndex; _460 < 2898u; )
[   2.498][e][vo/gpu-next] [ 26]         {
[   2.498][e][vo/gpu-next] [ 27]             _37[_460] = 0u;
[   2.498][e][vo/gpu-next] [ 28]             _460 += 964u;
[   2.498][e][vo/gpu-next] [ 29]             continue;
[   2.498][e][vo/gpu-next] [ 30]         }
[   2.498][e][vo/gpu-next] [ 31]         for (uint _461 = 0u; _461 < 3639u; _461++)
[   2.498][e][vo/gpu-next] [ 32]         {
[   2.498][e][vo/gpu-next] [ 33]             GroupMemoryBarrierWithGroupSync();
[   2.498][e][vo/gpu-next] [ 34]             uint _304 = (_461 * 964u) + gl_LocalInvocationIndex;
[   2.498][e][vo/gpu-next] [ 35]             int _307 = int(_304 % 964u);
[   2.498][e][vo/gpu-next] [ 36]             int _310 = int(_304 / 964u);
[   2.498][e][vo/gpu-next] [ 37]             int _314 = _310 - (_307 * 2);
[   2.498][e][vo/gpu-next] [ 38]             if ((_314 < 0) || (_314 > 1713))
[   2.498][e][vo/gpu-next] [ 39]             {
[   2.498][e][vo/gpu-next] [ 40]                 continue;
[   2.498][e][vo/gpu-next] [ 41]             }
[   2.498][e][vo/gpu-next] [ 42]             uint _327 = uint((_310 * 966) + _307) % 2898u;
[   2.498][e][vo/gpu-next] [ 43]             int2 _331 = int2(_314, _307);
[   2.498][e][vo/gpu-next] [ 44]             float4 _333 = _101.Load(int3(_331, 0));
[   2.498][e][vo/gpu-next] [ 45]             uint _338 = _37[_327];
[   2.498][e][vo/gpu-next] [ 46]             uint _339 = _338 + 2148008064u;
[   2.498][e][vo/gpu-next] [ 47]             float3 _362 = mad(float3(float(int((_339 >> uint(24)) & 255u) - 128), float(int((_339 >> uint(12)) & 255u) - 128), float(int(_339 & 255u) - 128)), 0.00393700785934925079345703125f.xxx, _333.xyz * 255.0f);
[   2.498][e][vo/gpu-next] [ 48]             _37[_327] = 0u;
[   2.498][e][vo/gpu-next] [ 49]             float3 _366 = round(_362);
[   2.498][e][vo/gpu-next] [ 50]             _156[_331] = float4(_366 * 0.0039215688593685626983642578125f.xxx, _333.w);
[   2.498][e][vo/gpu-next] [ 51]             float3 _385 = ((_362 - _366) * 254.0f) * 0.25f.xxx;
[   2.498][e][vo/gpu-next] [ 52]             int3 _389 = int3(round(_385 * 1.0f));
[   2.498][e][vo/gpu-next] [ 53]             uint _405 = ((uint(_389.x & 255) << uint(24)) | (uint(_389.y & 255) << uint(12))) | uint(_389.z & 255);
[   2.498][e][vo/gpu-next] [ 54]             if (_314 >= 1)
[   2.498][e][vo/gpu-next] [ 55]             {
[   2.498][e][vo/gpu-next] [ 56]                 uint _414;
[   2.498][e][vo/gpu-next] [ 57]                 InterlockedAdd(_37[(_327 + 967u) % 2898u], _405, _414);
[   2.498][e][vo/gpu-next] [ 58]             }
[   2.498][e][vo/gpu-next] [ 59]             uint _421;
[   2.498][e][vo/gpu-next] [ 60]             InterlockedAdd(_37[(_327 + 1933u) % 2898u], _405, _421);
[   2.498][e][vo/gpu-next] [ 61]             int3 _425 = int3(round(_385 * 2.0f));
[   2.498][e][vo/gpu-next] [ 62]             uint _447;
[   2.499][e][vo/gpu-next] [ 63]             InterlockedAdd(_37[(_327 + 966u) % 2898u], ((uint(_425.x & 255) << uint(24)) | (uint(_425.y & 255) << uint(12))) | uint(_425.z & 255), _447);
[   2.499][e][vo/gpu-next] [ 64]         }
[   2.499][e][vo/gpu-next] [ 65]         break;
[   2.499][e][vo/gpu-next] [ 66]     } while(false);
[   2.499][e][vo/gpu-next] [ 67] }
[   2.499][e][vo/gpu-next] [ 68] 
[   2.499][e][vo/gpu-next] [ 69] [numthreads(964, 1, 1)]
[   2.499][e][vo/gpu-next] [ 70] void main(SPIRV_Cross_Input stage_input)
[   2.499][e][vo/gpu-next] [ 71] {
[   2.499][e][vo/gpu-next] [ 72]     gl_WorkGroupID = stage_input.gl_WorkGroupID;
[   2.499][e][vo/gpu-next] [ 73]     gl_LocalInvocationIndex = stage_input.gl_LocalInvocationIndex;
[   2.499][e][vo/gpu-next] [ 74]     comp_main();
[   2.499][e][vo/gpu-next] [ 75] }
[   2.499][e][vo/gpu-next] compute shader source:
[   2.499][e][vo/gpu-next] [  1] #version 450
[   2.499][e][vo/gpu-next] [  2] #extension GL_ARB_compute_shader : enable
[   2.499][e][vo/gpu-next] [  3] #extension GL_ARB_shader_image_load_store : enable
[   2.499][e][vo/gpu-next] [  4] #extension GL_ARB_texture_gather : enable
[   2.499][e][vo/gpu-next] [  5] layout(binding=0) uniform  sampler2D _input_tex_1_0;
[   2.499][e][vo/gpu-next] [  6] layout(binding=1, rgba16f) writeonly restrict uniform image2D _output_tex_2_0;
[   2.499][e][vo/gpu-next] [  7] layout (local_size_x = 964, local_size_y = 1) in;
[   2.499][e][vo/gpu-next] [  8] 
[   2.499][e][vo/gpu-next] [  9] const uint _ring_buffer_size_0_0 = 2898u; 
[   2.499][e][vo/gpu-next] [ 10] shared uint err_rgb8[_ring_buffer_size_0_0]; 
[   2.499][e][vo/gpu-next] [ 11] const int _const_3_0 = 966; 
[   2.499][e][vo/gpu-next] [ 12] const int _const_4_0 = 1713; 
[   2.499][e][vo/gpu-next] [ 13] const uint _const_5_0 = 964u; 
[   2.499][e][vo/gpu-next] [ 14] const uint _const_6_0 = 3639u; 
[   2.499][e][vo/gpu-next] [ 15] void _main_7_0() {
[   2.499][e][vo/gpu-next] [ 16] // pl_shader_error_diffusion                                          
[   2.499][e][vo/gpu-next] [ 17] if (gl_WorkGroupID != uvec3(0))                                       
[   2.499][e][vo/gpu-next] [ 18]     return;                                                           
[   2.499][e][vo/gpu-next] [ 19] for (uint i = gl_LocalInvocationIndex; i < _ring_buffer_size_0_0; i+=gl_WorkGroupSize.x) 
[   2.499][e][vo/gpu-next] [ 20]     err_rgb8[i] = 0u;                                                 
[   2.499][e][vo/gpu-next] [ 21] for (uint block_id = 0; block_id < _const_6_0; block_id++) {                  
[   2.499][e][vo/gpu-next] [ 22] barrier();                                                            
[   2.499][e][vo/gpu-next] [ 23] uint id = block_id * gl_WorkGroupSize.x + gl_LocalInvocationIndex;    
[   2.499][e][vo/gpu-next] [ 24] const uint height = _const_5_0;                                               
[   2.499][e][vo/gpu-next] [ 25] int y = int(id % height), x_shifted = int(id / height);              
[   2.499][e][vo/gpu-next] [ 26] int x = x_shifted - y * 2;                                           
[   2.499][e][vo/gpu-next] [ 27] if (x < 0 || x > _const_4_0)                                                  
[   2.499][e][vo/gpu-next] [ 28]     continue;                                                         
[   2.499][e][vo/gpu-next] [ 29] uint idx = uint(x_shifted * _const_3_0 + y) % _ring_buffer_size_0_0;                            
[   2.499][e][vo/gpu-next] [ 30] vec4 pix_orig = texelFetch(_input_tex_1_0, ivec2(x, y), 0);                       
[   2.499][e][vo/gpu-next] [ 31] vec3 pix = pix_orig.rgb;                                              
[   2.499][e][vo/gpu-next] [ 32] uint err_u32 = err_rgb8[idx] + 2148008064u;                                   
[   2.499][e][vo/gpu-next] [ 33] pix = pix * 255.0 + vec3(int((err_u32 >> 24) & 0xFFu) - 128,           
[   2.499][e][vo/gpu-next] [ 34]                         int((err_u32 >> 12) & 0xFFu) - 128,           
[   2.499][e][vo/gpu-next] [ 35]                         int( err_u32        & 0xFFu) - 128) / 254.0;   
[   2.499][e][vo/gpu-next] [ 36] err_rgb8[idx] = 0u;                                                   
[   2.499][e][vo/gpu-next] [ 37] vec3 dithered = round(pix);                                           
[   2.499][e][vo/gpu-next] [ 38] imageStore(_output_tex_2_0, ivec2(x, y), vec4(dithered / 255.0, pix_orig.a));       
[   2.499][e][vo/gpu-next] [ 39] vec3 err_divided = (pix - dithered) * 254.0 / 4.0;                    
[   2.499][e][vo/gpu-next] [ 40] ivec3 tmp;                                                            
[   2.500][e][vo/gpu-next] [ 41] tmp = ivec3(round(err_divided * 1.0));   
[   2.500][e][vo/gpu-next] [ 42] err_u32 = (uint(tmp.r & 0xFF) << 24) |    
[   2.500][e][vo/gpu-next] [ 43]           (uint(tmp.g & 0xFF) << 12) |    
[   2.500][e][vo/gpu-next] [ 44]            uint(tmp.b & 0xFF);            
[   2.500][e][vo/gpu-next] [ 45] if (x >= 1) 
[   2.500][e][vo/gpu-next] [ 46] atomicAdd(err_rgb8[(idx + 967u) % _ring_buffer_size_0_0], err_u32); 
[   2.500][e][vo/gpu-next] [ 47] atomicAdd(err_rgb8[(idx + 1933u) % _ring_buffer_size_0_0], err_u32); 
[   2.500][e][vo/gpu-next] [ 48] tmp = ivec3(round(err_divided * 2.0));   
[   2.500][e][vo/gpu-next] [ 49] err_u32 = (uint(tmp.r & 0xFF) << 24) |    
[   2.500][e][vo/gpu-next] [ 50]           (uint(tmp.g & 0xFF) << 12) |    
[   2.500][e][vo/gpu-next] [ 51]            uint(tmp.b & 0xFF);            
[   2.500][e][vo/gpu-next] [ 52] atomicAdd(err_rgb8[(idx + 966u) % _ring_buffer_size_0_0], err_u32); 
[   2.500][e][vo/gpu-next] [ 53] } 
[   2.500][e][vo/gpu-next] [ 54] 
[   2.500][e][vo/gpu-next] [ 55] }
[   2.500][e][vo/gpu-next] [ 56] 
[   2.500][e][vo/gpu-next] [ 57] void main() {
[   2.500][e][vo/gpu-next] [ 58] _main_7_0();
[   2.500][e][vo/gpu-next] [ 59] }
[   2.500][e][vo/gpu-next] Failed creating render pass for dispatch