Search code examples
compute-shaderwebgpu

WebGPU compute shader gives unexpected result


I made a shader which gives unexpected result:

@compute @workgroup_size(256, 1)
fn test(@builtin(workgroup_id) id : vec3<u32>) {
  let thread = id.x;
  for(var c = 0u; c<1000; c++){
    any_compute(vec4f(1,2,3,4),vec4f(1,2,3,4));
    if(thread==0){
      resultBuffer[c]+=1;
    }
  }
}

Only one thread will write to resultBuffer. So every element should be 1.

The function any_compute() has no side effects. I guess heavy computation causes this problem.

The results are correct most of the time. But sometimes there are unexpected results. There will be some 2 or 3 in resultBuffer.

The full test program:

const cWGSL = `
@group(0) @binding(0) var<storage, read_write> resultBuffer: array<f32>;

const PI = 3.14159265358979323846;
const inv4PI = 0.25/PI;

const eps = 1e-6;
fn any_compute(a : vec4<f32>, b : vec4<f32>) -> vec3f
{
  let dist = a.xyz - b.xyz;
  let invDist = inverseSqrt(dot(dist, dist) + eps); 
  let invDistCube = invDist * invDist * invDist;
  let s = b.w * invDistCube;
  return -s * inv4PI * dist;
}


@compute @workgroup_size(256, 1)
fn test(@builtin(workgroup_id) id : vec3<u32>) {
  let thread = id.x;
  for(var c = 0u; c<1000; c++){
    any_compute(vec4f(1,2,3,4),vec4f(1,2,3,4));
    if(thread==0){
    resultBuffer[c]+=1;
    }
  }
}`;

async function main(){
const adapter = await navigator.gpu.requestAdapter();
  const device = await adapter.requestDevice();

  // to-do: check limit
  console.log(adapter);
  const maxGPUThread = 256;

  const resultBufferGPU = device.createBuffer({
    size: 4000,
    usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC
  });
  const readBufferGPU = device.createBuffer({
    size: 4000,
    usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST
  });

  const shaderModule = device.createShaderModule({
    code: cWGSL,
  })
  const computePipeline = device.createComputePipeline({
    layout: 'auto', // infer from shader code.
    compute: {
      module: shaderModule,
      entryPoint: "test"
    }
  });

  const bindGroupLayout = computePipeline.getBindGroupLayout(0);
  const buffers = [resultBufferGPU];
  const entries = buffers.map((b, i) => {
    return { binding: i, resource: { buffer: b } };
  });
  const bindGroup = device.createBindGroup({
    layout: bindGroupLayout,
    entries: entries
  });
  const commandEncoder = device.createCommandEncoder();
  const computePassEncoder = commandEncoder.beginComputePass();
  computePassEncoder.setPipeline(computePipeline);
  computePassEncoder.setBindGroup(0, bindGroup);
  computePassEncoder.dispatchWorkgroups(256, 1);
  computePassEncoder.end();

  commandEncoder.copyBufferToBuffer(resultBufferGPU, 0, readBufferGPU, 0, 4000);


  const gpuCommands = commandEncoder.finish();
  device.queue.submit([gpuCommands]);
  await device.queue.onSubmittedWorkDone();
  await readBufferGPU.mapAsync(GPUMapMode.READ);
  const arrayBuffer = readBufferGPU.getMappedRange();
  const result = new Float32Array(arrayBuffer);
  console.log(result);
  const resultCount = {};
  Array.from(new Set(result)).forEach(v => resultCount[v] = result.filter(x => x == v).length);

  console.log(resultCount);

}

main();

Also a jsfiddle version. I tested it in Chrome Canary.

The count of occurrence will be printed in console. The expected output is {1: 1000}.


Solution

  • The issue is I think you're confusing workgroup size and and dispatchWorkgroups.

    @workgroup_size defines now many individual processes to run per workgroup. In other words, if you set @workgroup_size(256, 1) in your shader, then dispatchWorkgroups(1) will run 256 processes.

    You issue the command computePassEncoder.dispatchWorkgroups(256, 1); which means you're running 256 process, 256 times (64k processes in total)

    Since thread = id.x and id.x comes from workgroup_id, that means when the first workgroup is run, workgroup_id.x will be 0 for all 256 threads. (because workgroup_id is the id of the workgroup) So, there are 256 times that thread == 0 which means there's a race between threads trying to update resultBuffer[c]. This is why the result is random.

    There are 3 ids

    • local_invocation_id (the ID with in a workgroup)
    • workgroup_id (the ID of a workgroup)
    • global_invocation_id (workgroup_id * workgroup_size + local_invocation_id)

    There's one convenience builtin

    • local_invocation_index

      rowSize = workgroup_size.x
      sliceSize = workgroup_size.x * workgroup_size.y
      local_invocation_index =
           local_invocation_id.x + 
           local_invocation_id.y * rowSize +
           local_invocation_id.z * sliceSize
      

    Here's some code to hopefully illustrate the point

    const dispatchCount = [4, 3, 2];
    const workgroupSize = [2, 3, 4];
    
    const arrayProd = arr => arr.reduce((a, b) => a * b);
    
    const numThreadsPerWorkgroup = arrayProd(workgroupSize);
    
    const code = `
    // NOTE!: vec3u is padded to by 4 bytes
    @group(0) @binding(0) var<storage, read_write> workgroupResult: array<vec3u>;
    @group(0) @binding(1) var<storage, read_write> localResult: array<vec3u>;
    @group(0) @binding(2) var<storage, read_write> globalResult: array<vec3u>;
    
    const wg_size = ${numThreadsPerWorkgroup};
    @compute @workgroup_size(${workgroupSize})
    fn test(
        @builtin(workgroup_id) workgroup_id : vec3<u32>,
        @builtin(local_invocation_id) local_invocation_id : vec3<u32>,
        @builtin(global_invocation_id) global_invocation_id : vec3<u32>,
        @builtin(local_invocation_index) local_invocation_index: u32,
        @builtin(num_workgroups) num_workgroups: vec3<u32>
    ) {
      let workgroup_index =  
         workgroup_id.x +
         workgroup_id.y * num_workgroups.x +
         workgroup_id.z * num_workgroups.x * num_workgroups.y;
      let global_invocation_index =
         workgroup_index * wg_size +
         local_invocation_index;
      workgroupResult[global_invocation_index] = workgroup_id;
      localResult[global_invocation_index] = local_invocation_id;
      globalResult[global_invocation_index] = global_invocation_id;
    }
    `;
    
    async function main(){
      const adapter = await navigator.gpu?.requestAdapter();
      const device = await adapter?.requestDevice();
      if (!device) {
        console.error('need WebGPU');
        return;
      }
    
      const numWorkgroups = dispatchCount[0] * dispatchCount[1] * dispatchCount[2];
      const numResults = numWorkgroups * numThreadsPerWorkgroup;
      const size = numResults * 4 * 4;  // vec3f * u32
    
      let usage = GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC;
      const workgroupBufferGPU = device.createBuffer({size, usage});
      const localBufferGPU = device.createBuffer({size, usage});
      const globalBufferGPU = device.createBuffer({size, usage});
    
      usage = GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST;
      const workgroupReadBufferGPU = device.createBuffer({size, usage});
      const localReadBufferGPU = device.createBuffer({size, usage});
      const globalReadBufferGPU = device.createBuffer({size, usage});
    
      const shaderModule = device.createShaderModule({code});
      const computePipeline = device.createComputePipeline({
        layout: 'auto',
        compute: {
          module: shaderModule,
          entryPoint: "test"
        }
      });
    
      const bindGroup = device.createBindGroup({
        layout: computePipeline.getBindGroupLayout(0),
        entries: [
          { binding: 0, resource: { buffer: workgroupBufferGPU }},
          { binding: 1, resource: { buffer: localBufferGPU }},
          { binding: 2, resource: { buffer: globalBufferGPU }},
        ],
      });
      const commandEncoder = device.createCommandEncoder();
      const computePassEncoder = commandEncoder.beginComputePass();
      computePassEncoder.setPipeline(computePipeline);
      computePassEncoder.setBindGroup(0, bindGroup);
      computePassEncoder.dispatchWorkgroups(...dispatchCount);
      computePassEncoder.end();
    
      commandEncoder.copyBufferToBuffer(workgroupBufferGPU, 0, workgroupReadBufferGPU, 0, size);
      commandEncoder.copyBufferToBuffer(localBufferGPU, 0, localReadBufferGPU, 0, size);
      commandEncoder.copyBufferToBuffer(globalBufferGPU, 0, globalReadBufferGPU, 0, size);
    
      device.queue.submit([commandEncoder.finish()]);
    
      await Promise.all([
        workgroupReadBufferGPU.mapAsync(GPUMapMode.READ),
        localReadBufferGPU.mapAsync(GPUMapMode.READ),
        globalReadBufferGPU.mapAsync(GPUMapMode.READ),
      ]);
    
      const workgroup = new Uint32Array(workgroupReadBufferGPU.getMappedRange());
      const local = new Uint32Array(localReadBufferGPU.getMappedRange());
      const global = new Uint32Array(globalReadBufferGPU.getMappedRange());
    
      const get3 = (arr, i) => {
        const off = i * 4;
        return `${arr[off]},${arr[off + 1]},${arr[off + 2]}`;
      };
    
      for (let i = 0; i < numResults; ++i) {
        if (i % numThreadsPerWorkgroup === 0) {
          log(`g-index   workgroup  local  global   dispatch: ${i / numThreadsPerWorkgroup}`);
        }
        log(`${i.toString().padStart(3)}:      ${get3(workgroup, i)}      ${get3(local, i)}   ${get3(global, i)}`)
      }
    }
    
    function log(...args) {
      const elem = document.createElement('pre');
      elem.textContent = args.join(' ');
      document.body.appendChild(elem);
    }
    
    main();
    pre { margin: 0; }

    notice in the first dispatch global and local are the same. For the rest of the dispatches they aren't

    Also notice, workgroup_id is the same for all threads in the same workgroup.

    I don't know all the reasons why it's this way but, within a single workgroup the threads can share var<workgroup> variables.

    note: you do not need to call await device.queue.onSubmittedWorkDone. From the spec:

    Under step 4 of mapAsync:

    after the completion of currently-enqueued operations that use this,

    ...

    Issue the map success steps on the contentTimeline.