Why is my compute shader not adjusting the positions of all of the input particles? Vulkan/GLSL/C++

I have just about figured out compute shader implementation with Vulkan. However, I am struggling to understand why only a fraction of the particles that I input to the shader are updating. int PARTICLE_COUNT = 32000 in the video.

YouTube upload of the problem. Please forgive me for not using Imgur, it has not been working for me for the last few hours and will not let me create any accessible uploads.

The compute shader code is below:

#version 450

struct camera {
    mat4 view;
    mat4 proj;
    vec3 position;
};

layout(binding = 0) uniform UniformBufferObject {
    float dt;
    mat4 model;
    camera cam;  
} ubo;

struct Particle {
    vec4 position;
    vec4 color;
    vec4 velocity; 
};

layout(std140, set = 2, binding = 0) readonly buffer inSSBO {
   Particle particlesIn[ ];
};

layout(std140, set = 2, binding = 1) buffer outSSBO {
   Particle particlesOut[ ];
};

layout (local_size_x = 10, local_size_y = 10, local_size_z = 10) in;

// Organization and Indexing
uvec3 nWG = gl_NumWorkGroups;
uvec3 sWG = gl_WorkGroupSize;
uint i = gl_WorkGroupID.x + (nWG.x * gl_WorkGroupID.y) + (nWG.x * nWG.y * gl_WorkGroupID.z);
uint j = gl_LocalInvocationID.x + (sWG.x * gl_LocalInvocationID.y) + (sWG.x * sWG.y *gl_LocalInvocationID.z);
    
// Globals
const float c = 1.0f;

// Calculates acceleration towards a position
vec3 Gravity(vec3 p1, vec3 p2, float m1, float m2) {
    vec3 rN = normalize(p2 - p1);
    float dist2 = distance(p2, p1);
    dist2 *= dist2;
    return rN * ((m1 * m2) / (dist2));
}

void main() 
{
    particlesOut[i].position.xyz = particlesIn[i].position.xyz;
    
    // Kinematic Motion of the Elements of the System
    vec3 Acceleration;
    if (i != j)
    {// Particle Interaction Calculations
        // Interacting Particle Properties
        float m0 = 1.f;
        float m1 = 1.f;
        vec3 p0 = particlesIn[i].position.xyz;
        vec3 p1 = particlesIn[j].position.xyz;
        
        // Velocity Calculation
        particlesOut[i].velocity.xyz += Gravity(p0, p1, m0, m1) * ubo.dt;

        if (length(particlesOut[i].velocity) > c/2)
        {// Sets the Velocity Maximum to the Speed of Light (divided by two bc ITS TOO FAST)
            normalize(particlesOut[i].velocity);
            particlesOut[i].velocity *= c/2;
        }

        particlesOut[i].position.xyz += particlesOut[i].velocity.xyz * ubo.dt;

        // Flip movement at volume border
        if ((particlesOut[i].position.x <= -1.0) || (particlesOut[i].position.x >= 1.0)) {
            particlesOut[i].velocity.x = -particlesOut[i].velocity.x;
        }
        if ((particlesOut[i].position.y <= -1.0) || (particlesOut[i].position.y >= 1.0)) {
            particlesOut[i].velocity.y = -particlesOut[i].velocity.y;
        }
        if ((particlesOut[i].position.z <= -1.0) || (particlesOut[i].position.z >= 1.0)) {
            particlesOut[i].velocity.z = -particlesOut[i].velocity.z;
        }
    }   
}

The compute shader dispatch code is below:

void computeCommand(VkCommandBuffer& commandBuffer, uint32_t setCount, VkDescriptorSet* sets) {
    VkCommandBufferBeginInfo beginInfo
    { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };

    if (vkBeginCommandBuffer(commandBuffer, &beginInfo) != VK_SUCCESS) {
        throw std::runtime_error("failed to begin recording command buffer!");
    }

    vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mPipeline);

    vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mLayout, 0, setCount, sets, 0, nullptr);

    vkCmdDispatch(commandBuffer, PARTICLE_COUNT / (1000), PARTICLE_COUNT / (1000), PARTICLE_COUNT / (1000));

    if (vkEndCommandBuffer(commandBuffer) != VK_SUCCESS) {
        throw std::runtime_error("failed to record compute command buffer!");
    }
}

and the final potential culprits, the Particle struct and the data buffer code:

struct Particle {
    glm::vec4 position;
    glm::vec4 color;
    glm::vec4 velocity;

    const static VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
    static VkVertexInputBindingDescription vkCreateBindings() {
        VkVertexInputBindingDescription bindingDescription{};
        bindingDescription.binding = 0;
        bindingDescription.stride = sizeof(Particle);
        bindingDescription.inputRate = VK_VERTEX_INPUT_RATE_VERTEX;

        return bindingDescription;
    }

    static std::array<VkVertexInputAttributeDescription, 2> vkCreateAttributes() {
        std::array<VkVertexInputAttributeDescription, 2> attributeDescriptions{};

        attributeDescriptions[0].binding = 0;
        attributeDescriptions[0].location = 0;
        attributeDescriptions[0].format = VK_FORMAT_R32G32B32A32_SFLOAT;
        attributeDescriptions[0].offset = offsetof(Particle, position);

        attributeDescriptions[1].binding = 0;
        attributeDescriptions[1].location = 1;
        attributeDescriptions[1].format = VK_FORMAT_R32G32B32A32_SFLOAT;
        attributeDescriptions[1].offset = offsetof(Particle, color);

        return attributeDescriptions;
    }
    static VkPipelineVertexInputStateCreateInfo vkCreateVertexInput() {
        static auto bindingDescription = vkCreateBindings();
        static auto attributeDescriptions = vkCreateAttributes();

        VkPipelineVertexInputStateCreateInfo vertexInputInfo
        { VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO };
        vertexInputInfo.vertexBindingDescriptionCount = 1;
        vertexInputInfo.vertexAttributeDescriptionCount = static_cast<uint32_t>(attributeDescriptions.size());
        vertexInputInfo.pVertexBindingDescriptions = &bindingDescription;
        vertexInputInfo.pVertexAttributeDescriptions = attributeDescriptions.data();
        return vertexInputInfo;
    }
};

// SSBO struct initializes and stores an std::vector<Particle> particles;

void createDataBuffer(SSBO& ssbo) {
    void* data;
    VkBuffer stagingBuffer;
    VkDeviceMemory stagingBufferMemory;

    Buffer.resize(MAX_FRAMES_IN_FLIGHT);
    Memory.resize(MAX_FRAMES_IN_FLIGHT);

    bufferSize = sizeof(Particle)*PARTICLE_COUNT;

    createBuffer(stagingBuffer, stagingBufferMemory,
        VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
        VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
    
    vkMapMemory(VkGPU::device, stagingBufferMemory, 0, bufferSize, 0, &data);
    memcpy(data, ssbo.particles.data(), (size_t)bufferSize);
    vkUnmapMemory(VkGPU::device, stagingBufferMemory);
    
    for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) {
        createBuffer(Buffer[i], Memory[i],
            VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
            VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);

        copyBuffer(stagingBuffer, Buffer[i]);
    }

    vkDestroyBuffer(VkGPU::device, stagingBuffer, nullptr);
    vkFreeMemory(VkGPU::device, stagingBufferMemory, nullptr);
}

The problem is much more apparent when PARTICLE_COUNT = 2000. Even fewer particles are moving. At most ten out of the 2000 particles. Again, please forgive my use of YouTube for my uploads.

I have a feeling the problem is with my indexing within the compute shader, but I am not totally sure. My other thought was that the number of particles distributed to the workgroups might have been a source of the problem, but decreasing the particle count only made it more apparent.

Edit: Fixed a line in the compute shader that did not accurately update the positions of the ParticlesOut[] object, but that was a remnant from testing the SSBOs readonly property. Fixing that line has not made any difference in fixing the problem.

Solution

Sometimes it's good to take a break after working hard. After a good meal and some rest, I was able to look at my code with fresh eyes and figure out what I was doing wrong.

In dispatching the compute shader, I gave each work-group axis too few particles to work on. My original dispatch command was:

vkCmdDispatch(commandBuffer, PARTICLE_COUNT / (1000), PARTICLE_COUNT / (1000), PARTICLE_COUNT / (1000));

Whereas the layout for my compute shader was:

layout (local_size_x = 10, local_size_y = 10, local_size_z = 10) in;

Updating the dispatch command to the following line of code gives the appropriate number of particles to work on:

vkCmdDispatch(commandBuffer, PARTICLE_COUNT / (100), PARTICLE_COUNT / (100), PARTICLE_COUNT / (100));

I believe the problem stemmed from giving each workgroup too few particles at first, so not every particle was being processed. I tested this by lowering the dispatch command's groupCount to PARTICLE_COUNT / (10) which resulted in a horrendous framerate drop since each invocation had to process 10x more particles.

I am not totally clear as to why or how the math works behind the 3D workgroups, but it seems to have to do with the size of the other two workgroup axes, where the divisor is equal to the product of the other local workgroups. I.E. local_size_y = 10 and local_size_z = 10, so the divisor is equal to 10*10 or 100. I would appreciate it if someone could better explain the math behind calculating the groupCount as I do not fully understand it beyond what I could explain here.