Why is my Open GL Compute Shader so slow?

I have been building an OpenGL compute shader that implements ray tracing. Currently it just computes the pixel color by casting a ray against an array of triangles.

#version 430 core
struct Triangle {
 vec3 vertex1;
 vec3 vertex2;
 vec3 vertex3;
 vec3 color1;
 vec3 color2;
 vec3 color3;
 vec3 normal1;
 vec3 normal2;
 vec3 normal3;
 vec3 edge1;
 vec3 edge2;
};
layout (std430, binding = 0) readonly buffer TriangleBuffer {
 int numTriangles;
 Triangle triangles[];
};
layout (std430, binding = 1, column_major) buffer CameraBuffer {
 vec3 cameraPosition;
 mat4 view;
 mat4 projection;
 mat4 inverseViewProjection;
};
layout (rgba8, binding = 2) writeonly uniform image2D outputImage;
layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
vec3 getBarycentricCoords(int triangleIndex, vec3 closestIntersectionPoint) {
 vec3 v0 = triangles[triangleIndex].vertex2 - triangles[triangleIndex].vertex1;
 vec3 v1 = triangles[triangleIndex].vertex3 - triangles[triangleIndex].vertex1;
 vec3 v2 = closestIntersectionPoint - triangles[triangleIndex].vertex1;
 float d00 = dot(v0, v0);
 float d01 = dot(v0, v1);
 float d11 = dot(v1, v1);
 float d20 = dot(v2, v0);
 float d21 = dot(v2, v1);
 float denom = d00 * d11 - d01 * d01;
 float b1 = (d11 * d20 - d01 * d21) / denom;
 float b2 = (d00 * d21 - d01 * d20) / denom;
 float b0 = 1.0f - b1 - b2;
 return vec3(b0, b1, b2);
}
vec3 getTriangleColor(int triangleIndex, vec3 closestIntersectionPoint) {
 vec3 barycentric = getBarycentricCoords(triangleIndex, closestIntersectionPoint);
 vec3 triangleColor = barycentric.x * triangles[triangleIndex].color1 + barycentric.y * triangles[triangleIndex].color2 + barycentric.z * triangles[triangleIndex].color3;
 return triangleColor;
}
bool rayTriangleIntersection(vec3 rayOrigin, vec3 rayDirection, int triangleIndex, out vec3 intersectionPoint) {
 vec3 h = cross(rayDirection, triangles[triangleIndex].edge2);
 float a = dot(triangles[triangleIndex].edge1, h);
 if (a > -0.00001 && a < 0.00001) {
  return false;
 }
 float f = 1.0 / a;
 vec3 s = rayOrigin - triangles[triangleIndex].vertex1;
 float u = f * dot(s, h);
 if (u < 0.0 || u > 1.0) {
  return false;
 }
 vec3 q = cross(s, triangles[triangleIndex].edge1);
 float v = f * dot(rayDirection, q);
 if (v < 0.0 || u + v > 1.0) {
  return false;
 }
 float t = f * dot(triangles[triangleIndex].edge2, q);
 if (t > 0.00001) {
  intersectionPoint = rayOrigin + rayDirection * t;
  return true;
 }
 return false;
}
vec3 unProject(vec3 win, mat4 model, mat4 proj, vec4 viewport) {
 vec4 tmp = vec4(win, 1);
 tmp.x = (tmp.x - viewport[0]) / viewport[2];
 tmp.y = (tmp.y - viewport[1]) / viewport[3];
 tmp.x = tmp.x * 2 - 1;
 tmp.y = tmp.y * 2 - 1;
 vec4 obj = inverseViewProjection * tmp;
 obj /= obj.w;
 return obj.xyz;
}
void main() {
 ivec2 pixelCoord = ivec2(gl_GlobalInvocationID.xy);
 vec4 viewport = vec4(0, 0, vec2(imageSize(outputImage)).xy);
 vec3 near = vec3(pixelCoord.x, pixelCoord.y, -1);
 vec3 far = vec3(pixelCoord.x, pixelCoord.y, 0.9518f);
 vec3 rayOrigin = unProject(near, view, projection, viewport);
 vec3 rayWorldFar = unProject(far, view, projection, viewport);
 vec3 rayDirection = normalize(rayWorldFar - rayOrigin);
 vec3 intersectionPoint;
 vec3 closestIntersectionPoint = vec3(0,0,0);
 float closestIntersectionDistance = 999999999.0f;
 vec3 finalColor = vec3(0,0,0);
 bool intersectionFound = false;
 for (int triangleIndex = 0; triangleIndex < numTriangles; triangleIndex++) {
  if (rayTriangleIntersection(rayOrigin, rayDirection, triangleIndex, intersectionPoint)) {
   float intersectionDistance = distance(intersectionPoint, rayOrigin);
   if (intersectionDistance < closestIntersectionDistance) {
    closestIntersectionDistance = intersectionDistance;
    closestIntersectionPoint = intersectionPoint;
    finalColor = getTriangleColor(triangleIndex, closestIntersectionPoint);
    intersectionFound = true;
   }
  }
 }
 if (intersectionFound) {
  imageStore(outputImage, pixelCoord, vec4(finalColor, 1.0f));
 }
 else
  imageStore(outputImage, pixelCoord, vec4(0));
}

However when running the shader I only get 30fps. There is a significant bottleneck in the code. This is running with only 20 triangles.

What optimizations can I make to increase the performance of the code? Why is there a bottleneck?

Solution

I managed to more than double my framerate by making the following modifications:

Change layout to a higher value

for this I used GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS

 GLint glMaxComputeWorkGroupInvocations = 0;
 glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS, &glMaxComputeWorkGroupInvocations);
 LIGHTING_SHADER_LOCAL_SIZE_Y = LIGHTING_SHADER_LOCAL_SIZE_X = sqrt(glMaxComputeWorkGroupInvocations);

and update the layout sizes:

layout (local_size_x = ${LIGHTING_SHADER_LOCAL_SIZE_X}, local_size_y = ${LIGHTING_SHADER_LOCAL_SIZE_Y}, local_size_z = 1) in;

Get pixelCoord based on group_id and local_id

 ivec3 groupId = ivec3(gl_WorkGroupID);
 ivec3 localId = ivec3(gl_LocalInvocationID);
 ivec3 globalId = ivec3(gl_GlobalInvocationID);
 ivec3 coords = groupId * ivec3(gl_WorkGroupSize) + localId;
 ivec2 pixelCoord = ivec2(coords.xy);

Update glDispatchCompute

glDispatchCompute(windowWidth / LIGHTING_SHADER_LOCAL_SIZE_X, windowHeight / LIGHTING_SHADER_LOCAL_SIZE_Y, 1);