Search code examples
c++openglglslraytracingcompute-shader

GLSL Compute Shader doesn't run for large inputs


The shader takes an SSBO of Photons that have a position, direction, wavelength and intensity and each thread is responsible for tracing exactly one photon through the grid, where at each grid cell the photon hits, the intensity is accumulated for each wavelength to create a spectral distribution for each grid cell.

The problem is that the shader works perfectly for 100,000 photons, but doesn't return a result for 1,000,000 photons.

I looked into the sizes for the SSBOs and all were within my GPUs (NVIDIA Quadro P6000) limits of 2GB:

  • SSBO Grid size: 1.5GB
  • SSBO Photons Size: 0.02GB

If I change the logic at some places it works with one million photons (see lines 87 and 114 for comments).

I currently can't see any explanation of why the shader fails for 1,000,000 photons, but works for 100,000 photons. The logic is the same and the buffer sizes are within limits. (That the buffer size can't be a problem is also confirmed by that it works when changing the logic.)

Below is the source code. If you want to try it yourself here is the code on github: https://github.com/TheJhonny007/TextureTracerDebug

Compute Shader:

#version 430

#extension GL_EXT_compute_shader: enable
#extension GL_EXT_shader_storage_buffer_object: enable
#extension GL_ARB_compute_variable_group_size: enable

const uint TEX_WIDTH = 1024u;
const uint TEX_HEIGHT = TEX_WIDTH;

const uint MIN_WAVELENGTH = 380u;
const uint MAX_WAVELENGTH = 740u;
const uint NUM_WAVELENGTHS = MAX_WAVELENGTH - MIN_WAVELENGTH;

// Size: 24 bytes -> ~40,000,000 photons per available gigabyte of ram
struct Photon {
    vec2 position;// m
    vec2 direction;// normalized
    uint wavelength;// nm
    float intensity;// 0..1 should start at 1
};

layout(std430, binding = 0) buffer Photons {
    Photon photons[];
};

// Size: 1440 bytes -> ~700,000 pixels per available gigabyte of ram
struct Pixel {
    uint intensityAtWavelengths[NUM_WAVELENGTHS];// [0..1000]
};

layout(std430, binding = 1) buffer Pixels {
//Pixel pixels[TEX_WIDTH][TEX_HEIGHT];
// NVIDIAs linker takes ages to link if the sizes are specified :(
    Pixel[] pixels;
};

uniform float xAxisScalingFactor;

vec2 getHorizontalRectangleAt(int i) {
    float x = pow(float(i), xAxisScalingFactor);
    float w = pow(float(i + 1), xAxisScalingFactor);
    return vec2(x, w);
}

uniform float rectangleHeight;

struct Rectangle {
    float x;
    float y;
    float w;
    float h;
};

layout (local_size_variable) in;

void addToPixel(uvec2 idx, uint wavelength, uint intensity) {
    if (idx.x >= 0u && idx.x < TEX_WIDTH && idx.y >= 0u && idx.y < TEX_HEIGHT) {
        uint index = (idx.y * TEX_WIDTH) + idx.x;
        atomicAdd(pixels[index].intensityAtWavelengths[wavelength - MIN_WAVELENGTH], intensity);
    }
}

/// Returns the rectangle at the given indices.
Rectangle getRectangleAt(ivec2 indices) {
    vec2 horRect = getHorizontalRectangleAt(indices.x);
    return Rectangle(horRect.x, rectangleHeight * float(indices.y), horRect.y, rectangleHeight);
}

uniform float shadowLength;
uniform float shadowHeight;

/// Returns the indices of the rectangle at the given location
ivec2 getRectangleIdxAt(vec2 location) {
    int x = 0;
    int y = int(location.y / rectangleHeight);
    return ivec2(x, y);
}

float getRayIntersectAtX(Photon ray, float x) {
    float slope = ray.direction.y / ray.direction.x;
    return slope * (x - ray.position.x) + ray.position.y;
}

ivec2 getRayRectangleExitEdge(Photon ray, Rectangle rect) {
    float intersectHeight = getRayIntersectAtX(ray, rect.x + rect.w);

    // IF ONE OF THE FIRST TWO CONDITIONS GETS REMOVED IT WORKS WITH 1'000'000 PHOTONS OTHERWISE ONLY 100'000 WHY?
    if (intersectHeight < rect.y) {
        return ivec2(0, -1);
    } else if (intersectHeight > rect.y + rect.h) {
        return ivec2(0, 1);
    } else {
        return ivec2(1, 0);
    }
}

void main() {
    uint gid = gl_GlobalInvocationID.x;
    if (gid >= photons.length()) return;

    Photon photon = photons[gid];

    ivec2 photonTexIndices = getRectangleIdxAt(photon.position);
    while (photonTexIndices.x < TEX_WIDTH && photonTexIndices.y < TEX_HEIGHT &&
    photonTexIndices.x >= 0        && photonTexIndices.y >= 0) {
        // need to convert to uint for atomic add operations...
        addToPixel(uvec2(photonTexIndices), photon.wavelength, uint(photon.intensity * 100.0));

        ivec2 dir = getRayRectangleExitEdge(photon, getRectangleAt(photonTexIndices));
        photonTexIndices += dir;

        // When the ray goes out of bounds on the bottom then mirror it to simulate rays coming from
        // the other side of the planet. This works because of the rotational symmetry of the system.
        // IF COMMENTET OUT IT WORKS WITH 1'000'000 PHOTONS OTHERWISE ONLY 100'000 WHY?
        if (photonTexIndices.y < 0) {
            photonTexIndices.y = 0;
            photon.position.y *= -1.0;
            photon.direction.y *= -1.0;
        }
    }
}

Tracer.hpp

#ifndef TEXTURE_TRACER_HPP
#define TEXTURE_TRACER_HPP

#include <glm/glm.hpp>
#include <random>

namespace gpu {

    // 6 * 4 = 24 Bytes
    struct Photon {
        glm::vec2 position;  // m
        glm::vec2 direction; // normalized
        uint32_t waveLength; // nm
        float intensity;     // 0..1 should start at 1
    };

    class TextureTracer {
    public:
        TextureTracer();
        uint32_t createShadowMap(size_t numPhotons);

    private:
        void initTextureTracer();
        void traceThroughTexture(uint32_t ssboPhotons, size_t numPhotons);
        Photon emitPhoton();
        std::vector<Photon> generatePhotons(uint32_t count);

        struct {
            uint32_t uRectangleHeight;
            uint32_t uShadowLength;
            uint32_t uShadowHeight;
            uint32_t uXAxisScalingFactor;
        } mTextureTracerUniforms;

        uint32_t mTextureTracerProgram;

        std::mt19937_64 mRNG;
        std::uniform_real_distribution<> mDistributionSun;
        std::uniform_int_distribution<uint32_t> mDistributionWavelength;
        std::bernoulli_distribution mDistributionBoolean;
    };

} // namespace gpu

#endif // TEXTURE_TRACER_HPP

Tracer.cpp

#include "TextureTracer.hpp"
#include <GL/glew.h>
#include <algorithm>
#include <fstream>
#include <iostream>
#include <random>
#include <string>
#include <vector>

void GLAPIENTRY MessageCallback(GLenum source, GLenum type, GLuint id,
                                GLenum severity, GLsizei length,
                                const GLchar *message, const void *userParam) {
  if (type == GL_DEBUG_TYPE_ERROR)
    fprintf(stderr, "GL ERROR: type = 0x%x, severity = 0x%x, message = %s\n",
            type, severity, message);
  else
    fprintf(stdout, "GL INFO: type = 0x%x, severity = 0x%x, message = %s\n",
            type, severity, message);
}

namespace gpu {
const double TEX_HEIGHT_TO_RADIUS_FACTOR = 4;
const double TEX_SHADOW_LENGTH_FACTOR = 8;

const uint32_t TEX_WIDTH = 1024u;
const uint32_t TEX_HEIGHT = TEX_WIDTH;

const double RADIUS = 6'371'000.0;
const double RADIUS_FACTORED = RADIUS * TEX_HEIGHT_TO_RADIUS_FACTOR;

const double SUN_RADIUS = 695'510'000.0;
const double DIST_TO_SUN = 149'600'000'000.0;
const double ATMO_HEIGHT = 42'000.0;

std::string loadShader(const std::string &fileName) {
  std::ifstream shaderFileStream(fileName, std::ios::in);
  if (!shaderFileStream.is_open()) {
    std::cerr << "Could not load the GLSL shader from '" << fileName << "'!"
              << std::endl;
    exit(-1);
  }

  std::string shaderCode;
  while (!shaderFileStream.eof()) {
    std::string line;
    std::getline(shaderFileStream, line);
    shaderCode.append(line + "\n");
  }

  return shaderCode;
}

void TextureTracer::initTextureTracer() {
  mTextureTracerProgram = glCreateProgram();
  uint32_t rayTracingComputeShader = glCreateShader(GL_COMPUTE_SHADER);

  std::string code = loadShader("../resources/TextureTracer.glsl");
  const char *shader = code.c_str();
  glShaderSource(rayTracingComputeShader, 1, &shader, nullptr);
  glCompileShader(rayTracingComputeShader);

  glAttachShader(mTextureTracerProgram, rayTracingComputeShader);
  glLinkProgram(mTextureTracerProgram);

  mTextureTracerUniforms.uRectangleHeight =
      glGetUniformLocation(mTextureTracerProgram, "rectangleHeight");
  mTextureTracerUniforms.uShadowHeight =
      glGetUniformLocation(mTextureTracerProgram, "shadowHeight");
  mTextureTracerUniforms.uShadowLength =
      glGetUniformLocation(mTextureTracerProgram, "shadowLength");
  mTextureTracerUniforms.uXAxisScalingFactor =
      glGetUniformLocation(mTextureTracerProgram, "xAxisScalingFactor");

  glDetachShader(mTextureTracerProgram, rayTracingComputeShader);
  glDeleteShader(rayTracingComputeShader);
}

TextureTracer::TextureTracer()
    : mRNG(1L), mDistributionSun(
                    std::uniform_real_distribution<>(-SUN_RADIUS, SUN_RADIUS)),
      mDistributionWavelength(
          std::uniform_int_distribution<uint32_t>(380, 739)),
      mDistributionBoolean(std::bernoulli_distribution(0.5)) {
  glEnable(GL_DEBUG_OUTPUT);
  glDebugMessageCallback(MessageCallback, nullptr);

  initTextureTracer();
}

double raySphereDistance(glm::dvec2 origin, glm::dvec2 direction,
                         glm::dvec2 center, double radius) {
  glm::dvec2 m = origin - center;
  double b = glm::dot(m, direction);
  double c = glm::dot(m, m) - (radius * radius);
  if (c > 0.0 && b > 0.0)
    return -1.0;

  double discr = b * b - c;

  // A negative discriminant corresponds to ray missing sphere
  if (discr < 0.0)
    return -1.0;

  // Ray now found to intersect sphere, compute smallest t value of intersection
  return glm::max(0.0, -b - glm::sqrt(discr));
}

Photon TextureTracer::emitPhoton() {
  std::uniform_real_distribution<> distributionEarth(0.0, ATMO_HEIGHT);
  glm::dvec2 target = {0.0, RADIUS + distributionEarth(mRNG)};

  double d;
  do {
    d = glm::length(glm::dvec2(mDistributionSun(mRNG), mDistributionSun(mRNG)));
  } while (d > SUN_RADIUS);

  glm::dvec2 startPosition =
      glm::dvec2(-DIST_TO_SUN, mDistributionBoolean(mRNG) ? d : -d);
  glm::dvec2 direction = glm::normalize(target - startPosition);

  startPosition +=
      direction * raySphereDistance(startPosition, direction, {0.0, 0.0},
                                    RADIUS + ATMO_HEIGHT);

  return {glm::vec2(0.0, startPosition.y), glm::vec2(direction),
          mDistributionWavelength(mRNG), 1.0f};
}

std::vector<Photon> TextureTracer::generatePhotons(uint32_t count) {
  std::vector<Photon> photons(count);
  std::generate(photons.begin(), photons.end(),
                [this]() { return emitPhoton(); });
  return photons;
}

void TextureTracer::traceThroughTexture(uint32_t ssboPhotons,
                                        size_t numPhotons) {
  glUseProgram(mTextureTracerProgram);

  glUniform1f(mTextureTracerUniforms.uRectangleHeight,
              RADIUS_FACTORED / TEX_HEIGHT);

  const double shadowLength =
      TEX_SHADOW_LENGTH_FACTOR * (DIST_TO_SUN * RADIUS) / (SUN_RADIUS - RADIUS);

  glUniform1f(mTextureTracerUniforms.uShadowLength, shadowLength);
  glUniform1f(mTextureTracerUniforms.uShadowHeight, RADIUS_FACTORED);

  const double xAxisScalingFactor =
      glm::log(shadowLength) / glm::log(static_cast<double>(TEX_WIDTH));

  glUniform1f(mTextureTracerUniforms.uXAxisScalingFactor,
              static_cast<float>(xAxisScalingFactor));

  const uint32_t MIN_WAVELENGTH = 380u;
  const uint32_t MAX_WAVELENGTH = 740u;
  const uint32_t NUM_WAVELENGTHS = MAX_WAVELENGTH - MIN_WAVELENGTH;

  size_t pixelBufferSize =
      TEX_WIDTH * TEX_HEIGHT * NUM_WAVELENGTHS * sizeof(uint32_t);
  uint32_t ssboPixels;
  glGenBuffers(1, &ssboPixels);
  glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssboPixels);
  glBufferData(GL_SHADER_STORAGE_BUFFER, pixelBufferSize, nullptr,
               GL_DYNAMIC_COPY);

  glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ssboPhotons);
  glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, ssboPixels);

  const uint32_t numThreads = 32u;
  const uint32_t numBlocks = numPhotons / numThreads;
  std::cout << "numBlocks: " << numBlocks << std::endl;

  glDispatchComputeGroupSizeARB(numBlocks, 1, 1, numThreads, 1, 1);
  glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);

  struct Pixel {
    uint32_t intensityAtWavelengths[NUM_WAVELENGTHS];
  };

  std::vector<Pixel> pixels(TEX_WIDTH * TEX_HEIGHT);

  glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssboPixels);
  glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, pixelBufferSize,
                     pixels.data());
  glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);

  for (int y = 0; y < TEX_HEIGHT; ++y) {
    printf("%4i | ", y);
    for (int x = 0; x < TEX_WIDTH; ++x) {
      Pixel p = pixels[y * TEX_WIDTH + x];
      int counter = 0;
      for (uint32_t i : p.intensityAtWavelengths) {
        counter += i;
      }

      if (counter == 0) {
        printf("  ");
      } else if (counter > 100'000'000) {
        printf("%4s", "\u25A0");
      } else if (counter > 10'000'000) {
        printf("%4s", "\u25A3");
      } else if (counter > 1'000'000) {
        printf("%4s", "\u25A6");
      } else if (counter > 100'000) {
        printf("%4s", "\u25A4");
      } else {
        printf("%4s", "\u25A1");
      }
    }

    std::cout << std::endl;
  }

  glDeleteBuffers(1, &ssboPixels);

  glUseProgram(0);
}

uint32_t TextureTracer::createShadowMap(size_t numPhotons) {
  std::vector<Photon> photons = generatePhotons(numPhotons);

  uint32_t ssboPhotons;
  glGenBuffers(1, &ssboPhotons);
  glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssboPhotons);
  glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(Photon) * photons.size(),
               photons.data(), GL_DYNAMIC_COPY);

  traceThroughTexture(ssboPhotons, photons.size());

  glDeleteBuffers(1, &ssboPhotons);
  glDeleteProgram(mTextureTracerProgram);

  glDisable(GL_DEBUG_OUTPUT);
  glDebugMessageCallback(nullptr, nullptr);

  return 0;
}
}

main.cpp

#include <GL/glew.h>
#include <GL/glut.h>

#include "TextureTracer.hpp"

int main(int argc, char *argv[]) {
    glutInit(&argc, argv);
    glutCreateWindow("OpenGL needs a window o.O");
    glewInit();

    auto mapper = gpu::TextureTracer();

    // WITH 100'000 PHOTONS IT WORKS, WITH 1'000'000 PHOTONS NOT WHY?
    mapper.createShadowMap(100'000);

    return 0;
}

Solution

  • Operating systems cancel GPU program executions if they take too long. On Windows it is generally two seconds and on Linux it is five seconds most of the time, but it can vary.

    This is to detect GPU programs that are stuck and cancel them. There are different methods to get around this timeout, but they all require admin/root privileges, which is not always available.

    If possible the execution can be split up into multiple invocations like in the following snippet:

    const uint32_t passSize   = 2048u;
    const uint32_t numPasses  = (numPhotons / passSize) + 1;
    const uint32_t numThreads = 64u;
    const uint32_t numBlocks  = passSize / numThreads;
    
    glUniform1ui(glGetUniformLocation(mTextureTracerProgram, "passSize"), passSize);
    for (uint32_t pass = 0u; pass < numPasses; ++pass) {
      glUniform1ui(glGetUniformLocation(mTextureTracerProgram, "pass"), pass);
    
      glDispatchComputeGroupSizeARB(numBlocks, 1, 1, numThreads, 1, 1);
      glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
      glFlush();
      glFinish();
    }
    

    The glFlush() and glFinish() calls are important or the executions will get bundled together and the OS triggers a timeout anyways.

    In the shader you just need to access the right sections of the input data like so:

    // other stuff
    
    uniform uint pass;
    uniform uint passSize;
    
    void main() {
      uint gid = gl_GlobalInvocationID.x;
      uint passId = pass * passSize + gid;
      if (passId >= photons.length()) return;
    
      Photon photon = photons[passId];
    
      // rest of program
    }
    

    This is all.

    If you want to disable the OS timeouts here is a relevant post for Linux: https://stackoverflow.com/a/30520538/5543884

    And here is a post regarding Windows: https://stackoverflow.com/a/29759823/5543884