Recursion in CUDA returns illegal memory access

I'm writing a numerical integration program which implements trapezoidal rule with adaptive step size. Without going too much into details, the algorithm uses recursion to compute the integral of the coded mathematical function in a given interval with a specified relative tolerance. I have simplified the code for posting but kept all the essential points, so some parts might seem unnecessary or overcomplicated. Here it is:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cmath>
#include <iostream>
#include <iomanip>

class Integral {
public:
    double value;       // the value of the integral
    __device__ __host__ Integral() : value(0) {};
    __device__ __host__ Integral& operator+=(Integral &I);
};
__device__ Integral trapezoid(double a, double b, double tolerance, double fa, double fb);
__device__ __host__ double f(double x); // the integrand function

const int BLOCKS = 1;
const int THREADS = 1;

__global__ void controller(Integral *blockIntegrals, double a, double b, double tolerance) {
    extern __shared__ Integral threadIntegrals[]; // an array of thread-local integrals
    double fa = f(a), fb = f(b);
    threadIntegrals[threadIdx.x] += trapezoid(a, b, tolerance, fa, fb);
    blockIntegrals[blockIdx.x] += threadIntegrals[0];
}

int main() {
    // *************** Input parameters ***************
    double a = 1, b = 800;  // integration bounds
    double tolerance = 1e-7;
    // ************************************************

    cudaError cudaStatus;
    Integral blockIntegrals[BLOCKS]; // an array of total integrals computed by each block
    Integral *devBlockIntegrals;
    cudaStatus = cudaMalloc((void**)&devBlockIntegrals, BLOCKS * sizeof(Integral));
    if (cudaStatus != cudaSuccess)
        std::cout << "cudaMalloc failed!\n";

    double estimate = 0; // a rough 10-point estimate of the whole integral
    double h = (b - a) / 10;
    for (int i = 0; i < 10; i++)
        estimate += f(a + i*h);
    estimate *= h;
    tolerance *= estimate; // compute relative tolerance

    controller<<<BLOCKS, THREADS, THREADS*sizeof(Integral)>>>(devBlockIntegrals, a, b, tolerance);

    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess)
        std::cout << "addKernel launch failed: " << cudaGetErrorString(cudaStatus) << "\n";

    cudaStatus = cudaMemcpy(blockIntegrals, devBlockIntegrals, BLOCKS * sizeof(Integral), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess)
        std::cout << "cudaMemcpy failed: " << cudaGetErrorString(cudaStatus) << "\n";
    Integral result; // final result

    for (int i = 0; i < BLOCKS; i++) // final reduction that sums the results of all blocks
        result += blockIntegrals[i];

    std::cout << "Integral = " << std::setprecision(15) << result.value;
    cudaFree(devBlockIntegrals);
    getchar();
    return 0;
}

__device__  double f(double x) {
    return log(x);
}

__device__ Integral trapezoid(double a, double b, double tolerance, double fa, double fb) {
    double h = b - a;               // compute the new step
    double I1 = h*(fa + fb) / 2;    // compute the first integral
    double m = (a + b) / 2;         // find the middle point
    double fm = f(m);                       // function value at the middle point
    h = h / 2;                              // make step two times smaller
    double I2 = h*(0.5*fa + fm + 0.5*fb);   // compute the second integral
    Integral I;
    if (abs(I2 - I1) <= tolerance) {    // if tolerance is satisfied
        I.value = I2;
    }
    else {  // if tolerance is not satisfied
        if (tolerance > 1e-15) // check that we are not requiring too high precision
            tolerance /= 2; // request higher precision in every half
        I += trapezoid(a, m, tolerance, fa, fm);    // integrate the first half [a m]
        I += trapezoid(m, b, tolerance, fm, fb);    // integrate the second half [m b]
    }
    return I;
}

__device__ Integral& Integral::operator+=(Integral &I) {
    this->value += I.value;
    return *this;
}

For simplicity, I'm only using a single thread here. Now, if I run this code, I get a message "cudaMemcpy failed: an illegal memory access was encountered". When I run "cuda-memcheck" I get this error:

========= Invalid __local__ write of size 4
=========     at 0x00000b18 in C:/Users/User/Desktop/Integrator Stack/Integrator_GPU/kernel.cu:73:controller(Integral*, double, double, double)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x00fff8ac is out of bounds

It says the problem is with the line 73 which is just

double m = (a + b) / 2;

Could it be that at this point I am running out of memory?

If I make the integration interval smaller by changing the right bound from b = 800 to b = 700 in main, the program works just fine and it gives the correct result. Why am I getting the illegal memory access error when simply creating a new variable?

Also, I have an identical CPU version of this program and it works flawlessly, so the calculation algorithm is most probably correct.

Solution

Could it be that at this point I am running out of memory?

Not exactly. I would guess you are running out of call stack space as the recursion depth increases. The runtime allocates a preset default per thread call stack allocation which is typically around 1kb (although it might vary with hardware and CUDA version). I don't think it will take long for that to be consumed if the recursion depth of that function is more than about 16.

You can query the exact stack size per thread with cudaDeviceGetLimit and alter it with cudaDeviceSetLimit, which might let you make the code work correctly at large recursion depths. In general, highly recursive code in CUDA isn't a great idea, and the compiler and hardware will do better with a convention loop than deeply recursive code.