I'm writing a numerical integration program which implements trapezoidal rule with adaptive step size. Without going too much into details, the algorithm uses recursion to compute the integral of the coded mathematical function in a given interval with a specified relative tolerance. I have simplified the code for posting but kept all the essential points, so some parts might seem unnecessary or overcomplicated. Here it is:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cmath>
#include <iostream>
#include <iomanip>
class Integral {
public:
double value; // the value of the integral
__device__ __host__ Integral() : value(0) {};
__device__ __host__ Integral& operator+=(Integral &I);
};
__device__ Integral trapezoid(double a, double b, double tolerance, double fa, double fb);
__device__ __host__ double f(double x); // the integrand function
const int BLOCKS = 1;
const int THREADS = 1;
__global__ void controller(Integral *blockIntegrals, double a, double b, double tolerance) {
extern __shared__ Integral threadIntegrals[]; // an array of thread-local integrals
double fa = f(a), fb = f(b);
threadIntegrals[threadIdx.x] += trapezoid(a, b, tolerance, fa, fb);
blockIntegrals[blockIdx.x] += threadIntegrals[0];
}
int main() {
// *************** Input parameters ***************
double a = 1, b = 800; // integration bounds
double tolerance = 1e-7;
// ************************************************
cudaError cudaStatus;
Integral blockIntegrals[BLOCKS]; // an array of total integrals computed by each block
Integral *devBlockIntegrals;
cudaStatus = cudaMalloc((void**)&devBlockIntegrals, BLOCKS * sizeof(Integral));
if (cudaStatus != cudaSuccess)
std::cout << "cudaMalloc failed!\n";
double estimate = 0; // a rough 10-point estimate of the whole integral
double h = (b - a) / 10;
for (int i = 0; i < 10; i++)
estimate += f(a + i*h);
estimate *= h;
tolerance *= estimate; // compute relative tolerance
controller<<<BLOCKS, THREADS, THREADS*sizeof(Integral)>>>(devBlockIntegrals, a, b, tolerance);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
std::cout << "addKernel launch failed: " << cudaGetErrorString(cudaStatus) << "\n";
cudaStatus = cudaMemcpy(blockIntegrals, devBlockIntegrals, BLOCKS * sizeof(Integral), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
std::cout << "cudaMemcpy failed: " << cudaGetErrorString(cudaStatus) << "\n";
Integral result; // final result
for (int i = 0; i < BLOCKS; i++) // final reduction that sums the results of all blocks
result += blockIntegrals[i];
std::cout << "Integral = " << std::setprecision(15) << result.value;
cudaFree(devBlockIntegrals);
getchar();
return 0;
}
__device__ double f(double x) {
return log(x);
}
__device__ Integral trapezoid(double a, double b, double tolerance, double fa, double fb) {
double h = b - a; // compute the new step
double I1 = h*(fa + fb) / 2; // compute the first integral
double m = (a + b) / 2; // find the middle point
double fm = f(m); // function value at the middle point
h = h / 2; // make step two times smaller
double I2 = h*(0.5*fa + fm + 0.5*fb); // compute the second integral
Integral I;
if (abs(I2 - I1) <= tolerance) { // if tolerance is satisfied
I.value = I2;
}
else { // if tolerance is not satisfied
if (tolerance > 1e-15) // check that we are not requiring too high precision
tolerance /= 2; // request higher precision in every half
I += trapezoid(a, m, tolerance, fa, fm); // integrate the first half [a m]
I += trapezoid(m, b, tolerance, fm, fb); // integrate the second half [m b]
}
return I;
}
__device__ Integral& Integral::operator+=(Integral &I) {
this->value += I.value;
return *this;
}
For simplicity, I'm only using a single thread here. Now, if I run this code, I get a message "cudaMemcpy failed: an illegal memory access was encountered". When I run "cuda-memcheck" I get this error:
========= Invalid __local__ write of size 4
========= at 0x00000b18 in C:/Users/User/Desktop/Integrator Stack/Integrator_GPU/kernel.cu:73:controller(Integral*, double, double, double)
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x00fff8ac is out of bounds
It says the problem is with the line 73 which is just
double m = (a + b) / 2;
Could it be that at this point I am running out of memory?
If I make the integration interval smaller by changing the right bound from b = 800
to b = 700
in main
, the program works just fine and it gives the correct result.
Why am I getting the illegal memory access error when simply creating a new variable?
Also, I have an identical CPU version of this program and it works flawlessly, so the calculation algorithm is most probably correct.
Could it be that at this point I am running out of memory?
Not exactly. I would guess you are running out of call stack space as the recursion depth increases. The runtime allocates a preset default per thread call stack allocation which is typically around 1kb (although it might vary with hardware and CUDA version). I don't think it will take long for that to be consumed if the recursion depth of that function is more than about 16.
You can query the exact stack size per thread with cudaDeviceGetLimit
and alter it with cudaDeviceSetLimit
, which might let you make the code work correctly at large recursion depths. In general, highly recursive code in CUDA isn't a great idea, and the compiler and hardware will do better with a convention loop than deeply recursive code.