I am new to cuda and trying to write a little code which should generate random points on a sphere. Here is the code.
__global__
void setup_kernel(curandStateMRG32k3a *state)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(0, id, 0, &state[id]);
}
__global__
void computeRandomVectors(float* x, float* y, float* z, unsigned int numberOfElements,curandStateMRG32k3a *state)
{
float a,b;
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
curandStateMRG32k3a localState = state[i];
if(i < numberOfElements)
{
a = curand_uniform(&localState);
b = curand_uniform(&localState);
while(a * a + b * b > 1.0f)
{
a = curand_uniform(&localState) * 2.0f - 1.0f;
b = curand_uniform(&localState) * 2.0f - 1.0f;
}
x[i] = 2.0f * a * sqrtf(1.0f - a * a - b * b);
y[i] = 2.0f * b * sqrtf(1.0f - a * a - b * b);
z[i] = 1.0f - 2.0f * (a * a + b * b);
}
}
void generatePointsOnASphere(thrust::host_vector<float>& h_x, thrust::host_vector<float>& h_y, thrust::host_vector<float>& h_z)
{
if(h_x.size() != h_y.size() && h_x.size() != h_z.size())
{
std::cout << "The three component vectors have unmatching size()" << std::endl;
return;
}
size_t size = h_x.size() * sizeof(float);
float* h_p_x = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_y = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_z = (float*) calloc(h_x.size(),sizeof(float));
if(h_p_x==NULL || h_p_y==NULL || h_p_z==NULL)
{
std::cout << "Host memory allocation failure" << std::endl;
return;
}
float* d_p_x;
float* d_p_y;
float* d_p_z;
if(cudaMalloc((void **)&d_p_x,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_y,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_z,size) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device memory allocation failure" << std::endl;
return;
}
curandStateMRG32k3a *devStates;
if(cudaMalloc((void **)&devStates, h_x.size() * sizeof(curandStateMRG32k3a)) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Random generator states memory allocation failure" << std::endl;
return;
}
int threads = 256;
dim3 grid = size / threads;
setup_kernel<<<grid,threads>>>(devStates);
if(cudaMemcpy(d_p_x,h_p_x,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_y,h_p_y,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_z,h_p_z,size,cudaMemcpyHostToDevice) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Host to Device memory copy failure" << std::endl;
}
computeRandomVectors<<< grid, threads >>>(d_p_x,d_p_y,d_p_z,size / sizeof(float), devStates);
if(cudaMemcpy(h_p_x,d_p_x,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_y,d_p_y,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_z,d_p_z,size,cudaMemcpyDeviceToHost) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device to Host memory copy failure" << std::endl;
}
for(size_t i = 0; i < h_x.size(); ++i)
{
h_x[i] = h_p_x[i];
h_y[i] = h_p_y[i];
h_z[i] = h_p_z[i];
}
free (h_p_x);
free (h_p_y);
free (h_p_z);
cudaFree (devStates);
cudaFree (d_p_x);
cudaFree (d_p_y);
cudaFree (d_p_z);
cudaDeviceReset();
}
This code works if the number of elements in the vectors is less than 4000 (I tried 1K,2K,3K and 4K). Than it gives me cuda Error Illegal Address in the first cudaMemcpy. I don't think I run out of memory, I am working with gtx 980 (4GB of global memory). Any idea how to fix this?
EDIT: The code after the suggested modifications is the following:
__global__
void setup_kernel(curandStateMRG32k3a *state, unsigned int numberOfElements)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
if(id < numberOfElements) curand_init(0, id, 0, &state[id]);
}
__global__
void computeRandomVectors(float* x, float* y, float* z, unsigned int numberOfElements,curandStateMRG32k3a *state)
{
float a,b;
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
curandStateMRG32k3a localState = state[i];
if(i < numberOfElements)
{
a = curand_uniform(&localState);
b = curand_uniform(&localState);
while(a * a + b * b > 1.0f)
{
a = curand_uniform(&localState) * 2.0f - 1.0f;
b = curand_uniform(&localState) * 2.0f - 1.0f;
}
x[i] = 2.0f * a * sqrtf(1.0f - a * a - b * b);
y[i] = 2.0f * b * sqrtf(1.0f - a * a - b * b);
z[i] = 1.0f - 2.0f * (a * a + b * b);
}
}
void generatePointsOnASphere(thrust::host_vector<float>& h_x, thrust::host_vector<float>& h_y, thrust::host_vector<float>& h_z)
{
if(h_x.size() != h_y.size() && h_x.size() != h_z.size())
{
std::cout << "The three component vectors have unmatching size()" << std::endl;
return;
}
size_t size = h_x.size() * sizeof(float);
float* h_p_x = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_y = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_z = (float*) calloc(h_x.size(),sizeof(float));
if(h_p_x==NULL || h_p_y==NULL || h_p_z==NULL)
{
std::cout << "Host memory allocation failure" << std::endl;
return;
}
float* d_p_x;
float* d_p_y;
float* d_p_z;
if(cudaMalloc((void **)&d_p_x,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_y,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_z,size) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device memory allocation failure" << std::endl;
return;
}
curandStateMRG32k3a *devStates;
if(cudaMalloc((void **)&devStates, h_x.size() * sizeof(curandStateMRG32k3a)) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Random generator states memory allocation failure" << std::endl;
return;
}
if(cudaMemcpy(d_p_x,h_p_x,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_y,h_p_y,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_z,h_p_z,size,cudaMemcpyHostToDevice) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Host to Device memory copy failure" << std::endl;
}
int threads = 512;
dim3 grid = (h_x.size() + threads - 1) / threads;
setup_kernel<<<grid,threads>>>(devStates, size / sizeof(float));
computeRandomVectors<<< grid, threads >>>(d_p_x,d_p_y,d_p_z,size / sizeof(float), devStates);
cudaDeviceSynchronize();
if(cudaMemcpy(h_p_x,d_p_x,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_y,d_p_y,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_z,d_p_z,size,cudaMemcpyDeviceToHost) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device to Host memory copy failure" << std::endl;
}
for(size_t i = 0; i < h_x.size(); ++i)
{
h_x[i] = h_p_x[i];
h_y[i] = h_p_y[i];
h_z[i] = h_p_z[i];
}
free (h_p_x);
free (h_p_y);
free (h_p_z);
cudaFree (devStates);
cudaFree (d_p_x);
cudaFree (d_p_y);
cudaFree (d_p_z);
cudaDeviceReset();
}
I feel sorry for keeping posting here but I think by understanding what are my mistakes now I think I might get a better understanding of cuda. So, now I am getting errorIllegalAdress on cudaMemcpy device->host when h_x.size() is 20k. I still do not understand how the code works for small numbers but not for big ones.
The problem is here:
size_t size = h_x.size() * sizeof(float);
...
int threads = 256;
dim3 grid = size / threads;
Your size
variable is scaled by the number of bytes. So that is not the correct variable to use for the grid size. You should compute the grid size like this:
dim3 grid = h_x.size() / threads;
or similar. Also note that this construct won't properly initialize all curand state unless the vector length (h_x.size()
) is evenly divisible by threads
i.e. 256. The method to address this would be to include a thread check in your setup_kernel
similar to the one in your other kernel:
__global__
void setup_kernel(curandStateMRG32k3a *state, int size)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id < size)
curand_init(0, id, 0, &state[id]);
}
and launch enough threads to cover the vector size:
dim3 grid = (h_x.size()+threads-1) / threads;