So I wrote a Cuda version of an OpenCL program I wrote. The OpenCL versions works, meanwhile the Cuda version doesn't. Now converting OpenCL code to Cuda code isn't 1-to-1, but I'm confused as to why the cuda version wouldn't work after all I did base my code around an cuda example when translating it over.
I am getting an illegal memory access was encountered (error code # = 77)
during a cudaMemcpy(... cudaMemcpyDeviceToHost); (line 227) Although it's during a memcpy the problem appears to be an illegal memory access during the kernel run. Here is an example of what I get with cuda-memcheck checking the program:
========= Invalid __global__ read of size 4
========= at 0x000002b8 in MoveoutAndStackCuda(float*, float*, float*, int*, int*, int*, unsigned int, unsigned int, unsigned int)
========= by thread (53,0,0) in block (130,0,0)
========= Address 0x130718e590 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204235]
========= Host Frame:./MoveoutAndStackCudaMVC [0x19a11]
========= Host Frame:./MoveoutAndStackCudaMVC [0x375b3]
========= Host Frame:./MoveoutAndStackCudaMVC [0x4059]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3f0a]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3f85]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3438]
========= Host Frame:./MoveoutAndStackCudaMVC [0x36c9]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3c46]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3d4b]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed1d]
========= Host Frame:./MoveoutAndStackCudaMVC [0x2b69]
=========
========= Invalid __global__ read of size 4
========= at 0x000002b8 in MoveoutAndStackCuda(float*, float*, float*, int*, int*, int*, unsigned int, unsigned int, unsigned int)
========= by thread (52,0,0) in block (130,0,0)
========= Address 0x130718e590 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204235]
========= Host Frame:./MoveoutAndStackCudaMVC [0x19a11]
========= Host Frame:./MoveoutAndStackCudaMVC [0x375b3]
========= Host Frame:./MoveoutAndStackCudaMVC [0x4059]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3f0a]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3f85]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3438]
========= Host Frame:./MoveoutAndStackCudaMVC [0x36c9]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3c46]
========= Host Frame:./MoveoutAndStackCudaMVC [0x3d4b]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed1d]
========= Host Frame:./MoveoutAndStackCudaMVC [0x2b69]
I don't understand the differences between Cuda and OpenCL well enough to know what I am doing wrong. I tried mucking around with MoveoutAndStackCuda<<<grid, threads>>>
and change it to something like MoveoutAndStackCuda<<<grid, threads, (localGroupSize * sizeof(float))>>>
but no luck. I've also tried commenting out parts of my kernel the problem appears to occur even when I have commented out most of my kernel.
Hopefully this is verifiable for you, but there is a chance that it isn't since it could depend on my hardware. I am running a Quadro M5000 on CentOS 6.8 (Final).
I tried to cut out as much stuff that is useless for this problem as possible. I would also provide the working OpenCL version of this MCV example however I am out of text. I recommend debugging using the arguments 100 50 40 for now, because I also have a problem of spawning too many global threads that I will tackle after this one is solved.
Here is the Minimal, Complete, and Verifiable example:
#include <math.h>
#include <sstream>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <cuda.h>
#include <assert.h>
#include <unistd.h>
const bool _VERBOSE = true;
const bool _PRINT_ALLOC_SIZE = true;
const bool _PRINT_RUN_TIME = true;
const int MIN_LOCAL_SIZE = 8;
__global__ void MoveoutAndStackCuda(float prestackTraces[], float stackTracesOut[],
float powerTracesOut[], int startIndices[], int exitIndices[],
int sampleShift[], const unsigned int samplesPerT, const unsigned int readIns,
const unsigned int nOuts) {
unsigned int globalId = (blockIdx.x * blockDim.x) + threadIdx.x;
float stackF = 0.0;
float powerF = 0.0;
unsigned int readIndex = (globalId % samplesPerT);
unsigned int jobNum = (globalId / samplesPerT);
for (unsigned int x = 0; x < readIns; x++) {
unsigned int offsetIndex = x + (jobNum * readIns);
unsigned int startInd = startIndices[offsetIndex];
if ((readIndex >= startInd) && (readIndex < (exitIndices[offsetIndex] + startInd))) {
float value = prestackTraces[readIndex + (x * samplesPerT) + sampleShift[offsetIndex]];
stackF += value;
powerF += (value * value);
}
}
stackTracesOut[globalId] = stackF;
powerTracesOut[globalId] = powerF;
}
/*
* Single threaded version that somewhat mimics what is executed in the OpenCL code as close as possible.
*/
void MoveoutAndStackSingleThread(const float prestackTraces[], float stackTracesOut[],
float powerTracesOut[], const int startIndices[], const int exitIndices[], const int shift[],
const unsigned int samplesPerT, const unsigned int readIns, const unsigned int nOuts,
const unsigned int jobNum, const unsigned int readIndex) {
float stackF = 0.0f;
float powerF = 0.0f;
int outputIndex = readIndex + (jobNum * samplesPerT);
for (unsigned int x = 0; x < readIns; x++) {
unsigned int offsetIndex = x + (jobNum * readIns);
unsigned int startInd = startIndices[offsetIndex];
bool shouldRead = ((readIndex >= startInd) && (readIndex < (exitIndices[offsetIndex] + startInd)));
if (shouldRead) {
float value = prestackTraces[readIndex + (x * samplesPerT) + shift[offsetIndex]];
stackF += value;
powerF += (value * value);
}
}
stackTracesOut[outputIndex] = stackF;
powerTracesOut[outputIndex] = powerF;
}
/**
* Used to keep track of how long it takes to execute this.
*/;
double GetTime() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + (1e-6 * tv.tv_usec);
}
/*
* Print message to stderr and exit.
*/
void Fatal(const char* format, ...) {
va_list args;
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
exit(1);
}
/*
* We have an error, which one? Also print out where this occured.
*/
void CudaWhichError(cudaError_t errorCode, char* location) {
if (errorCode == cudaSuccess) {
// This shouldn't happen. It should be made sure that errorCode != cudaSuccess before calling this function.
printf("Reported error not actually an error... (cudaSuccess) %s\n", location);
return;
}
Fatal("%s %s (error code # = %d)\n", location, cudaGetErrorString(errorCode), errorCode);
}
/*
* Check for errors.
*/
void CheckForErrors(char* location) {
cudaError_t errorCode = cudaGetLastError();
if (errorCode != cudaSuccess) {
CudaWhichError(errorCode, location);
}
}
/*
* Finds and initializes the fastest graphics card for CUDA use.
*
* Returns the max number of threads per block for the selected device.
*/
int GetFastestDevice() {
// Get the number of CUDA devices
int num;
if (cudaGetDeviceCount(&num)) Fatal("Cannot get number of CUDA devices\n");
if (num<1) Fatal("No CUDA devices found\n");
// Props
cudaDeviceProp currentDevice;
int fastestGflops = -1;
cudaDeviceProp bestDevice;
int fastestDeviceID = -1;
// Get fastest device
for (int dev=0;dev<num;dev++) {
if (cudaGetDeviceProperties(¤tDevice, dev)) {
Fatal("Error getting device %d properties\n", dev);
}
int Gflops = currentDevice.multiProcessorCount * currentDevice.clockRate;
if (_VERBOSE) {
printf("CUDA Device %d: %s Gflops %f Processors %d Threads/Block %d\n",
dev,
currentDevice.name,
(1e-6 * Gflops),
currentDevice.multiProcessorCount,
currentDevice.maxThreadsPerBlock);
}
if (Gflops > fastestGflops) {
fastestGflops = Gflops;
fastestDeviceID = dev;
bestDevice = currentDevice;
}
}
// Check to see if we get a device
if (fastestDeviceID == -1) {
Fatal("bestDevice == NULL");
}
// Print and set device
if (cudaGetDeviceProperties(&bestDevice, fastestDeviceID)) {
Fatal("Error getting device %d properties\n", fastestDeviceID);
}
cudaSetDevice(fastestDeviceID);
if (_VERBOSE) {
printf("Fastest CUDA Device %d: %s\n", fastestDeviceID, bestDevice.name);
printf("bestDevice.maxThreadsPerBlock: %d\n", bestDevice.maxThreadsPerBlock);
}
CheckForErrors((char*)("GetFastestDevice()"));
// Return max thread count
return bestDevice.maxThreadsPerBlock;
}
/*
* Allocate memory on the GPU, also copy the data over.
*
* CudaPtr variables point to the arrays on the GPU side.
* Host variables point to the arrays on the CPU side.
* Sizes variables determine sizes of the arrays.
*/
void AllocateAndCopyCudaDeviceMemory(float** prestackCudaPtr, float** stackOutCudaPtr, float** powerOutCudaPtr,
int** startIndicesCudaPtr, int** endIndicesCudaPtr, int** sampleShiftCudaPtr,
float *prestackHost, int *startIndicesHost, int *endIndicesHost, int *sampleShiftHost,
size_t prestackSizes, size_t outputSizes, size_t inputSizes) {
if (_PRINT_ALLOC_SIZE) {
size_t totalMemoryAllocated = (prestackSizes + (outputSizes * 2) + (inputSizes * 3));
printf(" Total memory allocated for run: %zu\n", totalMemoryAllocated);
printf(" Prestack array size: %zu\n", prestackSizes);
printf(" Output array sizes: %zu\n", outputSizes);
printf(" EtartIndices, EndIndices, & SampleShift array size: %zu\n", inputSizes);
}
cudaError_t cudaCode;
// Allocate memory on the graphics card
cudaCode = cudaMalloc((void**)prestackCudaPtr, prestackSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for prestack array\n")));
cudaCode = cudaMalloc((void**)stackOutCudaPtr, outputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for stackOut array\n")));
cudaCode = cudaMalloc((void**)powerOutCudaPtr, outputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for powerOut array\n")));
cudaCode = cudaMalloc((void**)startIndicesCudaPtr, inputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for startIndices array\n")));
cudaCode = cudaMalloc((void**)endIndicesCudaPtr, inputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for endIndices array\n")));
cudaCode = cudaMalloc((void**)sampleShiftCudaPtr, inputSizes);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("cudaErrorMemoryAllocation ERROR: during device memory allocation for sampleShift array\n")));
// Copy data over (for the arrays the need it)
cudaCode = cudaMemcpy(*prestackCudaPtr, prestackHost, prestackSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy prestack data over to device.\n")));
cudaCode = cudaMemcpy(*startIndicesCudaPtr, startIndicesHost, inputSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy startIndices data over to device.\n")));
cudaCode = cudaMemcpy(*endIndicesCudaPtr, endIndicesHost, inputSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy endIndices data over to device.\n")));
cudaCode = cudaMemcpy(*sampleShiftCudaPtr, sampleShiftHost, inputSizes, cudaMemcpyHostToDevice);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
((char*)("AllocateAndCopyCudaDeviceMemory ERROR: during copy sampleSgift data over to device.\n")));
}
/*
* Enqueue the kernels to be ran on the gpu. Pointers that are passed in are pointing to
* device side memory.
*/
void RunCudaMoveAndStackJobs(float** prestackTracesCudaPtr, float** stackTracesOutCudaPtr,
float** powerTracesOutCudaPtr, int** startIndicesCudaPtr, int** exitIndicesCudaPtr,
int** sampleShiftCudaPtr, unsigned int samplesPerT, unsigned int readIns,
unsigned int nOuts, size_t localGroupSize) {
// Set the size
dim3 threads(localGroupSize);
dim3 grid(samplesPerT * nOuts);
if (*prestackTracesCudaPtr == NULL) printf("*prestackTracesCudaPtr == NULL\n");
// Execute the kernel
MoveoutAndStackCuda<<<grid, threads>>>(*prestackTracesCudaPtr,
*stackTracesOutCudaPtr, *powerTracesOutCudaPtr, *startIndicesCudaPtr, *exitIndicesCudaPtr,
*sampleShiftCudaPtr, samplesPerT, readIns, nOuts);
CheckForErrors((char*)("RunCudaMoveAndStackJobs()"));
}
/*
* Free memory on the GPU device as well as free the remaining OpenCL objects for the host side.
*/
void RetrieveAndCleanupCudaDeviceMemory(float **prestackCudaPtr, float **stackOutCudaPtr,
float **powerOutCudaPtr, int **startIndicesCudaPtr, int **endIndicesCudaPtr, int **sampleShiftCudaPtr,
float *stackOutHost, float *powerOutHost, size_t outputSizes) {
// Copy C from device to host
cudaError_t cudaCode;
cudaCode = cudaMemcpy(stackOutHost, *stackOutCudaPtr, outputSizes, cudaMemcpyDeviceToHost);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
(char*)("RetrieveAndCleanupCudaDeviceMemory ERROR: Cannot copy stackOut data back to host.\n"));
cudaCode = cudaMemcpy(powerOutHost, *powerOutCudaPtr, outputSizes, cudaMemcpyDeviceToHost);
if (cudaCode != cudaSuccess) CudaWhichError(cudaCode,
(char*)("RetrieveAndCleanupCudaDeviceMemory ERROR: Cannot copy powerOut data back to host.\n"));
// Free device memory (TODO: reverse order)
cudaFree(*prestackCudaPtr);
cudaFree(*stackOutCudaPtr);
cudaFree(*powerOutCudaPtr);
cudaFree(*startIndicesCudaPtr);
cudaFree(*endIndicesCudaPtr);
cudaFree(*sampleShiftCudaPtr);
}
/*
* Runs the program given the arrays passed in the parameters.
*
* Return the time it took to run the program, if desired.
*/
double CommenceCUDAMoveoutAndStack(float* prestackTraces, float* stackOut, float* powerOut,
int* startIndices, int* endIndices, int* sampleShift,
unsigned int samplesPerTrace, unsigned int nTracesIn, unsigned int nTracesOut,
size_t localGroupSize, size_t prestackSizes, size_t outputSizes, size_t inputSizes) {
double returnVal = 0.0;
if (_PRINT_RUN_TIME) {
printf("CommenceCUDAMoveoutAndStack:\n samplesPerTrace=%u nTracesIn=%u nTracesOut=%u\n"
" localGroupSize=%zu\n",
samplesPerTrace, nTracesIn, nTracesOut, localGroupSize);
}
// Init CUDA
int maxThreadsPerBlock = GetFastestDevice();
// Check the desirec local size
if (((int)localGroupSize) > maxThreadsPerBlock) {
Fatal("Error: local group (%zu) size exceeds the max local group size of the selected graphics card (%d).\n",
localGroupSize, maxThreadsPerBlock);
} else if (((int)localGroupSize) < MIN_LOCAL_SIZE) {
Fatal("Error: local group (%zu) size is less than MIN_LOCAL_SIZE (%d).\n",
localGroupSize, MIN_LOCAL_SIZE);
}
// Allocate memory on the device. These pointers will point to memory on the GPU.
double preInitTime = GetTime();
float* prestackCudaPtr = NULL;
float* stackOutCudaPtr = NULL;
float* powerOutCudaPtr = NULL;
int* startIndicesCudaPtr = NULL;
int* endIndicesCudaPtr = NULL;
int* sampleShiftCudaPtr = NULL;
AllocateAndCopyCudaDeviceMemory(&prestackCudaPtr, &stackOutCudaPtr, &powerOutCudaPtr,
&startIndicesCudaPtr, &endIndicesCudaPtr, &sampleShiftCudaPtr,
prestackTraces, startIndices, endIndices, sampleShift,
prestackSizes, outputSizes, inputSizes);
// Run the program
RunCudaMoveAndStackJobs(&prestackCudaPtr, &stackOutCudaPtr, &powerOutCudaPtr,
&startIndicesCudaPtr, &endIndicesCudaPtr, &sampleShiftCudaPtr,
samplesPerTrace, nTracesIn, nTracesOut, localGroupSize);
// Retrieve the data and clean up graphics card memory
RetrieveAndCleanupCudaDeviceMemory(&prestackCudaPtr, &stackOutCudaPtr, &powerOutCudaPtr,
&startIndicesCudaPtr, &endIndicesCudaPtr, &sampleShiftCudaPtr,
stackOut, powerOut,
(size_t)(nTracesOut * samplesPerTrace * sizeof(float)));
// Print the run time (if requested)
if (_PRINT_RUN_TIME) {
returnVal = (GetTime() - preInitTime);
if (_PRINT_RUN_TIME) {
printf(" Run Time: %f secs\n", returnVal);
}
}
return returnVal;
}
// Returns a float 0.0 - 1.0, inclusive
float RandomFloat() {
return static_cast <float> (rand()) / static_cast <float>(RAND_MAX);
}
// Fill in the prestack traces array
void FillFloatArrayRandomly(float* fillArray, unsigned int length) {
for (unsigned int r = 0; r < length; r++) {
fillArray[r] = RandomFloat() * 1000.0f;
}
}
// Fill the start and end arrays randomly
void FillStartEndShiftArraysRandomly(int* startArray, int* nSampsArray, int* shiftArray,
int arrayLength, int rangeOfStartEndMax, int samplesPerT) {
for (int r = 0; r < arrayLength; r++) {
startArray[r] = (rand() % rangeOfStartEndMax);
int endIndex = samplesPerT - (rand() % rangeOfStartEndMax);
nSampsArray[r] = endIndex - startArray[r];
int range = startArray[r] + (samplesPerT - endIndex);
int ra = rand();
if (range != 0) shiftArray[r] = (ra % range) - startArray[r];
else shiftArray[r] = 0;
// Check to make sure we won't go out of bounds
assert((startArray[r] + nSampsArray[r]) <= samplesPerT);
assert(endIndex > startArray[r]);
assert(startArray[r] >= 0);
assert(nSampsArray[r] >= 0);
assert((startArray[r] + shiftArray[r]) >= 0);
assert((nSampsArray[r] + shiftArray[r]) <= samplesPerT);
}
}
// Create arrays for the OpenCL program to use
double GenerateArraysAndRun(unsigned int samplesPerTrace,
unsigned int nTracesIn, unsigned int nTracesOut, size_t localGroupS) {
srand(time(NULL)); // Set random seed to current time
double returnVal;
// Create the arrays to be used in the program
float* prestackTraces1D;
float* stackOut1D;
float* powerOut1D;
int* startIndices1D;
int* endIndices1D;
int* shift1D;
// Get sizes or arrays
size_t prestackSizes = samplesPerTrace * nTracesIn * sizeof(float);
size_t outputSizes = nTracesOut * samplesPerTrace * sizeof(float);
size_t inputSizes = nTracesOut * nTracesIn * sizeof(int);
// Fill in the arrays
prestackTraces1D = (float*)malloc(prestackSizes);
stackOut1D = (float*)malloc(outputSizes);
powerOut1D = (float*)malloc(outputSizes);
startIndices1D = (int*)malloc(inputSizes);
endIndices1D = (int*)malloc(inputSizes);
shift1D = (int*)malloc(inputSizes);
FillFloatArrayRandomly(prestackTraces1D, samplesPerTrace * nTracesIn);
FillStartEndShiftArraysRandomly(startIndices1D, endIndices1D, shift1D,
(int)(nTracesOut * nTracesIn), (int)(((float)samplesPerTrace) * 0.1), (int)samplesPerTrace);
// Check if arrays were created
if (prestackTraces1D == NULL) Fatal("GenerateArraysAndRun(): prestackTraces1D == NULL\n");
if (stackOut1D == NULL) Fatal("GenerateArraysAndRun(): stackOut1D == NULL\n");
if (powerOut1D == NULL) Fatal("GenerateArraysAndRun(): powerOut1D == NULL\n");
if (startIndices1D == NULL) Fatal("GenerateArraysAndRun(): startIndices1D == NULL\n");
if (endIndices1D == NULL) Fatal("GenerateArraysAndRun(): endIndices1D == NULL\n");
if (shift1D == NULL) Fatal("GenerateArraysAndRun(): shift1D == NULL\n");
// Run the program
returnVal = CommenceCUDAMoveoutAndStack(prestackTraces1D, stackOut1D, powerOut1D, startIndices1D,
endIndices1D, shift1D, samplesPerTrace, nTracesIn, nTracesOut,
localGroupS, prestackSizes, outputSizes, inputSizes);
// Finished: free the memory on CPU side in reverse order
free(shift1D);
free(endIndices1D);
free(startIndices1D);
free(powerOut1D);
free(stackOut1D);
free(prestackTraces1D);
// Return the time that the program gave us
return returnVal;
}
// Main
int main(int argc, char* argv[]) {
// TODO: Errors here
if (argc != 5)
Fatal("Incorrect # of Arguments (5 Needed) <samplesPerTrace> <nTracesIn> <nTracesOut> <LocalGroupSize>\n"
" argc = %d\n", argc);
unsigned int samplesPerTrace = atoi(argv[1]);
unsigned int nTracesIn = atoi(argv[2]);
unsigned int nTracesOut = atoi(argv[3]);
size_t localGroupS = atoi(argv[4]);
GenerateArraysAndRun(samplesPerTrace, nTracesIn, nTracesOut, localGroupS);
return 0;
}
The problem was that I was spawning too many blocks. In OpenCL, you tell the kernel the total number of threads and how many threads are in each block, and the total # of blocks is determined from that. Meanwhile in Cuda, you tell the kernel how many blocks there are and how many threads per block there is, and the total # of threads is determined by those. So:
dim3 threads(localGroupSize);
dim3 grid(samplesPerT * nOuts);
Should be:
dim3 threads(localGroupSize);
dim3 grid((samplesPerT * nOuts) / localGroupSize);