I am trying to do the following: I assume that I have a system with heterogeneous processing units (PUs), including CPUs, GPUs, and Intel Xeon Phis. The GPU devices can also have different characteristics. Hence, splitting the workload across these devices is not as simple as N/num_devices.
omp_set_num_threads(system->getPUCount());
#pragma omp parallel
{
unsigned int cpu_thread_id = omp_get_thread_num();
unsigned int num_cpu_threads = omp_get_num_threads();
Each thread iterates in a loop until the end of the data is reached.
PU pu = listOfPUs[cpu_thread_id];
//threads are active until all data is processed
while (finish_0 < N) {
//the my_start and my_finish are private to a specific device.
int my_start = 0;
int my_finish = 0;
I have set a constant chunk_size for each PU, and I create as many CPU threads as I have PUs, meaning that each CPU thread controls one of the PUs. Each thread, determines its own start and end of the chunk of the data (a critical code section)
#pragma omp critical (chunkdetermination_0)
{
start_0 = finish_0;
finish_0 = start_0 + pu.getChunkSize();
if(finish_0 > N)
finish_0 = N;
my_start = start_0;
my_finish = finish_0;
}
Now I check the type of the PU, and execute the corresponding kernel.
if(pu.getType() == GPU) {
int myN = my_finish-my_start;
CudaSafeCall(cudaSetDevice(pu.getId()));
unsigned int nbytes_per_kernel = sizeof(double)*myN;
//memory allocation
CudaSafeCall(cudaMalloc((void**)&d_a, nbytes_per_kernel));
CudaSafeCall(cudaMalloc((void**)&d_c, nbytes_per_kernel));
CudaSafeCall(cudaMemset(d_a, 0, nbytes_per_kernel));
CudaSafeCall(cudaMemset(d_c, 0, nbytes_per_kernel));
//data transfer
CudaSafeCall(cudaMemcpy(d_a, a+my_start, nbytes_per_kernel, cudaMemcpyHostToDevice));
CudaSafeCall(cudaMemcpy(d_c, c+my_start, nbytes_per_kernel, cudaMemcpyHostToDevice));
//block and grid values
dim3 gpu_threads(128);
dim3 gpu_blocks(myN/gpu_threads.x);
if( myN % gpu_threads.x != 0 ) gpu_blocks.x+=1;
//execute kernel
kernel_0<<<gpu_blocks,gpu_threads>>>( d_a, d_c, myN);
//data transfer device to host
CudaSafeCall(cudaMemcpy(c+my_start, d_c, nbytes_per_kernel, cudaMemcpyDeviceToHost));
//sycnhronize devices
CudaSafeCall(cudaDeviceSynchronize());
// //free device memory
CudaSafeCall(cudaFree(d_a));
CudaSafeCall(cudaFree(d_c));
}
When I test this code with one GPU, it works fine. However, when I test it with two GPUs, it does not work. I have also tried using cuda streams, but unfortunately could not manage it to work.
Any suggestions what am I doing wrong, or how should I solve this problem?
Here is the complete example:
#include <omp.h>
#include <stdio.h>
#include <vector>
#include <iostream>
#include <sys/time.h>
#include <float.h>
#include <limits.h>
using namespace std;
#define CPU 0
#define GPU 1
#define MIC 2
class PU
{
public:
PU(int puId, int puType)
{
id = puId;
type = puType;
}
int getId() {
return id;
}
void setId(int puId) {
id = puId;
}
int getType() {
return type;
}
char * getTypeAsString() {
if(type == CPU)
return (char *) "CPU";
else if (type == GPU)
return (char *) "GPU";
else
return (char *) "MIC";
}
void setType(int puType) {
type = puType;
}
int getChunkSize() {
return chunkSize;
}
void setChunkSize(int puChunkSize) {
chunkSize = puChunkSize;
}
private:
int id;
int type;
int chunkSize;
};
class System
{
public:
System() {
numOfPUs = 0;
//Adding PU0 of type GPU to the system
PU * pu0 = new PU(0, GPU);
pu0->setChunkSize(262144);
listOfPUs.push_back(*pu0);
numOfPUs ++;
//Adding PU1 of type GPU to the system
PU * pu1 = new PU(1, GPU);
pu1->setChunkSize(262144);
listOfPUs.push_back(*pu1);
numOfPUs ++;
}
vector<PU> getPUs() {
return listOfPUs;
}
int getPUCount() {
return numOfPUs;
}
private:
vector<PU> listOfPUs;
int numOfPUs;
};
#define N 2097152
//********************** CUDA Error checker **********************
#define CUDA_ERROR_CHECK
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
inline void __cudaSafeCall( cudaError err, const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
if ( cudaSuccess != err )
{
fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );
exit( -1 );
}
#endif
return;
}
inline void __cudaCheckError( const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
cudaError err = cudaGetLastError();
if ( cudaSuccess != err )
{
fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );
exit( -1 );
}
// More careful checking. However, this will affect performance.
// Comment away if needed.
err = cudaDeviceSynchronize();
if( cudaSuccess != err )
{
fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );
exit( -1 );
}
#endif
return;
}
//********************** CUDA Error checker **********************
__global__ void kernel_0(double * a, double * c, int len)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < len)
{
c[idx] = a[idx];
}
}
/*
* Start of Generated Code
* This code enables execution on both host CPUs and accelerating devices
*/
void hybrid_function_0 (double *a, double *c)
{
System * system = new System();
//device variable declarations
double *d_a;
double *d_c;
//start and finish determine the chunk size of a device
int start_0 = 0;
int finish_0 = 0;
vector<PU> listOfPUs = system->getPUs();
printf("Num devices = %d\n", system->getPUCount());
omp_set_num_threads(system->getPUCount());
//one thread of the CPU controls one GPU device. The rest of CPU threads will be used to process data.
#pragma omp parallel
{
unsigned int cpu_thread_id = omp_get_thread_num();
unsigned int num_cpu_threads = omp_get_num_threads();
PU pu = listOfPUs[cpu_thread_id];
//threads are active until all data is processed
while (finish_0 < N) {
//the my_start and my_finish are private to a specific device.
int my_start = 0;
int my_finish = 0;
//the determination of chunks should be performed sequentially, in order to avoid two or more devices processing the same data.
#pragma omp critical (chunkdetermination_0)
{
start_0 = finish_0;
finish_0 = start_0 + pu.getChunkSize();
if(finish_0 > N)
finish_0 = N;
my_start = start_0;
my_finish = finish_0;
}
//devices with id less than nDevices are GPU devices. The host CPU has id = nDevices
if(pu.getType() == GPU) {
int myN = my_finish-my_start;
printf("device_id\t%d\tpu_id\t%d\ttype\t%s\tprocessing\t%d-%d (%lu KB)\n", cpu_thread_id, pu.getId(), pu.getTypeAsString(), my_start, my_finish, sizeof(double)*myN/1000);
CudaSafeCall(cudaSetDevice(pu.getId()));
unsigned int nbytes_per_kernel = sizeof(double)*myN;
//memory allocation
CudaSafeCall(cudaMalloc((void**)&d_a, nbytes_per_kernel));
CudaSafeCall(cudaMalloc((void**)&d_c, nbytes_per_kernel));
CudaSafeCall(cudaMemset(d_a, 0, nbytes_per_kernel));
CudaSafeCall(cudaMemset(d_c, 0, nbytes_per_kernel));
//data transfer
CudaSafeCall(cudaMemcpy(d_a, a+my_start, nbytes_per_kernel, cudaMemcpyHostToDevice));
CudaSafeCall(cudaMemcpy(d_c, c+my_start, nbytes_per_kernel, cudaMemcpyHostToDevice));
//block and grid values
dim3 gpu_threads(128);
dim3 gpu_blocks(myN/gpu_threads.x);
if( myN % gpu_threads.x != 0 ) gpu_blocks.x+=1;
//execute kernel
kernel_0<<<gpu_blocks,gpu_threads>>>( d_a, d_c, myN);
//data transfer device to host
CudaSafeCall(cudaMemcpy(c+my_start, d_c, nbytes_per_kernel, cudaMemcpyDeviceToHost));
//sycnhronize devices
CudaSafeCall(cudaDeviceSynchronize());
// //free device memory
CudaSafeCall(cudaFree(d_a));
CudaSafeCall(cudaFree(d_c));
}
//execute on host
else if (pu.getType() == CPU) {
omp_set_num_threads(omp_get_max_threads());
#pragma omp parallel for
for (int i = my_start; i < my_finish; i++)
{
c[i] = a[i];
}
}
//execute on MIC
else if (pu.getType() == MIC) {
#pragma offload target(mic: cpu_thread_id) in(a[my_start:my_finish]) in(c[my_start:my_finish]) out(c[my_start:my_finish])
{
#pragma omp parallel for
for (int i = my_start; i < my_finish; i++)
{
c[i] = a[i];
}
}
}
}
}
}
/*
* End of Generated Code
*/
int main()
{
double *a, *b, *c;
double scalar;
/* Allocate memory on host */
a = (double*)malloc(sizeof(double)*N);
b = (double*)malloc(sizeof(double)*N);
c = (double*)malloc(sizeof(double)*N);
// omp_set_num_threads(omp_get_max_threads());
printf("OMP Max threads %d\n", omp_get_max_threads());
#pragma omp parallel
{
#pragma omp master
printf("OMP Num threads %d\n", omp_get_num_threads());
}
// #pragma omp parallel for
// for(int i = 0; i < 10; i++) {
// printf("I am thread %d\n", omp_get_thread_num());
// }
//initialization of variables
#pragma omp parallel for
for (int j=0; j<N; j++) {
a[j] = 1.0;
b[j] = 2.0;
c[j] = 0.0;
}
#pragma omp parallel for
for (int j = 0; j < N; j++)
a[j] = 2.0E0 * a[j];
scalar=3.0f;
printf("%s\n", "COPY Started");
hybrid_function_0(a, c);
printf("%s\n", "COPY Finished");
return 0;
}
You can compile it with:
nvcc mini.cu -o mini -Xcompiler "-fopenmp"
When assigning two or more GPUs to run this class, i receive different error messages, and some times it just hangs and does nothing. When I check the GPU status through nvidia-smi it shows that the GPUs are executing this class, however the utilization is 0%.
Some of the errors include:
cudaSafeCall() failed at mini.cu:221 : invalid argument
cudaSafeCall() failed at mini.cu:221 : driver shutting down
I would imagine that the device variable declarations within hybrid_function_0
should be moved to within the main OpenMP parallel section, something like this:
#pragma omp parallel
{
unsigned int cpu_thread_id = omp_get_thread_num();
unsigned int num_cpu_threads = omp_get_num_threads();
//device variable declarations
double *d_a;
double *d_c;
...
As it stands, there is a memory race between threads which can result in the device variable pointer values being overwritten with allocations from the wrong device by other threads, leading to the sorts of invalid argument errors you report in API calls which use the device pointers as arguments.
I would expect that the error could occur at any of the cudaMemset
, cudaMemcpy
, or cudaFree
calls or the kernel launch in that function and the error might move from place to place depending on the behaviour of competing threads.