I have a function in which I am calling a cuda kernel serially in a loop. This function is executed parallely in threads using OpenMP. Through each iteration, I update a variable currentTime
with:
cudaMemcpyFromSymbolAsync(¤tTime, minChangeTime, sizeof(currentTime), 0, cudaMemcpyDeviceToHost, stream_id);
where minChangeTime
is computed in the kernel. Somehow, the update of this variable currentTime
is not done properly when calling several kernels in parallel using OpenMP. I have provided a reproducible code at the end. The results I am expecting are:
0 65 186
1 130 251
2 195 316
3 260 381
4 325 446
...
But when enabling OpenMP, I do not get this difference of 121:
7 325 641
3 325 381
3 325 381
6 325 576
4 390 446
8 390 706
7 390 641
4 3063 446
What am I doing wrong or misunderstanding ? If device memory variables are inappropriate here, what would then be a better variable type ?
#ifdef __CUDACC__
#define CUDA_HOSTDEV __host__ __device__
#define CUDA_DEVICE __device__
#define CUDA_GLOBAL __global__
#define CUDA_CONST __constant__
#else
#define CUDA_HOSTDEV
#define CUDA_DEVICE
#define CUDA_GLOBAL
#define CUDA_CONST
#endif
#include <cuda.h>
#include <cuda_runtime.h>
#include <omp.h>
#include "helper_cuda.h"
#include "helper_functions.h"
CUDA_DEVICE int minChangeTime;
CUDA_DEVICE bool foundMinimum;
CUDA_GLOBAL void reduction(
int* cu_adjustment_time
){
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
__syncthreads();
for (unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0){
atomicMin(&minChangeTime, cu_adjustment_time[tid+s]);
}
__syncthreads();
}
}
CUDA_GLOBAL void wh(int* cu_adjustment_time, int currentTime){
int tid = threadIdx.x + blockDim.x*blockIdx.x;
cu_adjustment_time[tid] = currentTime+tid;
}
void iteration_function(int *iRows, int time_data_index, int num_nets, cudaStream_t stream_id){
int currentTime = 0;
int limit = *iRows-1;
int starting_point = time_data_index;
time_data_index+=currentTime;
int* cu_adjustment_time;
cudaMalloc((void **)&cu_adjustment_time, sizeof(int) * (num_nets));
limit = (*iRows) - 1;
cudaStreamSynchronize(stream_id);
int loop = 0;
while(currentTime<limit){
cudaMemcpyToSymbolAsync(minChangeTime, &limit, sizeof(*iRows), 0, cudaMemcpyHostToDevice, stream_id);
wh<<<num_nets, 1, 0, stream_id>>>(
cu_adjustment_time,
currentTime
);
cudaStreamSynchronize(stream_id);
reduction<<<1, num_nets, 0, stream_id>>>(
cu_adjustment_time
);
cudaStreamSynchronize(stream_id);
cudaMemcpyFromSymbolAsync(¤tTime, minChangeTime, sizeof(currentTime), 0, cudaMemcpyDeviceToHost, stream_id);
cudaStreamSynchronize(stream_id);
currentTime+=num_nets;
time_data_index+=num_nets+1;
std::cout << loop << " " << currentTime << " " << time_data_index << std::endl;
loop++;
}
std::cout << "finished" << std::endl;
}
int main(){
//compiled with: nvcc no_fun.cu -Xcompiler=-fopenmp -o no_fun
int iRows = 3000;
int iter = 300;
int time_data_index = 121;
int num_nets = 64;
cudaStream_t streams[iter];
//#pragma omp parallel for simd schedule(dynamic) -> including this part causes undefined results
for(unsigned int j = 0; j < iter; j++){
cudaStreamCreate(&streams[j]);
iteration_function(&iRows, time_data_index, num_nets, streams[j]);
cudaStreamSynchronize(streams[j]);
cudaStreamDestroy(streams[j]);
}
}
When multiple reduction
kernels run simultaneously, there is a race- condition with the global variable minChangeTime
.
You need to have separate device memory for each kernel that should run in parallel. The simplest approach would be to just cudaMalloc minChangeTime
in each thread instead of declaring it a global variable, and pass it to the kernel.