Cuda device memory variables with OpenMP multithreading produce wrong results

I have a function in which I am calling a cuda kernel serially in a loop. This function is executed parallely in threads using OpenMP. Through each iteration, I update a variable currentTime with:

cudaMemcpyFromSymbolAsync(&currentTime, minChangeTime, sizeof(currentTime), 0, cudaMemcpyDeviceToHost, stream_id);

where minChangeTime is computed in the kernel. Somehow, the update of this variable currentTime is not done properly when calling several kernels in parallel using OpenMP. I have provided a reproducible code at the end. The results I am expecting are:

But when enabling OpenMP, I do not get this difference of 121:

What am I doing wrong or misunderstanding ? If device memory variables are inappropriate here, what would then be a better variable type ?

#ifdef __CUDACC__
#define CUDA_HOSTDEV __host__ __device__
#define CUDA_DEVICE __device__
#define CUDA_GLOBAL __global__
#define CUDA_CONST __constant__
#else
#define CUDA_HOSTDEV
#define CUDA_DEVICE
#define CUDA_GLOBAL
#define CUDA_CONST
#endif

#include <cuda.h>
#include <cuda_runtime.h>
#include <omp.h>

#include "helper_cuda.h"
#include "helper_functions.h"

CUDA_DEVICE int minChangeTime;
CUDA_DEVICE bool foundMinimum;

CUDA_GLOBAL void reduction(
  int* cu_adjustment_time
  ){

  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
  __syncthreads();
  for (unsigned int s=1; s < blockDim.x; s *= 2) {
    if (tid % (2*s) == 0){
      atomicMin(&minChangeTime, cu_adjustment_time[tid+s]);
    }
    __syncthreads();
  }
}

CUDA_GLOBAL void wh(int* cu_adjustment_time, int currentTime){
  int tid = threadIdx.x + blockDim.x*blockIdx.x;
  cu_adjustment_time[tid] = currentTime+tid;
}

void iteration_function(int *iRows, int time_data_index, int num_nets, cudaStream_t stream_id){
    
    int currentTime = 0;
    int limit = *iRows-1;
    int starting_point = time_data_index;
    time_data_index+=currentTime;

    int* cu_adjustment_time;
    cudaMalloc((void **)&cu_adjustment_time, sizeof(int) * (num_nets));

    limit = (*iRows) - 1;
    cudaStreamSynchronize(stream_id);

    int loop = 0;
    while(currentTime<limit){

        cudaMemcpyToSymbolAsync(minChangeTime, &limit, sizeof(*iRows), 0, cudaMemcpyHostToDevice, stream_id);
        
        wh<<<num_nets, 1, 0, stream_id>>>(
            cu_adjustment_time,
            currentTime
        );
        cudaStreamSynchronize(stream_id);
        
        reduction<<<1, num_nets, 0, stream_id>>>(
          cu_adjustment_time
        );
      
        cudaStreamSynchronize(stream_id);        
        cudaMemcpyFromSymbolAsync(&currentTime, minChangeTime, sizeof(currentTime), 0, cudaMemcpyDeviceToHost, stream_id);
        cudaStreamSynchronize(stream_id);

        currentTime+=num_nets;
        time_data_index+=num_nets+1;
        
        std::cout << loop << " " << currentTime << " " << time_data_index << std::endl;
        loop++;
        
    }
    std::cout << "finished" << std::endl;

}

int main(){
    //compiled with: nvcc no_fun.cu -Xcompiler=-fopenmp -o no_fun 
    int iRows = 3000;
    int iter = 300;
    int time_data_index = 121;
    int num_nets = 64;
    cudaStream_t streams[iter];
    //#pragma omp parallel for simd schedule(dynamic) -> including this part causes undefined results
    for(unsigned int j = 0; j < iter; j++){
        cudaStreamCreate(&streams[j]);
        iteration_function(&iRows, time_data_index, num_nets, streams[j]);
        cudaStreamSynchronize(streams[j]);
        cudaStreamDestroy(streams[j]);
    }

}

Solution

When multiple reduction kernels run simultaneously, there is a race- condition with the global variable minChangeTime. You need to have separate device memory for each kernel that should run in parallel. The simplest approach would be to just cudaMalloc minChangeTime in each thread instead of declaring it a global variable, and pass it to the kernel.