Search code examples
cudadynamic-parallelism

Dynamic Parallelism in CUDA not working


I wrote a simple code to understand Dynamic Parallelism. From the values being printed,I see that the child kernel has executed correctly, but when I come back to the parent kernel, I see wrong values being used in place of temp array which is being updated correctly in the child kernel. When I try to update the 'd_cin array' it is giving me wrong values. These are the compilation flags being used :

nvcc -m64 -dc  -gencode arch=compute_35,code=sm_35  -I/opt/apps/cuda/5.5/include -I. -I.. -I../../common/inc -o simple.o -c simple.cu
nvcc -m64 -gencode arch=compute_35,code=sm_35  -o simple simple.o -L/opt/apps/cuda/5.5/lib64 -lcudadevrt

Can someone help me ? Here is the code.

#include <stdio.h>
#include "cuPrintf.cu"
#include "cuPrintf.cuh"

__global__ void innerKernel(double *I,double *d_temp,int parentIndex){
    int index=threadIdx.x+blockIdx.x*blockDim.x;
    d_temp[parentIndex*3+index]=I[parentIndex];

}

__global__ void kernel(double *d_I,double *d_temp,double *d_cin){

    int index=threadIdx.x+blockIdx.x*blockDim.x;
    int i;
    double res=0.0;
        if(index<30){

    cudaStream_t s;
        cudaStreamCreateWithFlags( &s, cudaStreamNonBlocking );
    dim3 dimBlock(3,1,1);
    dim3 dimGrid(1,1,1);
    innerKernel<<<dimGrid,dimBlock>>>(d_I,d_temp,index);
        __syncthreads();

    if(index==0){
        for(i=0;i<90;i++)
            cuPrintf("temp[%d]: %f\n",i,d_temp[i]);
    }   
    for (i=0;i<3;i++){
            res=res+d_temp[index*3+i];
    }
        __syncthreads();
    d_cin[index]=res;
        cudaStreamDestroy(s);
    }
}

int main(int argc,char **argv){

    double I[30]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30};
    double *d_I;
    double *d_temp;
    double *d_cin;
    double cout[30];

    cudaMalloc(&d_I,30*sizeof(double));
    cudaMemcpy(d_I,I,30*sizeof(double),cudaMemcpyHostToDevice);

    cudaMalloc(&d_temp,3*30*sizeof(double));

    cudaMalloc(&d_cin,30*sizeof(double));

    dim3 dimBlock(8,1,1);
    dim3 dimGrid(4,1,1);
    /*LAUNCH THE KERNEL*/
    printf("Before the kernel\n");
    cudaPrintfInit();
    kernel<<<dimGrid,dimBlock>>>(d_I,d_temp,d_cin);
    //cudaThreadSynchronize();
    cudaPrintfDisplay(stdout,true);
    cudaPrintfEnd();
    printf("After the kernel\n");
    cudaMemcpy(cout,d_cin,30*sizeof(double),cudaMemcpyDeviceToHost);

    int i;
    for(i=0;i<30;i++)
       printf("%f\n",cout[i]);

}

Solution

  • Any time you are having trouble with a CUDA code, I would recommend that you do proper cuda error checking (you can use this methodology in kernel code for dynamic parallelism as well). You can also try running your code with cuda-memcheck. Finally, cuPrintf is not necessary. Any cc 3.5 GPU supports printf directly from kernel code.

    The problem in your code is that kernel launches (including child kernel launches) are asynchronous, meaning that control is returned immediately to the calling thread, before the launched kernel completes.

    You may think that __syncthreads() is an appropriate barrier to force the child kernels to complete. It is not. The correct barrier to use in this case is the same as what you would use on the host: cudaDeviceSynchronize()

    When I add a cudaDeviceSynchronize() before the first __syncthreads() in your kernel , I get the results that I think you are expecting.

    Also note that the stream creation in your main kernel is not doing anything useful. You are not explicitly using that stream anywhere.

    Here's a slightly simplified version of your code, with the above fix, and a test run:

    $ cat t68.cu
    #include <stdio.h>
    
    __global__ void innerKernel(double *I,double *d_temp,int parentIndex){
        int index=threadIdx.x+blockIdx.x*blockDim.x;
        d_temp[parentIndex*3+index]=I[parentIndex];
    }
    
    __global__ void kernel(double *d_I,double *d_temp,double *d_cin){
    
        int index=threadIdx.x+blockIdx.x*blockDim.x;
        int i;
        double res=0.0;
        if(index<30){
          dim3 dimBlock(3,1,1);
          dim3 dimGrid(1,1,1);
          innerKernel<<<dimGrid,dimBlock>>>(d_I,d_temp,index);
          cudaDeviceSynchronize();
          __syncthreads();
          if(index==0){
            for(i=0;i<90;i++)
                printf("temp[%d]: %f\n",i,d_temp[i]);
          }
          for (i=0;i<3;i++){
                res=res+d_temp[index*3+i];
          }
          __syncthreads();
          d_cin[index]=res;
        }
    }
    
    int main(int argc,char **argv){
    
        double I[30]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30};
        double *d_I;
        double *d_temp;
        double *d_cin;
        double cout[30];
    
        cudaMalloc(&d_I,30*sizeof(double));
        cudaMemcpy(d_I,I,30*sizeof(double),cudaMemcpyHostToDevice);
    
        cudaMalloc(&d_temp,3*30*sizeof(double));
    
        cudaMalloc(&d_cin,30*sizeof(double));
    
        dim3 dimBlock(8,1,1);
        dim3 dimGrid(4,1,1);
        /*LAUNCH THE KERNEL*/
        printf("Before the kernel\n");
        kernel<<<dimGrid,dimBlock>>>(d_I,d_temp,d_cin);
        //cudaThreadSynchronize();
        cudaMemcpy(cout,d_cin,30*sizeof(double),cudaMemcpyDeviceToHost);
        printf("After the kernel\n");
    
        int i;
        for(i=0;i<30;i++)
           printf("%f\n",cout[i]);
    
    }
    $ nvcc -rdc=true -arch=sm_35 -o t68 t68.cu -lcudadevrt
    $ ./t68
    Before the kernel
    temp[0]: 1.000000
    temp[1]: 1.000000
    temp[2]: 1.000000
    temp[3]: 2.000000
    temp[4]: 2.000000
    temp[5]: 2.000000
    temp[6]: 3.000000
    temp[7]: 3.000000
    temp[8]: 3.000000
    temp[9]: 4.000000
    temp[10]: 4.000000
    temp[11]: 4.000000
    temp[12]: 5.000000
    temp[13]: 5.000000
    temp[14]: 5.000000
    temp[15]: 6.000000
    temp[16]: 6.000000
    temp[17]: 6.000000
    temp[18]: 7.000000
    temp[19]: 7.000000
    temp[20]: 7.000000
    temp[21]: 8.000000
    temp[22]: 8.000000
    temp[23]: 8.000000
    temp[24]: 9.000000
    temp[25]: 9.000000
    temp[26]: 9.000000
    temp[27]: 10.000000
    temp[28]: 10.000000
    temp[29]: 10.000000
    temp[30]: 11.000000
    temp[31]: 11.000000
    temp[32]: 11.000000
    temp[33]: 12.000000
    temp[34]: 12.000000
    temp[35]: 12.000000
    temp[36]: 13.000000
    temp[37]: 13.000000
    temp[38]: 13.000000
    temp[39]: 14.000000
    temp[40]: 14.000000
    temp[41]: 14.000000
    temp[42]: 15.000000
    temp[43]: 15.000000
    temp[44]: 15.000000
    temp[45]: 16.000000
    temp[46]: 16.000000
    temp[47]: 16.000000
    temp[48]: 17.000000
    temp[49]: 17.000000
    temp[50]: 17.000000
    temp[51]: 18.000000
    temp[52]: 18.000000
    temp[53]: 18.000000
    temp[54]: 19.000000
    temp[55]: 19.000000
    temp[56]: 19.000000
    temp[57]: 20.000000
    temp[58]: 20.000000
    temp[59]: 20.000000
    temp[60]: 21.000000
    temp[61]: 21.000000
    temp[62]: 21.000000
    temp[63]: 22.000000
    temp[64]: 22.000000
    temp[65]: 22.000000
    temp[66]: 23.000000
    temp[67]: 23.000000
    temp[68]: 23.000000
    temp[69]: 24.000000
    temp[70]: 24.000000
    temp[71]: 24.000000
    temp[72]: 25.000000
    temp[73]: 25.000000
    temp[74]: 25.000000
    temp[75]: 26.000000
    temp[76]: 26.000000
    temp[77]: 26.000000
    temp[78]: 27.000000
    temp[79]: 27.000000
    temp[80]: 27.000000
    temp[81]: 28.000000
    temp[82]: 28.000000
    temp[83]: 28.000000
    temp[84]: 29.000000
    temp[85]: 29.000000
    temp[86]: 29.000000
    temp[87]: 30.000000
    temp[88]: 30.000000
    temp[89]: 30.000000
    After the kernel
    3.000000
    6.000000
    9.000000
    12.000000
    15.000000
    18.000000
    21.000000
    24.000000
    27.000000
    30.000000
    33.000000
    36.000000
    39.000000
    42.000000
    45.000000
    48.000000
    51.000000
    54.000000
    57.000000
    60.000000
    63.000000
    66.000000
    69.000000
    72.000000
    75.000000
    78.000000
    81.000000
    84.000000
    87.000000
    90.000000
    $