Dynamic Parallelism in CUDA not working

I wrote a simple code to understand Dynamic Parallelism. From the values being printed,I see that the child kernel has executed correctly, but when I come back to the parent kernel, I see wrong values being used in place of temp array which is being updated correctly in the child kernel. When I try to update the 'd_cin array' it is giving me wrong values. These are the compilation flags being used :

nvcc -m64 -dc  -gencode arch=compute_35,code=sm_35  -I/opt/apps/cuda/5.5/include -I. -I.. -I../../common/inc -o simple.o -c simple.cu
nvcc -m64 -gencode arch=compute_35,code=sm_35  -o simple simple.o -L/opt/apps/cuda/5.5/lib64 -lcudadevrt

Can someone help me ? Here is the code.

#include <stdio.h>
#include "cuPrintf.cu"
#include "cuPrintf.cuh"

__global__ void innerKernel(double *I,double *d_temp,int parentIndex){
    int index=threadIdx.x+blockIdx.x*blockDim.x;
    d_temp[parentIndex*3+index]=I[parentIndex];

}

__global__ void kernel(double *d_I,double *d_temp,double *d_cin){

    int index=threadIdx.x+blockIdx.x*blockDim.x;
    int i;
    double res=0.0;
        if(index<30){

    cudaStream_t s;
        cudaStreamCreateWithFlags( &s, cudaStreamNonBlocking );
    dim3 dimBlock(3,1,1);
    dim3 dimGrid(1,1,1);
    innerKernel<<<dimGrid,dimBlock>>>(d_I,d_temp,index);
        __syncthreads();

    if(index==0){
        for(i=0;i<90;i++)
            cuPrintf("temp[%d]: %f\n",i,d_temp[i]);
    }   
    for (i=0;i<3;i++){
            res=res+d_temp[index*3+i];
    }
        __syncthreads();
    d_cin[index]=res;
        cudaStreamDestroy(s);
    }
}

int main(int argc,char **argv){

    double I[30]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30};
    double *d_I;
    double *d_temp;
    double *d_cin;
    double cout[30];

    cudaMalloc(&d_I,30*sizeof(double));
    cudaMemcpy(d_I,I,30*sizeof(double),cudaMemcpyHostToDevice);

    cudaMalloc(&d_temp,3*30*sizeof(double));

    cudaMalloc(&d_cin,30*sizeof(double));

    dim3 dimBlock(8,1,1);
    dim3 dimGrid(4,1,1);
    /*LAUNCH THE KERNEL*/
    printf("Before the kernel\n");
    cudaPrintfInit();
    kernel<<<dimGrid,dimBlock>>>(d_I,d_temp,d_cin);
    //cudaThreadSynchronize();
    cudaPrintfDisplay(stdout,true);
    cudaPrintfEnd();
    printf("After the kernel\n");
    cudaMemcpy(cout,d_cin,30*sizeof(double),cudaMemcpyDeviceToHost);

    int i;
    for(i=0;i<30;i++)
       printf("%f\n",cout[i]);

}

Solution

Any time you are having trouble with a CUDA code, I would recommend that you do proper cuda error checking (you can use this methodology in kernel code for dynamic parallelism as well). You can also try running your code with cuda-memcheck. Finally, cuPrintf is not necessary. Any cc 3.5 GPU supports printf directly from kernel code.

The problem in your code is that kernel launches (including child kernel launches) are asynchronous, meaning that control is returned immediately to the calling thread, before the launched kernel completes.

You may think that __syncthreads() is an appropriate barrier to force the child kernels to complete. It is not. The correct barrier to use in this case is the same as what you would use on the host: cudaDeviceSynchronize()

When I add a cudaDeviceSynchronize() before the first __syncthreads() in your kernel , I get the results that I think you are expecting.

Also note that the stream creation in your main kernel is not doing anything useful. You are not explicitly using that stream anywhere.

Here's a slightly simplified version of your code, with the above fix, and a test run:

$ cat t68.cu
#include <stdio.h>

__global__ void innerKernel(double *I,double *d_temp,int parentIndex){
    int index=threadIdx.x+blockIdx.x*blockDim.x;
    d_temp[parentIndex*3+index]=I[parentIndex];
}

__global__ void kernel(double *d_I,double *d_temp,double *d_cin){

    int index=threadIdx.x+blockIdx.x*blockDim.x;
    int i;
    double res=0.0;
    if(index<30){
      dim3 dimBlock(3,1,1);
      dim3 dimGrid(1,1,1);
      innerKernel<<<dimGrid,dimBlock>>>(d_I,d_temp,index);
      cudaDeviceSynchronize();
      __syncthreads();
      if(index==0){
        for(i=0;i<90;i++)
            printf("temp[%d]: %f\n",i,d_temp[i]);
      }
      for (i=0;i<3;i++){
            res=res+d_temp[index*3+i];
      }
      __syncthreads();
      d_cin[index]=res;
    }
}

int main(int argc,char **argv){

    double I[30]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30};
    double *d_I;
    double *d_temp;
    double *d_cin;
    double cout[30];

    cudaMalloc(&d_I,30*sizeof(double));
    cudaMemcpy(d_I,I,30*sizeof(double),cudaMemcpyHostToDevice);

    cudaMalloc(&d_temp,3*30*sizeof(double));

    cudaMalloc(&d_cin,30*sizeof(double));

    dim3 dimBlock(8,1,1);
    dim3 dimGrid(4,1,1);
    /*LAUNCH THE KERNEL*/
    printf("Before the kernel\n");
    kernel<<<dimGrid,dimBlock>>>(d_I,d_temp,d_cin);
    //cudaThreadSynchronize();
    cudaMemcpy(cout,d_cin,30*sizeof(double),cudaMemcpyDeviceToHost);
    printf("After the kernel\n");

    int i;
    for(i=0;i<30;i++)
       printf("%f\n",cout[i]);

}
$ nvcc -rdc=true -arch=sm_35 -o t68 t68.cu -lcudadevrt
$ ./t68
Before the kernel
temp[0]: 1.000000
temp[1]: 1.000000
temp[2]: 1.000000
temp[3]: 2.000000
temp[4]: 2.000000
temp[5]: 2.000000
temp[6]: 3.000000
temp[7]: 3.000000
temp[8]: 3.000000
temp[9]: 4.000000
temp[10]: 4.000000
temp[11]: 4.000000
temp[12]: 5.000000
temp[13]: 5.000000
temp[14]: 5.000000
temp[15]: 6.000000
temp[16]: 6.000000
temp[17]: 6.000000
temp[18]: 7.000000
temp[19]: 7.000000
temp[20]: 7.000000
temp[21]: 8.000000
temp[22]: 8.000000
temp[23]: 8.000000
temp[24]: 9.000000
temp[25]: 9.000000
temp[26]: 9.000000
temp[27]: 10.000000
temp[28]: 10.000000
temp[29]: 10.000000
temp[30]: 11.000000
temp[31]: 11.000000
temp[32]: 11.000000
temp[33]: 12.000000
temp[34]: 12.000000
temp[35]: 12.000000
temp[36]: 13.000000
temp[37]: 13.000000
temp[38]: 13.000000
temp[39]: 14.000000
temp[40]: 14.000000
temp[41]: 14.000000
temp[42]: 15.000000
temp[43]: 15.000000
temp[44]: 15.000000
temp[45]: 16.000000
temp[46]: 16.000000
temp[47]: 16.000000
temp[48]: 17.000000
temp[49]: 17.000000
temp[50]: 17.000000
temp[51]: 18.000000
temp[52]: 18.000000
temp[53]: 18.000000
temp[54]: 19.000000
temp[55]: 19.000000
temp[56]: 19.000000
temp[57]: 20.000000
temp[58]: 20.000000
temp[59]: 20.000000
temp[60]: 21.000000
temp[61]: 21.000000
temp[62]: 21.000000
temp[63]: 22.000000
temp[64]: 22.000000
temp[65]: 22.000000
temp[66]: 23.000000
temp[67]: 23.000000
temp[68]: 23.000000
temp[69]: 24.000000
temp[70]: 24.000000
temp[71]: 24.000000
temp[72]: 25.000000
temp[73]: 25.000000
temp[74]: 25.000000
temp[75]: 26.000000
temp[76]: 26.000000
temp[77]: 26.000000
temp[78]: 27.000000
temp[79]: 27.000000
temp[80]: 27.000000
temp[81]: 28.000000
temp[82]: 28.000000
temp[83]: 28.000000
temp[84]: 29.000000
temp[85]: 29.000000
temp[86]: 29.000000
temp[87]: 30.000000
temp[88]: 30.000000
temp[89]: 30.000000
After the kernel
3.000000
6.000000
9.000000
12.000000
15.000000
18.000000
21.000000
24.000000
27.000000
30.000000
33.000000
36.000000
39.000000
42.000000
45.000000
48.000000
51.000000
54.000000
57.000000
60.000000
63.000000
66.000000
69.000000
72.000000
75.000000
78.000000
81.000000
84.000000
87.000000
90.000000
$