Issue with printf on CUDA GPU

#include <iostream>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>

#include <iostream>
#include <cmath>
#include <vector>

using namespace std;

#define HOST __host__
#define DEVICE __device__

template<class L>
__global__ void launch_global (L f0) { f0(); }

template<typename T>
struct Array{
    T* __restrict__ data;

    DEVICE HOST inline
    T& operator()(int i)const noexcept {
        return data[i];
    }
};


int main(){

    int nx = 5;

    auto vec = Array<double>{new double[nx]};

    Array<double> *vec1;
    cudaMallocManaged((void**)&vec1, nx * sizeof(double));

    launch_global<<<1,256>>>([=] DEVICE () noexcept{
        int i = blockDim.x*blockIdx.x+threadIdx.x;
        if(i < nx){
             (*vec1)(i) = 1.0;
             printf("Printing here %d %g\n", i,1.0);
        }
    });

    cudaDeviceSynchronize();

return 0;
}

Compilation:

nvcc -x cu --expt-extended-lambda --expt-relaxed-constexpr --expt-relaxed-constexpr kernel_test.cpp -o out

Issue:

The issue is that the print statement - printf("Printing here %d %g\n", i, 1.0), is not getting executed. But if I comment the previous line - vec(i)=1.0, then the print command executes. Could someone help me figure out why?

I also tried running with cuda-memcheck. There are no errors.

Solution

There are various problems.

First of all, this:

auto vec = Array<double>{new double[nx]};

creates an allocated pointer by the use of new. No matter what you subsequently do, that pointer will never be usable in device code. This isn't a proper strategy for using UM.

Next, your allocation size here make no sense:

cudaMallocManaged((void**)&vec1, nx * sizeof(double));

vec1 is a pointer to something of type Array<double>. Allocating a size based on sizeof(double) makes no sense.

If you want to do this using UM (managed memory), you will need to handle the allocation of the class object as well as the allocation of the embedded pointer in the object.

You generally appear to be confused that you are allocating an array of Array<double> objects, each of which has an embedded pointer to double items. The following code has these issues fixed, and seems to run correctly for me:

$ cat t2239.cu
#include <iostream>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>

#include <iostream>
#include <cmath>
#include <vector>

using namespace std;

#define HOST __host__
#define DEVICE __device__

template<class L>
__global__ void launch_global (L f0) { f0(); }

template<typename T>
struct Array{
    T* __restrict__ data;

    DEVICE HOST inline
    T& operator()(int i)const noexcept {
        return data[i];
    }
};


int main(){

    int nx = 5;

    Array<double> *vec;
    cudaMallocManaged((void**)&vec, sizeof(Array<double>));
    cudaMallocManaged((void**)&(vec[0].data), nx*sizeof(double));


    launch_global<<<1,256>>>([=] DEVICE () noexcept{
        for(int i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
             i < nx; i += stride){
             vec[0](i) = 1.0;
             printf("Printing here %d %g\n", i,1.0);
        }
    });

    cudaDeviceSynchronize();

return 0;
}
$ nvcc -o t2239 t2239.cu  --extended-lambda -lineinfo
$ compute-sanitizer ./t2239
========= COMPUTE-SANITIZER
Printing here 0 1
Printing here 1 1
Printing here 2 1
Printing here 3 1
Printing here 4 1
========= ERROR SUMMARY: 0 errors
$