c++cmake cuda thrust windows-subsystem-for-linux

Using Cuda on WSL2 gives me "no kernel image is available for execution on the device."

I am trying to use Cuda and Thrust in a C++ program on WSL2. I followed the instructions in here to enable Cuda on WSL2. Here is a small sample program:

first, I define:

export CUDA_LIBRARY_DIRECTORY=/usr/local/cuda-11.0/lib64
export CUDA_INCLUDE_DIRECTORY=/usr/local/cuda-11.0/include
export CUDACXX=/usr/local/cuda-11.0/bin/nvcc

CMakeLists.txt

cmake_minimum_required(VERSION 2.8)
project(proj LANGUAGES CXX CUDA)

set (CMAKE_CXX_STANDARD 14)

#### use cuda ####
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=sm_50;-lineinfo; -cudart=static; -Xptxas; -v)

include_directories($ENV{CUDA_INCLUDE_DIRECTORY})
link_directories($ENV{CUDA_LIBRARY_DIRECTORY})

ADD_EXECUTABLE(
proj 
src/cudafile.cu
src/main.cpp)

main.cpp

#include<thrust/host_vector.h>
#include<thrust/device_vector.h>
#include<thrust/device_ptr.h>

void func(int size, int* a1, int* a2, int* a3);
void FillWithValue(int* arr, int size, int val);

int main()
{

    int size=1000;
    int *arr1, *arr2, *arr3;
    
    cudaMalloc((void**)&arr1, size * sizeof(int));
    FillWithValue(arr1,size,1);

    cudaMalloc((void**)&arr2, size * sizeof(int));
    FillWithValue(arr2,size,2);

    cudaMalloc((void**)&arr3, size * sizeof(int));

    int* harr = new int [size];
    cudaMemcpy(harr,arr1,size*sizeof(int),cudaMemcpyDeviceToHost);
    fprintf(stdout, "%d\n",harr[0]);


    func(size, arr1, arr2, arr3);
    cudaError_t err = cudaGetLastError();
    if (cudaSuccess != err)
        fprintf(stderr, "Cuda error: %s.\n", cudaGetErrorString(err));
    

    return 1;

}

cudafile.cu

#include<thrust/host_vector.h>
#include<thrust/device_vector.h>
#include<thrust/device_ptr.h>

#define blocksize 512
#define maxblocks 65535

__global__ void funcKernel(int size, int* a1, int* a2, int* a3)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    while (i < size)
    {
        a3[i]=a1[i]+a2[i];
    }
}

void func(int size, int* a1, int* a2, int* a3)
{
    int gridsize = size / blocksize + 1;
    if (gridsize > maxblocks) gridsize = maxblocks;

    funcKernel << <gridsize, blocksize >> > (size, a1, a2, a3);
}

void FillWithValue(int* arr, int size, int val)
{

    thrust::device_ptr<int> d = thrust::device_pointer_cast(arr);
    thrust::fill(d, d + size, val);
}

output

0
Cuda error: no kernel image is available for execution on the device.

Now, the output of the first fprintf proves that the Thrust fill function fails to fill the arrays, and cudaGetLastError() catches an error, proving that the kernel also fails.

This is the verbose cmake build:

cmake ..

-- The CXX compiler identification is GNU 9.3.0
-- The CUDA compiler identification is NVIDIA 11.0.221
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Check for working CUDA compiler: /usr/local/cuda-11.0/bin/nvcc
-- Check for working CUDA compiler: /usr/local/cuda-11.0/bin/nvcc -- works
-- Detecting CUDA compiler ABI info
-- Detecting CUDA compiler ABI info - done
-- Configuring done
-- Generating done
-- Build files have been written to: /mnt/d/work/wsl2-projects/tests/kernels/build

make

/usr/bin/cmake -S/mnt/d/work/wsl2-projects/tests/kernels -B/mnt/d/work/wsl2-projects/tests/kernels/build --check-build-system CMakeFiles/Makefile.cmake 0
/usr/bin/cmake -E cmake_progress_start /mnt/d/work/wsl2-projects/tests/kernels/build/CMakeFiles /mnt/d/work/wsl2-projects/tests/kernels/build/CMakeFiles/progress.marks
make -f CMakeFiles/Makefile2 all
make[1]: Entering directory '/mnt/d/work/wsl2-projects/tests/kernels/build'
make -f CMakeFiles/proj.dir/build.make CMakeFiles/proj.dir/depend
make[2]: Entering directory '/mnt/d/work/wsl2-projects/tests/kernels/build'
cd /mnt/d/work/wsl2-projects/tests/kernels/build && /usr/bin/cmake -E cmake_depends "Unix Makefiles" /mnt/d/work/wsl2-projects/tests/kernels /mnt/d/work/wsl2-projects/tests/kernels /mnt/d/work/wsl2-projects/tests/kernels/build /mnt/d/work/wsl2-projects/tests/kernels/build /mnt/d/work/wsl2-projects/tests/kernels/build/CMakeFiles/proj.dir/DependInfo.cmake --color=
Scanning dependencies of target proj
make[2]: Leaving directory '/mnt/d/work/wsl2-projects/tests/kernels/build'
make -f CMakeFiles/proj.dir/build.make CMakeFiles/proj.dir/build
make[2]: Entering directory '/mnt/d/work/wsl2-projects/tests/kernels/build'
[ 33%] Building CUDA object CMakeFiles/proj.dir/src/cudafile.cu.o
/usr/local/cuda-11.0/bin/nvcc     -x cu -c /mnt/d/work/wsl2-projects/tests/kernels/src/cudafile.cu -o CMakeFiles/proj.dir/src/cudafile.cu.o
[ 66%] Building CXX object CMakeFiles/proj.dir/src/main.cpp.o
/usr/bin/c++   -I/usr/local/cuda-11.0/include  -std=gnu++14 -o CMakeFiles/proj.dir/src/main.cpp.o -c /mnt/d/work/wsl2-projects/tests/kernels/src/main.cpp
[100%] Linking CXX executable proj
/usr/bin/cmake -E cmake_link_script CMakeFiles/proj.dir/link.txt --verbose=1
/usr/bin/c++    -rdynamic CMakeFiles/proj.dir/src/cudafile.cu.o CMakeFiles/proj.dir/src/main.cpp.o  -o proj   -L/usr/local/cuda-11.0/lib64  -L/usr/local/cuda-11.0/targets/x86_64-linux/lib/stubs  -L/usr/local/cuda-11.0/targets/x86_64-linux/lib  -lcudadevrt -lcudart_static -lrt -lpthread -ldl
make[2]: Leaving directory '/mnt/d/work/wsl2-projects/tests/kernels/build'
[100%] Built target proj
make[1]: Leaving directory '/mnt/d/work/wsl2-projects/tests/kernels/build'
/usr/bin/cmake -E cmake_progress_start /mnt/d/work/wsl2-projects/tests/kernels/build/CMakeFiles 0

Is it something related to my GPU unmatching the Cuda version? I thought of downgrading to Cuda 10 or 9, but I don't know how to install it exactly like here, so that it does not replace the driver with another Nvidia driver.

Additional Info:

GeForce GTX 950M
Windows 11 Home. build 22000.51.
WSL2: Ubuntu-20.04
Cuda compilation tools, release 9.1, V9.1.85

Solution

Based on the comment of Robert Crovella, I managed to get the program to run properly with the right output and no errors.

In CMakeLists.txt, I used

set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_50,code=sm_50 -lineinfo -cudart=static -Xptxas -v")

instead of

set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=sm_50;-lineinfo; -cudart=static; -Xptxas; -v)

and now the output is