This is the standard Hello World CUDA file:
#include <stdio.h>
#include "hello.h"
const int N = 7;
const int blocksize = 7;
__global__ void hello_kernel(char *a, int *b) {
a[threadIdx.x] += b[threadIdx.x];
}
#define cudaCheckError() { \
cudaError_t e=cudaGetLastError(); \
if(e!=cudaSuccess) { \
printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e)); \
exit(0); \
} \
}
void hello() {
char a[N] = "Hello ";
int b[N] = {15, 10, 6, 0, -11, 1, 0};
char *ad;
int *bd;
const int csize = N*sizeof(char);
const int isize = N*sizeof(int);
printf("%s", a);
cudaMalloc( (void**)&ad, csize );
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice );
cudaCheckError();
cudaMalloc( (void**)&bd, isize );
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice );
cudaCheckError();
dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
hello_kernel<<<dimGrid, dimBlock>>>(ad, bd);
cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost );
cudaCheckError();
cudaFree( ad );
cudaCheckError();
printf("%s\n", a);
}
And its header:
-- hello.h
extern "C"
void hello();
That's a Haskell file that calls such function:
-- test.hs
{-# LANGUAGE ForeignFunctionInterface #-}
import Foreign.C
import Foreign.Ptr (Ptr,nullPtr)
foreign import ccall "hello" hello :: IO ()
main = hello
I'm compiling it with:
nvcc hello.c -c -o hello.o
ghc test.hs -o test hello.o -L/usr/local/cuda/lib -optl-lcudart
Running that program with ./test
results in:
Hello Cuda failure hello.cu:32: 'no CUDA-capable device is detected'
Running the same program with a C main()
that just calls hello
produces Hello World
, as expected.
How do I make Haskell detect the device correctly?
Maybe unrelated, but I was able to reproduce your error on a Mac with separate on-board and discrete graphics cards. When "Automatic graphics switching" is enabled in System Preferences (and no 3D graphics applications are running), I get the same "no CUDA-capable device is detected" error.
When I turn off automatic graphics switching, it forces the Mac to use the discrete graphics card, and then the program runs as expected.
The purely C/CUDA-based version of the code doesn't seem to be affected by this preference and always works whether automatic switching is enabled or not.