How can I create global variables in CUDA?? Could you please give me an example?
How can create arrays inside a CUDA function for example
__global__ void test()
{
int *a = new int[10];
}
or How can I create a global array and access it from this function. for example
__device__ int *a;
__global__ void test()
{
a[0] = 2;
}
Or How can I use like the following..
__global__ void ProcessData(int img)
{
int *neighborhood = new int[8];
getNeighbourhood(img, neighbourhood);
}
Still I have some problem with this. I found that compare to
__device__
if I define
"__device__ __constant__" (read only)
will improve the memory access. But my problem is I have an array in host memory say
float *arr = new float[sizeOfTheArray];
I want to make it as a variable array in device and I need to modify this in device memory and I need to copy this back to host. How can I do it??
The C++ new
operator is supported on compute capability 2.0 and 2.1 (ie. Fermi) with CUDA 4.0, so you could use new
to allocate global memory onto a device symbol, although neither of your first two code snippets are how it would be done in practice.
On older hardware, and/or with pre CUDA 4.0 toolkits, the standard approach is to use the cudaMemcpyToSymbol
API in host code:
__device__ float *a;
int main()
{
const size_t sz = 10 * sizeof(float);
float *ah;
cudaMalloc((void **)&ah, sz);
cudaMemcpyToSymbol("a", &ah, sizeof(float *), size_t(0),cudaMemcpyHostToDevice);
}
which copies a dynamically allocated device pointer onto a symbol which can be used directly in device code.
EDIT: Answering this question is a bit like hitting a moving target. For the constant memory case you now seem interested in, here is a complete working example:
#include <cstdio>
#define nn (10)
__constant__ float a[nn];
__global__ void kernel(float *out)
{
if (threadIdx.x < nn)
out[threadIdx.x] = a[threadIdx.x];
}
int main()
{
const size_t sz = size_t(nn) * sizeof(float);
const float avals[nn]={ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10. };
float ah[nn];
cudaMemcpyToSymbol("a", &avals[0], sz, size_t(0),cudaMemcpyHostToDevice);
float *ad;
cudaMalloc((void **)&ad, sz);
kernel<<<dim3(1),dim3(16)>>>(ad);
cudaMemcpy(&ah[0],ad,sz,cudaMemcpyDeviceToHost);
for(int i=0; i<nn; i++) {
printf("%d %f\n", i, ah[i]);
}
}
This shows copying data onto a constant memory symbol, and using that data inside a kernel.