Creating global variables in CUDA

How can I create global variables in CUDA?? Could you please give me an example?

How can create arrays inside a CUDA function for example

__global__ void test()
{
  int *a = new int[10];
}

or How can I create a global array and access it from this function. for example

__device__ int *a;
__global__ void test()
{
  a[0] = 2;
}

Or How can I use like the following..

__global__ void ProcessData(int img)
{
   int *neighborhood = new int[8]; 
   getNeighbourhood(img, neighbourhood);
}

Still I have some problem with this. I found that compare to

__device__

if I define

"__device__ __constant__" (read only)

will improve the memory access. But my problem is I have an array in host memory say

 float *arr = new float[sizeOfTheArray];

I want to make it as a variable array in device and I need to modify this in device memory and I need to copy this back to host. How can I do it??

Solution

The C++ new operator is supported on compute capability 2.0 and 2.1 (ie. Fermi) with CUDA 4.0, so you could use new to allocate global memory onto a device symbol, although neither of your first two code snippets are how it would be done in practice.

On older hardware, and/or with pre CUDA 4.0 toolkits, the standard approach is to use the cudaMemcpyToSymbol API in host code:

__device__ float *a;

int main()
{
    const size_t sz = 10 * sizeof(float);

    float *ah;
    cudaMalloc((void **)&ah, sz);
    cudaMemcpyToSymbol("a", &ah, sizeof(float *), size_t(0),cudaMemcpyHostToDevice);
}

which copies a dynamically allocated device pointer onto a symbol which can be used directly in device code.

EDIT: Answering this question is a bit like hitting a moving target. For the constant memory case you now seem interested in, here is a complete working example:

#include <cstdio>

#define nn (10)

__constant__ float a[nn];

__global__ void kernel(float *out)
{
    if (threadIdx.x < nn)
        out[threadIdx.x] = a[threadIdx.x];

}

int main()
{
    const size_t sz = size_t(nn) * sizeof(float);
    const float avals[nn]={ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10. };
    float ah[nn];

    cudaMemcpyToSymbol("a", &avals[0], sz, size_t(0),cudaMemcpyHostToDevice);

    float *ad;
    cudaMalloc((void **)&ad, sz);

    kernel<<<dim3(1),dim3(16)>>>(ad);

    cudaMemcpy(&ah[0],ad,sz,cudaMemcpyDeviceToHost);

    for(int i=0; i<nn; i++) {
        printf("%d %f\n", i, ah[i]);
    }
}

This shows copying data onto a constant memory symbol, and using that data inside a kernel.