Synchronizing Statically Allocated Struct Instances between CPU and GPU

I have a struct that contains an array, and I want to copy the contents from an instance of that struct in CPU memory to another instance in GPU memory.

My question is similar to this one. There are two big difference between this question and the one from the link:

I'm not using an array of structs. I just need one.
All instances of the struct are statically allocated.

In attempt to answer my own question, I tried modifying the code in the answer as follows:

#include <stdio.h>
#include <stdlib.h>

#define cudaCheckError() { \
    cudaError_t err = cudaGetLastError(); \
    if(err != cudaSuccess) { \
      printf("Cuda error: %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
      exit(1); \
    } \
  }

struct Test {
    char array[5];
};

__global__ void kernel(Test *dev_test) {
    for(int i=0; i < 5; i++) {
        printf("Kernel[0][i]: %c \n", dev_test[0].array[i]);
    }
}

__device__ Test dev_test; //dev_test is now global, statically allocated, and one instance of the struct

int main(void) {

    int size = 5;
    Test test; //test is now statically allocated and one instance of the struct

    char temp[] = { 'a', 'b', 'c', 'd' , 'e' };
    memcpy(test.array, temp, size * sizeof(char));

    cudaCheckError();
    cudaMemcpy(&dev_test, &test, sizeof(Test), cudaMemcpyHostToDevice);
    cudaCheckError();
    kernel<<<1, 1>>>(&dev_test);
    cudaCheckError();
    cudaDeviceSynchronize();
    cudaCheckError();

    //  memory free
    return 0;
}

But this code throws a runtime error:

Cuda error: HelloCUDA.cu:34: invalid argument

Is there a way to copy test into dev_test?

Solution

When using a statically allocated __device__ variable:

We don't use the cudaMemcpy API. We use the cudaMemcpyToSymbol (or cudaMemcpyFromSymbol) API
We don't pass __device__ variables as kernel arguments. They are at global scope. You just use them in your kernel code.

The following code has these issues addressed:

$ cat t10.cu
#include <stdio.h>

#define cudaCheckError() { \
    cudaError_t err = cudaGetLastError(); \
    if(err != cudaSuccess) { \
      printf("Cuda error: %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
      exit(1); \
    } \
  }

struct Test {
    char array[5];
};

__device__ Test dev_test; //dev_test is now global, statically allocated, and one instance of the struct

__global__ void kernel() {
    for(int i=0; i < 5; i++) {
        printf("Kernel[0][i]: %c \n", dev_test.array[i]);
    }
}


int main(void) {

    int size = 5;
    Test test; //test is now statically allocated and one instance of the struct

    char temp[] = { 'a', 'b', 'c', 'd' , 'e' };
    memcpy(test.array, temp, size * sizeof(char));

    cudaCheckError();
    cudaMemcpyToSymbol(dev_test, &test, sizeof(Test));
    cudaCheckError();
    kernel<<<1, 1>>>();
    cudaCheckError();
    cudaDeviceSynchronize();
    cudaCheckError();

    //  memory free
    return 0;
}
$ nvcc -o t10 t10.cu
$ cuda-memcheck ./t10
========= CUDA-MEMCHECK
Kernel[0][i]: a
Kernel[0][i]: b
Kernel[0][i]: c
Kernel[0][i]: d
Kernel[0][i]: e
========= ERROR SUMMARY: 0 errors
$

(your array usage in kernel code also didn't make sense. dev_test is not an array, therefore you cannot index into it: dev_test[0]....)