CUDA deep copy with other data

I'm trying to copy my struct Test to the GPU, change the data, and upload it back to the CPU. This is what I've tried so far, note that my code crashes on the last, commented out, line:

struct Test {
    int x, y;
    int* data;
};

// Test kernel
static __global__  void TestKernel(Test* d) {
    const uint32_t index = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;

    // increment some values
    ++d->data[0]; 
    ++d->data[1];
    ++d->data[2];

    ++d->x;
    ++d->y;
}

// Test snippet:
Test* host = new Test{ 10, 20,new int[3]{1, 2, 3} };
Test* device = nullptr;
int* deviceData;

COMPUTE_SAFE(cudaMalloc(&device, sizeof(Test)));
COMPUTE_SAFE(cudaMalloc(&deviceData, 3 * sizeof(int)));

COMPUTE_SAFE(cudaMemcpy(device, host, sizeof(Test), cudaMemcpyHostToDevice));
COMPUTE_SAFE(cudaMemcpy(deviceData, host->data, 3 * sizeof(int), cudaMemcpyHostToDevice));
COMPUTE_SAFE(cudaMemcpy(&(device->data), &deviceData, sizeof(float*), cudaMemcpyHostToDevice));

TestKernel <<< 1, 1 >>> (device);
COMPUTE_SAFE(cudaDeviceSynchronize());

COMPUTE_SAFE(cudaMemcpy(host, device, sizeof(Test), cudaMemcpyDeviceToHost));
COMPUTE_SAFE(cudaMemcpy(host->data, deviceData, 3 * sizeof(float), cudaMemcpyDeviceToHost));


printf("\nhost:\n");
printf("%d %d\n", host->x, host->y); // works
// printf("%d %d %d\n", host->data[0], host->data[1], host->data[2]); // crashes

Note that I've seen multiple related questions, but none of them also copy some data apart from the deep copied data pointer.

My error message:

Exception thrown at 0x00007FF7A2C5297D in VFD.exe: 0xC0000005: Access violation reading location 0x0000000B01600208.

Note that I'm probably copying the memory incorrectly, or something along those lines. If I remove the COMPUTE_SAFE(cudaMemcpy(host, device, sizeof(Test), cudaMemcpyDeviceToHost)); line I'm able to access the host->data array, but the x and y values stay unincremented for obvious reasons.

Solution

After you cudaMemcpy into the host struct back from the GPU, you override the data pointer in it with an invalid GPU data pointer.

In order to fix it you need to restore the original data pointer (and then copy the actual data).

Working version:

struct Test 
{
    int x, y;
    int* data;
};


static __global__  void TestKernel(Test* d) 
{
    ++(d->data[0]);
    ++(d->data[1]);
    ++(d->data[2]);
    ++(d->x);
    ++(d->y);
}


int main() 
{
    int* hostData = new int[3]{ 1, 2, 3 };
    Test* host = new Test{ 10, 20, hostData };
    int* deviceData = nullptr;
    Test* device = nullptr;

    COMPUTE_SAFE(cudaMalloc(&device, sizeof(Test)));
    COMPUTE_SAFE(cudaMalloc(&deviceData, 3 * sizeof(int)));

    COMPUTE_SAFE(cudaMemcpy(device, host, sizeof(Test), cudaMemcpyHostToDevice));
    COMPUTE_SAFE(cudaMemcpy(deviceData, host->data, 3 * sizeof(int), cudaMemcpyHostToDevice));
    COMPUTE_SAFE(cudaMemcpy(&(device->data), &deviceData, sizeof(int*), cudaMemcpyHostToDevice));

    TestKernel << < 1, 1 >> > (device);
    COMPUTE_SAFE(cudaDeviceSynchronize());

    COMPUTE_SAFE(cudaMemcpy(host, device, sizeof(Test), cudaMemcpyDeviceToHost));
    host->data = hostData;  // Restore host data pointer
    COMPUTE_SAFE(cudaMemcpy(host->data, deviceData, 3 * sizeof(int), cudaMemcpyDeviceToHost));

    printf("\nhost:\n");
    printf("%d %d\n", host->x, host->y);
    printf("%d %d %d\n", host->data[0], host->data[1], host->data[2]);
    return 0;
}

Output:

host:
11 21
2 3 4

Some notes:

For clarity I added () in the kernel increment statements.
You used sizeof(float*) for the data pointer, although it is an int* (of course the size is the same).