How to allocate memory in structure in CUDA?

I am new in CUDA and I want to build a simple CUDA project that uses a structure that contains a size_t field and an array field of int.

Here is the code. The allocation fails on field "bits".

main.cu

#include "z_cu_t.cuh"

int main()
{
    z_t z = z_init(2);
    z_cu_t* z_cu;
    cudaMalloc(&z_cu, sizeof(z_cu_t));

    z_cu_from_z(z_cu, &z); //Segmentation fault here

    z_free(&z);
    z_cu_free<<<1, 1>>>(z_cu);
    cudaFree(z_cu);
    
    return 0;
}

z_cu_t.cu

#include <stdio.h>

#include <cuda.h>

#include "z_cu_t.cuh"

__global__ void z_cu_free(z_cu_t* z)
{
    cudaFree(z->bits);
    z->bits = 0;
    z->size = 0;
}

void z_cu_from_z(z_cu_t* to, z_t const* from)
{
    cudaMemcpy(&to->size, &from->size, sizeof(size_t), cudaMemcpyHostToDevice);
    printf("OK\n");
    cudaMalloc(&to->bits, sizeof(int) * from->size); //Segmentation fault here
    printf("KO\n");
    cudaMemcpy(to->bits, from->bits, sizeof(int) * from->size, cudaMemcpyHostToDevice);
}

void z_from_z_cu(z_t* to, z_cu_t const* from)
{
    cudaMemcpy(&to->size, &from->size, sizeof(size_t), cudaMemcpyDeviceToHost);
    cudaMalloc(&to->bits, sizeof(int) * to->size);
    cudaMemcpy(to->bits, from->bits, sizeof(int) * to->size, cudaMemcpyDeviceToHost);
}

z_cu_t.cuh

#ifndef Z_CU_T_H
#define Z_CU_T_H

struct z_cu_t_struct
{
    int* bits;
    size_t size;
};

typedef struct z_cu_t_struct z_cu_t;

__global__ void z_cu_free(z_cu_t* z);

extern "C"
{
#include "z_t.h"
}

void z_cu_from_z(z_cu_t* to, z_t const* from);
void z_from_z_cu(z_t* to, z_cu_t const* from);

#endif // Z_CU_T_H

z_t.c

#include <stdlib.h>
#include <string.h>

#include "z_t.h"

z_t z_init(size_t size)
{
    z_t z;

    z.size = size;
    z.bits = malloc(sizeof(int) * size);
    memset(z.bits, 0, sizeof(int) * size);

    return z;
}

void z_free(z_t* z)
{
    free(z->bits);
    z->bits = 0;
    z->size = 0;
}

z_t.h

#ifndef Z_T_H
#define Z_T_H

struct z_t_struct
{
    int* bits;
    size_t size;
};

typedef struct z_t_struct z_t;

z_t z_init(size_t size);
void z_free(z_t* z);

#endif // Z_T_H

Then I compile and link project:

nvcc -c main.cu
nvcc -rdc=true -c z_cu_t.cu
nvcc z_t.o main.o z_cu_t.o -o main

The output is:

OK
Segmentation error (core dumped)

It crashes on the line 18 of "z_cu_t.cu":

    cudaMalloc(&to->bits, sizeof(int) * from->size); //Segmentation fault here

I don't know what is wrong.

Solution

Now that you have fixed the pointer ordering (host vs. device, source vs. destination) the main issue giving rise to the seg fault is the need for a deep-copy pattern.

We will need to implement a deep copy pattern, to make your z_cu_from_z work. You cannot directly do a cudaMalloc operation on an embedded pointer in another structure allocated with cudaMalloc. The pointer that cudaMalloc stores its allocated value in must exist in host memory. Unfixed, this also creates a seg fault. cudaMemcpy also requires pointers that are stored in host memory (even though they refer to locations in device memory, in some cases).

To address that, you could change your z_cu_from_z function as follows:

void z_cu_from_z(z_cu_t* to, z_t const* from)
{
    cudaMemcpy(&to->size, &from->size, sizeof(size_t), cudaMemcpyHostToDevice);
    printf("OK\n");
    int *temp;  // storage in host memory for a device pointer
    cudaMalloc(&temp, sizeof(int) * from->size); //deep copy
    cudaMemcpy(&to->bits, &temp, sizeof(int *), cudaMemcpyHostToDevice);
    printf("KO\n");
    cudaMemcpy(temp, from->bits, sizeof(int) * from->size, cudaMemcpyHostToDevice); // note change here also
}

Once you fix that you will run into trouble later in your code. I always suggest using proper CUDA error checking and run your codes with compute-sanitizer. A basic principle is that the host and device versions of cudaMalloc and cudaFree are not interoperable. A pointer allocated using host cudaMalloc cannot be freed with device cudaFree. Instead we will need to use (somehow) that temp pointer that I created above to facilitate the deep copy. If we don't have that conveniently available, we must recreate it:

void z_cu_free(z_cu_t* z){
  int *temp;
  cudaMemcpy(&temp, &z->bits, sizeof(int *), cudaMemcpyDeviceToHost);
  cudaFree(temp); // host code, host pointer storage of a device pointer
  cudaFree(z);
  }

Use that, or something like it, to replace your kernel call (and your last code line).

That should eliminate all the compute-sanitizer errors in your code you have posted, as well as all seg faults. Your posted code doesn't call z_from_z_cu so I haven't looked at that, but it may require reworking also.

The general pointer-to-pointer pattern here often involves a deep-copy, and is covered in numerous SO items on the cuda tag. Here is one.