I am new in CUDA and I want to build a simple CUDA project that uses a structure that contains a size_t field and an array field of int.
Here is the code. The allocation fails on field "bits".
main.cu
#include "z_cu_t.cuh"
int main()
{
z_t z = z_init(2);
z_cu_t* z_cu;
cudaMalloc(&z_cu, sizeof(z_cu_t));
z_cu_from_z(z_cu, &z); //Segmentation fault here
z_free(&z);
z_cu_free<<<1, 1>>>(z_cu);
cudaFree(z_cu);
return 0;
}
z_cu_t.cu
#include <stdio.h>
#include <cuda.h>
#include "z_cu_t.cuh"
__global__ void z_cu_free(z_cu_t* z)
{
cudaFree(z->bits);
z->bits = 0;
z->size = 0;
}
void z_cu_from_z(z_cu_t* to, z_t const* from)
{
cudaMemcpy(&to->size, &from->size, sizeof(size_t), cudaMemcpyHostToDevice);
printf("OK\n");
cudaMalloc(&to->bits, sizeof(int) * from->size); //Segmentation fault here
printf("KO\n");
cudaMemcpy(to->bits, from->bits, sizeof(int) * from->size, cudaMemcpyHostToDevice);
}
void z_from_z_cu(z_t* to, z_cu_t const* from)
{
cudaMemcpy(&to->size, &from->size, sizeof(size_t), cudaMemcpyDeviceToHost);
cudaMalloc(&to->bits, sizeof(int) * to->size);
cudaMemcpy(to->bits, from->bits, sizeof(int) * to->size, cudaMemcpyDeviceToHost);
}
z_cu_t.cuh
#ifndef Z_CU_T_H
#define Z_CU_T_H
struct z_cu_t_struct
{
int* bits;
size_t size;
};
typedef struct z_cu_t_struct z_cu_t;
__global__ void z_cu_free(z_cu_t* z);
extern "C"
{
#include "z_t.h"
}
void z_cu_from_z(z_cu_t* to, z_t const* from);
void z_from_z_cu(z_t* to, z_cu_t const* from);
#endif // Z_CU_T_H
z_t.c
#include <stdlib.h>
#include <string.h>
#include "z_t.h"
z_t z_init(size_t size)
{
z_t z;
z.size = size;
z.bits = malloc(sizeof(int) * size);
memset(z.bits, 0, sizeof(int) * size);
return z;
}
void z_free(z_t* z)
{
free(z->bits);
z->bits = 0;
z->size = 0;
}
z_t.h
#ifndef Z_T_H
#define Z_T_H
struct z_t_struct
{
int* bits;
size_t size;
};
typedef struct z_t_struct z_t;
z_t z_init(size_t size);
void z_free(z_t* z);
#endif // Z_T_H
Then I compile and link project:
nvcc -c main.cu
nvcc -rdc=true -c z_cu_t.cu
nvcc z_t.o main.o z_cu_t.o -o main
The output is:
OK
Segmentation error (core dumped)
It crashes on the line 18 of "z_cu_t.cu":
cudaMalloc(&to->bits, sizeof(int) * from->size); //Segmentation fault here
I don't know what is wrong.
Now that you have fixed the pointer ordering (host vs. device, source vs. destination) the main issue giving rise to the seg fault is the need for a deep-copy pattern.
We will need to implement a deep copy pattern, to make your z_cu_from_z
work. You cannot directly do a cudaMalloc
operation on an embedded pointer in another structure allocated with cudaMalloc
. The pointer that cudaMalloc
stores its allocated value in must exist in host memory. Unfixed, this also creates a seg fault. cudaMemcpy
also requires pointers that are stored in host memory (even though they refer to locations in device memory, in some cases).
To address that, you could change your z_cu_from_z
function as follows:
void z_cu_from_z(z_cu_t* to, z_t const* from)
{
cudaMemcpy(&to->size, &from->size, sizeof(size_t), cudaMemcpyHostToDevice);
printf("OK\n");
int *temp; // storage in host memory for a device pointer
cudaMalloc(&temp, sizeof(int) * from->size); //deep copy
cudaMemcpy(&to->bits, &temp, sizeof(int *), cudaMemcpyHostToDevice);
printf("KO\n");
cudaMemcpy(temp, from->bits, sizeof(int) * from->size, cudaMemcpyHostToDevice); // note change here also
}
Once you fix that you will run into trouble later in your code. I always suggest using proper CUDA error checking and run your codes with compute-sanitizer
. A basic principle is that the host and device versions of cudaMalloc
and cudaFree
are not interoperable. A pointer allocated using host cudaMalloc
cannot be freed with device cudaFree
. Instead we will need to use (somehow) that temp
pointer that I created above to facilitate the deep copy. If we don't have that conveniently available, we must recreate it:
void z_cu_free(z_cu_t* z){
int *temp;
cudaMemcpy(&temp, &z->bits, sizeof(int *), cudaMemcpyDeviceToHost);
cudaFree(temp); // host code, host pointer storage of a device pointer
cudaFree(z);
}
Use that, or something like it, to replace your kernel call (and your last code line).
That should eliminate all the compute-sanitizer
errors in your code you have posted, as well as all seg faults. Your posted code doesn't call z_from_z_cu so I haven't looked at that, but it may require reworking also.
The general pointer-to-pointer pattern here often involves a deep-copy, and is covered in numerous SO items on the cuda
tag. Here is one.