How do you allocate GPU memory in a separate CUDA function?

I'm new to CUDA and sure that I'm doing something that's simple enough to fix, but I'm also not sure what to exactly search to find an answer. I've tried looking around but to no avail.

I have a few functions in my code that I want to perform matrix operations with, so instead of writing the code to allocate the memory multiple times, I want to use a function to do that for me. My issue is that the memory location is not being passed back to the function calling my MatrixInitCUDA function.

If I directly allocate the memory in my matrix functions it works as expected, but the issue I'm running into is that my pointer to device memory is only being assigned to the pointer inside of the MatrixInitCUDA function.

Initially I thought that there might have been some kind of type conversion of the arguments, so I included the typeinfo header and printed out the type of the device argument before and after cudaMalloc (no change - not surprising). I've tried passing in double pointers for the device matrix arguments but that doesn't seem to work either, although I'm not I did it properly either.

// Compile using nvcc <file> -lcublas -o <output>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <typeinfo>

// Define block size for thread allocation
#define BLOCK_DIM 32
#define N 10

typedef struct _matrixSize // Optional Command-line multiplier for matrix sizes
{
    unsigned int A_height, A_width, B_height, B_width, C_height, C_width;
} MatrixSize;

void SetMatrixSize(MatrixSize *matrixSize,
                   unsigned int widthA, unsigned int heightA,
                   unsigned int widthB, unsigned int heightB,
                   unsigned int widthC, unsigned int heightC)
{
    matrixSize->A_height = heightA;
    matrixSize->A_width = widthA;
    matrixSize->B_height = heightB;
    matrixSize->B_width = widthB;
    matrixSize->C_height = heightC;
    matrixSize->C_width = widthC;
}

void MatrixInitCUDA(int argc, char **argv, int &devID, MatrixSize *matrixSize,
                    float *host_matrixA, float *host_matrixB, float *host_matrixC,
                    float *dev_matrixA, float *dev_matrixB, float *dev_matrixC)
{
    // Assign CUDA variables
    devID = 0;
    cudaGetDevice(&devID);
    cudaError_t err;

    // Assign size variables
    size_t matrixA_size = matrixSize->A_height * matrixSize->A_width * sizeof(float);
    printf("Allocation size: %d\tMatrix Size: %d\n", (int) matrixA_size, matrixSize->A_height * matrixSize->A_width);
    size_t matrixB_size = matrixSize->B_height * matrixSize->B_width * sizeof(float);
    size_t matrixC_size = matrixSize->C_height * matrixSize->C_width * sizeof(float);
    printf("PRE ALLOC TYPE: %s\n", typeid(typeof(dev_matrixA)).name());
    // Allocate memory on GPU
    err = cudaMalloc((void **) &dev_matrixA, matrixA_size);
    printf("POST ALLOC TYPE: %s\n", typeid(typeof(dev_matrixA)).name());
    printf("DEV A POST ALLOC: %p\n", dev_matrixA);
    if (err != cudaSuccess) printf("Allocate matrix A: %s\n", cudaGetErrorString(err));
    err = cudaMalloc((void **) &dev_matrixB, matrixB_size);
    if (err != cudaSuccess) printf("Allocate matrix B: %s\n", cudaGetErrorString(err));
    err = cudaMalloc((void **) &dev_matrixC, matrixC_size);
    if (err != cudaSuccess) printf("Allocate matrix C: %s\n", cudaGetErrorString(err));

    // Copy data from host PC to GPU
    err = cudaMemcpy(dev_matrixA, host_matrixA, matrixA_size, cudaMemcpyHostToDevice);
    if (err != cudaSuccess) printf("Copy matrix A to GPU: %s\n", cudaGetErrorString(err));
    err =cudaMemcpy(dev_matrixB, host_matrixB, matrixB_size, cudaMemcpyHostToDevice);
    if (err != cudaSuccess) printf("Copy matrix B to GPU: %s\n", cudaGetErrorString(err));
    err =cudaMemcpy(dev_matrixC, host_matrixC, matrixC_size, cudaMemcpyHostToDevice);
    if (err != cudaSuccess) printf("Copy matrix C to GPU: %s\n", cudaGetErrorString(err));
}

int main(int argc, char **argv)
{
    // Create memory for Layer 1, Layer 2, Layer 3 vectors
    // float *layer1 = malloc(784*sizeof(floats)))
    // Create memory for Weight 1->2, Weight 2->3 matrices

    // Layer 1 will read from file for input (X) values
    // Layer 2 and 3 will be calculated
    int devID = 0;
    cudaGetDevice(&devID);

    // Testing hadamard product, init function, and set matrix size function
    float *host_A, *host_B, *host_C, *dev_A = NULL, *dev_B = NULL, *dev_C = NULL;
    MatrixSize *mallocTest = (MatrixSize *) calloc(sizeof(MatrixSize), 1);
    size_t calcSize = N * N * sizeof(float);
    host_A = (float *) calloc(calcSize, 1);
    host_B = (float *) calloc(calcSize, 1);
    host_C = (float *) calloc(calcSize, 1);
    SetMatrixSize(mallocTest, N, N, N, N, N, N);

    printf("DEV A PRE ALLOC: %p\n", dev_A);

    // Initialize memory on GPU
    MatrixInitCUDA(argc, argv, devID, mallocTest,
                   host_A, host_B, host_C,
                   dev_A, dev_B, dev_C);

    printf("DEV A POST INIT: %p\n", dev_A);
    return 0;
}

Here's the output I get if I compile and run this code:

DEV A PRE ALLOC: (nil)
Allocation size: 400    Matrix Size: 100
PRE ALLOC TYPE: Pf
POST ALLOC TYPE: Pf
DEV A POST ALLOC: 0x10208400000
DEV A POST INIT: (nil)

Solution

There are multiple ways using which the desired behavior can be achieved.

Method 1

One of the ways is to modify the MatrixInitCUDA arguments to accept double pointers (**) for device pointers and modify the code as follows:

Modify the function signature:

void MatrixInitCUDA(int argc, char **argv, int &devID, MatrixSize *matrixSize,
                    float *host_matrixA, float *host_matrixB, float *host_matrixC,
                    float **dev_matrixA, float **dev_matrixB, float **dev_matrixC)
{
}

Allocate device memory as follows inside MatrixInitCUDA:

err = cudaMalloc((void **) dev_matrixA, matrixA_size);

Call MatrixInitCUDA from main like this:

MatrixInitCUDA(argc, argv, devID, mallocTest,
                   host_A, host_B, host_C,
                   &dev_A, &dev_B, &dev_C);

Method 2

My personal favorite way is that don't do any of the above and just modify the function signature to accept references for device pointers as follows:

void MatrixInitCUDA(int argc, char **argv, int &devID, MatrixSize *matrixSize,
                    float *host_matrixA, float *host_matrixB, float *host_matrixC,
                    float *&dev_matrixA, float *&dev_matrixB, float *&dev_matrixC)
{
}