memory-leaks cuda nvidia variadic-functions

Freeing memory with a variadic function in CUDA

I wrote a function similar to the one described in this answer to a SO post about freeing with a variadic function in the C language but for CUDA vectors.

Here is the function:

void freeCudaVectors(size_t nAllocVec, void* arg1, ...)
{
  va_list args;
  void *vp;
  va_start(args, arg1);
  for (int i = 0; i < nAllocVec; ++i){
    vp = va_arg(args, void *);
    cudaFree(vp);
  }
  va_end(args);
}

Here is an (almost) Minimal Working Example (MWE) showing that I get memory leaks with this function. In the example below, I randomly generate two double square matrices and multiply them together. I repeat this product with increasing sizes.

I compile with nvcc -o memory_leak_test memory_leak_test.cu -lcublas -lcurand.

#include<iostream>
#include<cstdarg>
#include<cuda.h>
#include<curand.h>
#include<cublas_v2.h>

#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    exit(EXIT_FAILURE);}} while(0)

#define HANDLE_ERROR(x) do { if((x)!=cudaSuccess) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    exit(EXIT_FAILURE);}} while(0)

void freeCudaVectors(size_t nAllocVec, void* arg1, ...)
{
  va_list args;
  void *vp;
  va_start(args, arg1);
  for (int i = 0; i < nAllocVec; ++i){
    vp = va_arg(args, void *);
    cudaFree(vp);
  }
  va_end(args);
}

void allocMatricesForProduct(double** a, double** b, double** c, size_t M, size_t K, size_t N)
{
  HANDLE_ERROR(cudaMalloc(a, M*K*sizeof(double)) );
  HANDLE_ERROR(cudaMalloc(b, K*N*sizeof(double)) );
  HANDLE_ERROR(cudaMalloc(c, M*N*sizeof(double)) );
}

int main() {
  cublasStatus_t stat;
  cublasHandle_t handle;
  stat = cublasCreate(&handle);

  curandGenerator_t prng;
  curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
  curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long) clock());

  if (stat != CUBLAS_STATUS_SUCCESS)
  {
    fprintf(stderr, "handle creation error: %s", cublasGetStatusString(stat));
    exit(EXIT_FAILURE);
  }

  double *d_a, *d_b, *d_c = NULL;
  size_t m, k, n;

  double one = 1.0; double *alpha = &one;
  double zero = 0.0; double *beta = &zero;

  m = 100; k = 100; n = 100;
  for (m = 100; m <= 10000; m += 100) {
    allocMatricesForProduct(&d_a, &d_b, &d_c, m, k, n);
    CURAND_CALL(curandGenerateUniformDouble(prng, (double *)d_a, m*k));
    CURAND_CALL(curandGenerateUniformDouble(prng, (double *)d_b, k*n));
    stat = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
        m, n, k,
        alpha, d_a, m,
        d_b, k,
        beta, d_c, m);

    freeCudaVectors(3, d_a, d_b, d_c); /* Memory leak is here */
    /* cudaFree(d_a); */
    /* cudaFree(d_b); */
    /* cudaFree(d_c); */
    k += 100;
    n += 100;
  }

  cublasDestroy(handle);
  curandDestroyGenerator(prng);
  return 0;
}

The program displays more memory usage in nvtop than if I replace freeCudaVectors with the usual cudaFree (as commented).

Why do I get memory leaks with this variadic function?

Solution

You're not freeing the first pointer you pass.

Note the description of va_start():

The va_start macro enables access to the variable arguments following the named argument parm_n.

(emphasis added)

So your first pointer is showing up in void* arg1,, which you never free. Then you are trying to iterate three times into a varargs pack of two items.

The following mod fixes things for me, I'm sure there are other fixes possible:

void freeCudaVectors(size_t nAllocVec, void* arg1, ...)
{
  va_list args;
  void *vp;
  va_start(args, arg1);
  cudaFree(arg1);
  for (int i = 0; i < nAllocVec-1; ++i){
    vp = va_arg(args, void *);
    cudaFree(vp);
  }
  va_end(args);
}

Here is my test case:

$ cat t42.cu
#include<iostream>
#include<cstdarg>
#include<cuda.h>
#include<curand.h>
#include<cublas_v2.h>

#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    exit(EXIT_FAILURE);}} while(0)

#define HANDLE_ERROR(x) do { if((x)!=cudaSuccess) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    exit(EXIT_FAILURE);}} while(0)

void freeCudaVectors(size_t nAllocVec, void *arg1, ...)
{
  va_list args;
  void *vp;
  va_start(args, arg1);
  cudaFree(arg1);
  for (int i = 0; i < nAllocVec-1; ++i){
    vp = va_arg(args, void *);
    cudaFree(vp);
  }
  va_end(args);
}

void allocMatricesForProduct(double** a, double** b, double** c, size_t M, size_t K, size_t N)
{
  HANDLE_ERROR(cudaMalloc(a, M*K*sizeof(double)) );
  HANDLE_ERROR(cudaMalloc(b, K*N*sizeof(double)) );
  HANDLE_ERROR(cudaMalloc(c, M*N*sizeof(double)) );
}

int main() {
  cublasStatus_t stat;
  cublasHandle_t handle;
  stat = cublasCreate(&handle);

  curandGenerator_t prng;
  curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
  curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long) clock());

  if (stat != CUBLAS_STATUS_SUCCESS)
  {
    fprintf(stderr, "handle creation error: %s", cublasGetStatusString(stat));
    exit(EXIT_FAILURE);
  }

  double *d_a, *d_b, *d_c = NULL;
  size_t m, k, n;

  double one = 1.0; double *alpha = &one;
  double zero = 0.0; double *beta = &zero;

  m = 100; k = 100; n = 100;
  for (m = 100; m <= 500; m += 100) {
    allocMatricesForProduct(&d_a, &d_b, &d_c, m, k, n);
    CURAND_CALL(curandGenerateUniformDouble(prng, (double *)d_a, m*k));
    CURAND_CALL(curandGenerateUniformDouble(prng, (double *)d_b, k*n));
    stat = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
        m, n, k,
        alpha, d_a, m,
        d_b, k,
        beta, d_c, m);

    freeCudaVectors(3, d_a, d_b, d_c); /* Memory leak is here */
    /* cudaFree(d_a); */
    /* cudaFree(d_b); */
    /* cudaFree(d_c); */
    k += 100;
    n += 100;
    size_t total_mem, free_mem;
    cudaMemGetInfo(&free_mem, &total_mem);
    std::cout << "free mem: " << free_mem << std::endl;
  }

  cublasDestroy(handle);
  curandDestroyGenerator(prng);
  return 0;
}
$ nvcc -o t42 t42.cu -lcurand -lcublas
$ compute-sanitizer ./t42
========= COMPUTE-SANITIZER
free mem: 4778885120
free mem: 4778885120
free mem: 4778885120
free mem: 4778885120
free mem: 4778885120
========= ERROR SUMMARY: 0 errors
$

(CUDA 12.1, GTX 1660 Super)

Any time you are having trouble with a CUDA code, I strongly encourage the use of proper CUDA error checking and run your code with compute-sanitizer as I have demonstrated above. If you had done that, the tool would have indicated to you that you are attempting to free a pointer that the CUDA runtime does not recognize. While that isn't a complete description of the problem here, it is certainly a useful clue.

(I also note in the answer you linked they are freeing the named pointer explicitly, then iterating through the pack.)