Search code examples
memory-leakscudanvidiavariadic-functions

Freeing memory with a variadic function in CUDA


I wrote a function similar to the one described in this answer to a SO post about freeing with a variadic function in the C language but for CUDA vectors.

Here is the function:

void freeCudaVectors(size_t nAllocVec, void* arg1, ...)
{
  va_list args;
  void *vp;
  va_start(args, arg1);
  for (int i = 0; i < nAllocVec; ++i){
    vp = va_arg(args, void *);
    cudaFree(vp);
  }
  va_end(args);
}

Here is an (almost) Minimal Working Example (MWE) showing that I get memory leaks with this function. In the example below, I randomly generate two double square matrices and multiply them together. I repeat this product with increasing sizes.

I compile with nvcc -o memory_leak_test memory_leak_test.cu -lcublas -lcurand.

#include<iostream>
#include<cstdarg>
#include<cuda.h>
#include<curand.h>
#include<cublas_v2.h>

#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    exit(EXIT_FAILURE);}} while(0)

#define HANDLE_ERROR(x) do { if((x)!=cudaSuccess) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    exit(EXIT_FAILURE);}} while(0)

void freeCudaVectors(size_t nAllocVec, void* arg1, ...)
{
  va_list args;
  void *vp;
  va_start(args, arg1);
  for (int i = 0; i < nAllocVec; ++i){
    vp = va_arg(args, void *);
    cudaFree(vp);
  }
  va_end(args);
}

void allocMatricesForProduct(double** a, double** b, double** c, size_t M, size_t K, size_t N)
{
  HANDLE_ERROR(cudaMalloc(a, M*K*sizeof(double)) );
  HANDLE_ERROR(cudaMalloc(b, K*N*sizeof(double)) );
  HANDLE_ERROR(cudaMalloc(c, M*N*sizeof(double)) );
}

int main() {
  cublasStatus_t stat;
  cublasHandle_t handle;
  stat = cublasCreate(&handle);

  curandGenerator_t prng;
  curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
  curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long) clock());

  if (stat != CUBLAS_STATUS_SUCCESS)
  {
    fprintf(stderr, "handle creation error: %s", cublasGetStatusString(stat));
    exit(EXIT_FAILURE);
  }

  double *d_a, *d_b, *d_c = NULL;
  size_t m, k, n;

  double one = 1.0; double *alpha = &one;
  double zero = 0.0; double *beta = &zero;

  m = 100; k = 100; n = 100;
  for (m = 100; m <= 10000; m += 100) {
    allocMatricesForProduct(&d_a, &d_b, &d_c, m, k, n);
    CURAND_CALL(curandGenerateUniformDouble(prng, (double *)d_a, m*k));
    CURAND_CALL(curandGenerateUniformDouble(prng, (double *)d_b, k*n));
    stat = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
        m, n, k,
        alpha, d_a, m,
        d_b, k,
        beta, d_c, m);

    freeCudaVectors(3, d_a, d_b, d_c); /* Memory leak is here */
    /* cudaFree(d_a); */
    /* cudaFree(d_b); */
    /* cudaFree(d_c); */
    k += 100;
    n += 100;
  }

  cublasDestroy(handle);
  curandDestroyGenerator(prng);
  return 0;
}

The program displays more memory usage in nvtop than if I replace freeCudaVectors with the usual cudaFree (as commented).

Why do I get memory leaks with this variadic function?


Solution

  • You're not freeing the first pointer you pass.

    Note the description of va_start():

    The va_start macro enables access to the variable arguments following the named argument parm_n.

    (emphasis added)

    So your first pointer is showing up in void* arg1,, which you never free. Then you are trying to iterate three times into a varargs pack of two items.

    The following mod fixes things for me, I'm sure there are other fixes possible:

    void freeCudaVectors(size_t nAllocVec, void* arg1, ...)
    {
      va_list args;
      void *vp;
      va_start(args, arg1);
      cudaFree(arg1);
      for (int i = 0; i < nAllocVec-1; ++i){
        vp = va_arg(args, void *);
        cudaFree(vp);
      }
      va_end(args);
    }
    

    Here is my test case:

    $ cat t42.cu
    #include<iostream>
    #include<cstdarg>
    #include<cuda.h>
    #include<curand.h>
    #include<cublas_v2.h>
    
    #define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
        printf("Error at %s:%d\n",__FILE__,__LINE__);\
        exit(EXIT_FAILURE);}} while(0)
    
    #define HANDLE_ERROR(x) do { if((x)!=cudaSuccess) { \
        printf("Error at %s:%d\n",__FILE__,__LINE__);\
        exit(EXIT_FAILURE);}} while(0)
    
    void freeCudaVectors(size_t nAllocVec, void *arg1, ...)
    {
      va_list args;
      void *vp;
      va_start(args, arg1);
      cudaFree(arg1);
      for (int i = 0; i < nAllocVec-1; ++i){
        vp = va_arg(args, void *);
        cudaFree(vp);
      }
      va_end(args);
    }
    
    void allocMatricesForProduct(double** a, double** b, double** c, size_t M, size_t K, size_t N)
    {
      HANDLE_ERROR(cudaMalloc(a, M*K*sizeof(double)) );
      HANDLE_ERROR(cudaMalloc(b, K*N*sizeof(double)) );
      HANDLE_ERROR(cudaMalloc(c, M*N*sizeof(double)) );
    }
    
    int main() {
      cublasStatus_t stat;
      cublasHandle_t handle;
      stat = cublasCreate(&handle);
    
      curandGenerator_t prng;
      curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
      curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long) clock());
    
      if (stat != CUBLAS_STATUS_SUCCESS)
      {
        fprintf(stderr, "handle creation error: %s", cublasGetStatusString(stat));
        exit(EXIT_FAILURE);
      }
    
      double *d_a, *d_b, *d_c = NULL;
      size_t m, k, n;
    
      double one = 1.0; double *alpha = &one;
      double zero = 0.0; double *beta = &zero;
    
      m = 100; k = 100; n = 100;
      for (m = 100; m <= 500; m += 100) {
        allocMatricesForProduct(&d_a, &d_b, &d_c, m, k, n);
        CURAND_CALL(curandGenerateUniformDouble(prng, (double *)d_a, m*k));
        CURAND_CALL(curandGenerateUniformDouble(prng, (double *)d_b, k*n));
        stat = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
            m, n, k,
            alpha, d_a, m,
            d_b, k,
            beta, d_c, m);
    
        freeCudaVectors(3, d_a, d_b, d_c); /* Memory leak is here */
        /* cudaFree(d_a); */
        /* cudaFree(d_b); */
        /* cudaFree(d_c); */
        k += 100;
        n += 100;
        size_t total_mem, free_mem;
        cudaMemGetInfo(&free_mem, &total_mem);
        std::cout << "free mem: " << free_mem << std::endl;
      }
    
      cublasDestroy(handle);
      curandDestroyGenerator(prng);
      return 0;
    }
    $ nvcc -o t42 t42.cu -lcurand -lcublas
    $ compute-sanitizer ./t42
    ========= COMPUTE-SANITIZER
    free mem: 4778885120
    free mem: 4778885120
    free mem: 4778885120
    free mem: 4778885120
    free mem: 4778885120
    ========= ERROR SUMMARY: 0 errors
    $
    

    (CUDA 12.1, GTX 1660 Super)

    Any time you are having trouble with a CUDA code, I strongly encourage the use of proper CUDA error checking and run your code with compute-sanitizer as I have demonstrated above. If you had done that, the tool would have indicated to you that you are attempting to free a pointer that the CUDA runtime does not recognize. While that isn't a complete description of the problem here, it is certainly a useful clue.

    (I also note in the answer you linked they are freeing the named pointer explicitly, then iterating through the pack.)