I wrote a function similar to the one described in this answer to a SO post about freeing with a variadic function in the C language but for CUDA vectors.
Here is the function:
void freeCudaVectors(size_t nAllocVec, void* arg1, ...)
{
va_list args;
void *vp;
va_start(args, arg1);
for (int i = 0; i < nAllocVec; ++i){
vp = va_arg(args, void *);
cudaFree(vp);
}
va_end(args);
}
Here is an (almost) Minimal Working Example (MWE) showing that I get memory leaks with this function. In the example below, I randomly generate two double square matrices and multiply them together. I repeat this product with increasing sizes.
I compile with nvcc -o memory_leak_test memory_leak_test.cu -lcublas -lcurand
.
#include<iostream>
#include<cstdarg>
#include<cuda.h>
#include<curand.h>
#include<cublas_v2.h>
#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
exit(EXIT_FAILURE);}} while(0)
#define HANDLE_ERROR(x) do { if((x)!=cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
exit(EXIT_FAILURE);}} while(0)
void freeCudaVectors(size_t nAllocVec, void* arg1, ...)
{
va_list args;
void *vp;
va_start(args, arg1);
for (int i = 0; i < nAllocVec; ++i){
vp = va_arg(args, void *);
cudaFree(vp);
}
va_end(args);
}
void allocMatricesForProduct(double** a, double** b, double** c, size_t M, size_t K, size_t N)
{
HANDLE_ERROR(cudaMalloc(a, M*K*sizeof(double)) );
HANDLE_ERROR(cudaMalloc(b, K*N*sizeof(double)) );
HANDLE_ERROR(cudaMalloc(c, M*N*sizeof(double)) );
}
int main() {
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
curandGenerator_t prng;
curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long) clock());
if (stat != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "handle creation error: %s", cublasGetStatusString(stat));
exit(EXIT_FAILURE);
}
double *d_a, *d_b, *d_c = NULL;
size_t m, k, n;
double one = 1.0; double *alpha = &one;
double zero = 0.0; double *beta = &zero;
m = 100; k = 100; n = 100;
for (m = 100; m <= 10000; m += 100) {
allocMatricesForProduct(&d_a, &d_b, &d_c, m, k, n);
CURAND_CALL(curandGenerateUniformDouble(prng, (double *)d_a, m*k));
CURAND_CALL(curandGenerateUniformDouble(prng, (double *)d_b, k*n));
stat = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
m, n, k,
alpha, d_a, m,
d_b, k,
beta, d_c, m);
freeCudaVectors(3, d_a, d_b, d_c); /* Memory leak is here */
/* cudaFree(d_a); */
/* cudaFree(d_b); */
/* cudaFree(d_c); */
k += 100;
n += 100;
}
cublasDestroy(handle);
curandDestroyGenerator(prng);
return 0;
}
The program displays more memory usage in nvtop than if I replace freeCudaVectors with the usual cudaFree
(as commented).
Why do I get memory leaks with this variadic function?
You're not freeing the first pointer you pass.
Note the description of va_start()
:
The va_start macro enables access to the variable arguments following the named argument parm_n.
(emphasis added)
So your first pointer is showing up in void* arg1,
, which you never free. Then you are trying to iterate three times into a varargs pack of two items.
The following mod fixes things for me, I'm sure there are other fixes possible:
void freeCudaVectors(size_t nAllocVec, void* arg1, ...)
{
va_list args;
void *vp;
va_start(args, arg1);
cudaFree(arg1);
for (int i = 0; i < nAllocVec-1; ++i){
vp = va_arg(args, void *);
cudaFree(vp);
}
va_end(args);
}
Here is my test case:
$ cat t42.cu
#include<iostream>
#include<cstdarg>
#include<cuda.h>
#include<curand.h>
#include<cublas_v2.h>
#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
exit(EXIT_FAILURE);}} while(0)
#define HANDLE_ERROR(x) do { if((x)!=cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
exit(EXIT_FAILURE);}} while(0)
void freeCudaVectors(size_t nAllocVec, void *arg1, ...)
{
va_list args;
void *vp;
va_start(args, arg1);
cudaFree(arg1);
for (int i = 0; i < nAllocVec-1; ++i){
vp = va_arg(args, void *);
cudaFree(vp);
}
va_end(args);
}
void allocMatricesForProduct(double** a, double** b, double** c, size_t M, size_t K, size_t N)
{
HANDLE_ERROR(cudaMalloc(a, M*K*sizeof(double)) );
HANDLE_ERROR(cudaMalloc(b, K*N*sizeof(double)) );
HANDLE_ERROR(cudaMalloc(c, M*N*sizeof(double)) );
}
int main() {
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
curandGenerator_t prng;
curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long) clock());
if (stat != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "handle creation error: %s", cublasGetStatusString(stat));
exit(EXIT_FAILURE);
}
double *d_a, *d_b, *d_c = NULL;
size_t m, k, n;
double one = 1.0; double *alpha = &one;
double zero = 0.0; double *beta = &zero;
m = 100; k = 100; n = 100;
for (m = 100; m <= 500; m += 100) {
allocMatricesForProduct(&d_a, &d_b, &d_c, m, k, n);
CURAND_CALL(curandGenerateUniformDouble(prng, (double *)d_a, m*k));
CURAND_CALL(curandGenerateUniformDouble(prng, (double *)d_b, k*n));
stat = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
m, n, k,
alpha, d_a, m,
d_b, k,
beta, d_c, m);
freeCudaVectors(3, d_a, d_b, d_c); /* Memory leak is here */
/* cudaFree(d_a); */
/* cudaFree(d_b); */
/* cudaFree(d_c); */
k += 100;
n += 100;
size_t total_mem, free_mem;
cudaMemGetInfo(&free_mem, &total_mem);
std::cout << "free mem: " << free_mem << std::endl;
}
cublasDestroy(handle);
curandDestroyGenerator(prng);
return 0;
}
$ nvcc -o t42 t42.cu -lcurand -lcublas
$ compute-sanitizer ./t42
========= COMPUTE-SANITIZER
free mem: 4778885120
free mem: 4778885120
free mem: 4778885120
free mem: 4778885120
free mem: 4778885120
========= ERROR SUMMARY: 0 errors
$
(CUDA 12.1, GTX 1660 Super)
Any time you are having trouble with a CUDA code, I strongly encourage the use of proper CUDA error checking and run your code with compute-sanitizer
as I have demonstrated above. If you had done that, the tool would have indicated to you that you are attempting to free a pointer that the CUDA runtime does not recognize. While that isn't a complete description of the problem here, it is certainly a useful clue.
(I also note in the answer you linked they are freeing the named pointer explicitly, then iterating through the pack.)