Checking currently residing entities in GPU memory

What would be the easiest way of checking which (and their size) entities that have been allocated with cudaMalloc (), reside currently on a GPU device? I want to find a memory leak inside a function, that if it's just called once and exit, there is no memory leak (checked via cuda-memcheck), but if called multiple times the memory footprint gets bigger and bigger.

Nsight Visual Profiler seems too complex for what I ask and cuda-memcheck finds no leak!

Solution

There is no way to do this with the CUDA APIs. If you did want to do this, you would need to make your own instrumentation system which wraps the CUDA memory allocation/deallocation APIs which your code calls. The simplest implementation could look something like this:

#include <iostream>
#include <vector>
#include <algorithm>

typedef std::pair<void*, size_t> mrecord;
struct mymemory
{
    std::vector<mrecord> mstack;

    mymemory() {};

    cudaError_t cudaMalloc(void** p, size_t sz);
    cudaError_t cudaFree(void* p);
    void print_stack();

};

cudaError_t mymemory::cudaMalloc(void** p, size_t sz)
{
    cudaError_t ret = ::cudaMalloc(p, sz);

    if (ret == cudaSuccess) {
       mstack.push_back(mrecord(*p,sz));
    }
    return ret;
};


cudaError_t mymemory::cudaFree(void* p)
{
    cudaError_t ret = ::cudaFree(p);

    if (ret == cudaSuccess) {
        auto rit = std::find_if( mstack.begin(), mstack.end(),
                                 [&](const mrecord& r){ return r.first == p; } );
        if (rit != mstack.end()) {
            mstack.erase(rit);
        }
    }
    return ret;
};

void mymemory::print_stack()
{
    auto it = mstack.begin();
    for(; it != mstack.end(); ++it) {
        mrecord rec = *it;
        std::cout << rec.first << " : " << rec.second << std::endl;
    }
}


int main(void) 
{
    const int nallocs = 10;
    void* pointers[nallocs];

    mymemory mdebug;
    for(int i=0; i<nallocs; ++i) {
        mdebug.cudaMalloc(&pointers[i], 4<<i);
    }
    std::cout << "After Allocation" << std::endl;
    mdebug.print_stack();

    mdebug.cudaFree(pointers[1]);
    mdebug.cudaFree(pointers[7]);
    mdebug.cudaFree(pointers[8]);
    mdebug.cudaFree(0);

    std::cout << "After Deallocation" << std::endl;
    mdebug.print_stack();

    return 0;
}

[Warning: only very lightly tested and required C++11 compiler support]

which would do this:

~/SO$ nvcc -std=c++11 -g -arch=sm_52 instrumentation.cu 
~/SO$ ./a.out 
After Allocation
0x705e40000 : 4
0x705e40200 : 8
0x705e40400 : 16
0x705e40600 : 32
0x705e40800 : 64
0x705e40a00 : 128
0x705e40c00 : 256
0x705e40e00 : 512
0x705e41000 : 1024
0x705f40000 : 2048
After Deallocation
0x705e40000 : 4
0x705e40400 : 16
0x705e40600 : 32
0x705e40800 : 64
0x705e40a00 : 128
0x705e40c00 : 256
0x705f40000 : 2048

This might be enough to understand which memory allocations are leaking. But be aware that memory management on the GPU isn't as predictable as you might believe it to be, and you need to be careful when diagnosing a memory leak just on the basis of the amount of free memory which the device reports at any given instant. See this question for some more details.