Search code examples
c++c++-cli

Intel OneAPI Video decoding memory leak when using C++ CLI


I am trying to use Intel OneAPI/OneVPL to decode a stream I receive from an RTSP Camera in C#. But when I run the code I get an enormous memory leak. Around 1-200MB per run, which is around once every second. When I've collected a GoP from the camera where I know the first data is a keyframe I pass it as a byte array to my CLI and C++ code. Here I expect it to decode all the frames and return decoded images. It receives 30 frames and returns 16 decoded images but has a memory leak.

I've tried to use Visual Studio memory profiler and all I can tell from it is that its unmanaged memory that's my problem. I've tried to override the "new" and "delete" method inside videoHandler.cpp to track and compare all allocations and deallocations and as far as I can tell everything is handled correctly in there. I cannot see any classes that get instantiated that do not get cleaned up. I think my issue is in the CLI class videoHandlerWrapper.cpp. Am I missing something obvious?

videoHandlerWrapper.cpp

array<imgFrameWrapper^>^ videoHandlerWrapper::decode(array<System::Byte>^ byteArray)
{
    array<imgFrameWrapper^>^ returnFrames = gcnew array<imgFrameWrapper^>(30);
    {
        std::vector<imgFrame> frames(30); //Output from decoding process. imgFrame implements a deconstructor that will rid the data when exiting scope

        std::vector<unsigned char> bytes(byteArray->Length); //Input for decoding process
        Marshal::Copy(byteArray, 0, IntPtr((unsigned char*)(&((bytes)[0]))), byteArray->Length); //Copy from managed (C#) to unmanaged (C++)

        int status = _pVideoHandler->decode(bytes, frames); //Decode

        for (size_t i = 0; i < frames.size(); i++)
        {
            if (frames[i].size > 0)
                returnFrames[i] = gcnew imgFrameWrapper(frames[i].size, frames[i].bytes);
        }
    }
    //PrintMemoryUsage();
    return returnFrames;
}

videoHandler.cpp

#define BITSTREAM_BUFFER_SIZE 2000000 //TODO Maybe higher or lower bitstream buffer. Thorough testing has been done at 2000000
int videoHandler::decode(std::vector<unsigned char> bytes, std::vector<imgFrame> &frameData)
{
    int result = -1;
    bool isStillGoing = true;
    mfxBitstream bitstream = { 0 };
    mfxSession session = NULL;
    mfxStatus sts = MFX_ERR_NONE;
    mfxSurfaceArray* outSurfaces = nullptr;
    mfxU32 framenum = 0;
    mfxU32 numVPPCh = 0;
    mfxVideoChannelParam* mfxVPPChParams = nullptr;
    void* accelHandle = NULL;
    mfxVideoParam mfxDecParams = {};
    mfxVersion version = { 0, 1 };

    //variables used only in 2.x version
    mfxConfig cfg = NULL;
    mfxLoader loader = NULL;
    mfxVariant inCodec = {};
    std::vector<mfxU8> input_buffer;

    // Initialize VPL session for any implementation of HEVC/H265 decode
    loader = MFXLoad();
    VERIFY(NULL != loader, "MFXLoad failed -- is implementation in path?");

    cfg = MFXCreateConfig(loader);
    VERIFY(NULL != cfg, "MFXCreateConfig failed")

        inCodec.Type = MFX_VARIANT_TYPE_U32;
    inCodec.Data.U32 = MFX_CODEC_AVC;
    sts = MFXSetConfigFilterProperty(
        cfg,
        (mfxU8*)"mfxImplDescription.mfxDecoderDescription.decoder.CodecID",
        inCodec);
    
    VERIFY(MFX_ERR_NONE == sts, "MFXSetConfigFilterProperty failed for decoder CodecID");

    sts = MFXCreateSession(loader, 0, &session);
    VERIFY(MFX_ERR_NONE == sts, "Not able to create VPL session");
    
    // Print info about implementation loaded
    version = ShowImplInfo(session);
    //VERIFY(version.Major > 1, "Sample requires 2.x API implementation, exiting");
    if (version.Major == 1) {
        mfxVariant ImplValueSW;
        ImplValueSW.Type = MFX_VARIANT_TYPE_U32;
        ImplValueSW.Data.U32 = MFX_IMPL_TYPE_SOFTWARE;
        MFXSetConfigFilterProperty(cfg, (mfxU8*)"mfxImplDescription.Impl", ImplValueSW);
        sts = MFXCreateSession(loader, 0, &session);
        VERIFY(MFX_ERR_NONE == sts, "Not able to create VPL session");
    }
    // Convenience function to initialize available accelerator(s)
    accelHandle = InitAcceleratorHandle(session);

    bitstream.MaxLength = BITSTREAM_BUFFER_SIZE;

    bitstream.Data = (mfxU8*)calloc(bytes.size(), sizeof(mfxU8));
    VERIFY(bitstream.Data, "Not able to allocate input buffer");

    bitstream.CodecId = MFX_CODEC_AVC;

    std::copy(bytes.begin(), bytes.end(), bitstream.Data);

    bitstream.DataLength = static_cast<mfxU32>(bytes.size());

    memset(&mfxDecParams, 0, sizeof(mfxDecParams));

    mfxDecParams.mfx.CodecId = MFX_CODEC_AVC;
    mfxDecParams.IOPattern = MFX_IOPATTERN_OUT_SYSTEM_MEMORY;
    sts = MFXVideoDECODE_DecodeHeader(session, &bitstream, &mfxDecParams);
    VERIFY(MFX_ERR_NONE == sts, "Error decoding header\n");

    numVPPCh = 1;
    mfxVPPChParams = new mfxVideoChannelParam[numVPPCh];
    for (mfxU32 i = 0; i < numVPPCh; i++) {
        mfxVPPChParams[i] = {};
    }

    //mfxVPPChParams[0].VPP.FourCC = mfxDecParams.mfx.FrameInfo.FourCC;
    mfxVPPChParams[0].VPP.FourCC = MFX_FOURCC_BGRA;
    mfxVPPChParams[0].VPP.ChromaFormat = MFX_CHROMAFORMAT_YUV420;
    mfxVPPChParams[0].VPP.PicStruct = MFX_PICSTRUCT_PROGRESSIVE;
    mfxVPPChParams[0].VPP.FrameRateExtN = 30;
    mfxVPPChParams[0].VPP.FrameRateExtD = 1;
    mfxVPPChParams[0].VPP.CropW = 1920;
    mfxVPPChParams[0].VPP.CropH = 1080;
    //Set value directly if input and output is the same.
    mfxVPPChParams[0].VPP.Width = 1920;
    mfxVPPChParams[0].VPP.Height = 1080;
    //// USED TO RESIZE. IF INPUT IS THE SAME AS OUTPUT THIS WILL MAKE IT SHIFT A BIT. 1920x1080 becomes 1920x1088.
    //mfxVPPChParams[0].VPP.Width = ALIGN16(mfxVPPChParams[0].VPP.CropW);
    //mfxVPPChParams[0].VPP.Height = ALIGN16(mfxVPPChParams[0].VPP.CropH);  
    mfxVPPChParams[0].VPP.ChannelId = 1;
    mfxVPPChParams[0].Protected = 0;
    mfxVPPChParams[0].IOPattern = MFX_IOPATTERN_IN_SYSTEM_MEMORY | MFX_IOPATTERN_OUT_SYSTEM_MEMORY;
    mfxVPPChParams[0].ExtParam = NULL;
    mfxVPPChParams[0].NumExtParam = 0;

    sts = MFXVideoDECODE_VPP_Init(session, &mfxDecParams, &mfxVPPChParams, numVPPCh); //This causes a MINOR memory leak! 
    
    outSurfaces = new mfxSurfaceArray;

    while (isStillGoing == true) {
        sts = MFXVideoDECODE_VPP_DecodeFrameAsync(session,
            &bitstream,
            NULL,
            0,
            &outSurfaces); //Big memory leak. 100MB pr run in the while loop.

        switch (sts) {
        case MFX_ERR_NONE:
            // decode output
            if (framenum >= 30)
            {
                isStillGoing = false;
                break;
            }

            sts = WriteRawFrameToByte(outSurfaces->Surfaces[1], &frameData[framenum]);
            VERIFY(MFX_ERR_NONE == sts, "Could not write 1st vpp output");

            framenum++;
            break;
        case MFX_ERR_MORE_DATA:
            // The function requires more bitstream at input before decoding can proceed           
            isStillGoing = false;
            break;
        case MFX_ERR_MORE_SURFACE:
            // The function requires more frame surface at output before decoding can proceed.
            // This applies to external memory allocations and should not be expected for
            // a simple internal allocation case like this
            break;
        case MFX_ERR_DEVICE_LOST:
            // For non-CPU implementations,
            // Cleanup if device is lost
            break;
        case MFX_WRN_DEVICE_BUSY:
            // For non-CPU implementations,
            // Wait a few milliseconds then try again
            break;
        case MFX_WRN_VIDEO_PARAM_CHANGED:
            // The decoder detected a new sequence header in the bitstream.
            // Video parameters may have changed.
            // In external memory allocation case, might need to reallocate the output surface
            break;
        case MFX_ERR_INCOMPATIBLE_VIDEO_PARAM:
            // The function detected that video parameters provided by the application
            // are incompatible with initialization parameters.
            // The application should close the component and then reinitialize it
            break;
        case MFX_ERR_REALLOC_SURFACE:
            // Bigger surface_work required. May be returned only if
            // mfxInfoMFX::EnableReallocRequest was set to ON during initialization.
            // This applies to external memory allocations and should not be expected for
            // a simple internal allocation case like this
            break;
        default:
            printf("unknown status %d\n", sts);
            isStillGoing = false;
            break;
        }
    }

    sts = MFXVideoDECODE_VPP_Close(session);  // Helps massively! Halves the memory leak speed. Closes internal structures and tables.
    VERIFY(MFX_ERR_NONE == sts, "Error closing VPP session\n");
    
    result = 0;
end:
    printf("Decode and VPP processed %d frames\n", framenum);

    // Clean up resources - It is recommended to close components first, before
    // releasing allocated surfaces, since some surfaces may still be locked by
    // internal resources.        

    if (mfxVPPChParams)
        delete[] mfxVPPChParams;

    if (outSurfaces)
        delete outSurfaces;

    if (bitstream.Data)
        free(bitstream.Data);

    if (accelHandle)
        FreeAcceleratorHandle(accelHandle);

    if (loader)
        MFXUnload(loader);

    return result;
} 

imgFrameWrapper.h

public ref class imgFrameWrapper
    {
    private:
        size_t size;
        array<System::Byte>^ bytes;
    public:
        imgFrameWrapper(size_t u_size, unsigned char* u_bytes);
        ~imgFrameWrapper();
        !imgFrameWrapper();

        size_t get_size();
        array<System::Byte>^ get_bytes();
    };

imgFrameWrapper.cpp

imgFrameWrapper::imgFrameWrapper(size_t u_size, unsigned char* u_bytes)
{
    size = u_size;
    bytes = gcnew array<System::Byte>(size);    
    Marshal::Copy((IntPtr)u_bytes, bytes, 0, size); 
}
imgFrameWrapper::~imgFrameWrapper()
{   
}
imgFrameWrapper::!imgFrameWrapper()
{   
}
size_t imgFrameWrapper::get_size()
{
    return size;
}
array<System::Byte>^ imgFrameWrapper::get_bytes()
{
    return bytes;
}

imgFrame.h

struct imgFrame
{
    int size;
    unsigned char* bytes;

    ~imgFrame()
    {
        if (bytes)
            delete[] bytes;
    }
};

Solution

  • MFXVideoDECODE_VPP_DecodeFrameAsync() function creates internal memory surfaces for the processing. You should release surfaces. Please check this link it's mentioning about it.

    https://spec.oneapi.com/onevpl/latest/API_ref/VPL_structs_decode_vpp.html#_CPPv415mfxSurfaceArray

       mfxStatus (*Release)(struct mfxSurfaceArray *surface_array)¶
       Decrements the internal reference counter of the surface. (*Release) should be 
       called after using the (*AddRef) function to add a surface or when allocation 
       logic requires it.
    

    And please check this sample. https://github.com/oneapi-src/oneVPL/blob/master/examples/hello-decvpp/src/hello-decvpp.cpp

    Especially, WriteRawFrame_InternalMem() function in https://github.com/oneapi-src/oneVPL/blob/17968d8d2299352f5a9e09388d24e81064c81c87/examples/util/util/util.h

    It shows how to release surfaces.