Capture and encode desktop with libav in real time not giving corect images

As part of a larger project I want to be able to capture and encode the desktop frame by frame in real time. I have the following test code to reproduce the issue shown in the screenshot:

#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <string>
#include <string.h>
#include <math.h>

extern "C"
{
#include "libavdevice/avdevice.h"
#include "libavutil/channel_layout.h"
#include "libavutil/mathematics.h"
#include "libavutil/opt.h"
#include "libavformat/avformat.h"
#include "libswscale/swscale.h"
}


/* 5 seconds stream duration */
#define STREAM_DURATION   5.0
#define STREAM_FRAME_RATE 25 /* 25 images/s */
#define STREAM_NB_FRAMES  ((int)(STREAM_DURATION * STREAM_FRAME_RATE))
#define STREAM_PIX_FMT    AV_PIX_FMT_YUV420P /* default pix_fmt */

int videoStreamIndx;
int framerate = 30;

int width = 1920;
int height = 1080;

int encPacketCounter;

AVFormatContext* ifmtCtx;
AVCodecContext* avcodecContx;
AVFormatContext* ofmtCtx;
AVStream* videoStream;
AVCodecContext* avCntxOut;
AVPacket* avPkt;
AVFrame* avFrame;
AVFrame* outFrame;
SwsContext* swsCtx;

std::ofstream fs;


AVDictionary* ConfigureScreenCapture()
{

    AVDictionary* options = NULL;
    //Try adding "-rtbufsize 100M" as in https://stackoverflow.com/questions/6766333/capture-windows-screen-with-ffmpeg
    av_dict_set(&options, "rtbufsize", "100M", 0);
    av_dict_set(&options, "framerate", std::to_string(framerate).c_str(), 0);
    char buffer[16];
    sprintf(buffer, "%dx%d", width, height);
    av_dict_set(&options, "video_size", buffer, 0);
    return options;
}

AVCodecParameters* ConfigureAvCodec()
{
    AVCodecParameters* av_codec_par_out = avcodec_parameters_alloc();
    av_codec_par_out->width = width;
    av_codec_par_out->height = height;
    av_codec_par_out->bit_rate = 40000;
    av_codec_par_out->codec_id = AV_CODEC_ID_H264; //AV_CODEC_ID_MPEG4; //Try H.264 instead of MPEG4
    av_codec_par_out->codec_type = AVMEDIA_TYPE_VIDEO;
    av_codec_par_out->format = 0;
    return av_codec_par_out;
}

int GetVideoStreamIndex()
{
    int VideoStreamIndx = -1;
    avformat_find_stream_info(ifmtCtx, NULL);
    /* find the first video stream index . Also there is an API available to do the below operations */
    for (int i = 0; i < (int)ifmtCtx->nb_streams; i++) // find video stream position/index.
    {
        if (ifmtCtx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO)
        {
            VideoStreamIndx = i;
            break;
        }
    }

    if (VideoStreamIndx == -1)
    {
    }

    return VideoStreamIndx;
}

void CreateFrames(AVCodecParameters* av_codec_par_in, AVCodecParameters* av_codec_par_out)
{

    avFrame = av_frame_alloc();
    avFrame->width = avcodecContx->width;
    avFrame->height = avcodecContx->height;
    avFrame->format = av_codec_par_in->format;
    av_frame_get_buffer(avFrame, 0);

    outFrame = av_frame_alloc();
    outFrame->width = avCntxOut->width;
    outFrame->height = avCntxOut->height;
    outFrame->format = av_codec_par_out->format;
    av_frame_get_buffer(outFrame, 0);
}

bool Init()
{
    AVCodecParameters* avCodecParOut = ConfigureAvCodec();

    AVDictionary* options = ConfigureScreenCapture();

    AVInputFormat* ifmt = av_find_input_format("gdigrab");
    auto ifmtCtxLocal = avformat_alloc_context();
    if (avformat_open_input(&ifmtCtxLocal, "desktop", ifmt, &options) < 0)
    {
        return false;
    }
    ifmtCtx = ifmtCtxLocal;

    videoStreamIndx = GetVideoStreamIndex();

    AVCodecParameters* avCodecParIn = avcodec_parameters_alloc();
    avCodecParIn = ifmtCtx->streams[videoStreamIndx]->codecpar;

    AVCodec* avCodec = avcodec_find_decoder(avCodecParIn->codec_id);
    if (avCodec == NULL)
    {
        return false;
    }

    avcodecContx = avcodec_alloc_context3(avCodec);
    if (avcodec_parameters_to_context(avcodecContx, avCodecParIn) < 0)
    {
        return false;
    }

    //av_dict_set
    int value = avcodec_open2(avcodecContx, avCodec, NULL); //Initialize the AVCodecContext to use the given AVCodec.
    if (value < 0)
    {
        return false;
    }

    AVOutputFormat* ofmt = av_guess_format("h264", NULL, NULL);

    if (ofmt == NULL)
    {
        return false;
    }

    auto ofmtCtxLocal = avformat_alloc_context();
    avformat_alloc_output_context2(&ofmtCtxLocal, ofmt, NULL, NULL);
    if (ofmtCtxLocal == NULL)
    {
        return false;
    }
    ofmtCtx = ofmtCtxLocal;

    AVCodec* avCodecOut = avcodec_find_encoder(avCodecParOut->codec_id);
    if (avCodecOut == NULL)
    {
        return false;
    }

    videoStream = avformat_new_stream(ofmtCtx, avCodecOut);
    if (videoStream == NULL)
    {
        return false;
    }

    avCntxOut = avcodec_alloc_context3(avCodecOut);
    if (avCntxOut == NULL)
    {
        return false;
    }

    if (avcodec_parameters_copy(videoStream->codecpar, avCodecParOut) < 0)
    {
        return false;
    }

    if (avcodec_parameters_to_context(avCntxOut, avCodecParOut) < 0)
    {
        return false;
    }

    avCntxOut->gop_size = 30; //3; //Use I-Frame frame every 30 frames.
    avCntxOut->max_b_frames = 0;
    avCntxOut->time_base.num = 1;
    avCntxOut->time_base.den = framerate;

    //avio_open(&ofmtCtx->pb, "", AVIO_FLAG_READ_WRITE);

    if (avformat_write_header(ofmtCtx, NULL) < 0)
    {
        return false;
    }

    value = avcodec_open2(avCntxOut, avCodecOut, NULL); //Initialize the AVCodecContext to use the given AVCodec.
    if (value < 0)
    {
        return false;
    }

    if (avcodecContx->codec_id == AV_CODEC_ID_H264)
    {
        av_opt_set(avCntxOut->priv_data, "preset", "ultrafast", 0);
        av_opt_set(avCntxOut->priv_data, "zerolatency", "1", 0);
        av_opt_set(avCntxOut->priv_data, "tune", "ull", 0);
    }

    if ((ofmtCtx->oformat->flags & AVFMT_GLOBALHEADER) != 0)
    {
        avCntxOut->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
    }

    CreateFrames(avCodecParIn, avCodecParOut);

    swsCtx = sws_alloc_context();
    if (sws_init_context(swsCtx, NULL, NULL) < 0)
    {
        return false;
    }

    swsCtx = sws_getContext(avcodecContx->width, avcodecContx->height, avcodecContx->pix_fmt,
        avCntxOut->width, avCntxOut->height, avCntxOut->pix_fmt, SWS_FAST_BILINEAR,
        NULL, NULL, NULL);
    if (swsCtx == NULL)
    {
        return false;
    }

    return true;
}

void Encode(AVCodecContext* enc_ctx, AVFrame* frame, AVPacket* pkt)
{
    int ret;

    /* send the frame to the encoder */
    ret = avcodec_send_frame(enc_ctx, frame);
    if (ret < 0)
    {
        return;
    }

    while (ret >= 0)
    {
        ret = avcodec_receive_packet(enc_ctx, pkt);
        if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF)
            return;
        if (ret < 0)
        {
            return;
        }

        fs.write((char*)pkt->data, pkt->size);
        av_packet_unref(pkt);
    }
}

void EncodeFrames(int noFrames)
{
    int frameCount = 0;
    avPkt = av_packet_alloc();
    AVPacket* outPacket = av_packet_alloc();
    encPacketCounter = 0;

    while (av_read_frame(ifmtCtx, avPkt) >= 0)
    {
        if (frameCount++ == noFrames)
            break;
        if (avPkt->stream_index != videoStreamIndx) continue;

        avcodec_send_packet(avcodecContx, avPkt);

        if (avcodec_receive_frame(avcodecContx, avFrame) >= 0) // Frame successfully decoded :)
        {
            outPacket->data = NULL; // packet data will be allocated by the encoder
            outPacket->size = 0;

            outPacket->pts = av_rescale_q(encPacketCounter, avCntxOut->time_base, videoStream->time_base);
            if (outPacket->dts != AV_NOPTS_VALUE)
                outPacket->dts = av_rescale_q(encPacketCounter, avCntxOut->time_base, videoStream->time_base);

            outPacket->dts = av_rescale_q(encPacketCounter, avCntxOut->time_base, videoStream->time_base);
            outPacket->duration = av_rescale_q(1, avCntxOut->time_base, videoStream->time_base);

            outFrame->pts = av_rescale_q(encPacketCounter, avCntxOut->time_base, videoStream->time_base);
            outFrame->pkt_duration = av_rescale_q(encPacketCounter, avCntxOut->time_base, videoStream->time_base);
            encPacketCounter++;

            int sts = sws_scale(swsCtx,
                avFrame->data, avFrame->linesize, 0, avFrame->height,
                outFrame->data, outFrame->linesize);

            /* make sure the frame data is writable */
            auto ret = av_frame_make_writable(outFrame);
            if (ret < 0)
                break;
            Encode(avCntxOut, outFrame, outPacket);
        }
        av_frame_unref(avFrame);
        av_packet_unref(avPkt);
    }
}

void Dispose()
{
    fs.close();

    auto ifmtCtxLocal = ifmtCtx;
    avformat_close_input(&ifmtCtx);
    avformat_free_context(ifmtCtx);
    avcodec_free_context(&avcodecContx);

}

int main(int argc, char** argv)
{
    avdevice_register_all();

    fs.open("out.h264");

    if (Init())
    {
        EncodeFrames(300);
    }
    else
    {
        std::cout << "Failed to Init \n";
    }    

    Dispose();

    return 0;
}

As far as I can tell the setup of the encoding process is correct as it is largely unchanged from how the example given in the official documentation is working: https://libav.org/documentation/doxygen/master/encode__video_8c_source.html

However there is limited documentation around the desktop capture online so I am not sure if I have set that up correctly.

Solution

We have to open the out.h264 as binary file.

Replace fs.open("out.h264"); with fs.open("out.h264", std::ios::binary);.

The default file type in Windows is "text file".
That means that each \n in converted to \r\n when writing, and the encoded stream get "messed up".

It took me quite a long time to figure out the problem...

There is another small issue:
There is a missing loop at the end, that flushes the remaining encoded packets.

We can use FFprobe for counting the number of encoded frames:

ffprobe -v error -select_streams v:0 -count_frames -show_entries stream=nb_read_frames -print_format csv out.h264

The result is 263 instead of 300.

The solution is adding the following loop at the end of void EncodeFrames(int noFrames) function:

int ret = 0;
avcodec_send_frame(avCntxOut, NULL);
do
{
    av_packet_unref(outPacket);
    ret = avcodec_receive_packet(avCntxOut, outPacket);
    if (!ret)
    {
        fs.write((char*)outPacket->data, outPacket->size);
    }
} while (!ret);