Search code examples
c++ffmpeg

Adding unregistered SEI data to every frame (ffmpeg / C++ / Windows)


I am working with FFMPEG 5.2 using it with C++ in Visual Studio. What I require to do is to add a SEI Unregistered message (5) to every frame of a stream, for that, I am demuxing a MP4 container, then taking the video stream, decoding every packet to get a frame, then add SEI message to every frame, encoding and remuxing a new video stream (video only) and saving the new stream to a separate container.

To add the SEI data I use this specific code:

            const char* sideDataMsg = "139FB1A9446A4DEC8CBF65B1E12D2CFDHola";;
            size_t sideDataSize = sizeof(sideDataMsg);
            AVBufferRef* sideDataBuffer = av_buffer_alloc(20);
            sideDataBuffer->data = (uint8_t*)sideDataMsg;

            AVFrameSideData* sideData = av_frame_new_side_data_from_buf(frame, AV_FRAME_DATA_SEI_UNREGISTERED, sideDataBuffer);

regarding the format of the sideDataMsg I have tried several apporaches including setting it like: "139FB1A9-446A-4DEC-8CBF65B1E12D2CFD+Hola!" which is indicated to be the required format in H.264 specs, however, even when in memory I see the SEI data is added to every frame as we observe as follows:

enter image description here

the resulting stream/container does not shows the expected data, below my entire code, this is mostly code taken/adapted from doc/examples folder of FFMPEG library.

BTW: I also tried setting AVCodecContext->export_side_data to different bit values (0 to FF) understanding that this can indicate the encoder to export the SEI data in every frame to be encoded but no luck.

I appreciate in advance any help from you!

// FfmpegTests.cpp : This file contains the 'main' function. Program execution begins and ends there.
//
#pragma warning(disable : 4996)
extern "C"
{
#include "libavformat/avformat.h"
#include "libavcodec/avcodec.h"
#include "libavfilter/avfilter.h"
#include "libavutil/opt.h"
#include "libavutil/avutil.h"
#include "libavutil/error.h"
#include "libavfilter/buffersrc.h"
#include "libavfilter/buffersink.h"
#include "libswscale/swscale.h"
}

#pragma comment(lib, "avcodec.lib")
#pragma comment(lib, "avformat.lib")
#pragma comment(lib, "avfilter.lib")
#pragma comment(lib, "avutil.lib")
#pragma comment(lib, "swscale.lib")

#include <cstdio>
#include <iostream>
#include <chrono>
#include <thread>


static AVFormatContext* fmt_ctx;
static AVCodecContext* dec_ctx;
AVFilterGraph* filter_graph;
AVFilterContext* buffersrc_ctx;
AVFilterContext* buffersink_ctx;
static int video_stream_index = -1;

const char* filter_descr = "scale=78:24,transpose=cclock";
static int64_t last_pts = AV_NOPTS_VALUE;


// FOR SEI NAL INSERTION
const AVOutputFormat* ofmt = NULL;
AVFormatContext* ofmt_ctx = NULL;
int stream_index = 0;
int* stream_mapping = NULL;
int stream_mapping_size = 0;
int FRAMES_COUNT = 0;
const AVCodec* codec_enc;
AVCodecContext* c = NULL;

static int open_input_file(const char* filename)
{
    const AVCodec* dec;
    int ret;

    if ((ret = avformat_open_input(&fmt_ctx, filename, NULL, NULL)) < 0) {
        av_log(NULL, AV_LOG_ERROR, "Cannot open input file\n");
        return ret;
    }

    if ((ret = avformat_find_stream_info(fmt_ctx, NULL)) < 0) {
        av_log(NULL, AV_LOG_ERROR, "Cannot find stream information\n");
        return ret;
    }

    /* select the video stream */
    ret = av_find_best_stream(fmt_ctx, AVMEDIA_TYPE_VIDEO, -1, -1, &dec, 0);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Cannot find a video stream in the input file\n");
        return ret;
    }
    video_stream_index = ret;

    /* create decoding context */
    dec_ctx = avcodec_alloc_context3(dec);
    if (!dec_ctx)
        return AVERROR(ENOMEM);
    avcodec_parameters_to_context(dec_ctx, fmt_ctx->streams[video_stream_index]->codecpar);

    FRAMES_COUNT = fmt_ctx->streams[video_stream_index]->nb_frames;

    /* init the video decoder */
    if ((ret = avcodec_open2(dec_ctx, dec, NULL)) < 0) {
        av_log(NULL, AV_LOG_ERROR, "Cannot open video decoder\n");
        return ret;
    }

    return 0;
}

static int init_filters(const char* filters_descr)
{
    char args[512];
    int ret = 0;
    const AVFilter* buffersrc = avfilter_get_by_name("buffer");
    const AVFilter* buffersink = avfilter_get_by_name("buffersink");
    AVFilterInOut* outputs = avfilter_inout_alloc();
    AVFilterInOut* inputs = avfilter_inout_alloc();
    AVRational time_base = fmt_ctx->streams[video_stream_index]->time_base;
    enum AVPixelFormat pix_fmts[] = { AV_PIX_FMT_GRAY8, AV_PIX_FMT_NONE };

    filter_graph = avfilter_graph_alloc();
    if (!outputs || !inputs || !filter_graph) {
        ret = AVERROR(ENOMEM);
        goto end;
    }

    /* buffer video source: the decoded frames from the decoder will be inserted here. */
    snprintf(args, sizeof(args),
        "video_size=%dx%d:pix_fmt=%d:time_base=%d/%d:pixel_aspect=%d/%d",
        dec_ctx->width, dec_ctx->height, dec_ctx->pix_fmt,
        time_base.num, time_base.den,
        dec_ctx->sample_aspect_ratio.num, dec_ctx->sample_aspect_ratio.den);

    ret = avfilter_graph_create_filter(&buffersrc_ctx, buffersrc, "in",
        args, NULL, filter_graph);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Cannot create buffer source\n");
        goto end;
    }

    /* buffer video sink: to terminate the filter chain. */
    ret = avfilter_graph_create_filter(&buffersink_ctx, buffersink, "out",
        NULL, NULL, filter_graph);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Cannot create buffer sink\n");
        goto end;
    }

    ret = av_opt_set_int_list(buffersink_ctx, "pix_fmts", pix_fmts, AV_PIX_FMT_NONE, AV_OPT_SEARCH_CHILDREN);
    if (ret < 0) {
        av_log(NULL, AV_LOG_ERROR, "Cannot set output pixel format\n");
        goto end;
    }

    outputs->name = av_strdup("in");
    outputs->filter_ctx = buffersrc_ctx;
    outputs->pad_idx = 0;
    outputs->next = NULL;

    inputs->name = av_strdup("out");
    inputs->filter_ctx = buffersink_ctx;
    inputs->pad_idx = 0;
    inputs->next = NULL;

    if ((ret = avfilter_graph_parse_ptr(filter_graph, filters_descr,
        &inputs, &outputs, NULL)) < 0)
        goto end;

    if ((ret = avfilter_graph_config(filter_graph, NULL)) < 0)
        goto end;

end:
    avfilter_inout_free(&inputs);
    avfilter_inout_free(&outputs);

    return ret;
}

static void display_frame(const AVFrame* frame, AVRational time_base)
{
    int x, y;
    uint8_t* p0, * p;
    int64_t delay;

    if (frame->pts != AV_NOPTS_VALUE) {
        if (last_pts != AV_NOPTS_VALUE) {
            /* sleep roughly the right amount of time;
             * usleep is in microseconds, just like AV_TIME_BASE. */
            AVRational timeBaseQ;
            timeBaseQ.num = 1;
            timeBaseQ.den = AV_TIME_BASE;

            delay = av_rescale_q(frame->pts - last_pts, time_base, timeBaseQ);
            if (delay > 0 && delay < 1000000)
                std::this_thread::sleep_for(std::chrono::microseconds(delay));
        }
        last_pts = frame->pts;
    }

    /* Trivial ASCII grayscale display. */
    p0 = frame->data[0];
    puts("\033c");
    for (y = 0; y < frame->height; y++) {
        p = p0;
        for (x = 0; x < frame->width; x++)
            putchar(" .-+#"[*(p++) / 52]);
        putchar('\n');
        p0 += frame->linesize[0];
    }
    fflush(stdout);
}

int save_frame_as_jpeg(AVCodecContext* pCodecCtx, AVFrame* pFrame, int FrameNo) {
    int ret = 0;

    const AVCodec* jpegCodec = avcodec_find_encoder(AV_CODEC_ID_MJPEG);
    if (!jpegCodec) {
        return -1;
    }
    AVCodecContext* jpegContext = avcodec_alloc_context3(jpegCodec);
    if (!jpegContext) {
        return -1;
    }

    jpegContext->pix_fmt = pCodecCtx->pix_fmt;
    jpegContext->height = pFrame->height;
    jpegContext->width = pFrame->width;
    jpegContext->time_base = AVRational{ 1,10 };
    jpegContext->strict_std_compliance = FF_COMPLIANCE_UNOFFICIAL;

    ret = avcodec_open2(jpegContext, jpegCodec, NULL);
    if (ret < 0) {
        return ret;
    }
    FILE* JPEGFile;
    char JPEGFName[256];

    AVPacket packet;
    packet.data = NULL;
    packet.size = 0;
    av_init_packet(&packet);

    int gotFrame;

    ret = avcodec_send_frame(jpegContext, pFrame);
    if (ret < 0) {
        return ret;
    }

    ret = avcodec_receive_packet(jpegContext, &packet);
    if (ret < 0) {
        return ret;
    }

    sprintf(JPEGFName, "c:\\folder\\dvr-%06d.jpg", FrameNo);
    JPEGFile = fopen(JPEGFName, "wb");
    fwrite(packet.data, 1, packet.size, JPEGFile);
    fclose(JPEGFile);

    av_packet_unref(&packet);
    avcodec_close(jpegContext);
    return 0;
}

int initialize_output_stream(AVFormatContext* input_fctx, const char* out_filename) {
    int ret = 0;

    avformat_alloc_output_context2(&ofmt_ctx, NULL, NULL, out_filename);
    if (!ofmt_ctx) {
        fprintf(stderr, "Could not create output context\n");
        return -1;
    }

    stream_mapping_size = input_fctx->nb_streams;
    stream_mapping = (int*)av_calloc(stream_mapping_size, sizeof(*stream_mapping));
    if (!stream_mapping) {
        ret = AVERROR(ENOMEM);
        return -1;
    }

    for (int i = 0; i < input_fctx->nb_streams; i++) {
        AVStream* out_stream;
        AVStream* in_stream = input_fctx->streams[i];
        AVCodecParameters* in_codecpar = in_stream->codecpar;

        if (in_codecpar->codec_type != AVMEDIA_TYPE_AUDIO &&
            in_codecpar->codec_type != AVMEDIA_TYPE_VIDEO &&
            in_codecpar->codec_type != AVMEDIA_TYPE_SUBTITLE) {
            stream_mapping[i] = -1;
            continue;
        }

        stream_mapping[i] = stream_index++;

        out_stream = avformat_new_stream(ofmt_ctx, NULL);
        if (!out_stream) {
            fprintf(stderr, "Failed allocating output stream\n");
            ret = AVERROR_UNKNOWN;
            return ret;
        }

        ret = avcodec_parameters_copy(out_stream->codecpar, in_codecpar);
        if (ret < 0) {
            fprintf(stderr, "Failed to copy codec parameters\n");
            return -1;
        }
        out_stream->codecpar->codec_tag = 0;
    }

    ret = avio_open(&ofmt_ctx->pb, out_filename, AVIO_FLAG_WRITE);
    if (ret < 0) {
        fprintf(stderr, "Could not open output file '%s'", out_filename);
        return -1;
    }

    ret = avformat_write_header(ofmt_ctx, NULL);
    if (ret < 0) {
        fprintf(stderr, "Error occurred when opening output file\n");
        return -1;
    }

    // ENCODER
    codec_enc = avcodec_find_encoder_by_name("libx264");
    if (!codec_enc) {
        fprintf(stderr, "Codec '%s' not found\n", "libx264");
        return -1;
    }

    c = avcodec_alloc_context3(codec_enc);
    if (!c) {
        fprintf(stderr, "Could not allocate video codec context\n");
        exit(1);
    }

    c->bit_rate = dec_ctx->bit_rate;
    c->width = dec_ctx->width;
    c->height = dec_ctx->height;
    c->time_base = dec_ctx->time_base;
    c->framerate = dec_ctx->framerate;
    c->gop_size = dec_ctx->gop_size;
    c->max_b_frames = dec_ctx->max_b_frames;
    c->pix_fmt = dec_ctx->pix_fmt;
    c->time_base = AVRational{ 1,1 };
    c->export_side_data = 255;

    if (codec_enc->id == AV_CODEC_ID_H264)
        av_opt_set(c->priv_data, "preset", "slow", 0);

    ret = avcodec_open2(c, codec_enc, NULL);
    if (ret < 0) {
        fprintf(stderr, "Could not open codec\n");
        return ret;
    }
}

int add_frame_output_stream(AVFrame* frame) {
    int ret;
    AVPacket* pkt;
    pkt = av_packet_alloc();

    ret = avcodec_send_frame(c, frame);
    if (ret < 0) {
        fprintf(stderr, "Error sending a frame for decoding\n");
        return ret;
    }

    while (ret >= 0) {
        ret = avcodec_receive_packet(c, pkt);
        if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF)
            return 0;
        else if (ret < 0) {
            fprintf(stderr, "Error during decoding\n");
            return -1;
        }

        pkt->stream_index = stream_mapping[pkt->stream_index];
        ret = av_interleaved_write_frame(ofmt_ctx, pkt);

        av_packet_unref(pkt);
    }

    return 0;
}

int main(int argc, char** argv)
{
    AVFrame* frame;
    AVFrame* filt_frame;
    AVPacket* packet;
    int ret, count = 0;

    // FOR SEI NAL INSERTION
    const char* out_filename;

    if (argc < 2) {
        fprintf(stderr, "Usage: %s file\n", argv[0]);
        exit(1);
    }

    frame = av_frame_alloc();
    filt_frame = av_frame_alloc();
    packet = av_packet_alloc();

    if (!frame || !filt_frame || !packet) {
        fprintf(stderr, "Could not allocate frame or packet\n");
        exit(1);
    }

    if ((ret = open_input_file(argv[1])) < 0)
        goto end;
    if ((ret = init_filters(filter_descr)) < 0)
        goto end;

    out_filename = argv[2];
    initialize_output_stream(fmt_ctx, out_filename);

    while (count < FRAMES_COUNT)
    {
        if ((ret = av_read_frame(fmt_ctx, packet)) < 0)
            break;

        if (packet->stream_index == video_stream_index) {
            ret = avcodec_send_packet(dec_ctx, packet);
            if (ret < 0) {
                av_log(NULL, AV_LOG_ERROR, "Error while sending a packet to the decoder\n");
                break;
            }

            while (ret >= 0)
            {
                ret = avcodec_receive_frame(dec_ctx, frame);
                if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
                    break;
                }
                else if (ret < 0) {
                    av_log(NULL, AV_LOG_ERROR, "Error while receiving a frame from the decoder\n");
                    goto end;
                }

                frame->pts = frame->best_effort_timestamp;

                /* push the decoded frame into the filtergraph */
                if (av_buffersrc_add_frame_flags(buffersrc_ctx, frame, AV_BUFFERSRC_FLAG_KEEP_REF) < 0) {
                    av_log(NULL, AV_LOG_ERROR, "Error while feeding the filtergraph\n");
                    break;
                }

                /* pull filtered frames from the filtergraph */
                while (1) {
                    ret = av_buffersink_get_frame(buffersink_ctx, filt_frame);
                    if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF)
                        break;
                    if (ret < 0)
                        goto end;
                    // display_frame(filt_frame, buffersink_ctx->inputs[0]->time_base);
                    av_frame_unref(filt_frame);

                    /* ret = save_frame_as_jpeg(dec_ctx, frame, dec_ctx->frame_number);
                    if (ret < 0)
                        goto end; */
                        //2. Add metadata to frames SEI

                    ret = av_frame_make_writable(frame);
                    if (ret < 0)
                        exit(1);

                    char sideDataSei[43] = "139FB1A9446A4DEC8CBF65B1E12D2CFDHola";
                    const char* sideDataMsg = "139FB1A9446A4DEC8CBF65B1E12D2CFDHola";
                    size_t sideDataSize = sizeof(sideDataMsg);
                    AVBufferRef* sideDataBuffer = av_buffer_alloc(20);
                    sideDataBuffer->data = (uint8_t*)sideDataMsg;

                    AVFrameSideData* sideData = av_frame_new_side_data_from_buf(frame, AV_FRAME_DATA_SEI_UNREGISTERED, sideDataBuffer);

                    ret = add_frame_output_stream(frame);
                    if (ret < 0)
                        goto end;
                }
                av_frame_unref(frame);
                count++;
            }
        }
        av_packet_unref(packet);
    }

    av_write_trailer(ofmt_ctx);

end:
    avfilter_graph_free(&filter_graph);
    avcodec_free_context(&dec_ctx);
    avformat_close_input(&fmt_ctx);
    av_frame_free(&frame);
    av_frame_free(&filt_frame);
    av_packet_free(&packet);

    if (ret < 0 && ret != AVERROR_EOF) {
        char errBuf[AV_ERROR_MAX_STRING_SIZE]{ 0 };
        int res = av_strerror(ret, errBuf, AV_ERROR_MAX_STRING_SIZE);
        fprintf(stderr, "Error:  %s\n", errBuf);
        exit(1);
    }

    exit(0);
}

Solution

  • Well, I came up with this solution from a friend, just had to add:

    *av_opt_set_int(c->priv_data, "udu_sei", 1, 0);*
    

    In the function initialize_output_stream after all parameters are set for AVCodecContext (c) that is being used for the output stream encoding.

    Hope this helps someone!