Audio Contatenation using FFmpeg library

I'm and working on implementing audio concatenation using the FFmpeg library. However, I've encountered an issue where the output audio consists of the first 3 seconds from audio one and the last 2 seconds from audio two, rather than the expected total of 3 + 5 seconds.

void AudioConcat::concatenateAudio(const char* input1Path, const char* input2Path, const char* outputPath) {
        std::cerr << count++ << std::endl;

    // Open input files
    AVPacket* avPacket = nullptr;
    AVFormatContext* avInputFormatContext1 = NULL;
    AVFormatContext* avInputFormatContext2 = NULL;
    AVFormatContext* avOutputFormatContext;

    avPacket = av_packet_alloc();
    if (!avPacket) {
        std::cerr << "Failed to allocate AVPacket." << std::endl;
        qCritical("Failed to allocate AVPacket.");
        return;
    }

    try {

        if (avformat_open_input(&avInputFormatContext1, input1Path, 0, 0) < 0 ||
            avformat_open_input(&avInputFormatContext2, input2Path, 0, 0) < 0) {
            std::cerr << "Error opening input files." << std::endl;
            return;
        }

        if (avformat_find_stream_info(avInputFormatContext1, 0) < 0 ||
            avformat_find_stream_info(avInputFormatContext2, 0) < 0) {
            qCritical("%s", QString("Failed to retrieve the input stream information.").toStdString().c_str());
            return;
        }

        // Open output file
        if (avformat_alloc_output_context2(&avOutputFormatContext, nullptr, nullptr, outputPath) < 0) {
            std::cerr << "Error creating output format context." << std::endl;
            return;
        }

        // Find audio streams in input files
        AVStream* input1AudioStream = nullptr;
        AVStream* input2AudioStream = nullptr;

        for (unsigned int i = 0; i < avInputFormatContext1->nb_streams; ++i) {
            if (avInputFormatContext1->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
                input1AudioStream = avInputFormatContext1->streams[i];
                break;
            }
        }

        for (unsigned int i = 0; i < avInputFormatContext2->nb_streams; ++i) {
            if (avInputFormatContext2->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
                input2AudioStream = avInputFormatContext2->streams[i];
                break;
            }
        }

        if (!input1AudioStream || !input2AudioStream) {
            std::cerr << "Error finding audio streams in input files." << std::endl;
            return;
        }

        // Create new audio stream in the output file
        AVStream* outputAudioStream = avformat_new_stream(avOutputFormatContext, nullptr);
        if (!outputAudioStream) {
            std::cerr << "Error creating new audio stream in the output file." << std::endl;
            return;
        }

        // Copy codec parameters from input streams to output stream
        avcodec_parameters_copy(outputAudioStream->codecpar, input1AudioStream->codecpar);

        // Write the output file header
        if (!(avOutputFormatContext->oformat->flags & AVFMT_NOFILE)) {
            int operationResult = avio_open(&avOutputFormatContext->pb, outputPath, AVIO_FLAG_WRITE);
            if (operationResult < 0) {
                qCritical(
                    "%s", QString("Failed to open the output file '%1'.").arg(outputPath).toStdString().c_str());
            }
        }
        if (avformat_write_header(avOutputFormatContext, NULL) < 0) {
            std::cerr << "Error writing output file header." << std::endl;
            return;
        }

        while (av_read_frame(avInputFormatContext1, avPacket) == 0) {
            if (avPacket->stream_index == input1AudioStream->index) {
                avPacket->stream_index = outputAudioStream->index;
                avPacket->pts = av_rescale_q(avPacket->pts, input1AudioStream->time_base, outputAudioStream->time_base);
                avPacket->dts = av_rescale_q(avPacket->dts, input1AudioStream->time_base, outputAudioStream->time_base);
                avPacket->duration = av_rescale_q(avPacket->duration, input1AudioStream->time_base, outputAudioStream->time_base);
                av_interleaved_write_frame(avOutputFormatContext, avPacket);
            }

            av_packet_unref(avPacket);
        }
        while (av_read_frame(avInputFormatContext2, avPacket) == 0) {
            if (avPacket->stream_index == input2AudioStream->index) {
                avPacket->stream_index = outputAudioStream->index;
                avPacket->pts = av_rescale_q(avPacket->pts, input2AudioStream->time_base, outputAudioStream->time_base);
                avPacket->dts = av_rescale_q(avPacket->dts, input2AudioStream->time_base, outputAudioStream->time_base);
                avPacket->duration = av_rescale_q(avPacket->duration, input2AudioStream->time_base, outputAudioStream->time_base);
                av_interleaved_write_frame(avOutputFormatContext, avPacket);
            }

            av_packet_unref(avPacket);
        }

        // Write the output file trailer
        if (av_write_trailer(avOutputFormatContext) < 0) {
            std::cerr << "Error writing output file trailer." << std::endl;
            return;
        }

        av_packet_unref(avPacket);
        avformat_close_input(&avInputFormatContext1);
        avformat_close_input(&avInputFormatContext2);
        avformat_free_context(avOutputFormatContext);
    }
    catch (...) {
        std::exception_ptr p = std::current_exception();
        std::cerr <<(p ? p.__cxa_exception_type()->name() : "null") << std::endl;
    }

}

I suspect this problem may be related to using the output stream seconds with AV packets while writing frames from the second input. Any insights?

Solution

The pts field stands for "presentation timestamp". The idea is that every packet encodes sound from pts to pts+duration. If you want to concatenate audio, you thus need to shift all packets of the second sound by pts+duration of the last packed in your first sound. Concretely:

int64_t nextPts = 0;
while (av_read_frame(avInputFormatContext1, avPacket) == 0) {
        if (avPacket->stream_index == input1AudioStream->index) {
                avPacket->stream_index = outputAudioStream->index;
                avPacket->pts = av_rescale_q(avPacket->pts, input1AudioStream->time_base, outputAudioStream->time_base);
                avPacket->dts = av_rescale_q(avPacket->dts, input1AudioStream->time_base, outputAudioStream->time_base);
                avPacket->duration = av_rescale_q(avPacket->duration, input1AudioStream->time_base, outputAudioStream->time_base);
                nextPts = avPacket->pts + avPacket->duration;
                av_interleaved_write_frame(avOutputFormatContext, avPacket);
        }

        av_packet_unref(avPacket);
}
while (av_read_frame(avInputFormatContext2, avPacket) == 0) {
        if (avPacket->stream_index == input2AudioStream->index) {
                avPacket->stream_index = outputAudioStream->index;
                // See nextPts below!
                avPacket->pts = nextPts + av_rescale_q(avPacket->pts, input2AudioStream->time_base, outputAudioStream->time_base);
                avPacket->dts = nextPts + av_rescale_q(avPacket->dts, input2AudioStream->time_base, outputAudioStream->time_base);
                avPacket->duration = av_rescale_q(avPacket->duration, input2AudioStream->time_base, outputAudioStream->time_base);
                av_interleaved_write_frame(avOutputFormatContext, avPacket);
        }

        av_packet_unref(avPacket);
}

Note that pts and dts might be a special value called AV_NOPTS_VALUE, in which case you have to preserve that AV_NOPTS_VALUE. That is left as an exercise for you :)