Search code examples
c++qtffmpeg

Audio Contatenation using FFmpeg library


I'm and working on implementing audio concatenation using the FFmpeg library. However, I've encountered an issue where the output audio consists of the first 3 seconds from audio one and the last 2 seconds from audio two, rather than the expected total of 3 + 5 seconds.

void AudioConcat::concatenateAudio(const char* input1Path, const char* input2Path, const char* outputPath) {
        std::cerr << count++ << std::endl;

    // Open input files
    AVPacket* avPacket = nullptr;
    AVFormatContext* avInputFormatContext1 = NULL;
    AVFormatContext* avInputFormatContext2 = NULL;
    AVFormatContext* avOutputFormatContext;

    avPacket = av_packet_alloc();
    if (!avPacket) {
        std::cerr << "Failed to allocate AVPacket." << std::endl;
        qCritical("Failed to allocate AVPacket.");
        return;
    }

    try {

        if (avformat_open_input(&avInputFormatContext1, input1Path, 0, 0) < 0 ||
            avformat_open_input(&avInputFormatContext2, input2Path, 0, 0) < 0) {
            std::cerr << "Error opening input files." << std::endl;
            return;
        }

        if (avformat_find_stream_info(avInputFormatContext1, 0) < 0 ||
            avformat_find_stream_info(avInputFormatContext2, 0) < 0) {
            qCritical("%s", QString("Failed to retrieve the input stream information.").toStdString().c_str());
            return;
        }

        // Open output file
        if (avformat_alloc_output_context2(&avOutputFormatContext, nullptr, nullptr, outputPath) < 0) {
            std::cerr << "Error creating output format context." << std::endl;
            return;
        }

        // Find audio streams in input files
        AVStream* input1AudioStream = nullptr;
        AVStream* input2AudioStream = nullptr;

        for (unsigned int i = 0; i < avInputFormatContext1->nb_streams; ++i) {
            if (avInputFormatContext1->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
                input1AudioStream = avInputFormatContext1->streams[i];
                break;
            }
        }

        for (unsigned int i = 0; i < avInputFormatContext2->nb_streams; ++i) {
            if (avInputFormatContext2->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
                input2AudioStream = avInputFormatContext2->streams[i];
                break;
            }
        }

        if (!input1AudioStream || !input2AudioStream) {
            std::cerr << "Error finding audio streams in input files." << std::endl;
            return;
        }

        // Create new audio stream in the output file
        AVStream* outputAudioStream = avformat_new_stream(avOutputFormatContext, nullptr);
        if (!outputAudioStream) {
            std::cerr << "Error creating new audio stream in the output file." << std::endl;
            return;
        }

        // Copy codec parameters from input streams to output stream
        avcodec_parameters_copy(outputAudioStream->codecpar, input1AudioStream->codecpar);

        // Write the output file header
        if (!(avOutputFormatContext->oformat->flags & AVFMT_NOFILE)) {
            int operationResult = avio_open(&avOutputFormatContext->pb, outputPath, AVIO_FLAG_WRITE);
            if (operationResult < 0) {
                qCritical(
                    "%s", QString("Failed to open the output file '%1'.").arg(outputPath).toStdString().c_str());
            }
        }
        if (avformat_write_header(avOutputFormatContext, NULL) < 0) {
            std::cerr << "Error writing output file header." << std::endl;
            return;
        }

        while (av_read_frame(avInputFormatContext1, avPacket) == 0) {
            if (avPacket->stream_index == input1AudioStream->index) {
                avPacket->stream_index = outputAudioStream->index;
                avPacket->pts = av_rescale_q(avPacket->pts, input1AudioStream->time_base, outputAudioStream->time_base);
                avPacket->dts = av_rescale_q(avPacket->dts, input1AudioStream->time_base, outputAudioStream->time_base);
                avPacket->duration = av_rescale_q(avPacket->duration, input1AudioStream->time_base, outputAudioStream->time_base);
                av_interleaved_write_frame(avOutputFormatContext, avPacket);
            }

            av_packet_unref(avPacket);
        }
        while (av_read_frame(avInputFormatContext2, avPacket) == 0) {
            if (avPacket->stream_index == input2AudioStream->index) {
                avPacket->stream_index = outputAudioStream->index;
                avPacket->pts = av_rescale_q(avPacket->pts, input2AudioStream->time_base, outputAudioStream->time_base);
                avPacket->dts = av_rescale_q(avPacket->dts, input2AudioStream->time_base, outputAudioStream->time_base);
                avPacket->duration = av_rescale_q(avPacket->duration, input2AudioStream->time_base, outputAudioStream->time_base);
                av_interleaved_write_frame(avOutputFormatContext, avPacket);
            }

            av_packet_unref(avPacket);
        }

        // Write the output file trailer
        if (av_write_trailer(avOutputFormatContext) < 0) {
            std::cerr << "Error writing output file trailer." << std::endl;
            return;
        }

        av_packet_unref(avPacket);
        avformat_close_input(&avInputFormatContext1);
        avformat_close_input(&avInputFormatContext2);
        avformat_free_context(avOutputFormatContext);
    }
    catch (...) {
        std::exception_ptr p = std::current_exception();
        std::cerr <<(p ? p.__cxa_exception_type()->name() : "null") << std::endl;
    }

}

I suspect this problem may be related to using the output stream seconds with AV packets while writing frames from the second input. Any insights?


Solution

  • The pts field stands for "presentation timestamp". The idea is that every packet encodes sound from pts to pts+duration. If you want to concatenate audio, you thus need to shift all packets of the second sound by pts+duration of the last packed in your first sound. Concretely:

    int64_t nextPts = 0;
    while (av_read_frame(avInputFormatContext1, avPacket) == 0) {
            if (avPacket->stream_index == input1AudioStream->index) {
                    avPacket->stream_index = outputAudioStream->index;
                    avPacket->pts = av_rescale_q(avPacket->pts, input1AudioStream->time_base, outputAudioStream->time_base);
                    avPacket->dts = av_rescale_q(avPacket->dts, input1AudioStream->time_base, outputAudioStream->time_base);
                    avPacket->duration = av_rescale_q(avPacket->duration, input1AudioStream->time_base, outputAudioStream->time_base);
                    nextPts = avPacket->pts + avPacket->duration;
                    av_interleaved_write_frame(avOutputFormatContext, avPacket);
            }
    
            av_packet_unref(avPacket);
    }
    while (av_read_frame(avInputFormatContext2, avPacket) == 0) {
            if (avPacket->stream_index == input2AudioStream->index) {
                    avPacket->stream_index = outputAudioStream->index;
                    // See nextPts below!
                    avPacket->pts = nextPts + av_rescale_q(avPacket->pts, input2AudioStream->time_base, outputAudioStream->time_base);
                    avPacket->dts = nextPts + av_rescale_q(avPacket->dts, input2AudioStream->time_base, outputAudioStream->time_base);
                    avPacket->duration = av_rescale_q(avPacket->duration, input2AudioStream->time_base, outputAudioStream->time_base);
                    av_interleaved_write_frame(avOutputFormatContext, avPacket);
            }
    
            av_packet_unref(avPacket);
    }
    

    Note that pts and dts might be a special value called AV_NOPTS_VALUE, in which case you have to preserve that AV_NOPTS_VALUE. That is left as an exercise for you :)