I'm and working on implementing audio concatenation using the FFmpeg library. However, I've encountered an issue where the output audio consists of the first 3 seconds from audio one and the last 2 seconds from audio two, rather than the expected total of 3 + 5 seconds.
void AudioConcat::concatenateAudio(const char* input1Path, const char* input2Path, const char* outputPath) {
std::cerr << count++ << std::endl;
// Open input files
AVPacket* avPacket = nullptr;
AVFormatContext* avInputFormatContext1 = NULL;
AVFormatContext* avInputFormatContext2 = NULL;
AVFormatContext* avOutputFormatContext;
avPacket = av_packet_alloc();
if (!avPacket) {
std::cerr << "Failed to allocate AVPacket." << std::endl;
qCritical("Failed to allocate AVPacket.");
return;
}
try {
if (avformat_open_input(&avInputFormatContext1, input1Path, 0, 0) < 0 ||
avformat_open_input(&avInputFormatContext2, input2Path, 0, 0) < 0) {
std::cerr << "Error opening input files." << std::endl;
return;
}
if (avformat_find_stream_info(avInputFormatContext1, 0) < 0 ||
avformat_find_stream_info(avInputFormatContext2, 0) < 0) {
qCritical("%s", QString("Failed to retrieve the input stream information.").toStdString().c_str());
return;
}
// Open output file
if (avformat_alloc_output_context2(&avOutputFormatContext, nullptr, nullptr, outputPath) < 0) {
std::cerr << "Error creating output format context." << std::endl;
return;
}
// Find audio streams in input files
AVStream* input1AudioStream = nullptr;
AVStream* input2AudioStream = nullptr;
for (unsigned int i = 0; i < avInputFormatContext1->nb_streams; ++i) {
if (avInputFormatContext1->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
input1AudioStream = avInputFormatContext1->streams[i];
break;
}
}
for (unsigned int i = 0; i < avInputFormatContext2->nb_streams; ++i) {
if (avInputFormatContext2->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
input2AudioStream = avInputFormatContext2->streams[i];
break;
}
}
if (!input1AudioStream || !input2AudioStream) {
std::cerr << "Error finding audio streams in input files." << std::endl;
return;
}
// Create new audio stream in the output file
AVStream* outputAudioStream = avformat_new_stream(avOutputFormatContext, nullptr);
if (!outputAudioStream) {
std::cerr << "Error creating new audio stream in the output file." << std::endl;
return;
}
// Copy codec parameters from input streams to output stream
avcodec_parameters_copy(outputAudioStream->codecpar, input1AudioStream->codecpar);
// Write the output file header
if (!(avOutputFormatContext->oformat->flags & AVFMT_NOFILE)) {
int operationResult = avio_open(&avOutputFormatContext->pb, outputPath, AVIO_FLAG_WRITE);
if (operationResult < 0) {
qCritical(
"%s", QString("Failed to open the output file '%1'.").arg(outputPath).toStdString().c_str());
}
}
if (avformat_write_header(avOutputFormatContext, NULL) < 0) {
std::cerr << "Error writing output file header." << std::endl;
return;
}
while (av_read_frame(avInputFormatContext1, avPacket) == 0) {
if (avPacket->stream_index == input1AudioStream->index) {
avPacket->stream_index = outputAudioStream->index;
avPacket->pts = av_rescale_q(avPacket->pts, input1AudioStream->time_base, outputAudioStream->time_base);
avPacket->dts = av_rescale_q(avPacket->dts, input1AudioStream->time_base, outputAudioStream->time_base);
avPacket->duration = av_rescale_q(avPacket->duration, input1AudioStream->time_base, outputAudioStream->time_base);
av_interleaved_write_frame(avOutputFormatContext, avPacket);
}
av_packet_unref(avPacket);
}
while (av_read_frame(avInputFormatContext2, avPacket) == 0) {
if (avPacket->stream_index == input2AudioStream->index) {
avPacket->stream_index = outputAudioStream->index;
avPacket->pts = av_rescale_q(avPacket->pts, input2AudioStream->time_base, outputAudioStream->time_base);
avPacket->dts = av_rescale_q(avPacket->dts, input2AudioStream->time_base, outputAudioStream->time_base);
avPacket->duration = av_rescale_q(avPacket->duration, input2AudioStream->time_base, outputAudioStream->time_base);
av_interleaved_write_frame(avOutputFormatContext, avPacket);
}
av_packet_unref(avPacket);
}
// Write the output file trailer
if (av_write_trailer(avOutputFormatContext) < 0) {
std::cerr << "Error writing output file trailer." << std::endl;
return;
}
av_packet_unref(avPacket);
avformat_close_input(&avInputFormatContext1);
avformat_close_input(&avInputFormatContext2);
avformat_free_context(avOutputFormatContext);
}
catch (...) {
std::exception_ptr p = std::current_exception();
std::cerr <<(p ? p.__cxa_exception_type()->name() : "null") << std::endl;
}
}
I suspect this problem may be related to using the output stream seconds with AV packets while writing frames from the second input. Any insights?
The pts
field stands for "presentation timestamp". The idea is that every packet encodes sound from pts
to pts+duration
. If you want to concatenate audio, you thus need to shift all packets of the second sound by pts+duration
of the last packed in your first sound. Concretely:
int64_t nextPts = 0;
while (av_read_frame(avInputFormatContext1, avPacket) == 0) {
if (avPacket->stream_index == input1AudioStream->index) {
avPacket->stream_index = outputAudioStream->index;
avPacket->pts = av_rescale_q(avPacket->pts, input1AudioStream->time_base, outputAudioStream->time_base);
avPacket->dts = av_rescale_q(avPacket->dts, input1AudioStream->time_base, outputAudioStream->time_base);
avPacket->duration = av_rescale_q(avPacket->duration, input1AudioStream->time_base, outputAudioStream->time_base);
nextPts = avPacket->pts + avPacket->duration;
av_interleaved_write_frame(avOutputFormatContext, avPacket);
}
av_packet_unref(avPacket);
}
while (av_read_frame(avInputFormatContext2, avPacket) == 0) {
if (avPacket->stream_index == input2AudioStream->index) {
avPacket->stream_index = outputAudioStream->index;
// See nextPts below!
avPacket->pts = nextPts + av_rescale_q(avPacket->pts, input2AudioStream->time_base, outputAudioStream->time_base);
avPacket->dts = nextPts + av_rescale_q(avPacket->dts, input2AudioStream->time_base, outputAudioStream->time_base);
avPacket->duration = av_rescale_q(avPacket->duration, input2AudioStream->time_base, outputAudioStream->time_base);
av_interleaved_write_frame(avOutputFormatContext, avPacket);
}
av_packet_unref(avPacket);
}
Note that pts and dts might be a special value called AV_NOPTS_VALUE
, in which case you have to preserve that AV_NOPTS_VALUE
. That is left as an exercise for you :)