I am trying to create a function which will combine an audio file and a video file and output them to an mp4. I've managed to successfully do so except that the output is not at the correct framerate. It's a very slight difference from the original. 30.13 whereas it should be 30 exactly. When I combine these files with the ffmpeg program, the result is exactly 30 as it should be.
I'm confident it has something to do with the dts/pts correction when recieving out of order data, but ffmpeg program does this too in a similar manner. So I'm not sure where to go from here. I've looked at the ffmpeg source code and copied some of their dts correction and still no luck. What am I doing wrong here?
bool mux_audio_video(const char* audio_filename, const char* video_filename, const char* output_filename){
av_register_all();
AVOutputFormat* out_format = NULL;
AVFormatContext* audio_context = NULL, *video_context = NULL, *output_context = NULL;
int video_index_in = -1, audio_index_in = -1;
int video_index_out = -1, audio_index_out = -1;
if(avformat_open_input(&audio_context, audio_filename, 0, 0) < 0)
return false;
if(avformat_find_stream_info(audio_context, 0) < 0){
avformat_close_input(&audio_context);
return false;
}
if(avformat_open_input(&video_context, video_filename, 0, 0) < 0){
avformat_close_input(&audio_context);
return false;
}
if(avformat_find_stream_info(video_context, 0) < 0){
avformat_close_input(&audio_context);
avformat_close_input(&video_context);
return false;
}
if(avformat_alloc_output_context2(&output_context, av_guess_format("mp4", NULL, NULL), NULL, output_filename) < 0){
avformat_close_input(&audio_context);
avformat_close_input(&video_context);
return false;
}
out_format = output_context->oformat;
//find first audio stream in the audio file input
for(size_t i = 0;i < audio_context->nb_streams;++i){
if(audio_context->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO){
audio_index_in = i;
AVStream* in_stream = audio_context->streams[i];
AVCodec* codec = avcodec_find_encoder(in_stream->codecpar->codec_id);
AVCodecContext* tmp = avcodec_alloc_context3(codec);
avcodec_parameters_to_context(tmp, in_stream->codecpar);
AVStream* out_stream = avformat_new_stream(output_context, codec);
audio_index_out = out_stream->index;
if(output_context->oformat->flags & AVFMT_GLOBALHEADER){
tmp->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
}
tmp->codec_tag = 0;
avcodec_parameters_from_context(out_stream->codecpar, tmp);
avcodec_free_context(&tmp);
break;
}
}
//find first video stream in the video file input
for(size_t i = 0;i < video_context->nb_streams;++i){
if(video_context->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO){
video_index_in = i;
AVStream* in_stream = video_context->streams[i];
AVCodec* codec = avcodec_find_encoder(in_stream->codecpar->codec_id);
AVCodecContext* tmp = avcodec_alloc_context3(codec);
avcodec_parameters_to_context(tmp, in_stream->codecpar);
AVStream* out_stream = avformat_new_stream(output_context, codec);
video_index_out = out_stream->index;
if(output_context->oformat->flags & AVFMT_GLOBALHEADER){
tmp->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
}
tmp->codec_tag = 0;
avcodec_parameters_from_context(out_stream->codecpar, tmp);
avcodec_free_context(&tmp);
break;
}
}
//setup output
if(!(out_format->flags & AVFMT_NOFILE)){
if(avio_open(&output_context->pb, output_filename, AVIO_FLAG_WRITE) < 0){
avformat_free_context(output_context);
avformat_close_input(&audio_context);
avformat_close_input(&video_context);
return false;
}
}
if(avformat_write_header(output_context, NULL) < 0){
if(!(out_format->flags & AVFMT_NOFILE)){
avio_close(output_context->pb);
}
avformat_free_context(output_context);
avformat_close_input(&audio_context);
avformat_close_input(&video_context);
return false;
}
int64_t video_pts = 0, audio_pts = 0;
int64_t last_video_dts = 0, last_audio_dts = 0;
while(true){
AVPacket packet;
av_init_packet(&packet);
packet.data = NULL;
packet.size = 0;
int64_t* last_dts;
AVFormatContext* in_context;
int stream_index = 0;
AVStream* in_stream, *out_stream;
//Read in a frame from the next stream
if(av_compare_ts(video_pts, video_context->streams[video_index_in]->time_base,
audio_pts, audio_context->streams[audio_index_in]->time_base) <= 0)
{
//video
last_dts = &last_video_dts;
in_context = video_context;
stream_index = video_index_out;
if(av_read_frame(in_context, &packet) >= 0){
do{
if(packet.stream_index == video_index_in){
video_pts = packet.pts;
break;
}
av_packet_unref(&packet);
}while(av_read_frame(in_context, &packet) >= 0);
}else{
break;
}
}else{
//audio
last_dts = &last_audio_dts;
in_context = audio_context;
stream_index = audio_index_out;
if(av_read_frame(in_context, &packet) >= 0){
do{
if(packet.stream_index == audio_index_in){
audio_pts = packet.pts;
break;
}
av_packet_unref(&packet);
}while(av_read_frame(in_context, &packet) >= 0);
}else{
break;
}
}
in_stream = in_context->streams[packet.stream_index];
out_stream = output_context->streams[stream_index];
av_packet_rescale_ts(&packet, in_stream->time_base, out_stream->time_base);
//if dts is out of order, ffmpeg throws an error. So manually fix. Similar to what ffmpeg does in ffmpeg.c
if(packet.dts < (*last_dts + !(output_context->oformat->flags & AVFMT_TS_NONSTRICT)) && packet.dts != AV_NOPTS_VALUE && (*last_dts) != AV_NOPTS_VALUE){
int64_t next_dts = (*last_dts)+1;
if(packet.pts >= packet.dts && packet.pts != AV_NOPTS_VALUE){
packet.pts = FFMAX(packet.pts, next_dts);
}
if(packet.pts == AV_NOPTS_VALUE){
packet.pts = next_dts;
}
packet.dts = next_dts;
}
(*last_dts) = packet.dts;
packet.pos = -1;
packet.stream_index = stream_index;
//output packet
if(av_interleaved_write_frame(output_context, &packet) < 0){
break;
}
av_packet_unref(&packet);
}
av_write_trailer(output_context);
//cleanup
if(!(out_format->flags & AVFMT_NOFILE)){
avio_close(output_context->pb);
}
avformat_free_context(output_context);
avformat_close_input(&audio_context);
avformat_close_input(&video_context);
return true;
}
I found the issue. I just needed to initialize last_video_dts
and last_audio_dts
to the minimum value for int64_t instead of 0.
int64_t last_video_dts, last_audio_dts;
last_video_dts = last_audio_dts = std::numeric_limits<int64_t>::lowest();
Now the output is basically identical to that of the ffmpeg program.
Edit:
As mentioned by the kamilz, it is better and more portable to use AV_NOPTS_VALUE.
int64_t last_video_dts, last_audio_dts;
last_video_dts = last_audio_dts = AV_NOPTS_VALUE;