Transcoded video stream unplayable in QuickTime player

Currently I'm writing software for transcoding media files using ffmpeg libs. The problem is that in case of H264 QuickTime cannot play result stream and shows black screen. Audio streams work as expected. I have read that QuickTime can deal only with yuv420p pixel format and that is true for encoded video.

I looked through the ffmpeg examples and ffmpeg source code and could not find anything find any clues where the problem might be. I would really appreciate any help.

The only thing I managed to get from QuickTime is SeqAndPicParamSetFromCFDictionaryRef, bad config record message in console. Same thing is logged by AVPlayer from AVFoundation.

Here is the initialization of output streams and encoders.

int status;

// avformat_alloc_output_context2()
if ((status = formatContext.open(destFilename)) < 0) {
    return status;
}

AVDictionary *fmtOptions = nullptr;
av_dict_set(&fmtOptions, "movflags", "faststart", 0);
av_dict_set(&fmtOptions, "brand", "mp42", 0);

streams.resize(input->getStreamsCount());
for (int i = 0; i < input->getStreamsCount(); ++i) {
    AVStream *inputStream = input->getStreamAtIndex(i);
    CodecContext &decoderContext = input->getDecoderAtIndex(i);

    // retrieve output codec by codec id
    auto encoderCodecId = decoderContext.getCodecID();;
    if (decoderContext.getCodecType() == AVMEDIA_TYPE_VIDEO || decoderContext.getCodecType() == AVMEDIA_TYPE_AUDIO) {
        int codecIdKey = decoderContext.getCodecType() == AVMEDIA_TYPE_AUDIO ? IPROC_KEY_INT(TargetAudioCodecID) : IPROC_KEY_INT(TargetVideoCodecID);
        auto codecIdParam = static_cast<AVCodecID>(params[codecIdKey]);
        if (codecIdParam != AV_CODEC_ID_NONE) {
            encoderCodecId = codecIdParam;
        }
    }
    AVCodec *encoder = nullptr;
    if ((encoder = avcodec_find_encoder(encoderCodecId)) == nullptr) {
        status = AVERROR_ENCODER_NOT_FOUND;
        return status;
    }

    // create stream with specific codec and format
    AVStream *outputStream = nullptr;
    // avformat_new_stream()
    if ((outputStream = formatContext.newStream(encoder)) == nullptr) {
        return AVERROR(ENOMEM);
    }


    CodecContext encoderContext;
    // avcodec_alloc_context3()
    if ((status = encoderContext.init(encoder)) < 0) {
        return status;
    }

    outputStream->disposition = inputStream->disposition;
    encoderContext.getRawCtx()->chroma_sample_location = decoderContext.getRawCtx()->chroma_sample_location;

    if (encoderContext.getCodecType() == AVMEDIA_TYPE_VIDEO) {
        auto lang = av_dict_get(input->getStreamAtIndex(i)->metadata, "language", nullptr, 0);
        if (lang) {
            av_dict_set(&outputStream->metadata, "language", lang->value, 0);
        }

        // prepare encoder context
        int targetWidth = params[IPROC_KEY_INT(TargetVideoWidth)];
        int targetHeight = params[IPROC_KEY_INT(TargetVideHeight)];



        encoderContext.width() = targetWidth > 0 ? targetWidth : decoderContext.width();
        encoderContext.height() = targetHeight > 0 ? targetHeight : decoderContext.height();
        encoderContext.pixelFormat() = encoder->pix_fmts ? encoder->pix_fmts[0] : decoderContext.pixelFormat();;
        encoderContext.timeBase() = decoderContext.timeBase();
        encoderContext.getRawCtx()->level = 31;
        encoderContext.getRawCtx()->gop_size = 25;

        double far = static_cast<double>(encoderContext.getRawCtx()->width) / encoderContext.getRawCtx()->height;
        double dar = static_cast<double>(decoderContext.width()) / decoderContext.height();
        encoderContext.sampleAspectRatio() = av_d2q(dar / far, 255);


        encoderContext.getRawCtx()->bits_per_raw_sample = FFMIN(decoderContext.getRawCtx()->bits_per_raw_sample,
                                                                av_pix_fmt_desc_get(encoderContext.pixelFormat())->comp[0].depth);
        encoderContext.getRawCtx()->framerate = inputStream->r_frame_rate;
        outputStream->avg_frame_rate = encoderContext.getRawCtx()->framerate;

        VideoFilterGraphParameters params;
        params.height = encoderContext.height();
        params.width = encoderContext.width();
        params.pixelFormat = encoderContext.pixelFormat();
        if ((status = generateGraph(decoderContext, encoderContext, params, streams[i].filterGraph)) < 0) {
            return status;
        }

    } else if (encoderContext.getCodecType() == AVMEDIA_TYPE_AUDIO) {
        auto lang = av_dict_get(input->getStreamAtIndex(i)->metadata, "language", nullptr, 0);
        if (lang) {
            av_dict_set(&outputStream->metadata, "language", lang->value, 0);
        }

        encoderContext.sampleRate() = params[IPROC_KEY_INT(TargetAudioSampleRate)] ? : decoderContext.sampleRate();
        encoderContext.channels() = params[IPROC_KEY_INT(TargetAudioChannels)] ? : decoderContext.channels();
        auto paramChannelLayout = params[IPROC_KEY_INT(TargetAudioChannelLayout)];
        if (paramChannelLayout) {
            encoderContext.channelLayout() = paramChannelLayout;
        } else {
            encoderContext.channelLayout() = av_get_default_channel_layout(encoderContext.channels());
        }

        AVSampleFormat sampleFormatParam = static_cast<AVSampleFormat>(params[IPROC_KEY_INT(TargetAudioSampleFormat)]);
        if (sampleFormatParam != AV_SAMPLE_FMT_NONE) {
            encoderContext.sampleFormat() = sampleFormatParam;
        } else if (encoder->sample_fmts) {
            encoderContext.sampleFormat() = encoder->sample_fmts[0];
        } else {
            encoderContext.sampleFormat() = decoderContext.sampleFormat();
        }

        encoderContext.timeBase().num = 1;
        encoderContext.timeBase().den = encoderContext.sampleRate();

        AudioFilterGraphParameters params;
        params.channelLayout = encoderContext.channelLayout();
        params.channels = encoderContext.channels();
        params.format = encoderContext.sampleFormat();
        params.sampleRate = encoderContext.sampleRate();
        if ((status = generateGraph(decoderContext, encoderContext, params, streams[i].filterGraph)) < 0) {
            return status;
        }
    }

    // before using encoder, we should open it and update its parameters
    printf("Codec bits per sample %d\n", av_get_bits_per_sample(encoderCodecId));
    AVDictionary *options = nullptr;
    // avcodec_open2()
    if ((status = encoderContext.open(encoder, &options)) < 0) {
        return status;
    }
    if (streams[i].filterGraph) {
        streams[i].filterGraph.setOutputFrameSize(encoderContext.getFrameSize());
    }
    // avcodec_parameters_from_context()
    if ((status = encoderContext.fillParamters(outputStream->codecpar)) < 0) {
        return status;
    }
    outputStream->codecpar->format = encoderContext.getRawCtx()->pix_fmt;

    if (formatContext.getRawCtx()->oformat->flags & AVFMT_GLOBALHEADER) {
        encoderContext.getRawCtx()->flags |= CODEC_FLAG_GLOBAL_HEADER;
    }

    if (encoderContext.getRawCtx()->nb_coded_side_data) {
        int i;

        for (i = 0; i < encoderContext.getRawCtx()->nb_coded_side_data; i++) {
            const AVPacketSideData *sd_src = &encoderContext.getRawCtx()->coded_side_data[i];
            uint8_t *dst_data;

            dst_data = av_stream_new_side_data(outputStream, sd_src->type, sd_src->size);
            if (!dst_data)
                return AVERROR(ENOMEM);
            memcpy(dst_data, sd_src->data, sd_src->size);
        }
    }

    /*
     * Add global input side data. For now this is naive, and copies it
     * from the input stream's global side data. All side data should
     * really be funneled over AVFrame and libavfilter, then added back to
     * packet side data, and then potentially using the first packet for
     * global side data.
     */
    for (int i = 0; i < inputStream->nb_side_data; i++) {
        AVPacketSideData *sd = &inputStream->side_data[i];
        uint8_t *dst = av_stream_new_side_data(outputStream, sd->type, sd->size);
        if (!dst)
            return AVERROR(ENOMEM);
        memcpy(dst, sd->data, sd->size);
    }

    // copy timebase while removing common factors
    if (outputStream->time_base.num <= 0 || outputStream->time_base.den <= 0) {
        outputStream->time_base = av_add_q(encoderContext.timeBase(), (AVRational){0, 1});
    }

    // copy estimated duration as a hint to the muxer
    if (outputStream->duration <= 0 && inputStream->duration > 0) {
        outputStream->duration = av_rescale_q(inputStream->duration, inputStream->time_base, outputStream->time_base);
    }

    streams[i].codecType = encoderContext.getRawCtx()->codec_type;
    streams[i].codec = std::move(encoderContext);
    streams[i].streamIndex = i;
}

// avio_open() and avformat_write_header()
if ((status = formatContext.writeHeader(fmtOptions)) < 0) {
    return status;
}

formatContext.dumpFormat();

Reading from stream.

int InputProcessor::performStep() {
    int status;

    Packet nextPacket;
    if ((status = input->getFormatContext().readFrame(nextPacket)) < 0) {
        return status;
    }
    ++streams[nextPacket.getStreamIndex()].readPackets;
    int streamIndex = nextPacket.getStreamIndex();
    CodecContext &decoder = input->getDecoderAtIndex(streamIndex);
    AVStream *inputStream = input->getStreamAtIndex(streamIndex);

    if (streams[nextPacket.getStreamIndex()].readPackets == 1) {
        for (int i = 0; i < inputStream->nb_side_data; ++i) {
            AVPacketSideData *src_sd = &inputStream->side_data[i];
            uint8_t *dst_data;

            if (src_sd->type == AV_PKT_DATA_DISPLAYMATRIX) {
                continue;
            }
            if (av_packet_get_side_data(nextPacket.getRawPtr(), src_sd->type, nullptr)) {
                continue;
            }
            dst_data = av_packet_new_side_data(nextPacket.getRawPtr(), src_sd->type, src_sd->size);
            if (!dst_data) {
                return AVERROR(ENOMEM);
            }
            memcpy(dst_data, src_sd->data, src_sd->size);
        }
    }

    nextPacket.rescaleTimestamps(inputStream->time_base, decoder.timeBase());

    status = decodePacket(&nextPacket, nextPacket.getStreamIndex());
    if (status < 0 && status != AVERROR(EAGAIN)) {
        return status;
    }
    return 0;
}

Here is decoding/encoding code.

int InputProcessor::decodePacket(Packet *packet, int streamIndex) {
    int status;
    int sendStatus;

    auto &decoder = input->getDecoderAtIndex(streamIndex);

    do {
        if (packet == nullptr) {

            sendStatus = decoder.flushDecodedFrames();
        } else {
            sendStatus = decoder.sendPacket(*packet);
        }

        if (sendStatus < 0 && sendStatus != AVERROR(EAGAIN) && sendStatus != AVERROR_EOF) {
            return sendStatus;
        }
        if (sendStatus == 0 && packet) {
            ++streams[streamIndex].decodedPackets;
        }

        Frame decodedFrame;
        while (true) {
            if ((status = decoder.receiveFrame(decodedFrame)) < 0) {
                break;
            }
            ++streams[streamIndex].decodedFrames;
            if ((status = filterAndWriteFrame(&decodedFrame, streamIndex)) < 0) {
                break;
            }
            decodedFrame.unref();
        }
    } while (sendStatus == AVERROR(EAGAIN));

 return status;
}

int InputProcessor::encodeAndWriteFrame(Frame *frame, int streamIndex) {
    assert(input->isValid());
    assert(formatContext);

    int status = 0;
    int sendStatus;

    Packet packet;

    CodecContext &encoderContext = streams[streamIndex].codec;

    do {
        if (frame) {
            sendStatus = encoderContext.sendFrame(*frame);
        } else {
            sendStatus = encoderContext.flushEncodedPackets();
        }
        if (sendStatus < 0 && sendStatus != AVERROR(EAGAIN) && sendStatus != AVERROR_EOF) {
            return status;
        }
        if (sendStatus == 0 && frame) {
            ++streams[streamIndex].encodedFrames;
        }

        while (true) {
            if ((status = encoderContext.receivePacket(packet)) < 0) {
                break;
            }
            ++streams[streamIndex].encodedPackets;
            packet.setStreamIndex(streamIndex);
            auto sourceTimebase = encoderContext.timeBase();
            auto dstTimebase = formatContext.getStreams()[streamIndex]->time_base;
            packet.rescaleTimestamps(sourceTimebase, dstTimebase);
            if ((status = formatContext.writeFrameInterleaved(packet)) < 0) {
                return status;
            }
            packet.unref();
        }
    } while (sendStatus == AVERROR(EAGAIN));

    if (status != AVERROR(EAGAIN)) {
        return status;
    }

    return 0;
}

FFprobe output for original video.

Input #0, matroska,webm, from 'testvideo':
  Metadata:
    title           : TestVideo
    encoder         : libebml v1.3.0 + libmatroska v1.4.0
    creation_time   : 2014-12-23T03:38:05.000000Z
  Duration: 00:02:29.25, start: 0.000000, bitrate: 79549 kb/s
    Stream #0:0(rus): Video: h264 (High 4:4:4 Predictive), yuv444p10le(pc, bt709, progressive), 2048x858 [SAR 1:1 DAR 1024:429], 24 fps, 24 tbr, 1k tbn, 48 tbc (default)
    Stream #0:1(rus): Audio: pcm_s24le, 48000 Hz, 6 channels, s32 (24 bit), 6912 kb/s (default)

Transcoded:

Input #0, mov,mp4,m4a,3gp,3g2,mj2, from '123.mp4':
  Metadata:
    major_brand     : mp42
    minor_version   : 512
    compatible_brands: isomiso2avc1mp41
    encoder         : Lavf57.71.100
  Duration: 00:02:29.27, start: 0.000000, bitrate: 4282 kb/s
    Stream #0:0(rus): Video: h264 (High) (avc1 / 0x31637661), yuv420p, 1280x720 [SAR 192:143 DAR 1024:429], 3940 kb/s, 24.01 fps, 24 tbr, 12288 tbn, 96 tbc (default)
    Metadata:
      handler_name    : VideoHandler
    Stream #0:1(rus): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, 5.1, fltp, 336 kb/s (default)
    Metadata:
      handler_name    : SoundHandler

Solution

The problem was in wrong sequence of steps when intializing encoder. In transcoding.c example they assign CODEC_FLAG_GLOBAL_HEADER to AVCodecContext.flags property after calling avcodec_open2(). I assumed that that is correct and did the same thing in my code. It caused extradata field to be uninitialized and QuickTime was unable to parse result stream. Setting flag before opening codec solved the problem.

Result code:

        // should be placed before avcodec_open2
        if (formatContext.getRawCtx()->oformat->flags & AVFMT_GLOBALHEADER) {
            encoderContext.getRawCtx()->flags |= CODEC_FLAG_GLOBAL_HEADER;
        }

        // before using encoder, we should open it and update its parameters
        printf("Codec bits per sample %d\n", av_get_bits_per_sample(encoderCodecId));
        AVDictionary *options = nullptr;
        if ((status = encoderContext.open(encoder, &options)) < 0) {
            return status;
        }