Search code examples
c++ffmpegcpython

Extracting Audio Samples using libavcodec


I'm confused about how to extract double values from data in AVFrame. I'm trying to extract frames. I have tried to examine the source behind the av module, written in CPython, especially for AudioFrame to try and understand where it's decoding samples from: https://github.com/PyAV-Org/PyAV/blob/main/av/audio/frame.pyx. The below script works fine, but the bottom C++ program, produces nonsense.

import av  

input_file = 'C:\\path\\to\\sound.mp3'
container = av.open(input_file)

# Find audio stream
audio_stream = None
for stream in container.streams:
    if stream.type == 'audio':
        audio_stream = stream
        break

if audio_stream is None:
    print("No audio streams detected")
    container.close()
    exit(1)

# Access audio samples
sampleListL = []
sampleListR = []
for packet in container.demux(audio_stream):
    for frame in packet.decode():
        exit(0)
        sampleListL += frame.to_ndarray()[0].tolist()
        sampleListR += frame.to_ndarray()[1].tolist()

print("Channel L,Channel R")
for s in zip(sampleListL, sampleListR):
    print(str.format("{},{}",s[0],s[1]))

# Cleanup
container.close()



// avcodec-simple-frame-extraction.cpp : Defines the entry point for the application.

#include "avcodec-simple-frame-extraction.h"
#include <array>
#include <limits>

using namespace std;

using AudioFrameSample = std::array<double, 8>;

static int ReadFramesForAudioFile(const char* filepath)
{
    AVCodecContext* avCodecCtx = avcodec_alloc_context3(nullptr);
    AVFormatContext* avFmtCtx = avformat_alloc_context();
    AVStream* avFirstAudioStream = nullptr;
    
    AVCodecParameters* avCodecParams = nullptr;
    if (avformat_open_input(&avFmtCtx, filepath, nullptr, nullptr) != 0) {
        fprintf(stderr, "Couldn't open file with avformat_open_input\n");
        return 1;
    }
    if (avformat_find_stream_info(avFmtCtx, nullptr) < 0) {
        fprintf(stderr, "Couldn't get stream info with avformat_find_stream_info\n");
        return 1;
    }
    avCodecCtx->request_sample_fmt = AV_SAMPLE_FMT_DBL;
    //find the index of the first audio stream
    int streamIndex = -1;
    for (int si = 0; si < avFmtCtx->nb_streams; si++) {
        if (avFmtCtx->streams[si]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
            avCodecParams = avFmtCtx->streams[si]->codecpar;
            streamIndex = si;
            break;
        }
    }
    //find decoder codec for local stream
    const AVCodec* avCodec = avcodec_find_decoder(avCodecParams->codec_id);
    avCodecCtx = avcodec_alloc_context3(avCodec);
    avcodec_parameters_to_context(avCodecCtx, avCodecParams);
    avcodec_open2(avCodecCtx, avCodec, nullptr);
    if (streamIndex == -1) {
        fprintf(stderr, "No audio streams detected\n");
    }
    avFirstAudioStream = avFmtCtx->streams[streamIndex];
    //no resampling, just use the same sampling rate as the original codec
    //prepare reading data
    AVPacket* avPacket = av_packet_alloc();
    AVFrame* avFrame = av_frame_alloc();
    if (avFrame == nullptr) {
        fprintf(stderr, "Error allocating the frame\n");
        return 1;
    }
    std::vector<AudioFrameSample> mySamples;
    while (av_read_frame(avFmtCtx, avPacket) >= 0) {
        avcodec_send_packet(avCodecCtx, avPacket);
        avcodec_receive_frame(avCodecCtx, avFrame);
        //TODO: Study and use decoding technique in PyAV:
            //https://github.com/PyAV-Org/PyAV/blob/main/av/audio/frame.pyx

        for (int ch = 0; ch < avFrame->ch_layout.nb_channels; ch++) {
            for (int s = 0; s < avFrame->linesize[0]; s+=sizeof(double)) {
                double value;
                memcpy(&value, &avFrame->data[ch][s], sizeof(double));
                mySamples.push_back(AudioFrameSample{ value });
            }
        }
    }
    for (auto& f : mySamples) {
        for (int ch = 0; ch < 6; ch++) {
            printf("%0.8f,", f[ch]);
        }
        printf("%0.8f\n", f[7]);
    }
    avcodec_free_context(&avCodecCtx);
    av_packet_free(&avPacket);
    av_frame_free(&avFrame);
    return 0;
}

int main(int argc, char* argv[])
{
    if (argc == 2) {
        ReadFramesForAudioFile(argv[1]);
    }
    else {
        fprintf(stderr, "Usage: avcodec-simple-frame-extraction [path]\n");
        return 1;
    }
    return 0;
}

Here's my program in C++, using AVFrame's data directly, trying to convert into double values:

// avcodec-simple-frame-extraction.cpp : Defines the entry point for the application.

#include "avcodec-simple-frame-extraction.h"
#include <array>
#include <limits>

using namespace std;

using AudioFrameSample = std::array<double, 8>;

static int ReadFramesForAudioFile(const char* filepath)
{
    AVCodecContext* avCodecCtx = avcodec_alloc_context3(nullptr);
    AVFormatContext* avFmtCtx = avformat_alloc_context();
    AVStream* avFirstAudioStream = nullptr;
    
    AVCodecParameters* avCodecParams = nullptr;
    if (avformat_open_input(&avFmtCtx, filepath, nullptr, nullptr) != 0) {
        fprintf(stderr, "Couldn't open file with avformat_open_input\n");
        return 1;
    }
    if (avformat_find_stream_info(avFmtCtx, nullptr) < 0) {
        fprintf(stderr, "Couldn't get stream info with avformat_find_stream_info\n");
        return 1;
    }
    avCodecCtx->request_sample_fmt = AV_SAMPLE_FMT_DBL;
    //find the index of the first audio stream
    int streamIndex = -1;
    for (int si = 0; si < avFmtCtx->nb_streams; si++) {
        if (avFmtCtx->streams[si]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
            avCodecParams = avFmtCtx->streams[si]->codecpar;
            streamIndex = si;
            break;
        }
    }
    //find decoder codec for local stream
    const AVCodec* avCodec = avcodec_find_decoder(avCodecParams->codec_id);
    avCodecCtx = avcodec_alloc_context3(avCodec);
    avcodec_parameters_to_context(avCodecCtx, avCodecParams);
    avcodec_open2(avCodecCtx, avCodec, nullptr);
    if (streamIndex == -1) {
        fprintf(stderr, "No audio streams detected\n");
    }
    avFirstAudioStream = avFmtCtx->streams[streamIndex];
    //no resampling, just use the same sampling rate as the original codec
    //prepare reading data
    AVPacket* avPacket = av_packet_alloc();
    AVFrame* avFrame = av_frame_alloc();
    if (avFrame == nullptr) {
        fprintf(stderr, "Error allocating the frame\n");
        return 1;
    }
    std::vector<AudioFrameSample> mySamples;
    while (av_read_frame(avFmtCtx, avPacket) >= 0) {
        avcodec_send_packet(avCodecCtx, avPacket);
        avcodec_receive_frame(avCodecCtx, avFrame);
        //TODO: Study and use decoding technique in PyAV:
            //https://github.com/PyAV-Org/PyAV/blob/main/av/audio/frame.pyx

        for (int ch = 0; ch < avFrame->ch_layout.nb_channels; ch++) {
            for (int s = 0; s < avFrame->linesize[0]; s+=sizeof(double)) {
                double value;
                memcpy(&value, &avFrame->data[ch][s], sizeof(double));
                mySamples.push_back(AudioFrameSample{ value });
            }
        }
        av_packet_unref(avPacket);
    }
    for (auto& f : mySamples) {
        for (int ch = 0; ch < 6; ch++) {
            printf("%0.8f,", f[ch]);
        }
        printf("%0.8f\n", f[7]);
    }
    avcodec_free_context(&avCodecCtx);
    av_packet_free(&avPacket);
    av_frame_free(&avFrame);
    return 0;
}

int main(int argc, char* argv[])
{
    if (argc == 2) {
        ReadFramesForAudioFile(argv[1]);
    }
    else {
        fprintf(stderr, "Usage: avcodec-simple-frame-extraction [path]\n");
        return 1;
    }
    return 0;
}

Solution

  • According to my experiments, avCodecCtx->request_sample_fmt = AV_SAMPLE_FMT_DBL has no effect.
    There are all kind of references saying that it's not working....

    Using av_get_packed_sample_fmt(avCodecCtx->sample_fmt), we can see that the sample format is AV_SAMPLE_FMT_FLT (float).
    Casting from float to double is not an issue (in case the format is AV_SAMPLE_FMT_S16 for example, scaling is required).


    The loop that stores the samples into mySamples is incorrect.
    There are few solutions, here is a suggested solution (assume the data is float):

    AudioFrameSample my_sample{0, 0, 0, 0, 0, 0, 0, 0}; //Single sample
    
    for (int s = 0; s < avFrame->linesize[0]; s+=sizeof(float)) {
        for (int ch = 0; ch < avFrame->ch_layout.nb_channels; ch++) { //Channels is the inner loop
            float value = *((float*)(&avFrame->data[ch][s])); //We better use "load operation" than using memcpy.
            my_sample[ch] = value; //Fill all channels
        }
    
        mySamples.push_back(my_sample);
    }
    

    The inner loop iterates the channels.
    After storing all channels in my_sample, we are storing my_sample in mySamples.


    Updated code sample:

    extern "C"
    {
    #include <libavcodec/avcodec.h>
    #include <libavformat/avformat.h>
    #include <libavdevice/avdevice.h>
    #include <libavutil/imgutils.h>
    }
    
    
    #include <array>
    #include <limits>
    #include <vector>
    
    using namespace std;
    
    using AudioFrameSample = std::array<double, 8>;
    
    
    static int ReadFramesForAudioFile(const char* filepath)
    {
        //AVCodecContext* avCodecCtx = avcodec_alloc_context3(nullptr);
        AVFormatContext* avFmtCtx = avformat_alloc_context();
        AVStream* avFirstAudioStream = nullptr;
        
        AVCodecParameters* avCodecParams = nullptr;
        if (avformat_open_input(&avFmtCtx, filepath, nullptr, nullptr) != 0) {
            fprintf(stderr, "Couldn't open file with avformat_open_input\n");
            return 1;
        }
        if (avformat_find_stream_info(avFmtCtx, nullptr) < 0) {
            fprintf(stderr, "Couldn't get stream info with avformat_find_stream_info\n");
            return 1;
        }
    
        //avCodecCtx->request_sample_fmt = AV_SAMPLE_FMT_DBL;
    
        //find the index of the first audio stream
        int streamIndex = -1;
        for (int si = 0; si < (int)avFmtCtx->nb_streams; si++) {
            if (avFmtCtx->streams[si]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
                avCodecParams = avFmtCtx->streams[si]->codecpar;
                streamIndex = si;
                break;
            }
        }
    
    
        //find decoder codec for local stream
        const AVCodec* avCodec = avcodec_find_decoder(avCodecParams->codec_id);
        AVCodecContext* avCodecCtx = avcodec_alloc_context3(avCodec);
        avcodec_parameters_to_context(avCodecCtx, avCodecParams);
    
        avCodecCtx->request_sample_fmt = AV_SAMPLE_FMT_DBL; //<-- Has no effect.
    
        avcodec_open2(avCodecCtx, avCodec, nullptr);
        if (streamIndex == -1) {
            fprintf(stderr, "No audio streams detected\n");
        }
    
        avFirstAudioStream = avFmtCtx->streams[streamIndex];
        //no resampling, just use the same sampling rate as the original codec
        //prepare reading data
        AVPacket* avPacket = av_packet_alloc();
        AVFrame* avFrame = av_frame_alloc();
        if (avFrame == nullptr) {
            fprintf(stderr, "Error allocating the frame\n");
            return 1;
        }
    
        std::vector<AudioFrameSample> mySamples;
    
        while (av_read_frame(avFmtCtx, avPacket) >= 0) {
            avcodec_send_packet(avCodecCtx, avPacket);
            avcodec_receive_frame(avCodecCtx, avFrame);
            //TODO: Study and use decoding technique in PyAV:
                //https://github.com/PyAV-Org/PyAV/blob/main/av/audio/frame.pyx
    
            AVSampleFormat sample_fmt = av_get_packed_sample_fmt(avCodecCtx->sample_fmt);   //AV_SAMPLE_FMT_FLT
    
            AudioFrameSample my_sample{0, 0, 0, 0, 0, 0, 0, 0}; //Single sample
    
            if (sample_fmt == AV_SAMPLE_FMT_DBL)
            {
                for (int s = 0; s < avFrame->linesize[0]; s+=sizeof(double)) {
                    for (int ch = 0; ch < avFrame->ch_layout.nb_channels; ch++) { //Channels is the inner loop
                        double value;
                        memcpy(&value, &avFrame->data[ch][s], sizeof(double));
                        //mySamples.push_back(AudioFrameSample{ (double)value });
                        my_sample[ch] = value; //Fill all channels
                    }
    
                    mySamples.push_back(my_sample);
                }
            }
            else if (sample_fmt == AV_SAMPLE_FMT_FLT)
            {
                for (int s = 0; s < avFrame->linesize[0]; s+=sizeof(float)) {
                    for (int ch = 0; ch < avFrame->ch_layout.nb_channels; ch++) { //Channels is the inner loop
                        //memcpy(&value, &avFrame->data[ch][s], sizeof(float));
                        float value = *((float*)(&avFrame->data[ch][s])); //We better use "load operation" than using memcpy.
                        //mySamples.push_back(AudioFrameSample{ (double)value });
                        my_sample[ch] = value; //Fill all channels
                    }
    
                    mySamples.push_back(my_sample);
                }
            }
            else
            {
                fprintf(stderr, "sample_fmt is not yet supported by the current implementation\n");
                return 1;
            }
    
            av_packet_unref(avPacket);
        }
    
        //Example for playing the raw audio file using FFplay (my input audio file is stereo):
        //ffplay -f f64le -ar 44100 -ac 2 -channel_layout stereo raw_audio.raw
        FILE *ff = fopen("raw_audio.raw", "wb"); //Open binary file for storing the audio samples in raw binary format.
    
        int nb_channels = avFrame->ch_layout.nb_channels;
    
        for (auto& f : mySamples) {
            //for (int ch = 0; ch < 6; ch++) {
            for (int ch = 0; ch < nb_channels-1; ch++) {
                //printf("%0.8f,", f[ch]);S
                fwrite(&f[ch], 1, sizeof(f[ch]), ff);    //Write audio sample to binary file
            }        
            //printf("%0.8f\n", f[nb_channels-1]);  //printf("%0.8f\n", f[7]);        
            fwrite(&f[nb_channels-1], 1, sizeof(f[nb_channels-1]), ff); //Write audio sample to binary file
        }
    
        fclose(ff);
    
        avcodec_free_context(&avCodecCtx);
        av_packet_free(&avPacket);
        av_frame_free(&avFrame);
        return 0;
    }
    
    
    int main()
    {
        const char *input_file = "song.mp3";
    
        ReadFramesForAudioFile(input_file);
    
        return 0;
    }
    

    The above code sample stores the samples to raw_audio.raw file.

    With my stereo input file, I could play the audio using FFplay (adjust the arguments as needed):

    ffplay -f f64le -ar 44100 -ac 2 -channel_layout stereo raw_audio.raw

    For testing, I suggest you to start with a stereo MP3 input file (with 44100 sample rate).