I'm confused about how to extract double values from data in AVFrame. I'm trying to extract frames. I have tried to examine the source behind the av module, written in CPython, especially for AudioFrame to try and understand where it's decoding samples from: https://github.com/PyAV-Org/PyAV/blob/main/av/audio/frame.pyx. The below script works fine, but the bottom C++ program, produces nonsense.
import av
input_file = 'C:\\path\\to\\sound.mp3'
container = av.open(input_file)
# Find audio stream
audio_stream = None
for stream in container.streams:
if stream.type == 'audio':
audio_stream = stream
break
if audio_stream is None:
print("No audio streams detected")
container.close()
exit(1)
# Access audio samples
sampleListL = []
sampleListR = []
for packet in container.demux(audio_stream):
for frame in packet.decode():
exit(0)
sampleListL += frame.to_ndarray()[0].tolist()
sampleListR += frame.to_ndarray()[1].tolist()
print("Channel L,Channel R")
for s in zip(sampleListL, sampleListR):
print(str.format("{},{}",s[0],s[1]))
# Cleanup
container.close()
// avcodec-simple-frame-extraction.cpp : Defines the entry point for the application.
#include "avcodec-simple-frame-extraction.h"
#include <array>
#include <limits>
using namespace std;
using AudioFrameSample = std::array<double, 8>;
static int ReadFramesForAudioFile(const char* filepath)
{
AVCodecContext* avCodecCtx = avcodec_alloc_context3(nullptr);
AVFormatContext* avFmtCtx = avformat_alloc_context();
AVStream* avFirstAudioStream = nullptr;
AVCodecParameters* avCodecParams = nullptr;
if (avformat_open_input(&avFmtCtx, filepath, nullptr, nullptr) != 0) {
fprintf(stderr, "Couldn't open file with avformat_open_input\n");
return 1;
}
if (avformat_find_stream_info(avFmtCtx, nullptr) < 0) {
fprintf(stderr, "Couldn't get stream info with avformat_find_stream_info\n");
return 1;
}
avCodecCtx->request_sample_fmt = AV_SAMPLE_FMT_DBL;
//find the index of the first audio stream
int streamIndex = -1;
for (int si = 0; si < avFmtCtx->nb_streams; si++) {
if (avFmtCtx->streams[si]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
avCodecParams = avFmtCtx->streams[si]->codecpar;
streamIndex = si;
break;
}
}
//find decoder codec for local stream
const AVCodec* avCodec = avcodec_find_decoder(avCodecParams->codec_id);
avCodecCtx = avcodec_alloc_context3(avCodec);
avcodec_parameters_to_context(avCodecCtx, avCodecParams);
avcodec_open2(avCodecCtx, avCodec, nullptr);
if (streamIndex == -1) {
fprintf(stderr, "No audio streams detected\n");
}
avFirstAudioStream = avFmtCtx->streams[streamIndex];
//no resampling, just use the same sampling rate as the original codec
//prepare reading data
AVPacket* avPacket = av_packet_alloc();
AVFrame* avFrame = av_frame_alloc();
if (avFrame == nullptr) {
fprintf(stderr, "Error allocating the frame\n");
return 1;
}
std::vector<AudioFrameSample> mySamples;
while (av_read_frame(avFmtCtx, avPacket) >= 0) {
avcodec_send_packet(avCodecCtx, avPacket);
avcodec_receive_frame(avCodecCtx, avFrame);
//TODO: Study and use decoding technique in PyAV:
//https://github.com/PyAV-Org/PyAV/blob/main/av/audio/frame.pyx
for (int ch = 0; ch < avFrame->ch_layout.nb_channels; ch++) {
for (int s = 0; s < avFrame->linesize[0]; s+=sizeof(double)) {
double value;
memcpy(&value, &avFrame->data[ch][s], sizeof(double));
mySamples.push_back(AudioFrameSample{ value });
}
}
}
for (auto& f : mySamples) {
for (int ch = 0; ch < 6; ch++) {
printf("%0.8f,", f[ch]);
}
printf("%0.8f\n", f[7]);
}
avcodec_free_context(&avCodecCtx);
av_packet_free(&avPacket);
av_frame_free(&avFrame);
return 0;
}
int main(int argc, char* argv[])
{
if (argc == 2) {
ReadFramesForAudioFile(argv[1]);
}
else {
fprintf(stderr, "Usage: avcodec-simple-frame-extraction [path]\n");
return 1;
}
return 0;
}
Here's my program in C++, using AVFrame's data directly, trying to convert into double values:
// avcodec-simple-frame-extraction.cpp : Defines the entry point for the application.
#include "avcodec-simple-frame-extraction.h"
#include <array>
#include <limits>
using namespace std;
using AudioFrameSample = std::array<double, 8>;
static int ReadFramesForAudioFile(const char* filepath)
{
AVCodecContext* avCodecCtx = avcodec_alloc_context3(nullptr);
AVFormatContext* avFmtCtx = avformat_alloc_context();
AVStream* avFirstAudioStream = nullptr;
AVCodecParameters* avCodecParams = nullptr;
if (avformat_open_input(&avFmtCtx, filepath, nullptr, nullptr) != 0) {
fprintf(stderr, "Couldn't open file with avformat_open_input\n");
return 1;
}
if (avformat_find_stream_info(avFmtCtx, nullptr) < 0) {
fprintf(stderr, "Couldn't get stream info with avformat_find_stream_info\n");
return 1;
}
avCodecCtx->request_sample_fmt = AV_SAMPLE_FMT_DBL;
//find the index of the first audio stream
int streamIndex = -1;
for (int si = 0; si < avFmtCtx->nb_streams; si++) {
if (avFmtCtx->streams[si]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
avCodecParams = avFmtCtx->streams[si]->codecpar;
streamIndex = si;
break;
}
}
//find decoder codec for local stream
const AVCodec* avCodec = avcodec_find_decoder(avCodecParams->codec_id);
avCodecCtx = avcodec_alloc_context3(avCodec);
avcodec_parameters_to_context(avCodecCtx, avCodecParams);
avcodec_open2(avCodecCtx, avCodec, nullptr);
if (streamIndex == -1) {
fprintf(stderr, "No audio streams detected\n");
}
avFirstAudioStream = avFmtCtx->streams[streamIndex];
//no resampling, just use the same sampling rate as the original codec
//prepare reading data
AVPacket* avPacket = av_packet_alloc();
AVFrame* avFrame = av_frame_alloc();
if (avFrame == nullptr) {
fprintf(stderr, "Error allocating the frame\n");
return 1;
}
std::vector<AudioFrameSample> mySamples;
while (av_read_frame(avFmtCtx, avPacket) >= 0) {
avcodec_send_packet(avCodecCtx, avPacket);
avcodec_receive_frame(avCodecCtx, avFrame);
//TODO: Study and use decoding technique in PyAV:
//https://github.com/PyAV-Org/PyAV/blob/main/av/audio/frame.pyx
for (int ch = 0; ch < avFrame->ch_layout.nb_channels; ch++) {
for (int s = 0; s < avFrame->linesize[0]; s+=sizeof(double)) {
double value;
memcpy(&value, &avFrame->data[ch][s], sizeof(double));
mySamples.push_back(AudioFrameSample{ value });
}
}
av_packet_unref(avPacket);
}
for (auto& f : mySamples) {
for (int ch = 0; ch < 6; ch++) {
printf("%0.8f,", f[ch]);
}
printf("%0.8f\n", f[7]);
}
avcodec_free_context(&avCodecCtx);
av_packet_free(&avPacket);
av_frame_free(&avFrame);
return 0;
}
int main(int argc, char* argv[])
{
if (argc == 2) {
ReadFramesForAudioFile(argv[1]);
}
else {
fprintf(stderr, "Usage: avcodec-simple-frame-extraction [path]\n");
return 1;
}
return 0;
}
According to my experiments, avCodecCtx->request_sample_fmt = AV_SAMPLE_FMT_DBL
has no effect.
There are all kind of references saying that it's not working....
Using av_get_packed_sample_fmt(avCodecCtx->sample_fmt)
, we can see that the sample format is AV_SAMPLE_FMT_FLT
(float).
Casting from float
to double
is not an issue (in case the format is AV_SAMPLE_FMT_S16
for example, scaling is required).
The loop that stores the samples into mySamples
is incorrect.
There are few solutions, here is a suggested solution (assume the data is float):
AudioFrameSample my_sample{0, 0, 0, 0, 0, 0, 0, 0}; //Single sample
for (int s = 0; s < avFrame->linesize[0]; s+=sizeof(float)) {
for (int ch = 0; ch < avFrame->ch_layout.nb_channels; ch++) { //Channels is the inner loop
float value = *((float*)(&avFrame->data[ch][s])); //We better use "load operation" than using memcpy.
my_sample[ch] = value; //Fill all channels
}
mySamples.push_back(my_sample);
}
The inner loop iterates the channels.
After storing all channels in my_sample
, we are storing my_sample
in mySamples
.
Updated code sample:
extern "C"
{
#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libavdevice/avdevice.h>
#include <libavutil/imgutils.h>
}
#include <array>
#include <limits>
#include <vector>
using namespace std;
using AudioFrameSample = std::array<double, 8>;
static int ReadFramesForAudioFile(const char* filepath)
{
//AVCodecContext* avCodecCtx = avcodec_alloc_context3(nullptr);
AVFormatContext* avFmtCtx = avformat_alloc_context();
AVStream* avFirstAudioStream = nullptr;
AVCodecParameters* avCodecParams = nullptr;
if (avformat_open_input(&avFmtCtx, filepath, nullptr, nullptr) != 0) {
fprintf(stderr, "Couldn't open file with avformat_open_input\n");
return 1;
}
if (avformat_find_stream_info(avFmtCtx, nullptr) < 0) {
fprintf(stderr, "Couldn't get stream info with avformat_find_stream_info\n");
return 1;
}
//avCodecCtx->request_sample_fmt = AV_SAMPLE_FMT_DBL;
//find the index of the first audio stream
int streamIndex = -1;
for (int si = 0; si < (int)avFmtCtx->nb_streams; si++) {
if (avFmtCtx->streams[si]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
avCodecParams = avFmtCtx->streams[si]->codecpar;
streamIndex = si;
break;
}
}
//find decoder codec for local stream
const AVCodec* avCodec = avcodec_find_decoder(avCodecParams->codec_id);
AVCodecContext* avCodecCtx = avcodec_alloc_context3(avCodec);
avcodec_parameters_to_context(avCodecCtx, avCodecParams);
avCodecCtx->request_sample_fmt = AV_SAMPLE_FMT_DBL; //<-- Has no effect.
avcodec_open2(avCodecCtx, avCodec, nullptr);
if (streamIndex == -1) {
fprintf(stderr, "No audio streams detected\n");
}
avFirstAudioStream = avFmtCtx->streams[streamIndex];
//no resampling, just use the same sampling rate as the original codec
//prepare reading data
AVPacket* avPacket = av_packet_alloc();
AVFrame* avFrame = av_frame_alloc();
if (avFrame == nullptr) {
fprintf(stderr, "Error allocating the frame\n");
return 1;
}
std::vector<AudioFrameSample> mySamples;
while (av_read_frame(avFmtCtx, avPacket) >= 0) {
avcodec_send_packet(avCodecCtx, avPacket);
avcodec_receive_frame(avCodecCtx, avFrame);
//TODO: Study and use decoding technique in PyAV:
//https://github.com/PyAV-Org/PyAV/blob/main/av/audio/frame.pyx
AVSampleFormat sample_fmt = av_get_packed_sample_fmt(avCodecCtx->sample_fmt); //AV_SAMPLE_FMT_FLT
AudioFrameSample my_sample{0, 0, 0, 0, 0, 0, 0, 0}; //Single sample
if (sample_fmt == AV_SAMPLE_FMT_DBL)
{
for (int s = 0; s < avFrame->linesize[0]; s+=sizeof(double)) {
for (int ch = 0; ch < avFrame->ch_layout.nb_channels; ch++) { //Channels is the inner loop
double value;
memcpy(&value, &avFrame->data[ch][s], sizeof(double));
//mySamples.push_back(AudioFrameSample{ (double)value });
my_sample[ch] = value; //Fill all channels
}
mySamples.push_back(my_sample);
}
}
else if (sample_fmt == AV_SAMPLE_FMT_FLT)
{
for (int s = 0; s < avFrame->linesize[0]; s+=sizeof(float)) {
for (int ch = 0; ch < avFrame->ch_layout.nb_channels; ch++) { //Channels is the inner loop
//memcpy(&value, &avFrame->data[ch][s], sizeof(float));
float value = *((float*)(&avFrame->data[ch][s])); //We better use "load operation" than using memcpy.
//mySamples.push_back(AudioFrameSample{ (double)value });
my_sample[ch] = value; //Fill all channels
}
mySamples.push_back(my_sample);
}
}
else
{
fprintf(stderr, "sample_fmt is not yet supported by the current implementation\n");
return 1;
}
av_packet_unref(avPacket);
}
//Example for playing the raw audio file using FFplay (my input audio file is stereo):
//ffplay -f f64le -ar 44100 -ac 2 -channel_layout stereo raw_audio.raw
FILE *ff = fopen("raw_audio.raw", "wb"); //Open binary file for storing the audio samples in raw binary format.
int nb_channels = avFrame->ch_layout.nb_channels;
for (auto& f : mySamples) {
//for (int ch = 0; ch < 6; ch++) {
for (int ch = 0; ch < nb_channels-1; ch++) {
//printf("%0.8f,", f[ch]);S
fwrite(&f[ch], 1, sizeof(f[ch]), ff); //Write audio sample to binary file
}
//printf("%0.8f\n", f[nb_channels-1]); //printf("%0.8f\n", f[7]);
fwrite(&f[nb_channels-1], 1, sizeof(f[nb_channels-1]), ff); //Write audio sample to binary file
}
fclose(ff);
avcodec_free_context(&avCodecCtx);
av_packet_free(&avPacket);
av_frame_free(&avFrame);
return 0;
}
int main()
{
const char *input_file = "song.mp3";
ReadFramesForAudioFile(input_file);
return 0;
}
The above code sample stores the samples to raw_audio.raw
file.
With my stereo input file, I could play the audio using FFplay (adjust the arguments as needed):
ffplay -f f64le -ar 44100 -ac 2 -channel_layout stereo raw_audio.raw
For testing, I suggest you to start with a stereo MP3 input file (with 44100 sample rate).