I would like to detect features in audio files such as note positions and pitch. Essentia.js is a library which can do this. My code is:
const Essentia = require('essentia.js');
const fs = require('fs');
const glob = require('glob');
const path = require('path');
const wav = require('node-wav');
const essentia = new Essentia.Essentia(Essentia.EssentiaWASM);
const audioDir = path.join('test', 'audio', '**', '*.wav');
const audioPaths = glob.globSync(audioDir);
const results = [];
// Loop through each file in the folder and detect audio features.
audioPaths.forEach((audioPath) => {
console.log(`Analyzing ${audioPath}`);
const fileBuffer = fs.readFileSync(audioPath);
const audioBuffer = wav.decode(fileBuffer);
const audioVector = essentia.arrayToVector(audioBuffer.channelData[0]);
const melodia = essentia.PredominantPitchMelodia(audioVector).pitch;
const segments = essentia.PitchContourSegmentation(melodia, audioVector);
results.push({
audioPath,
durations: essentia.vectorToArray(segments.duration),
onsets: essentia.vectorToArray(segments.onset),
pitches: essentia.vectorToArray(segments.MIDIpitch)
});
});
// Output attributes side-by-side for comparison.
results.forEach(result => console.log('durations', result.audioPath, result.durations));
results.forEach(result => console.log('onsets', result.audioPath, result.onsets));
results.forEach(result => console.log('pitches', result.audioPath, result.pitches));
I'm running this code using these test files: https://github.com/kmturley/sfz-tools-core/tree/main/test/audio
However I am getting inconsistent results, different numbers of notes detected, even though the audio files contain the same number of notes:
Durations
durations test/audio/velocity-sin.wav Float32Array(10) [
0.5224489569664001,
0.5369614362716675,
0.5369614362716675,
0.5369614362716675,
0.5369614362716675,
0.10158730298280716,
0.1160997748374939,
0.14222222566604614,
0.09868481010198593,
0.10739228874444962
]
durations test/audio/velocity-saw.wav Float32Array(7) [
0.5195465087890625,
0.528253972530365,
0.528253972530365,
0.528253972530365,
0.528253972530365,
0.12480725347995758,
0.15673469007015228
]
durations test/audio/velocity-piano.wav Float32Array(3) [
2.983764171600342,
2.002721071243286,
3.0040817260742188
]
Onsets
onsets test/audio/scale-square.wav Float32Array(12) [
0,
0.9839455485343933,
1.9824036359786987,
2.9808616638183594,
3.9851248264312744,
4.983582973480225,
5.604716777801514,
5.979138374328613,
6.65541934967041,
6.759909152984619,
6.977596282958984,
7.7380499839782715
]
onsets test/audio/scale-sin.wav Float32Array(11) [
0,
0.9839455485343933,
1.9824036359786987,
2.977959156036377,
3.982222318649292,
4.980680465698242,
5.784671306610107,
5.979138374328613,
6.977596282958984,
7.5290703773498535,
7.679999828338623
]
onsets test/audio/scale-saw.wav Float32Array(10) [
0,
0.9868480563163757,
1.9853061437606812,
2.983764171600342,
3.558458089828491,
3.689070224761963,
3.9851248264312744,
4.97777795791626,
5.982040882110596,
6.980498790740967
]
Pitches:
pitches test/audio/scale2-sin.wav Float32Array(21) [
60, 61, 62, 63, 64, 65, 66,
67, 68, 69, 70, 71, 86, 72,
92, 90, 91, 88, 93, 93, 90
]
pitches test/audio/scale2-saw.wav Float32Array(19) [
60, 61, 62, 63, 64, 65, 66,
67, 68, 69, 70, 71, 72, 92,
91, 89, 92, 88, 92
]
pitches test/audio/scale2-piano.wav Float32Array(11) [
60, 61, 62, 63, 64,
65, 66, 67, 68, 69,
70
]
What is going on? Is this a bug or an issue with my implementation?
This demo has a more effective solution:
It uses two additional modules:
A simple demo:
import * as wav from 'node-wav';
import { readFileSync } from 'fs';
import { Essentia, EssentiaWASM } from 'essentia.js';
import PolarFFTWASM from './lib/polarFFT.module.js';
import OnsetsWASM from './lib/onsets.module.js';
function analyzeOnsets(buffer) {
const params = {
frameSize: 1024,
hopSize: 512,
odfs: ["hfc","complex"],
odfsWeights: [0.5,0.5],
sensitivity: 0.65
};
// Calculate polar frames.
const polarFrames = [];
let PolarFFT = new PolarFFTWASM.PolarFFT(params.frameSize);
let frames = essentia.FrameGenerator(buffer.channelData[0], params.frameSize, params.hopSize);
for (let i = 0; i < frames.size(); i++) {
let currentFrame = frames.get(i);
let windowed = essentia.Windowing(currentFrame).frame;
const polar = PolarFFT.compute(essentia.vectorToArray(windowed));
polarFrames.push(polar);
}
frames.delete();
PolarFFT.shutdown();
// Calculate onsets.
const alpha = 1 - params.sensitivity;
const Onsets = new OnsetsWASM.Onsets(alpha, 5, buffer.sampleRate / params.hopSize, 0.02);
const odfMatrix = [];
for (const func of params.odfs) {
const odfArray = polarFrames.map( (frame) => {
return essentia.OnsetDetection(
essentia.arrayToVector(essentia.vectorToArray(frame.magnitude)),
essentia.arrayToVector(essentia.vectorToArray(frame.phase)),
func, buffer.sampleRate).onsetDetection;
});
odfMatrix.push(Float32Array.from(odfArray));
}
const onsetPositions = Onsets.compute(odfMatrix, params.odfsWeights).positions;
Onsets.shutdown();
if (onsetPositions.size() == 0) { return new Float32Array(0) }
else { return essentia.vectorToArray(onsetPositions); }
}
const fileBuffer = readFileSync('./velocity-saw.wav');
const audioBuffer = wav.decode(fileBuffer);
analyzeOnsets(audioBuffer);
I also had to modify the modules last lines to be compatible with CommonJS modules:
./lib/onsets.module.js
// export { Module as OnsetsWASM };
exports.default = Module;
./lib/polarFFT.module.js
// export { Module as PolarFFTWASM };
exports.default = Module;
When running this I get more consistent results:
onsets test/audio/velocity-triangle.wav Float32Array(8) [
0.01160997711122036,
0.9984580278396606,
1.9969160556793213,
2.9953742027282715,
4.005442142486572,
5.003900051116943,
6.013968467712402,
7.999274253845215
]
onsets test/audio/velocity-square.wav Float32Array(8) [
0.01160997711122036,
0.9984580278396606,
1.9969160556793213,
2.9953742027282715,
4.005442142486572,
5.003900051116943,
6.002358436584473,
7.999274253845215
]
onsets test/audio/velocity-sin.wav Float32Array(8) [
0.01160997711122036,
0.9984580278396606,
1.9969160556793213,
2.9953742027282715,
4.005442142486572,
5.003900051116943,
6.025578022003174,
7.999274253845215
]
onsets test/audio/velocity-saw.wav Float32Array(8) [
0.01160997711122036,
0.9984580278396606,
1.9969160556793213,
2.9953742027282715,
4.005442142486572,
5.003900051116943,
6.013968467712402,
7.999274253845215
]
onsets test/audio/velocity-piano.wav Float32Array(8) [
0.01160997711122036,
1.0100680589675903,
2.008526086807251,
3.006984233856201,
4.017052173614502,
5.027120113372803,
6.0371880531311035,
7.058866024017334
]
onsets test/audio/scale2-triangle.wav Float32Array(14) [
0.01160997711122036,
0.4992290139198303,
0.9984580278396606,
1.4976871013641357,
1.9969160556793213,
2.496145009994507,
2.9953742027282715,
3.494603157043457,
3.9938321113586426,
4.493061065673828,
4.992290019989014,
5.491519451141357,
5.990748405456543,
7.999274253845215
]
onsets test/audio/scale2-square.wav Float32Array(14) [
0.01160997711122036,
0.4992290139198303,
0.9984580278396606,
1.4976871013641357,
1.9969160556793213,
2.496145009994507,
2.9953742027282715,
3.494603157043457,
3.9938321113586426,
4.493061065673828,
4.992290019989014,
5.491519451141357,
5.990748405456543,
7.999274253845215
]
onsets test/audio/scale2-sin.wav Float32Array(14) [
0.01160997711122036,
0.4992290139198303,
0.9984580278396606,
1.4976871013641357,
1.9969160556793213,
2.496145009994507,
2.9953742027282715,
3.494603157043457,
3.9938321113586426,
4.493061065673828,
4.992290019989014,
5.491519451141357,
5.990748405456543,
7.999274253845215
]
onsets test/audio/scale2-saw.wav Float32Array(14) [
0.01160997711122036,
0.4992290139198303,
0.9984580278396606,
1.4976871013641357,
1.9969160556793213,
2.496145009994507,
2.9953742027282715,
3.494603157043457,
3.9938321113586426,
4.493061065673828,
4.992290019989014,
5.491519451141357,
5.990748405456543,
7.999274253845215
]
onsets test/audio/scale2-piano.wav Float32Array(13) [
0.01160997711122036,
0.4992290139198303,
0.9984580278396606,
1.4976871013641357,
1.9969160556793213,
2.496145009994507,
2.9953742027282715,
3.5062131881713867,
4.005442142486572,
4.504671096801758,
5.003900051116943,
5.503129482269287,
6.002358436584473
]
onsets test/audio/scale-triangle.wav Float32Array(9) [
0.01160997711122036,
0.9984580278396606,
1.9969160556793213,
2.9953742027282715,
3.9938321113586426,
4.992290019989014,
5.990748405456543,
6.989206314086914,
7.999274253845215
]
onsets test/audio/scale-square.wav Float32Array(9) [
0.01160997711122036,
0.9984580278396606,
1.9969160556793213,
2.9953742027282715,
3.9938321113586426,
4.992290019989014,
5.990748405456543,
6.989206314086914,
7.999274253845215
]
onsets test/audio/scale-sin.wav Float32Array(9) [
0.01160997711122036,
0.9984580278396606,
1.9969160556793213,
2.9953742027282715,
3.9938321113586426,
4.992290019989014,
5.990748405456543,
6.989206314086914,
7.999274253845215
]
onsets test/audio/scale-saw.wav Float32Array(9) [
0.01160997711122036,
0.9984580278396606,
1.9969160556793213,
2.9953742027282715,
3.9938321113586426,
4.992290019989014,
5.990748405456543,
6.989206314086914,
7.999274253845215
]
onsets test/audio/scale-piano.wav Float32Array(8) [
0.01160997711122036,
0.9984580278396606,
1.9969160556793213,
3.006984233856201,
4.005442142486572,
5.003900051116943,
6.002358436584473,
7.000816345214844
]