javascript audio webassembly audio-analysis

essentia.js inconsistent results detecting features in test audio files

I would like to detect features in audio files such as note positions and pitch. Essentia.js is a library which can do this. My code is:

const Essentia = require('essentia.js');
const fs = require('fs');
const glob = require('glob');
const path = require('path');
const wav = require('node-wav');

const essentia = new Essentia.Essentia(Essentia.EssentiaWASM);
const audioDir = path.join('test', 'audio', '**', '*.wav');
const audioPaths = glob.globSync(audioDir);
const results = [];

// Loop through each file in the folder and detect audio features.
audioPaths.forEach((audioPath) => {
  console.log(`Analyzing ${audioPath}`);
  const fileBuffer = fs.readFileSync(audioPath);
  const audioBuffer = wav.decode(fileBuffer);
  const audioVector = essentia.arrayToVector(audioBuffer.channelData[0]);
  const melodia = essentia.PredominantPitchMelodia(audioVector).pitch;
  const segments = essentia.PitchContourSegmentation(melodia, audioVector);
  results.push({
    audioPath,
    durations: essentia.vectorToArray(segments.duration),
    onsets: essentia.vectorToArray(segments.onset),
    pitches: essentia.vectorToArray(segments.MIDIpitch)
  });
});

// Output attributes side-by-side for comparison.
results.forEach(result => console.log('durations', result.audioPath, result.durations));
results.forEach(result => console.log('onsets', result.audioPath, result.onsets));
results.forEach(result => console.log('pitches', result.audioPath, result.pitches));

I'm running this code using these test files: https://github.com/kmturley/sfz-tools-core/tree/main/test/audio

However I am getting inconsistent results, different numbers of notes detected, even though the audio files contain the same number of notes:

Durations

durations test/audio/velocity-sin.wav Float32Array(10) [
  0.5224489569664001,
  0.5369614362716675,
  0.5369614362716675,
  0.5369614362716675,
  0.5369614362716675,
  0.10158730298280716,
  0.1160997748374939,
  0.14222222566604614,
  0.09868481010198593,
  0.10739228874444962
]
durations test/audio/velocity-saw.wav Float32Array(7) [
  0.5195465087890625,
  0.528253972530365,
  0.528253972530365,
  0.528253972530365,
  0.528253972530365,
  0.12480725347995758,
  0.15673469007015228
]
durations test/audio/velocity-piano.wav Float32Array(3) [
  2.983764171600342,
  2.002721071243286,
  3.0040817260742188
]

Onsets

onsets test/audio/scale-square.wav Float32Array(12) [
  0,
  0.9839455485343933,
  1.9824036359786987,
  2.9808616638183594,
  3.9851248264312744,
  4.983582973480225,
  5.604716777801514,
  5.979138374328613,
  6.65541934967041,
  6.759909152984619,
  6.977596282958984,
  7.7380499839782715
]
onsets test/audio/scale-sin.wav Float32Array(11) [
  0,
  0.9839455485343933,
  1.9824036359786987,
  2.977959156036377,
  3.982222318649292,
  4.980680465698242,
  5.784671306610107,
  5.979138374328613,
  6.977596282958984,
  7.5290703773498535,
  7.679999828338623
]
onsets test/audio/scale-saw.wav Float32Array(10) [
  0,
  0.9868480563163757,
  1.9853061437606812,
  2.983764171600342,
  3.558458089828491,
  3.689070224761963,
  3.9851248264312744,
  4.97777795791626,
  5.982040882110596,
  6.980498790740967
]

Pitches:

pitches test/audio/scale2-sin.wav Float32Array(21) [
  60, 61, 62, 63, 64, 65, 66,
  67, 68, 69, 70, 71, 86, 72,
  92, 90, 91, 88, 93, 93, 90
]
pitches test/audio/scale2-saw.wav Float32Array(19) [
  60, 61, 62, 63, 64, 65, 66,
  67, 68, 69, 70, 71, 72, 92,
  91, 89, 92, 88, 92
]
pitches test/audio/scale2-piano.wav Float32Array(11) [
  60, 61, 62, 63, 64,
  65, 66, 67, 68, 69,
  70
]

What is going on? Is this a bug or an issue with my implementation?

Solution

This demo has a more effective solution:

It uses two additional modules:

A simple demo:

import * as wav from 'node-wav';
import { readFileSync } from 'fs';
import { Essentia, EssentiaWASM } from 'essentia.js';
import PolarFFTWASM from './lib/polarFFT.module.js';
import OnsetsWASM from './lib/onsets.module.js';

function analyzeOnsets(buffer) {
  const params = {
    frameSize: 1024,
    hopSize: 512,
    odfs: ["hfc","complex"],
    odfsWeights: [0.5,0.5],
    sensitivity: 0.65
  };
  // Calculate polar frames.
  const polarFrames = [];
  let PolarFFT = new PolarFFTWASM.PolarFFT(params.frameSize);
  let frames = essentia.FrameGenerator(buffer.channelData[0], params.frameSize, params.hopSize);
  for (let i = 0; i < frames.size(); i++) {
      let currentFrame = frames.get(i);
      let windowed = essentia.Windowing(currentFrame).frame;
      const polar = PolarFFT.compute(essentia.vectorToArray(windowed));
      polarFrames.push(polar);
  }
  frames.delete();
  PolarFFT.shutdown();
  // Calculate onsets.
  const alpha = 1 - params.sensitivity; 
  const Onsets = new OnsetsWASM.Onsets(alpha, 5, buffer.sampleRate / params.hopSize, 0.02); 
  const odfMatrix = [];
  for (const func of params.odfs) {
      const odfArray = polarFrames.map( (frame) => {
          return essentia.OnsetDetection(
              essentia.arrayToVector(essentia.vectorToArray(frame.magnitude)), 
              essentia.arrayToVector(essentia.vectorToArray(frame.phase)), 
              func, buffer.sampleRate).onsetDetection;
      });
      odfMatrix.push(Float32Array.from(odfArray));
  }
  const onsetPositions = Onsets.compute(odfMatrix, params.odfsWeights).positions;
  Onsets.shutdown();
  if (onsetPositions.size() == 0) { return new Float32Array(0) }
  else { return essentia.vectorToArray(onsetPositions); }
}

const fileBuffer = readFileSync('./velocity-saw.wav');
const audioBuffer = wav.decode(fileBuffer);
analyzeOnsets(audioBuffer);

I also had to modify the modules last lines to be compatible with CommonJS modules:

./lib/onsets.module.js

// export { Module as OnsetsWASM };
exports.default = Module;

./lib/polarFFT.module.js

// export { Module as PolarFFTWASM };
exports.default = Module;

When running this I get more consistent results:

onsets test/audio/velocity-triangle.wav Float32Array(8) [
  0.01160997711122036,
  0.9984580278396606,
  1.9969160556793213,
  2.9953742027282715,
  4.005442142486572,
  5.003900051116943,
  6.013968467712402,
  7.999274253845215
]
onsets test/audio/velocity-square.wav Float32Array(8) [
  0.01160997711122036,
  0.9984580278396606,
  1.9969160556793213,
  2.9953742027282715,
  4.005442142486572,
  5.003900051116943,
  6.002358436584473,
  7.999274253845215
]
onsets test/audio/velocity-sin.wav Float32Array(8) [
  0.01160997711122036,
  0.9984580278396606,
  1.9969160556793213,
  2.9953742027282715,
  4.005442142486572,
  5.003900051116943,
  6.025578022003174,
  7.999274253845215
]
onsets test/audio/velocity-saw.wav Float32Array(8) [
  0.01160997711122036,
  0.9984580278396606,
  1.9969160556793213,
  2.9953742027282715,
  4.005442142486572,
  5.003900051116943,
  6.013968467712402,
  7.999274253845215
]
onsets test/audio/velocity-piano.wav Float32Array(8) [
  0.01160997711122036,
  1.0100680589675903,
  2.008526086807251,
  3.006984233856201,
  4.017052173614502,
  5.027120113372803,
  6.0371880531311035,
  7.058866024017334
]
onsets test/audio/scale2-triangle.wav Float32Array(14) [
  0.01160997711122036,
  0.4992290139198303,
  0.9984580278396606,
  1.4976871013641357,
  1.9969160556793213,
  2.496145009994507,
  2.9953742027282715,
  3.494603157043457,
  3.9938321113586426,
  4.493061065673828,
  4.992290019989014,
  5.491519451141357,
  5.990748405456543,
  7.999274253845215
]
onsets test/audio/scale2-square.wav Float32Array(14) [
  0.01160997711122036,
  0.4992290139198303,
  0.9984580278396606,
  1.4976871013641357,
  1.9969160556793213,
  2.496145009994507,
  2.9953742027282715,
  3.494603157043457,
  3.9938321113586426,
  4.493061065673828,
  4.992290019989014,
  5.491519451141357,
  5.990748405456543,
  7.999274253845215
]
onsets test/audio/scale2-sin.wav Float32Array(14) [
  0.01160997711122036,
  0.4992290139198303,
  0.9984580278396606,
  1.4976871013641357,
  1.9969160556793213,
  2.496145009994507,
  2.9953742027282715,
  3.494603157043457,
  3.9938321113586426,
  4.493061065673828,
  4.992290019989014,
  5.491519451141357,
  5.990748405456543,
  7.999274253845215
]
onsets test/audio/scale2-saw.wav Float32Array(14) [
  0.01160997711122036,
  0.4992290139198303,
  0.9984580278396606,
  1.4976871013641357,
  1.9969160556793213,
  2.496145009994507,
  2.9953742027282715,
  3.494603157043457,
  3.9938321113586426,
  4.493061065673828,
  4.992290019989014,
  5.491519451141357,
  5.990748405456543,
  7.999274253845215
]
onsets test/audio/scale2-piano.wav Float32Array(13) [
  0.01160997711122036,
  0.4992290139198303,
  0.9984580278396606,
  1.4976871013641357,
  1.9969160556793213,
  2.496145009994507,
  2.9953742027282715,
  3.5062131881713867,
  4.005442142486572,
  4.504671096801758,
  5.003900051116943,
  5.503129482269287,
  6.002358436584473
]
onsets test/audio/scale-triangle.wav Float32Array(9) [
  0.01160997711122036,
  0.9984580278396606,
  1.9969160556793213,
  2.9953742027282715,
  3.9938321113586426,
  4.992290019989014,
  5.990748405456543,
  6.989206314086914,
  7.999274253845215
]
onsets test/audio/scale-square.wav Float32Array(9) [
  0.01160997711122036,
  0.9984580278396606,
  1.9969160556793213,
  2.9953742027282715,
  3.9938321113586426,
  4.992290019989014,
  5.990748405456543,
  6.989206314086914,
  7.999274253845215
]
onsets test/audio/scale-sin.wav Float32Array(9) [
  0.01160997711122036,
  0.9984580278396606,
  1.9969160556793213,
  2.9953742027282715,
  3.9938321113586426,
  4.992290019989014,
  5.990748405456543,
  6.989206314086914,
  7.999274253845215
]
onsets test/audio/scale-saw.wav Float32Array(9) [
  0.01160997711122036,
  0.9984580278396606,
  1.9969160556793213,
  2.9953742027282715,
  3.9938321113586426,
  4.992290019989014,
  5.990748405456543,
  6.989206314086914,
  7.999274253845215
]
onsets test/audio/scale-piano.wav Float32Array(8) [
  0.01160997711122036,
  0.9984580278396606,
  1.9969160556793213,
  3.006984233856201,
  4.005442142486572,
  5.003900051116943,
  6.002358436584473,
  7.000816345214844
]