Search code examples
javascriptaudiowebassemblyaudio-analysis

essentia.js inconsistent results detecting features in test audio files


I would like to detect features in audio files such as note positions and pitch. Essentia.js is a library which can do this. My code is:

const Essentia = require('essentia.js');
const fs = require('fs');
const glob = require('glob');
const path = require('path');
const wav = require('node-wav');

const essentia = new Essentia.Essentia(Essentia.EssentiaWASM);
const audioDir = path.join('test', 'audio', '**', '*.wav');
const audioPaths = glob.globSync(audioDir);
const results = [];

// Loop through each file in the folder and detect audio features.
audioPaths.forEach((audioPath) => {
  console.log(`Analyzing ${audioPath}`);
  const fileBuffer = fs.readFileSync(audioPath);
  const audioBuffer = wav.decode(fileBuffer);
  const audioVector = essentia.arrayToVector(audioBuffer.channelData[0]);
  const melodia = essentia.PredominantPitchMelodia(audioVector).pitch;
  const segments = essentia.PitchContourSegmentation(melodia, audioVector);
  results.push({
    audioPath,
    durations: essentia.vectorToArray(segments.duration),
    onsets: essentia.vectorToArray(segments.onset),
    pitches: essentia.vectorToArray(segments.MIDIpitch)
  });
});

// Output attributes side-by-side for comparison.
results.forEach(result => console.log('durations', result.audioPath, result.durations));
results.forEach(result => console.log('onsets', result.audioPath, result.onsets));
results.forEach(result => console.log('pitches', result.audioPath, result.pitches));

I'm running this code using these test files: https://github.com/kmturley/sfz-tools-core/tree/main/test/audio

However I am getting inconsistent results, different numbers of notes detected, even though the audio files contain the same number of notes:

Durations

durations test/audio/velocity-sin.wav Float32Array(10) [
  0.5224489569664001,
  0.5369614362716675,
  0.5369614362716675,
  0.5369614362716675,
  0.5369614362716675,
  0.10158730298280716,
  0.1160997748374939,
  0.14222222566604614,
  0.09868481010198593,
  0.10739228874444962
]
durations test/audio/velocity-saw.wav Float32Array(7) [
  0.5195465087890625,
  0.528253972530365,
  0.528253972530365,
  0.528253972530365,
  0.528253972530365,
  0.12480725347995758,
  0.15673469007015228
]
durations test/audio/velocity-piano.wav Float32Array(3) [
  2.983764171600342,
  2.002721071243286,
  3.0040817260742188
]

Onsets

onsets test/audio/scale-square.wav Float32Array(12) [
  0,
  0.9839455485343933,
  1.9824036359786987,
  2.9808616638183594,
  3.9851248264312744,
  4.983582973480225,
  5.604716777801514,
  5.979138374328613,
  6.65541934967041,
  6.759909152984619,
  6.977596282958984,
  7.7380499839782715
]
onsets test/audio/scale-sin.wav Float32Array(11) [
  0,
  0.9839455485343933,
  1.9824036359786987,
  2.977959156036377,
  3.982222318649292,
  4.980680465698242,
  5.784671306610107,
  5.979138374328613,
  6.977596282958984,
  7.5290703773498535,
  7.679999828338623
]
onsets test/audio/scale-saw.wav Float32Array(10) [
  0,
  0.9868480563163757,
  1.9853061437606812,
  2.983764171600342,
  3.558458089828491,
  3.689070224761963,
  3.9851248264312744,
  4.97777795791626,
  5.982040882110596,
  6.980498790740967
]

Pitches:

pitches test/audio/scale2-sin.wav Float32Array(21) [
  60, 61, 62, 63, 64, 65, 66,
  67, 68, 69, 70, 71, 86, 72,
  92, 90, 91, 88, 93, 93, 90
]
pitches test/audio/scale2-saw.wav Float32Array(19) [
  60, 61, 62, 63, 64, 65, 66,
  67, 68, 69, 70, 71, 72, 92,
  91, 89, 92, 88, 92
]
pitches test/audio/scale2-piano.wav Float32Array(11) [
  60, 61, 62, 63, 64,
  65, 66, 67, 68, 69,
  70
]

What is going on? Is this a bug or an issue with my implementation?


Solution

  • This demo has a more effective solution:

    It uses two additional modules:

    A simple demo:

    import * as wav from 'node-wav';
    import { readFileSync } from 'fs';
    import { Essentia, EssentiaWASM } from 'essentia.js';
    import PolarFFTWASM from './lib/polarFFT.module.js';
    import OnsetsWASM from './lib/onsets.module.js';
    
    function analyzeOnsets(buffer) {
      const params = {
        frameSize: 1024,
        hopSize: 512,
        odfs: ["hfc","complex"],
        odfsWeights: [0.5,0.5],
        sensitivity: 0.65
      };
      // Calculate polar frames.
      const polarFrames = [];
      let PolarFFT = new PolarFFTWASM.PolarFFT(params.frameSize);
      let frames = essentia.FrameGenerator(buffer.channelData[0], params.frameSize, params.hopSize);
      for (let i = 0; i < frames.size(); i++) {
          let currentFrame = frames.get(i);
          let windowed = essentia.Windowing(currentFrame).frame;
          const polar = PolarFFT.compute(essentia.vectorToArray(windowed));
          polarFrames.push(polar);
      }
      frames.delete();
      PolarFFT.shutdown();
      // Calculate onsets.
      const alpha = 1 - params.sensitivity; 
      const Onsets = new OnsetsWASM.Onsets(alpha, 5, buffer.sampleRate / params.hopSize, 0.02); 
      const odfMatrix = [];
      for (const func of params.odfs) {
          const odfArray = polarFrames.map( (frame) => {
              return essentia.OnsetDetection(
                  essentia.arrayToVector(essentia.vectorToArray(frame.magnitude)), 
                  essentia.arrayToVector(essentia.vectorToArray(frame.phase)), 
                  func, buffer.sampleRate).onsetDetection;
          });
          odfMatrix.push(Float32Array.from(odfArray));
      }
      const onsetPositions = Onsets.compute(odfMatrix, params.odfsWeights).positions;
      Onsets.shutdown();
      if (onsetPositions.size() == 0) { return new Float32Array(0) }
      else { return essentia.vectorToArray(onsetPositions); }
    }
    
    const fileBuffer = readFileSync('./velocity-saw.wav');
    const audioBuffer = wav.decode(fileBuffer);
    analyzeOnsets(audioBuffer);
    

    I also had to modify the modules last lines to be compatible with CommonJS modules:

    ./lib/onsets.module.js

    // export { Module as OnsetsWASM };
    exports.default = Module;
    

    ./lib/polarFFT.module.js

    // export { Module as PolarFFTWASM };
    exports.default = Module;
    

    When running this I get more consistent results:

    onsets test/audio/velocity-triangle.wav Float32Array(8) [
      0.01160997711122036,
      0.9984580278396606,
      1.9969160556793213,
      2.9953742027282715,
      4.005442142486572,
      5.003900051116943,
      6.013968467712402,
      7.999274253845215
    ]
    onsets test/audio/velocity-square.wav Float32Array(8) [
      0.01160997711122036,
      0.9984580278396606,
      1.9969160556793213,
      2.9953742027282715,
      4.005442142486572,
      5.003900051116943,
      6.002358436584473,
      7.999274253845215
    ]
    onsets test/audio/velocity-sin.wav Float32Array(8) [
      0.01160997711122036,
      0.9984580278396606,
      1.9969160556793213,
      2.9953742027282715,
      4.005442142486572,
      5.003900051116943,
      6.025578022003174,
      7.999274253845215
    ]
    onsets test/audio/velocity-saw.wav Float32Array(8) [
      0.01160997711122036,
      0.9984580278396606,
      1.9969160556793213,
      2.9953742027282715,
      4.005442142486572,
      5.003900051116943,
      6.013968467712402,
      7.999274253845215
    ]
    onsets test/audio/velocity-piano.wav Float32Array(8) [
      0.01160997711122036,
      1.0100680589675903,
      2.008526086807251,
      3.006984233856201,
      4.017052173614502,
      5.027120113372803,
      6.0371880531311035,
      7.058866024017334
    ]
    onsets test/audio/scale2-triangle.wav Float32Array(14) [
      0.01160997711122036,
      0.4992290139198303,
      0.9984580278396606,
      1.4976871013641357,
      1.9969160556793213,
      2.496145009994507,
      2.9953742027282715,
      3.494603157043457,
      3.9938321113586426,
      4.493061065673828,
      4.992290019989014,
      5.491519451141357,
      5.990748405456543,
      7.999274253845215
    ]
    onsets test/audio/scale2-square.wav Float32Array(14) [
      0.01160997711122036,
      0.4992290139198303,
      0.9984580278396606,
      1.4976871013641357,
      1.9969160556793213,
      2.496145009994507,
      2.9953742027282715,
      3.494603157043457,
      3.9938321113586426,
      4.493061065673828,
      4.992290019989014,
      5.491519451141357,
      5.990748405456543,
      7.999274253845215
    ]
    onsets test/audio/scale2-sin.wav Float32Array(14) [
      0.01160997711122036,
      0.4992290139198303,
      0.9984580278396606,
      1.4976871013641357,
      1.9969160556793213,
      2.496145009994507,
      2.9953742027282715,
      3.494603157043457,
      3.9938321113586426,
      4.493061065673828,
      4.992290019989014,
      5.491519451141357,
      5.990748405456543,
      7.999274253845215
    ]
    onsets test/audio/scale2-saw.wav Float32Array(14) [
      0.01160997711122036,
      0.4992290139198303,
      0.9984580278396606,
      1.4976871013641357,
      1.9969160556793213,
      2.496145009994507,
      2.9953742027282715,
      3.494603157043457,
      3.9938321113586426,
      4.493061065673828,
      4.992290019989014,
      5.491519451141357,
      5.990748405456543,
      7.999274253845215
    ]
    onsets test/audio/scale2-piano.wav Float32Array(13) [
      0.01160997711122036,
      0.4992290139198303,
      0.9984580278396606,
      1.4976871013641357,
      1.9969160556793213,
      2.496145009994507,
      2.9953742027282715,
      3.5062131881713867,
      4.005442142486572,
      4.504671096801758,
      5.003900051116943,
      5.503129482269287,
      6.002358436584473
    ]
    onsets test/audio/scale-triangle.wav Float32Array(9) [
      0.01160997711122036,
      0.9984580278396606,
      1.9969160556793213,
      2.9953742027282715,
      3.9938321113586426,
      4.992290019989014,
      5.990748405456543,
      6.989206314086914,
      7.999274253845215
    ]
    onsets test/audio/scale-square.wav Float32Array(9) [
      0.01160997711122036,
      0.9984580278396606,
      1.9969160556793213,
      2.9953742027282715,
      3.9938321113586426,
      4.992290019989014,
      5.990748405456543,
      6.989206314086914,
      7.999274253845215
    ]
    onsets test/audio/scale-sin.wav Float32Array(9) [
      0.01160997711122036,
      0.9984580278396606,
      1.9969160556793213,
      2.9953742027282715,
      3.9938321113586426,
      4.992290019989014,
      5.990748405456543,
      6.989206314086914,
      7.999274253845215
    ]
    onsets test/audio/scale-saw.wav Float32Array(9) [
      0.01160997711122036,
      0.9984580278396606,
      1.9969160556793213,
      2.9953742027282715,
      3.9938321113586426,
      4.992290019989014,
      5.990748405456543,
      6.989206314086914,
      7.999274253845215
    ]
    onsets test/audio/scale-piano.wav Float32Array(8) [
      0.01160997711122036,
      0.9984580278396606,
      1.9969160556793213,
      3.006984233856201,
      4.005442142486572,
      5.003900051116943,
      6.002358436584473,
      7.000816345214844
    ]