I've been following the Tensorflow.js audio recogntion tutorial here: https://codelabs.developers.google.com/codelabs/tensorflowjs-audio-codelab/index.html?index=..%2F..index#5. I changed the commands, removed the slider and the function moveSlider(), and simply made the label appear in the "console" div. You can find my code here: https://codepen.io/willrd123/pen/abvQbyG?editors=0010.
<html>
<head>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs"></script>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow-models/speech-commands"></script>
</head>
<body>
<button id="start" onmousedown="collect(0)">Start</button>
<button id="forward" onmousedown="collect(1)">Forward</button>
<button id="back" onmousedown="collect(2)">Back</button>
<button id="left" onmousedown="collect(3)">Left</button>
<button id="right" onmousedown="collect(4)">Right</button>
<button id="up" onmousedown="collect(5)">Up</button>
<button id="down" onmousedown="collect(6)">Down</button>
<button id="stop" onmousedown="collect(7)">Stop</button>
<button id="takeOff" onmousedown="collect(8)">Take Off</button>
<button id="land" onmousedown="collect(9)">Land</button>
<button id="flip" onmousedown="collect(10)">Flip</button>
<button id="switchView" onmousedown="collect(11)">Switch View</button>
<button id="noise" onmousedown="collect(12)">Noise</button>
<br/><br/>
<button id="train" onclick="train()">Train</button>
<button id="listen" onclick="listen()">Listen</button>
<button id="save" onclick="save()">Save</button>
<br/><br/>
<div id="console"></div>
<script src="index.js"></script>
</body>
</html>
let recognizer;
async function app() {
recognizer = speechCommands.create('BROWSER_FFT');
await recognizer.ensureModelLoaded();
// Add this line.
buildModel();
}
app();
// One frame is ~23ms of audio.
const NUM_FRAMES = 6;
let examples = [];
function collect(label) {
if (recognizer.isListening()) {
return recognizer.stopListening();
}
if (label == null) {
return;
}
recognizer.listen(async ({spectrogram: {frameSize, data}}) => {
let vals = normalize(data.subarray(-frameSize * NUM_FRAMES));
examples.push({vals, label});
document.querySelector('#console').textContent =
`${examples.length} examples collected`;
}, {
overlapFactor: 0.999,
includeSpectrogram: true,
invokeCallbackOnNoiseAndUnknown: true
});
}
function normalize(x) {
const mean = -100;
const std = 10;
return x.map(x => (x - mean) / std);
}
const INPUT_SHAPE = [NUM_FRAMES, 232, 1];
let model;
async function train() {
toggleButtons(false);
const ys = tf.oneHot(examples.map(e => e.label), 3);
const xsShape = [examples.length, ...INPUT_SHAPE];
const xs = tf.tensor(flatten(examples.map(e => e.vals)), xsShape);
await model.fit(xs, ys, {
batchSize: 16,
epochs: 10,
callbacks: {
onEpochEnd: (epoch, logs) => {
document.querySelector('#console').textContent =
`Accuracy: ${(logs.acc * 100).toFixed(1)}% Epoch: ${epoch + 1}`;
}
}
});
tf.dispose([xs, ys]);
toggleButtons(true);
}
function buildModel() {
model = tf.sequential();
model.add(tf.layers.depthwiseConv2d({
depthMultiplier: 8,
kernelSize: [NUM_FRAMES, 3],
activation: 'relu',
inputShape: INPUT_SHAPE
}));
model.add(tf.layers.maxPooling2d({poolSize: [1, 2], strides: [2, 2]}));
model.add(tf.layers.flatten());
model.add(tf.layers.dense({units: 3, activation: 'softmax'}));
const optimizer = tf.train.adam(0.01);
model.compile({
optimizer,
loss: 'categoricalCrossentropy',
metrics: ['accuracy']
});
}
function toggleButtons(enable) {
document.querySelectorAll('button').forEach(b => b.disabled = !enable);
}
function flatten(tensors) {
const size = tensors[0].length;
const result = new Float32Array(tensors.length * size);
tensors.forEach((arr, i) => result.set(arr, i * size));
return result;
}
var labels = ["Forward", "Back", "Left", "Right", "Up", "Down", "Take Off", "Land", "Switch View", "Noise"];
async function finish(labelTensor) {
const label = (await labelTensor.data())[0];
document.getElementById('console').textContent = labels[label];
}
function listen() {
if (recognizer.isListening()) {
recognizer.stopListening();
toggleButtons(true);
document.getElementById('listen').textContent = 'Listen';
return;
}
toggleButtons(false);
document.getElementById('listen').textContent = 'Stop';
document.getElementById('listen').disabled = false;
recognizer.listen(async ({spectrogram: {frameSize, data}}) => {
const vals = normalize(data.subarray(-frameSize * NUM_FRAMES));
const input = tf.tensor(vals, [1, ...INPUT_SHAPE]);
const probs = model.predict(input);
const predLabel = probs.argMax(1);
await finish(predLabel);
tf.dispose([input, probs, predLabel]);
}, {
overlapFactor: 0.999,
includeSpectrogram: true,
invokeCallbackOnNoiseAndUnknown: true
});
}
async function save () {
const model = await tf.loadLayersModel(HTTP-Server/dronemodel.json');
}
However, when I tried adapting this code for 13 different commands, the model only ever returned the first 3 of them (Start, Forward, and Back), even when I only gave it audio for 1 command outside these 3. Is there any way to fix this?
The model is making a classification of three classes given the units 3
of the last layer.
The number of units has to be changed to the number of commands expected (13
) and the model needs to be trained accordingly.