I needed a deep learning algorithm for sound classification which uses the pre-trained model to make predictions on new audio files. Since I am new to python I was working on this sound classification algorithm that I found but which works great but currently, it outputs the real class of each audio file in the data set and their predicted classes. Instead, I need it to predict the class of new audio files, which are not labeled at all but I was unsuccessful at modifying the code to work as I wanted.
The code I am currently using:
from tensorflow.keras.models import load_model
from clean import downsample_mono, envelope
from kapre.time_frequency import STFT, Magnitude, ApplyFilterbank, MagnitudeToDecibel
from sklearn.preprocessing import LabelEncoder
import numpy as np
from glob import glob
import argparse
import os
import pandas as pd
from tqdm import tqdm
def make_prediction(args):
model = load_model(args.model_fn,
custom_objects={'STFT':STFT,
'Magnitude':Magnitude,
'ApplyFilterbank':ApplyFilterbank,
'MagnitudeToDecibel':MagnitudeToDecibel})
wav_paths = glob('{}/**'.format(args.src_dir), recursive=True)
wav_paths = sorted([x.replace(os.sep, '/') for x in wav_paths if '.wav' in x])
classes = sorted(os.listdir(args.src_dir))
labels = [os.path.split(x)[0].split('/')[-1] for x in wav_paths]
le = LabelEncoder()
y_true = le.fit_transform(labels)
results = []
for z, wav_fn in tqdm(enumerate(wav_paths), total=len(wav_paths)):
rate, wav = downsample_mono(wav_fn, args.sr)
mask, env = envelope(wav, rate, threshold=args.threshold)
clean_wav = wav[mask]
step = int(args.sr*args.dt)
batch = []
for i in range(0, clean_wav.shape[0], step):
sample = clean_wav[i:i+step]
sample = sample.reshape(-1, 1)
if sample.shape[0] < step:
tmp = np.zeros(shape=(step, 1), dtype=np.float32)
tmp[:sample.shape[0],:] = sample.flatten().reshape(-1, 1)
sample = tmp
batch.append(sample)
X_batch = np.array(batch, dtype=np.float32)
y_pred = model.predict(X_batch)
y_mean = np.mean(y_pred, axis=0)
y_pred = np.argmax(y_mean)
real_class = os.path.dirname(wav_fn).split('/')[-1]
print('Actual class: {}, Predicted class: {}'.format(real_class, classes[y_pred]))
results.append(y_mean)
np.save(os.path.join('logs', args.pred_fn), np.array(results))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Audio Classification Training')
parser.add_argument('--model_fn', type=str, default='models/lstm.h5',
help='model file to make predictions')
parser.add_argument('--pred_fn', type=str, default='y_pred',
help='fn to write predictions in logs dir')
parser.add_argument('--src_dir', type=str, default='wavfiles',
help='directory containing wavfiles to predict')
parser.add_argument('--dt', type=float, default=1.0,
help='time in seconds to sample audio')
parser.add_argument('--sr', type=int, default=16000,
help='sample rate of clean audio')
parser.add_argument('--threshold', type=str, default=20,
help='threshold magnitude for np.int16 dtype')
args, _ = parser.parse_known_args()
make_prediction(args)
In the section of code
wav_paths = sorted([x.replace(os.sep, '/') for x in wav_paths if '.wav' in x])
classes = sorted(os.listdir(args.src_dir))
labels = [os.path.split(x)[0].split('/')[-1] for x in wav_paths]
le = LabelEncoder() # delete
y_true = le.fit_transform(labels) # delete
results = []
Delete the 2 lines that are marked with '# delete
Then in this section
X_batch = np.array(batch, dtype=np.float32)
y_pred = model.predict(X_batch)
y_mean = np.mean(y_pred, axis=0)
y_pred = np.argmax(y_mean)
real_class = os.path.dirname(wav_fn).split('/')[-1]
print('Actual class: {}, Predicted class: {}'.format(real_class, classes[y_pred]))
results.append(y_mean)
np.save(os.path.join('logs', args.pred_fn), np.array(results))
Rewrite it as follows
X_batch = np.array(batch, dtype=np.float32)
y_pred = model.predict(X_batch)
y_mean = np.mean(y_pred, axis=0)
y_pred = np.argmax(y_mean)
print('Predicted class: {}'.format(classes[y_pred]))
results.append(y_mean)
# np.save(os.path.join('logs', args.pred_fn), np.array(results))
That should do it