Human voice detection on wav file

Let say I have short mp3 file and my task is to detect if there is a human voice on selected piece of samples audio signal.

My first idea was:

Extract from mp3 file only acapella wav file with librosa

import numpy as np
import librosa.display
import soundfile as sf
import matplotlib.pyplot as plt
import librosa

y, sr = librosa.load('one_test.mp3')
output_file_path = "one_test.wav"

S_full, phase = librosa.magphase(librosa.stft(y))
S_filter = librosa.decompose.nn_filter(S_full,
                                       aggregate=np.median,
                                       metric='cosine',
                                       width=int(librosa.time_to_frames(2, sr=sr)))
S_filter = np.minimum(S_full, S_filter)
margin_i, margin_v = 2, 10
power = 2

mask_i = librosa.util.softmask(S_filter,
                               margin_i * (S_full - S_filter),
                               power=power)

mask_v = librosa.util.softmask(S_full - S_filter,
                               margin_v * S_filter,
                               power=power)

# Once we have the masks, simply multiply them with the input spectrum
# to separate the components
S_foreground = mask_v * S_full
S_background = mask_i * S_full
D_foreground = S_foreground * phase
y_foreground = librosa.istft(D_foreground)
sf.write(output_file_path, y_foreground, samplerate=sr)

Read acapella wav file with scipy and select min-max value from selected range of samplerate

from scipy.io import wavfile
samplerate, data = wavfile.read('one_test.wav')

max(data[0:929])
>>> 3376
min(data[0:929])
>>> -5134

But for that moment I am not sure that It is a correct approach for my task. I still don't know which part of samples have human voice or not. Please provide me an approach or give me an advice where find the answer. My task is not hard I need only find voice on my ROI, no voice segmentation or text detection are not needed.

Solution

After some research, I find out option that suit for my task the most. For voice separation I used spleeter library that return me a fancy result, but I also enhance signal with scipy.signal which one reduce noise and I got track and signal only with voice.

Code implementation:

!pip install spleeter

from spleeter.separator import Separator
import numpy as np
import scipy.signal as sg
import soundfile as sf
from IPython.display import Audio, display
import matplotlib.pyplot as plt
%matplotlib inline


# Using embedded configuration.
separator = Separator('spleeter:2stems')

separator.separate_to_file('/content/one_test.mp3', '/content/')

def load_audio(filepath, sr=None, mono=True, dtype='float32'):
    
    x, fs = sf.read(filepath)

    if mono and len(x.shape)>1:
        x = np.mean(x, axis = 1)
    if sr:
        x = sg.resample_poly(x, sr, fs)
        fs = sr 
    x = x.astype(dtype)

    return x, fs 

def play(x, fr, autoplay=False):
    display(Audio(x, rate=fr, autoplay=autoplay))


# Original wav track after spleeter
x, fr = load_audio('/content/one_test/vocals.wav')
play(x, fr)
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
t = np.linspace(0., len(x) / fr, len(x))
ax.plot(t, x, lw=1)

# First case with low critical parameter 900 frequency 
b, a = sg.butter(4, 900. / (fr / 2.), 'low')
x_fil = sg.filtfilt(b, a, x)

play(x_fil, fr)
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
ax.plot(t, x, lw=1)
ax.plot(t, x_fil, lw=1)

# Second case with high critical parameter 300 frequency 
b, a = sg.butter(4, 300. / (fr / 2.), 'high')
x_fil = sg.filtfilt(b, a, x)

play(x_fil, fr)
fig, ax = plt.subplots(1, 1, figsize=(6, 3))
ax.plot(t, x, lw=1)
ax.plot(t, x_fil, lw=1)

And finally the idea to detect whether is voice on selected range or not. I define sample range and if it lower than critical, there is no voice and otherwise.

# For first case done
selection = x_fil[0:44099]
inter_range = max(selection)-min(selection)
print(inter_range)
>>> 0.008289359637006036
# As you can see range is quite small 
# in that case we could suppose that on selected range there is no voice

# another range
selection = x_fil[44099:44099+44099]
inter_range = max(selection)-min(selection)
print(inter_range)
>>> 1.0232146011987286
# Range of selected slice is quite big compared to the past slice
# Conclusion: there is voice on slice