Search code examples
pythonaudio

Human voice detection on wav file


Let say I have short mp3 file and my task is to detect if there is a human voice on selected piece of samples audio signal.

My first idea was:

  1. Extract from mp3 file only acapella wav file with librosa
import numpy as np
import librosa.display
import soundfile as sf
import matplotlib.pyplot as plt
import librosa

y, sr = librosa.load('one_test.mp3')
output_file_path = "one_test.wav"

S_full, phase = librosa.magphase(librosa.stft(y))
S_filter = librosa.decompose.nn_filter(S_full,
                                       aggregate=np.median,
                                       metric='cosine',
                                       width=int(librosa.time_to_frames(2, sr=sr)))
S_filter = np.minimum(S_full, S_filter)
margin_i, margin_v = 2, 10
power = 2

mask_i = librosa.util.softmask(S_filter,
                               margin_i * (S_full - S_filter),
                               power=power)

mask_v = librosa.util.softmask(S_full - S_filter,
                               margin_v * S_filter,
                               power=power)

# Once we have the masks, simply multiply them with the input spectrum
# to separate the components
S_foreground = mask_v * S_full
S_background = mask_i * S_full
D_foreground = S_foreground * phase
y_foreground = librosa.istft(D_foreground)
sf.write(output_file_path, y_foreground, samplerate=sr)
  1. Read acapella wav file with scipy and select min-max value from selected range of samplerate
from scipy.io import wavfile
samplerate, data = wavfile.read('one_test.wav')

max(data[0:929])
>>> 3376
min(data[0:929])
>>> -5134

But for that moment I am not sure that It is a correct approach for my task. I still don't know which part of samples have human voice or not. Please provide me an approach or give me an advice where find the answer. My task is not hard I need only find voice on my ROI, no voice segmentation or text detection are not needed.


Solution

  • After some research, I find out option that suit for my task the most. For voice separation I used spleeter library that return me a fancy result, but I also enhance signal with scipy.signal which one reduce noise and I got track and signal only with voice.

    .

    Code implementation:

    !pip install spleeter
    
    from spleeter.separator import Separator
    import numpy as np
    import scipy.signal as sg
    import soundfile as sf
    from IPython.display import Audio, display
    import matplotlib.pyplot as plt
    %matplotlib inline
    
    
    # Using embedded configuration.
    separator = Separator('spleeter:2stems')
    
    separator.separate_to_file('/content/one_test.mp3', '/content/')
    
    def load_audio(filepath, sr=None, mono=True, dtype='float32'):
        
        x, fs = sf.read(filepath)
    
        if mono and len(x.shape)>1:
            x = np.mean(x, axis = 1)
        if sr:
            x = sg.resample_poly(x, sr, fs)
            fs = sr 
        x = x.astype(dtype)
    
        return x, fs 
    
    def play(x, fr, autoplay=False):
        display(Audio(x, rate=fr, autoplay=autoplay))
    
    
    # Original wav track after spleeter
    x, fr = load_audio('/content/one_test/vocals.wav')
    play(x, fr)
    fig, ax = plt.subplots(1, 1, figsize=(8, 4))
    t = np.linspace(0., len(x) / fr, len(x))
    ax.plot(t, x, lw=1)
    

    Original signal after spleeter proceed

    # First case with low critical parameter 900 frequency 
    b, a = sg.butter(4, 900. / (fr / 2.), 'low')
    x_fil = sg.filtfilt(b, a, x)
    
    play(x_fil, fr)
    fig, ax = plt.subplots(1, 1, figsize=(8, 4))
    ax.plot(t, x, lw=1)
    ax.plot(t, x_fil, lw=1)
    
    

    First case

    # Second case with high critical parameter 300 frequency 
    b, a = sg.butter(4, 300. / (fr / 2.), 'high')
    x_fil = sg.filtfilt(b, a, x)
    
    play(x_fil, fr)
    fig, ax = plt.subplots(1, 1, figsize=(6, 3))
    ax.plot(t, x, lw=1)
    ax.plot(t, x_fil, lw=1)
    

    Second case

    And finally the idea to detect whether is voice on selected range or not. I define sample range and if it lower than critical, there is no voice and otherwise.

    # For first case done
    selection = x_fil[0:44099]
    inter_range = max(selection)-min(selection)
    print(inter_range)
    >>> 0.008289359637006036
    # As you can see range is quite small 
    # in that case we could suppose that on selected range there is no voice
    
    # another range
    selection = x_fil[44099:44099+44099]
    inter_range = max(selection)-min(selection)
    print(inter_range)
    >>> 1.0232146011987286
    # Range of selected slice is quite big compared to the past slice
    # Conclusion: there is voice on slice