python multithreading python-multithreading speech-to-text

Multi-threading chunks of audio within a loop (Python)

I have a large audio file that I would like to get transcribed. For this, I opted the silence-based conversion by splitting the audio file into chunks based on the silence between sentences. However, this takes longer than expected even for a short audio file.

from pydub import AudioSegment
from pydub.silence import split_on_silence
voice = AudioSegment.from_wav(path) #path to audio file
chunks = split_on_silence(voice, min_silence_len=500, silence_thresh=voice.dBFS-14, keep_silence=500,)

To try and process these chunks faster, I tried using a multi-threaded loop as shown

n_threads = len(chunks)
thread_list = []
for thr in range(n_threads):
    thread = Thread(target = threaded_process, args=(chunks[thr],))
    thread_list.append(thread)
    thread_list[thr].start()

for thread in thread_list:
    thread.join()

The function 'threaded_process' is supposed to perform the Speech-to-Text conversion

def threaded_process(chunks): 
    fh = open("recognized.txt", "w+") 
    i = 0
    for chunk in chunks: 
        chunk_silent = AudioSegment.silent(duration = 10)  
        audio_chunk = chunk_silent + chunk + chunk_silent 
        print("saving chunk{0}.wav".format(i)) 
        audio_chunk.export("./chunk{0}.wav".format(i), bitrate ='192k', format ="wav") 
        file = 'chunk'+str(i)+'.wav'
        print("Processing chunk "+str(i)) 
        rec = audio_to_text(file) #Another function which actually does the Speech to text conversion(IBM Watson SpeechToText API)
        if rec == "Error5487":
            return "Error5487E"
        fh.write(rec+" ")
        os.remove(file)
        i += 1
    fh.close()

But the conversion is done using the earlier method and not using multithreading. I also get this message- [WinError 32] The process cannot access the file because it is being used by another process: 'chunk0.wav' Why is this happening?

Solution

In this case multithreading is faster since audio transcription is done in the cloud.

Uses

pydub (audio package)
speech_recognition (google speech recognition API for audio to text)

Code

import concurrent.futures      # thread execution manager
import os
from time import time

import wget                    # save url data to file

from pydub import AudioSegment # process speech
from pydub.playback import play
from pydub.silence import split_on_silence

import speech_recognition as sr # speech recognizer

#########################################################
# Related to Data Acquisition
#########################################################
def get_sound_file(url):
    ' Gets data from a url and places into file '
    local_file = wget.download(url) 
    
    return local_file      # name of file data is placed into

def get_nonexistant_path(fname_path):
    """ 
    Generates the next unused file name based upon the fname_path '

    Examples
    --------
    >>> get_nonexistant_path('/etc/issue')
    '/etc/issue-1'
    >>> get_nonexistant_path('whatever/1337bla.py')
    'whatever/1337bla.py'
    
    Source: https://stackoverflow.com/questions/17984809/how-do-i-create-a-incrementing-filename-in-python
    """
    if not os.path.exists(fname_path):
        return fname_path
    filename, file_extension = os.path.splitext(fname_path)
    i = 1
    new_fname = "{}-{}{}".format(filename, i, file_extension)
    while os.path.exists(new_fname):
        i += 1
        new_fname = "{}-{}{}".format(filename, i, file_extension)
    return new_fname

def create_files(source_file):
    ' Splits data into multiple files based upon silence'
    sound = AudioSegment.from_wav(source_file)
    
    # Break into segments based upon silence
    segments = split_on_silence(sound, silence_thresh = sound.dBFS - 14)
    
    # Store as separate files
    #https://stackoverflow.com/questions/33747728/how-can-i-get-the-same-bitrate-of-input-and-output-file-in-pydub
    # https://wiki.audacityteam.org/wiki/WAV
    original_bitrate = str((sound.frame_rate * sound.frame_width * 8 * sound.channels) / 1000)
    
    file_list = []
    for audio_chunk in segments:
        # File whose enumeration number has not been used yet
        # i.e. file-1.wav, file-2.wav, ...
        file_list.append(get_nonexistant_path(source_file))                        # Add a file name
        audio_chunk.export(file_list[-1], format ="wav", bitrate=original_bitrate)# use name of last file added
        
    return file_list  # list of files created


#########################################################
# Speech to text
#########################################################
def audio_to_text(filename):
    '''
        Converts speech to text
        based upon blog: https://www.geeksforgeeks.org/audio-processing-using-pydub-and-google-speechrecognition-api/
    '''
    # Get recognizer
    r = sr.Recognizer() 
    
    with sr.AudioFile(filename) as source: 
        audio_listened = r.listen(source) 

        # Try to recognize the listened audio 
        # And catch expections. 
        try:     
            return r.recognize_google(audio_listened) 
            

        # If google could not understand the audio 
        except sr.UnknownValueError: 
            print("Could not understand audio") 
            return None

        # If the results cannot be requested from Google. 
        # Probably an internet connection error. 
        except sr.RequestError as e: 
            print("Could not request results.") 
            return None
      
def process(file):
    '''
        Audio conversion of file to text file
    '''
    with open('result.txt', 'w') as fout:
        transcription = audio_to_text(file)
        if transcription:
            fout.write(transcription + '\n')
            
def process_single(files):
    '''
        Audio conversion multiple audio files into a text file
    '''
    with open('result-single.txt', 'w') as fout:
        for file in files:
            transcription = audio_to_text(file)
            if transcription:
                fout.write(transcription + '\n')
                
def process_threads(files):
    '''
        Audio conversion multiple audio files into a text file using multiple threads
    '''
    with open('result_thread.txt', 'w') as fout:
        # using max_workers = None means use default 
        # number threads which is 5*(number of cpu cores)
        with concurrent.futures.ThreadPoolExecutor(max_workers = None) as executor:
            for transcription in executor.map(audio_to_text, files):
                if transcription:
                    fout.write(transcription + '\n')

Test Code

if __name__ == "__main__":
    # url of data used for testing
    url = 'http://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0010_8k.wav'
    
    # download data to local file
    data_file_name = get_sound_file(url)
    
    # place data into chunks based upon silence
    chunk_file_names = create_files(data_file_name)

    # Process single file without partitioning into chunks
    t0 = time()
    process(data_file_name)
    print(f'Running entire audio file elapsed time: {time() - t0:.4f}')
    
    # Single threaded version
    t0 = time()
    process_single(chunk_file_names)
    print(f'Running chunked audio files elapsed time: {time() - t0:.4f}')
        
    # Multiple threaded version
    t0 = time()
    process_threads(chunk_file_names)
    print(f'Running chunked audio files using multiple threads elapsed time: {time() - t0:.4f}')

Timing

Running entire audio file elapsed time: 13.0020
Running chunked audio files elapsed time: 17.8850
Running chunked audio files using multiple threads elapsed time: 3.6400