I have a large audio file that I would like to get transcribed. For this, I opted the silence-based conversion by splitting the audio file into chunks based on the silence between sentences. However, this takes longer than expected even for a short audio file.
from pydub import AudioSegment
from pydub.silence import split_on_silence
voice = AudioSegment.from_wav(path) #path to audio file
chunks = split_on_silence(voice, min_silence_len=500, silence_thresh=voice.dBFS-14, keep_silence=500,)
To try and process these chunks faster, I tried using a multi-threaded loop as shown
n_threads = len(chunks)
thread_list = []
for thr in range(n_threads):
thread = Thread(target = threaded_process, args=(chunks[thr],))
thread_list.append(thread)
thread_list[thr].start()
for thread in thread_list:
thread.join()
The function 'threaded_process' is supposed to perform the Speech-to-Text conversion
def threaded_process(chunks):
fh = open("recognized.txt", "w+")
i = 0
for chunk in chunks:
chunk_silent = AudioSegment.silent(duration = 10)
audio_chunk = chunk_silent + chunk + chunk_silent
print("saving chunk{0}.wav".format(i))
audio_chunk.export("./chunk{0}.wav".format(i), bitrate ='192k', format ="wav")
file = 'chunk'+str(i)+'.wav'
print("Processing chunk "+str(i))
rec = audio_to_text(file) #Another function which actually does the Speech to text conversion(IBM Watson SpeechToText API)
if rec == "Error5487":
return "Error5487E"
fh.write(rec+" ")
os.remove(file)
i += 1
fh.close()
But the conversion is done using the earlier method and not using multithreading. I also get this message- [WinError 32] The process cannot access the file because it is being used by another process: 'chunk0.wav' Why is this happening?
In this case multithreading is faster since audio transcription is done in the cloud.
Uses
Code
import concurrent.futures # thread execution manager
import os
from time import time
import wget # save url data to file
from pydub import AudioSegment # process speech
from pydub.playback import play
from pydub.silence import split_on_silence
import speech_recognition as sr # speech recognizer
#########################################################
# Related to Data Acquisition
#########################################################
def get_sound_file(url):
' Gets data from a url and places into file '
local_file = wget.download(url)
return local_file # name of file data is placed into
def get_nonexistant_path(fname_path):
"""
Generates the next unused file name based upon the fname_path '
Examples
--------
>>> get_nonexistant_path('/etc/issue')
'/etc/issue-1'
>>> get_nonexistant_path('whatever/1337bla.py')
'whatever/1337bla.py'
Source: https://stackoverflow.com/questions/17984809/how-do-i-create-a-incrementing-filename-in-python
"""
if not os.path.exists(fname_path):
return fname_path
filename, file_extension = os.path.splitext(fname_path)
i = 1
new_fname = "{}-{}{}".format(filename, i, file_extension)
while os.path.exists(new_fname):
i += 1
new_fname = "{}-{}{}".format(filename, i, file_extension)
return new_fname
def create_files(source_file):
' Splits data into multiple files based upon silence'
sound = AudioSegment.from_wav(source_file)
# Break into segments based upon silence
segments = split_on_silence(sound, silence_thresh = sound.dBFS - 14)
# Store as separate files
#https://stackoverflow.com/questions/33747728/how-can-i-get-the-same-bitrate-of-input-and-output-file-in-pydub
# https://wiki.audacityteam.org/wiki/WAV
original_bitrate = str((sound.frame_rate * sound.frame_width * 8 * sound.channels) / 1000)
file_list = []
for audio_chunk in segments:
# File whose enumeration number has not been used yet
# i.e. file-1.wav, file-2.wav, ...
file_list.append(get_nonexistant_path(source_file)) # Add a file name
audio_chunk.export(file_list[-1], format ="wav", bitrate=original_bitrate)# use name of last file added
return file_list # list of files created
#########################################################
# Speech to text
#########################################################
def audio_to_text(filename):
'''
Converts speech to text
based upon blog: https://www.geeksforgeeks.org/audio-processing-using-pydub-and-google-speechrecognition-api/
'''
# Get recognizer
r = sr.Recognizer()
with sr.AudioFile(filename) as source:
audio_listened = r.listen(source)
# Try to recognize the listened audio
# And catch expections.
try:
return r.recognize_google(audio_listened)
# If google could not understand the audio
except sr.UnknownValueError:
print("Could not understand audio")
return None
# If the results cannot be requested from Google.
# Probably an internet connection error.
except sr.RequestError as e:
print("Could not request results.")
return None
def process(file):
'''
Audio conversion of file to text file
'''
with open('result.txt', 'w') as fout:
transcription = audio_to_text(file)
if transcription:
fout.write(transcription + '\n')
def process_single(files):
'''
Audio conversion multiple audio files into a text file
'''
with open('result-single.txt', 'w') as fout:
for file in files:
transcription = audio_to_text(file)
if transcription:
fout.write(transcription + '\n')
def process_threads(files):
'''
Audio conversion multiple audio files into a text file using multiple threads
'''
with open('result_thread.txt', 'w') as fout:
# using max_workers = None means use default
# number threads which is 5*(number of cpu cores)
with concurrent.futures.ThreadPoolExecutor(max_workers = None) as executor:
for transcription in executor.map(audio_to_text, files):
if transcription:
fout.write(transcription + '\n')
Test Code
if __name__ == "__main__":
# url of data used for testing
url = 'http://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0010_8k.wav'
# download data to local file
data_file_name = get_sound_file(url)
# place data into chunks based upon silence
chunk_file_names = create_files(data_file_name)
# Process single file without partitioning into chunks
t0 = time()
process(data_file_name)
print(f'Running entire audio file elapsed time: {time() - t0:.4f}')
# Single threaded version
t0 = time()
process_single(chunk_file_names)
print(f'Running chunked audio files elapsed time: {time() - t0:.4f}')
# Multiple threaded version
t0 = time()
process_threads(chunk_file_names)
print(f'Running chunked audio files using multiple threads elapsed time: {time() - t0:.4f}')
Timing
Running entire audio file elapsed time: 13.0020
Running chunked audio files elapsed time: 17.8850
Running chunked audio files using multiple threads elapsed time: 3.6400