I'm trying to generate a transcription from an audio file using pydub
and speech_recognition
libraries. I'm trying to do this through a GUI made in Tkinter, in which I would like to show the transcription asynchronously. However, something doesn't work in my code because the GUI keeps freezing while generating the transcription.
Here is the code:
import customtkinter
import asyncio
from tkinter import filedialog
from async_tkinter_loop import async_handler, async_mainloop
import speech_recognition as sr
import os
from pathlib import Path
from pydub import AudioSegment
from pydub.silence import split_on_silence
class App(customtkinter.CTk):
def __init__(self):
super().__init__()
self.filepath = None
self.transcription = None
self.grid_rowconfigure(2, weight=1)
self.btn_select_file = customtkinter.CTkButton(
self, text="Select audio file", command=self.open_file
)
self.btn_select_file.grid(row=0, column=0, padx=20, pady=30)
self.btn_generate_text = customtkinter.CTkButton(
self,
fg_color="green",
text="Generate text",
command=async_handler(self.get_transcription)
)
self.btn_generate_text.grid(row=1, column=0, padx=20, pady=30)
self.tbx_transcription = customtkinter.CTkTextbox(self, wrap="word")
self.tbx_transcription.grid(row=2, column=0, padx=20, pady=20, sticky="nsew")
def open_file(self):
# Open the file dialog
filepath = filedialog.askopenfilename(
initialdir="/",
title="Select a file",
filetypes=[("Audio files", ["*.mp3", "*.wav", "*.ogg", "*.opus", "*.mpeg"])]
)
if filepath:
self.filepath = filepath
async def get_transcription(self):
if not self.filepath:
self.tbx_transcription.insert(
"0.0",
"Error: No audio file selected, please select one before generating text."
)
return
# Create a task to get the transcription
task = [asyncio.create_task(self.generate_transcription(self.filepath))]
completed, pending = await asyncio.wait(task)
self.transcription = [task.result() for task in completed]
# Display the transcription
self.tbx_transcription.insert("0.0", self.transcription)
@staticmethod
async def generate_transcription(filepath):
"""
Splitting a large audio file into chunks
and applying speech recognition on each of these chunks
"""
# create a speech recognition object
r = sr.Recognizer()
# open the audio file using pydub
content_type = Path(filepath).suffix
if "wav" in content_type:
sound = AudioSegment.from_wav(filepath)
elif "ogg" in content_type or "opus" in content_type:
sound = AudioSegment.from_ogg(filepath)
elif "mp3" in content_type or "mpeg" in content_type:
sound = AudioSegment.from_mp3(filepath)
# split audio sound where silence is 700 miliseconds or more and get chunks
chunks = split_on_silence(
sound,
# experiment with this value for your target audio file
min_silence_len=500,
# adjust this per requirement
silence_thresh=sound.dBFS - 14,
# keep the silence for 1 second, adjustable as well
keep_silence=500,
)
folder_name = "audio-chunks"
# create a directory to store the audio chunks
if not os.path.isdir(folder_name):
os.mkdir(folder_name)
whole_text = ""
# process each chunk
for i, audio_chunk in enumerate(chunks, start=1):
# export audio chunk and save it in the `folder_name` directory.
chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
audio_chunk.export(chunk_filename, format="wav")
# recognize the chunk
with sr.AudioFile(chunk_filename) as source:
audio_listened = r.record(source)
# try converting it to text
try:
text = r.recognize_google(audio_listened, language="es")
except sr.UnknownValueError as e:
print("Error:", str(e))
else:
text = f"{text.capitalize()}. "
whole_text += text
# return the text for all chunks detected
return whole_text
if __name__ == "__main__":
app = App()
async_mainloop(app)
I tried to use async_tkinter_loop
library out of desperation, but it's not mandatory to use it.
EDIT: I've tried httpSteve's solution but the GUI keeps freezing, just as the code that I've provided above. Here is a gif that represents the undisired behaviour of the app.
It may not be appreciated, but I try to move the window and click on the buttons without any response. The GUI won't respond until the transcription is generated.
I've finally managed to prevent the GUI from freezing thanks to threading
. The key here is to use
threading.Thread(
target=lambda loop: loop.run_until_complete(self.async_get_transcription()),
args=(asyncio.new_event_loop(),)
).start()
and passing command=lambda: self.get_transcription()
when creating the self.btn_generate_text
object.
Here is the fixed code:
import asyncio
import customtkinter
import threading
from tkinter import filedialog
import speech_recognition as sr
import os
from pathlib import Path
from pydub import AudioSegment
from pydub.silence import split_on_silence
class App(customtkinter.CTk):
def __init__(self):
super().__init__()
self.filepath = None
self.transcription = None
self.grid_rowconfigure(2, weight=1)
self.btn_select_file = customtkinter.CTkButton(
self,
text="Select audio file",
command=self.open_file
)
self.btn_select_file.grid(row=0, column=0, padx=20, pady=30)
self.btn_generate_text = customtkinter.CTkButton(
self,
fg_color="green",
text="Generate text",
command=lambda: self.get_transcription()
)
self.btn_generate_text.grid(row=1, column=0, padx=20, pady=30)
self.tbx_transcription = customtkinter.CTkTextbox(self, wrap="word")
self.tbx_transcription.grid(row=2, column=0, padx=20, pady=20, sticky="nsew")
def open_file(self):
# Open the file dialog
filepath = filedialog.askopenfilename(
initialdir="/",
title="Select a file",
filetypes=[("Audio files", ["*.mp3", "*.wav", "*.ogg", "*.opus", "*.mpeg"])]
)
if filepath:
self.filepath = filepath
def get_transcription(self):
if not self.filepath:
self.tbx_transcription.insert(
"0.0",
"Error: No audio file selected, please select one before generating text."
)
return
threading.Thread(
target=lambda loop: loop.run_until_complete(self.async_get_transcription()),
args=(asyncio.new_event_loop(),)
).start()
self.progressbar_1 = customtkinter.CTkProgressBar(self)
self.progressbar_1.grid(row=2, column=0, padx=40, pady=0, sticky="ew")
self.progressbar_1.configure(mode="indeterminnate")
self.progressbar_1.start()
async def async_get_transcription(self):
self.transcription = await self.generate_transcription(self.filepath)
self.progressbar_1.grid_forget()
self.tbx_transcription.insert("0.0", self.transcription)
@staticmethod
async def generate_transcription(filepath):
"""
Splitting a large audio file into chunks
and applying speech recognition on each of these chunks
"""
# create a speech recognition object
r = sr.Recognizer()
# open the audio file using pydub
content_type = Path(filepath).suffix
if "wav" in content_type:
sound = AudioSegment.from_wav(filepath)
elif "ogg" in content_type or "opus" in content_type:
sound = AudioSegment.from_ogg(filepath)
elif "mp3" in content_type or "mpeg" in content_type:
sound = AudioSegment.from_mp3(filepath)
# split audio sound where silence is 700 miliseconds or more and get chunks
chunks = split_on_silence(
sound,
# experiment with this value for your target audio file
min_silence_len=500,
# adjust this per requirement
silence_thresh=sound.dBFS - 14,
# keep the silence for 1 second, adjustable as well
keep_silence=500,
)
folder_name = "audio-chunks"
# create a directory to store the audio chunks
if not os.path.isdir(folder_name):
os.mkdir(folder_name)
whole_text = ""
# process each chunk
for i, audio_chunk in enumerate(chunks, start=1):
# export audio chunk and save it in the `folder_name` directory.
chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
audio_chunk.export(chunk_filename, format="wav")
# recognize the chunk
with sr.AudioFile(chunk_filename) as source:
audio_listened = r.record(source)
# try converting it to text
try:
text = r.recognize_google(audio_listened, language="es")
except sr.UnknownValueError as e:
print("Error:", str(e))
else:
text = f"{text.capitalize()}. "
whole_text += text
# return the text for all chunks detected
return whole_text
if __name__ == "__main__":
app = App()
app.mainloop()