I've been trying to figure out how to make subtitles with Microsoft Azure Speech Recognition service in Python, but can't figure it out. I've followed the tips someone else has answered here on getting the individual words, but even formatting those to .srt or .vtt seems convoluted. Here's the code:
import azure.cognitiveservices.speech as speechsdk
def speech_recognize_continuous_from_file():
"""performs continuous speech recognition with input from an audio file"""
# <SpeechContinuousRecognitionWithFile>
speech_key, service_region = "{api-key}", "{serive-region}"
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
audio_filename = "{for example: video.wav}"
audio_config = speechsdk.audio.AudioConfig(filename=audio_filename)
speech_config.speech_recognition_language="en-US"
speech_config.request_word_level_timestamps()
speech_config.enable_dictation()
speech_config.output_format = speechsdk.OutputFormat(1)
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
done = False
results = []
transcript = []
words = []
def handle_final_result(evt):
import json
results = json.loads(evt.result.json)
transcript.append(results['DisplayText'])
confidence_list_temp = [item.get('Confidence') for item in results['NBest']]
max_confidence_index = confidence_list_temp.index(max(confidence_list_temp))
words.extend(results['NBest'][max_confidence_index]['Words'])
def stop_cb(evt):
"""callback that stops continuous recognition upon receiving an event `evt`"""
print('CLOSING on {}'.format(evt))
speech_recognizer.stop_continuous_recognition()
nonlocal done
done = True
print("Transcript display list:\n")
print(transcript)
print("\nWords\n")
print(words)
print("\n")
speech_recognizer.recognized.connect(handle_final_result)
# Connect callbacks to the events fired by the speech recognizer
speech_recognizer.recognizing.connect(lambda evt: format(evt))
speech_recognizer.recognized.connect(lambda evt: format(evt))
speech_recognizer.session_started.connect(lambda evt: format(evt))
speech_recognizer.session_stopped.connect(lambda evt: format(evt))
speech_recognizer.canceled.connect(lambda evt: format(evt))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
# Start continuous speech recognition
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
with open('Azure_Raw.txt','w') as f:
f.write('\n'.join(results))
sample_long_running_recognize(storage_uri)
Only other "tutorial" I found on subtitles was a Google Cloud one, and that gives the results I am looking for (Yes, I've tested it myself), but Azure obviously doesn't work at all like G-cloud: https://medium.com/searce/generate-srt-file-subtitles-using-google-clouds-speech-to-text-api-402b2f1da3bd
So basically: How do I get like 3 seconds of speech-text into an .srt format, like this:
1
00:00:00,000 --> 00:00:03,000
This is the first sentence that
2
00:00:03,000 --> 00:00:06,000
continues after 3 seconds or so
So if you see closely - JSON output of the Azure speech service - it is slightly different from output of other services look like.
for the mentioned configuration the output looks like below after you take the best match
[{'Duration': 3900000, 'Offset': 500000, 'Word': "what's"},
{'Duration': 1300000, 'Offset': 4500000, 'Word': 'the'},
{'Duration': 2900000, 'Offset': 5900000, 'Word': 'weather'},
{'Duration': 4800000, 'Offset': 8900000, 'Word': 'like'}]
There are three outputs - Word,Duration & offset
You will have to make use of this in order to frame your timeline
import azure.cognitiveservices.speech as speechsdk
import os
import time
import pprint
import json
import srt
import datetime
path = os.getcwd()
# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and region identifier from here: https://aka.ms/speech/sdkregion
speech_key, service_region = "<>", "<>"
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
# Creates an audio configuration that points to an audio file.
# Replace with your own audio filename.
audio_filename = "sample.wav"
audio_input = speechsdk.audio.AudioConfig(filename=audio_filename)
# Creates a recognizer with the given settings
speech_config.speech_recognition_language="en-US"
speech_config.request_word_level_timestamps()
speech_config.enable_dictation()
speech_config.output_format = speechsdk.OutputFormat(1)
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)
#result = speech_recognizer.recognize_once()
all_results = []
results = []
transcript = []
words = []
#https://learn.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult?view=azure-python
def handle_final_result(evt):
import json
all_results.append(evt.result.text)
results = json.loads(evt.result.json)
transcript.append(results['DisplayText'])
confidence_list_temp = [item.get('Confidence') for item in results['NBest']]
max_confidence_index = confidence_list_temp.index(max(confidence_list_temp))
words.extend(results['NBest'][max_confidence_index]['Words'])
done = False
def stop_cb(evt):
print('CLOSING on {}'.format(evt))
speech_recognizer.stop_continuous_recognition()
global done
done= True
speech_recognizer.recognized.connect(handle_final_result)
#Connect callbacks to the events fired by the speech recognizer
speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
print("Printing all results:")
print(all_results)
speech_to_text_response = words
def convertduration(t):
x= t/10000
return int((x / 1000)), (x % 1000)
##-- Code to Create Subtitle --#
#3 Seconds
bin = 3.0
duration = 0
transcriptions = []
transcript = ""
index,prev=0,0
wordstartsec,wordstartmicrosec=0,0
for i in range(len(speech_to_text_response)):
#Forms the sentence until the bin size condition is met
transcript = transcript + " " + speech_to_text_response[i]["Word"]
#Checks whether the elapsed duration is less than the bin size
if(int((duration / 10000000)) < bin):
wordstartsec,wordstartmicrosec=convertduration(speech_to_text_response[i]["Offset"])
duration= duration+speech_to_text_response[i]["Offset"]-prev
prev=speech_to_text_response[i]["Offset"]
#transcript = transcript + " " + speech_to_text_response[i]["Word"]
else :
index=index+1
#transcript = transcript + " " + speech_to_text_response[i]["Word"]
transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
duration = 0
#print(transcript)
transcript=""
transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
subtitles = srt.compose(transcriptions)
with open("subtitle.srt", "w") as f:
f.write(subtitles)
Attached the Output for your reference :
Hope this Helps :)