python azure speech-recognition speech-to-text subtitle

Subtitles/captions with Microsoft Azure Speech-to-text in Python

I've been trying to figure out how to make subtitles with Microsoft Azure Speech Recognition service in Python, but can't figure it out. I've followed the tips someone else has answered here on getting the individual words, but even formatting those to .srt or .vtt seems convoluted. Here's the code:

import azure.cognitiveservices.speech as speechsdk


def speech_recognize_continuous_from_file():
    """performs continuous speech recognition with input from an audio file"""
    # <SpeechContinuousRecognitionWithFile>
    speech_key, service_region = "{api-key}", "{serive-region}"
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    
    audio_filename = "{for example: video.wav}"
    audio_config = speechsdk.audio.AudioConfig(filename=audio_filename)
    
    speech_config.speech_recognition_language="en-US"
    speech_config.request_word_level_timestamps()

    speech_config.enable_dictation()
    speech_config.output_format = speechsdk.OutputFormat(1)
    
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    done = False
    
    results = []
    
    transcript = []
    words = []
    
    def handle_final_result(evt):
        import json
        results = json.loads(evt.result.json)
        transcript.append(results['DisplayText'])
    confidence_list_temp = [item.get('Confidence') for item in results['NBest']]
    max_confidence_index = confidence_list_temp.index(max(confidence_list_temp))
        words.extend(results['NBest'][max_confidence_index]['Words'])
    def stop_cb(evt):
        """callback that stops continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        speech_recognizer.stop_continuous_recognition()
        nonlocal done
        done = True
        print("Transcript display list:\n")
        print(transcript)
        print("\nWords\n")
        print(words)
        print("\n")


    speech_recognizer.recognized.connect(handle_final_result)
    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.recognizing.connect(lambda evt: format(evt))
    speech_recognizer.recognized.connect(lambda evt: format(evt))
    speech_recognizer.session_started.connect(lambda evt: format(evt))
    speech_recognizer.session_stopped.connect(lambda evt: format(evt))
    speech_recognizer.canceled.connect(lambda evt: format(evt))
    # stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.5)

    
    with open('Azure_Raw.txt','w') as f:
        f.write('\n'.join(results))

sample_long_running_recognize(storage_uri)

Only other "tutorial" I found on subtitles was a Google Cloud one, and that gives the results I am looking for (Yes, I've tested it myself), but Azure obviously doesn't work at all like G-cloud: https://medium.com/searce/generate-srt-file-subtitles-using-google-clouds-speech-to-text-api-402b2f1da3bd

So basically: How do I get like 3 seconds of speech-text into an .srt format, like this:

1
00:00:00,000 --> 00:00:03,000
This is the first sentence that

2
00:00:03,000 --> 00:00:06,000
continues after 3 seconds or so

Solution

So if you see closely - JSON output of the Azure speech service - it is slightly different from output of other services look like.

for the mentioned configuration the output looks like below after you take the best match

[{'Duration': 3900000, 'Offset': 500000, 'Word': "what's"},
 {'Duration': 1300000, 'Offset': 4500000, 'Word': 'the'},
 {'Duration': 2900000, 'Offset': 5900000, 'Word': 'weather'},
 {'Duration': 4800000, 'Offset': 8900000, 'Word': 'like'}]

There are three outputs - Word,Duration & offset

Duration - The time in 100th Nano second for which the word is spelled
Offset - the number of second in 100th Nano seconds from the start of the video

You will have to make use of this in order to frame your timeline

import azure.cognitiveservices.speech as speechsdk
import os
import time
import pprint
import json
import srt
import datetime

 
path = os.getcwd()
# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and region identifier from here: https://aka.ms/speech/sdkregion
speech_key, service_region = "<>", "<>"
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

# Creates an audio configuration that points to an audio file.
# Replace with your own audio filename.
audio_filename = "sample.wav"
audio_input = speechsdk.audio.AudioConfig(filename=audio_filename)

# Creates a recognizer with the given settings
speech_config.speech_recognition_language="en-US"
speech_config.request_word_level_timestamps()



speech_config.enable_dictation()
speech_config.output_format = speechsdk.OutputFormat(1)

speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)

#result = speech_recognizer.recognize_once()
all_results = []
results = []
transcript = []
words = []


#https://learn.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult?view=azure-python
def handle_final_result(evt):
    import json
    all_results.append(evt.result.text) 
    results = json.loads(evt.result.json)
    transcript.append(results['DisplayText'])
    confidence_list_temp = [item.get('Confidence') for item in results['NBest']]
    max_confidence_index = confidence_list_temp.index(max(confidence_list_temp))
    words.extend(results['NBest'][max_confidence_index]['Words'])



done = False

def stop_cb(evt):
    print('CLOSING on {}'.format(evt))
    speech_recognizer.stop_continuous_recognition()
    global done
    done= True
    
speech_recognizer.recognized.connect(handle_final_result) 
#Connect callbacks to the events fired by the speech recognizer    
speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)

speech_recognizer.start_continuous_recognition()

while not done:
    time.sleep(.5)
    
print("Printing all results:")
print(all_results)

speech_to_text_response = words

def convertduration(t):
    x= t/10000
    return int((x / 1000)), (x % 1000)


##-- Code to Create Subtitle --#

#3 Seconds
bin = 3.0
duration = 0 
transcriptions = []
transcript = ""
index,prev=0,0
wordstartsec,wordstartmicrosec=0,0
for i in range(len(speech_to_text_response)):
    #Forms the sentence until the bin size condition is met
    transcript = transcript + " " + speech_to_text_response[i]["Word"]
    #Checks whether the elapsed duration is less than the bin size
    if(int((duration / 10000000)) < bin): 
        wordstartsec,wordstartmicrosec=convertduration(speech_to_text_response[i]["Offset"])
        duration= duration+speech_to_text_response[i]["Offset"]-prev
        prev=speech_to_text_response[i]["Offset"]
                #transcript = transcript + " " + speech_to_text_response[i]["Word"]
    else : 
        index=index+1
        #transcript = transcript + " " + speech_to_text_response[i]["Word"]
        transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
        duration = 0 
        #print(transcript)
        transcript=""



transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
subtitles = srt.compose(transcriptions)
with open("subtitle.srt", "w") as f:
    f.write(subtitles)

Attached the Output for your reference :

Output

Hope this Helps :)