Search code examples
pythonazurespeech-recognitionspeech-to-textsubtitle

Subtitles/captions with Microsoft Azure Speech-to-text in Python


I've been trying to figure out how to make subtitles with Microsoft Azure Speech Recognition service in Python, but can't figure it out. I've followed the tips someone else has answered here on getting the individual words, but even formatting those to .srt or .vtt seems convoluted. Here's the code:

import azure.cognitiveservices.speech as speechsdk


def speech_recognize_continuous_from_file():
    """performs continuous speech recognition with input from an audio file"""
    # <SpeechContinuousRecognitionWithFile>
    speech_key, service_region = "{api-key}", "{serive-region}"
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    
    audio_filename = "{for example: video.wav}"
    audio_config = speechsdk.audio.AudioConfig(filename=audio_filename)
    
    speech_config.speech_recognition_language="en-US"
    speech_config.request_word_level_timestamps()

    speech_config.enable_dictation()
    speech_config.output_format = speechsdk.OutputFormat(1)
    
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    done = False
    
    results = []
    
    transcript = []
    words = []
    
    def handle_final_result(evt):
        import json
        results = json.loads(evt.result.json)
        transcript.append(results['DisplayText'])
    confidence_list_temp = [item.get('Confidence') for item in results['NBest']]
    max_confidence_index = confidence_list_temp.index(max(confidence_list_temp))
        words.extend(results['NBest'][max_confidence_index]['Words'])
    def stop_cb(evt):
        """callback that stops continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        speech_recognizer.stop_continuous_recognition()
        nonlocal done
        done = True
        print("Transcript display list:\n")
        print(transcript)
        print("\nWords\n")
        print(words)
        print("\n")


    speech_recognizer.recognized.connect(handle_final_result)
    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.recognizing.connect(lambda evt: format(evt))
    speech_recognizer.recognized.connect(lambda evt: format(evt))
    speech_recognizer.session_started.connect(lambda evt: format(evt))
    speech_recognizer.session_stopped.connect(lambda evt: format(evt))
    speech_recognizer.canceled.connect(lambda evt: format(evt))
    # stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.5)

    
    with open('Azure_Raw.txt','w') as f:
        f.write('\n'.join(results))

sample_long_running_recognize(storage_uri)

Only other "tutorial" I found on subtitles was a Google Cloud one, and that gives the results I am looking for (Yes, I've tested it myself), but Azure obviously doesn't work at all like G-cloud: https://medium.com/searce/generate-srt-file-subtitles-using-google-clouds-speech-to-text-api-402b2f1da3bd

So basically: How do I get like 3 seconds of speech-text into an .srt format, like this:

1
00:00:00,000 --> 00:00:03,000
This is the first sentence that

2
00:00:03,000 --> 00:00:06,000
continues after 3 seconds or so

Solution

  • So if you see closely - JSON output of the Azure speech service - it is slightly different from output of other services look like.

    for the mentioned configuration the output looks like below after you take the best match

    [{'Duration': 3900000, 'Offset': 500000, 'Word': "what's"},
     {'Duration': 1300000, 'Offset': 4500000, 'Word': 'the'},
     {'Duration': 2900000, 'Offset': 5900000, 'Word': 'weather'},
     {'Duration': 4800000, 'Offset': 8900000, 'Word': 'like'}]
    

    There are three outputs - Word,Duration & offset

    • Duration - The time in 100th Nano second for which the word is spelled
    • Offset - the number of second in 100th Nano seconds from the start of the video

    You will have to make use of this in order to frame your timeline

    import azure.cognitiveservices.speech as speechsdk
    import os
    import time
    import pprint
    import json
    import srt
    import datetime
    
     
    path = os.getcwd()
    # Creates an instance of a speech config with specified subscription key and service region.
    # Replace with your own subscription key and region identifier from here: https://aka.ms/speech/sdkregion
    speech_key, service_region = "<>", "<>"
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    
    # Creates an audio configuration that points to an audio file.
    # Replace with your own audio filename.
    audio_filename = "sample.wav"
    audio_input = speechsdk.audio.AudioConfig(filename=audio_filename)
    
    # Creates a recognizer with the given settings
    speech_config.speech_recognition_language="en-US"
    speech_config.request_word_level_timestamps()
    
    
    
    speech_config.enable_dictation()
    speech_config.output_format = speechsdk.OutputFormat(1)
    
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)
    
    #result = speech_recognizer.recognize_once()
    all_results = []
    results = []
    transcript = []
    words = []
    
    
    #https://learn.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult?view=azure-python
    def handle_final_result(evt):
        import json
        all_results.append(evt.result.text) 
        results = json.loads(evt.result.json)
        transcript.append(results['DisplayText'])
        confidence_list_temp = [item.get('Confidence') for item in results['NBest']]
        max_confidence_index = confidence_list_temp.index(max(confidence_list_temp))
        words.extend(results['NBest'][max_confidence_index]['Words'])
    
    
    
    done = False
    
    def stop_cb(evt):
        print('CLOSING on {}'.format(evt))
        speech_recognizer.stop_continuous_recognition()
        global done
        done= True
        
    speech_recognizer.recognized.connect(handle_final_result) 
    #Connect callbacks to the events fired by the speech recognizer    
    speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
    speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
    speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
    speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
    speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
    # stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)
    
    speech_recognizer.start_continuous_recognition()
    
    while not done:
        time.sleep(.5)
        
    print("Printing all results:")
    print(all_results)
    
    speech_to_text_response = words
    
    def convertduration(t):
        x= t/10000
        return int((x / 1000)), (x % 1000)
    
    
    ##-- Code to Create Subtitle --#
    
    #3 Seconds
    bin = 3.0
    duration = 0 
    transcriptions = []
    transcript = ""
    index,prev=0,0
    wordstartsec,wordstartmicrosec=0,0
    for i in range(len(speech_to_text_response)):
        #Forms the sentence until the bin size condition is met
        transcript = transcript + " " + speech_to_text_response[i]["Word"]
        #Checks whether the elapsed duration is less than the bin size
        if(int((duration / 10000000)) < bin): 
            wordstartsec,wordstartmicrosec=convertduration(speech_to_text_response[i]["Offset"])
            duration= duration+speech_to_text_response[i]["Offset"]-prev
            prev=speech_to_text_response[i]["Offset"]
                    #transcript = transcript + " " + speech_to_text_response[i]["Word"]
        else : 
            index=index+1
            #transcript = transcript + " " + speech_to_text_response[i]["Word"]
            transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
            duration = 0 
            #print(transcript)
            transcript=""
    
    
    
    transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
    subtitles = srt.compose(transcriptions)
    with open("subtitle.srt", "w") as f:
        f.write(subtitles)
    

    Attached the Output for your reference :

    Output


    Hope this Helps :)