I'm trying to transcribe a conversation audio file into text with Azure's SpeechToText. I got it making use of the SKD and did another try with the API (following this instructions https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/batch/python/python-client/main.py) but I also want to split the result text by the different voices. Is it possible?
I know it is available on beta the conversation service, but as my audios are in spanish, I can't use it. Is there a configuration to split result by speakers?
This is the call with SDK:
all_results = []
def speech_recognize_continuous_from_file(file_to_transcript):
"""performs continuous speech recognition with input from an audio file"""
# <SpeechContinuousRecognitionWithFile>
speech_config = speechsdk.SpeechConfig(subscription=speech_key,
audio_config = speechsdk.audio.AudioConfig(filename=file_to_transcribe)
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
done = False
def stop_cb(evt):
"""callback that stops continuous recognition upon receiving an event `evt`"""
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
# Connect callbacks to the events fired by the speech recognizer
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
def handle_final_result(evt):
# Start continuous speech recognition
while not done:
# </SpeechContinuousRecognitionWithFile>
And this with the API:
from __future__ import print_function
from typing import List
import logging
import sys
import requests
import time
import swagger_client as cris_client
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format="%(message)s")
SUBSCRIPTION_KEY = subscription_key
HOST_NAME = "westeurope.cris.ai"
PORT = 443
NAME = "Simple transcription"
DESCRIPTION = "Simple transcription description"
LOCALE = "es-ES"
# ADAPTED_ACOUSTIC_ID = None # guid of a custom acoustic model
# ADAPTED_LANGUAGE_ID = None # guid of a custom language model
def transcribe():
logging.info("Starting transcription client...")
# configure API key authorization: subscription_key
configuration = cris_client.Configuration()
configuration.api_key['Ocp-Apim-Subscription-Key'] = SUBSCRIPTION_KEY
# create the client object and authenticate
client = cris_client.ApiClient(configuration)
# create an instance of the transcription api class
transcription_api = cris_client.CustomSpeechTranscriptionsApi(api_client=client)
# get all transcriptions for the subscription
transcriptions: List[cris_client.Transcription] = transcription_api.get_transcriptions()
logging.info("Deleting all existing completed transcriptions.")
# delete all pre-existing completed transcriptions
# if transcriptions are still running or not started, they will not be deleted
for transcription in transcriptions:
logging.info("Creating transcriptions.")
# transcription definition using custom models
# transcription_definition = cris_client.TranscriptionDefinition(
# name=NAME, description=DESCRIPTION, locale=LOCALE, recordings_url=RECORDINGS_BLOB_URI,
# models=[cris_client.ModelIdentity(ADAPTED_ACOUSTIC_ID), cris_client.ModelIdentity(ADAPTED_LANGUAGE_ID)]
# )
# comment out the previous statement and uncomment the following to use base models for transcription
transcription_definition = cris_client.TranscriptionDefinition(
name=NAME, description=DESCRIPTION, locale=LOCALE, recordings_url=RECORDINGS_BLOB_URI
data, status, headers = transcription_api.create_transcription_with_http_info(transcription_definition)
# extract transcription location from the headers
transcription_location: str = headers["location"]
# get the transcription Id from the location URI
created_transcriptions = list()
logging.info("Checking status.")
completed, running, not_started = 0, 0, 0
while completed < 1:
# get all transcriptions for the user
transcriptions: List[cris_client.Transcription] = transcription_api.get_transcriptions()
# for each transcription in the list we check the status
for transcription in transcriptions:
if transcription.status == "Failed" or transcription.status == "Succeeded":
# we check to see if it was one of the transcriptions we created from this client
if transcription.id not in created_transcriptions:
completed += 1
if transcription.status == "Succeeded":
results_uri = transcription.results_urls["channel_0"]
results = requests.get(results_uri)
logging.info("Transcription succeeded. Results: ")
elif transcription.status == "Running":
running += 1
elif transcription.status == "NotStarted":
not_started += 1
logging.info(f"Transcriptions status: {completed} completed, {running} running, {not_started} not started yet")
# wait for 5 seconds
input("Press any key...")
def main():
if __name__ == "__main__":
I also want to split the result text by the different voices.
The transcript received does not contains any notion of speaker. Here you are just calling an endpoint doing transcription, there is no speaker recognition feature inside.
Two things:
channels)Speaker Recognition API
(doc here) to do this identification but:As you mentioned, the Speech SDK's ConversationTranscriber API
(doc here) is currently limited to en-US
and zh-CN