Search code examples
c#azurespeech-to-text

Azure timestamps not appearing in speech to text model?


Timestamps are not appearing in my results when I run my speech-to-text Azure model. I'm not getting any errors, but also not getting timestamped results. My code is:

using System;
using System.Threading.Tasks;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;

namespace SUPRA
{
internal class NewBaseType
{
    static async Task Main(string[] args)

    {
        // Creates an instance of a speech config with specified subscription key and region.
        // Replace with your own subscription key and service region (e.g., "westus").
        var config = SpeechConfig.FromSubscription("8ec6730993d54cf9a9cec0f5d08b8e8b", "eastus");

        // Generates timestamps
        config.OutputFormat = OutputFormat.Detailed;
        config.RequestWordLevelTimestamps();

        //calls the audio file
        using (var audioInput = AudioConfig.FromWavFileInput("C:/Users/MichaelSchwartz/source/repos/AI-102-Process-Speech-master/transcribe_speech_to_text/media/narration.wav"))

        // Creates a speech recognizer from microphone.
        using (var recognizer = new SpeechRecognizer(config, audioInput))
        {

            recognizer.Recognized += (s, e) =>
            {
                var result = e.Result;
                if (result.Reason == ResultReason.RecognizedSpeech)
                {
                    Console.WriteLine(result.Text);
                }
            };

            recognizer.Recognized += (s, e) => 
            { 
                var j = e.Result.Properties.GetProperty(PropertyId.SpeechServiceResponse_JsonResult);
            };

            recognizer.Canceled += (s, e) =>
            {
                Console.WriteLine($"\n    Canceled. Reason: {e.Reason.ToString()}, CanceledReason: {e.Reason}");
            };

            recognizer.SessionStarted += (s, e) =>
            {
                Console.WriteLine("\n    Session started event.");
            };

            recognizer.SessionStopped += (s, e) =>
            {
                Console.WriteLine("\n    Session stopped event.");
            };

            // Starts continuous recognition. 
            // Uses StopContinuousRecognitionAsync() to stop recognition.
            await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);

            do
            {
                Console.WriteLine("Press Enter to stop");
            } while (Console.ReadKey().Key != ConsoleKey.Enter);

            // Stops recognition.
            await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);
        }
    }
}

}

No errors are returned and the results are accurate but without timestamps. I've included the code to produce timestamps in lines 37-40. How do I get timestamps to generate? Thanks.


Solution

  • You configured correctly but seems you haven't print the result in the console. Just try the code below:

    using Microsoft.CognitiveServices.Speech;
    using Microsoft.CognitiveServices.Speech.Audio;
    using System;
    using System.Threading.Tasks;
    
    namespace STTwithTime
    {
        class Program
        {
    
    
            static void Main(string[] args)
            {
                var key = "";
                var region = "";
                var audioFilePath = @"";
                var speechConfig = SpeechConfig.FromSubscription(key, region);
    
                // Generates timestamps
                speechConfig.RequestWordLevelTimestamps();
                speechConfig.OutputFormat = OutputFormat.Detailed;
    
                var stopRecognition = new TaskCompletionSource<int>();
    
                var audioConfig = AudioConfig.FromWavFileInput(audioFilePath);
                var recognizer = new SpeechRecognizer(speechConfig, audioConfig);
    
                //Display Recognizing
                recognizer.Recognizing += (s, e) =>
                {
                    Console.WriteLine($"RECOGNIZING:{e.Result.Properties.GetProperty(PropertyId.SpeechServiceResponse_JsonResult)}");
                };
    
                //Display Recognized
                recognizer.Recognized += (s, e) =>
                {
                    if (e.Result.Reason == ResultReason.RecognizedSpeech)
                    {
                        Console.WriteLine($"RECOGNIZED :{e.Result.Properties.GetProperty(PropertyId.SpeechServiceResponse_JsonResult)}");
                    }
                    else if (e.Result.Reason == ResultReason.NoMatch)
                    {
                        Console.WriteLine($"NOMATCH: Speech could not be recognized.");
                    }
                };
    
                recognizer.Canceled += (s, e) =>
                {
                    Console.WriteLine($"CANCELED: Reason={e.Reason}");
    
                    if (e.Reason == CancellationReason.Error)
                    {
                        Console.WriteLine($"CANCELED: ErrorCode={e.ErrorCode}");
                        Console.WriteLine($"CANCELED: ErrorDetails={e.ErrorDetails}");
                        Console.WriteLine($"CANCELED: Did you update the subscription info?");
                    }
    
                    stopRecognition.TrySetResult(0);
                };
    
                recognizer.SessionStopped += (s, e) =>
                {
                    Console.WriteLine("\n    Session stopped event.");
                    stopRecognition.TrySetResult(0);
                };
    
                recognizer.StartContinuousRecognitionAsync().GetAwaiter().GetResult();
    
                // Waits for completion. Use Task.WaitAny to keep the task rooted.
                Task.WaitAny(new[] { stopRecognition.Task });
    
    
            }
        }
    }
    

    Result

    Display recognizing:

    enter image description here

    Display recognized:

    enter image description here