Search code examples
c#winformsspeech-recognitionspeech-to-text

Can I retrieve the confidence of the recognized from Azure Cognitive Services?


I am making speech to text app in C sharp window form. it was working fine and running in visual studio but

I'm using this code to recognized using Microsoft Azure Cognitive Services. Once the whole thing has been recognized can I get a confidence score in c sharp window form?

How can I solve this?

My code:

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using System.IO;
using System.Threading;

namespace WindowsFormsApp2
{
    public partial class Form1 : Form
    {
        private bool isRecognizing = false;
        private SpeechRecognizer recognizer;


        public Form1()
        {
            InitializeComponent();
        }

        private void initRecognizer()
        {
            
            SpeechConfig config = SpeechConfig.FromSubscription("key", "region");
            if (Properties.Settings.Default.Punctuation)
            {
                config.SetServiceProperty("punctuation", "explicit", ServicePropertyChannel.UriQueryParameter);
            }
            //AudioConfig audioConfig = AudioConfig.FromMicrophoneInput();
            recognizer = new SpeechRecognizer(config/*, audioConfig*/);
            recognizer.Recognized += SpeechRecognizer_Recognized;
        }
        private void Form1_Load(object sender, EventArgs e)
        {
            initRecognizer();

        }

        private void SpeechRecognizer_Recognized(object sender, SpeechRecognitionEventArgs e)
        {
            if (e.Result.Reason == ResultReason.RecognizedSpeech)
            {
                if (e.Result.Text.ToLower().Equals("new line") || e.Result.Text.ToLower().Equals("newline"))
                {
                    SendKeys.SendWait(Environment.NewLine);
                }
                else
                {
                    SendKeys.SendWait(e.Result.Text);
                }

            }
        }


        private void Startstop()
        {
            if (isRecognizing)
            {
                recognizer.StopContinuousRecognitionAsync();
                picture_btn.Image = Properties.Resources.green;
                startToolStripMenuItem.Text = "Start";
                pictureBox1.Enabled = true;                
                isRecognizing = false;
                timer1.Stop();
                timer1.Enabled = false;
            }
            else
            {
                picture_btn.Image = Properties.Resources.red;
                startToolStripMenuItem.Text = "Stop";
                pictureBox1.Enabled = false;
                recognizer.StartContinuousRecognitionAsync();
                isRecognizing = true;
                timer1.Interval = 600;
                timer1.Start();
                timer1.Enabled = true;

            }
        }

        private void pictureBox1_Click(object sender, EventArgs e)
        {
            Startstop();
        }

        private void Form1_Move(object sender, EventArgs e)
        {
            if (this.WindowState == FormWindowState.Normal)
            {
                ShowInTaskbar = true;
                notifyIcon1.Visible = true;
                this.Hide();
                notifyIcon1.ShowBalloonTip(1000);
            }
        }

        private void Form1_MouseDoubleClick(object sender, MouseEventArgs e)
        {
            ShowInTaskbar = true;
            notifyIcon1.Visible = false;
            WindowState = FormWindowState.Normal;
            this.WindowState = FormWindowState.Normal;
            notifyIcon1.Visible = false;

        }

        private void exitToolStripMenuItem_Click(object sender, EventArgs e)
        {
            Application.Exit();
        }


        void SettingFormClosed(object sender, FormClosedEventArgs e)
        {
            initRecognizer();
        }

        private void startToolStripMenuItem_Click(object sender, EventArgs e)
        {
            Startstop();
        }



        private void timer1_Tick(object sender, EventArgs e)
        {
            if (picture_btn.Tag.Equals("red"))
            {
                picture_btn.Image = Properties.Resources.grey;
                picture_btn.Tag = "grey";
            }
            else
            {
                picture_btn.Image = Properties.Resources.red;
                picture_btn.Tag = "red";
            }
        }

        private void pictureBox1_Click_1(object sender, EventArgs e)
        {
            var myForm = new Form2();
            myForm.FormClosed += SettingFormClosed;
            myForm.Show();
        }

        private void notifyIcon1_MouseDoubleClick(object sender, MouseEventArgs e)
        {
            this.Show();
        }

        private void Form1_FormClosing(object sender, FormClosingEventArgs e)
        {
            if (e.CloseReason == CloseReason.UserClosing)
            {
                notifyIcon1.Visible = true;
                this.Hide();
                e.Cancel = true;
            }
        }
    }
}

Solution

  • You should always read through the documentation of a service before deep diving into it. The documentation will cover important configuration aspect and should detail limitations.

    Azure - Cognitive Services - Speech Service - Speech-to-text documentation
    This is the landing page for all the standard resources for the speech-to-text service, read through all these to understand how the service was designed to work and to gain code examples for common scenarios.

    Step 1 - Config

    You need to configure your SpeechRecognizer instance to return Detailed output:

    SpeechConfig config = SpeechConfig.FromSubscription("key", "region");
    // Detailed output will include confidence factor
    config.OutputFormat = OutputFormat.Detailed;
    
    if (Properties.Settings.Default.Punctuation)
    {
        config.SetServiceProperty("punctuation", "explicit", ServicePropertyChannel.UriQueryParameter);
    }
    

    Step 2 - Access Details from the Best() extension method

    This is documented in this section: Recognized offset and duration however the general idea is that we call the e.Result.Best() extension method to retrieve the details about the results.

    Make sure you have the following using statement to make the Best() method available:

     using Microsoft.CognitiveServices.Speech;
    
    private void SpeechRecognizer_Recognized(object sender, SpeechRecognitionEventArgs e)
    {
        if (e.Result.Reason == ResultReason.RecognizedSpeech)
        {
            if (e.Result.Text.ToLower().Equals("new line") || e.Result.Text.ToLower().Equals("newline"))
                SendKeys.SendWait(Environment.NewLine);
            else
                SendKeys.SendWait(e.Result.Text);
    
            // Get the detailed results
            var detailedResults = e.Result.Best();
            if (detailedResults != null && detailedResults.Any())
            {
                // The first item in detailedResults corresponds to the recognized text.
                // This is not necessarily the item with the highest confidence number.
                var bestResults = detailedResults?.ToList()[0];
                Console.WriteLine(String.Format("\tConfidence: {0}\n\tText: {1}\n\tLexicalForm: {2}\n\tNormalizedForm: {3}\n\tMaskedNormalizedForm: {4}",
                    bestResults.Confidence, 
                    bestResults.Text, 
                    bestResults.LexicalForm, 
                    bestResults.NormalizedForm, 
                    bestResults.MaskedNormalizedForm));  
        }
    }
    

    As an example, you can get this level of information back from the service, I have also enabled the config.RequestWordLevelTimestamps() to get word-level timestamps:

    example results from speech to text service