javascript azure speech-recognition speech-to-text azure-speech

Azure Pronunciation Assessment Could not deserialize speech context error

I am trying to implement a pronunciation assessment system using Azure's JS SDK (see doc).

I get the following error in console:

"Could not deserialize speech context. websocket error code: 1007"

Here is my implementation:

assessPronunciation(fileUrl) {
    const speechConfig = window.SpeechSDK.SpeechConfig.fromSubscription("xxx", "westeurope");
    speechConfig.speechRecognitionLanguage = "en-GB";

    // Fetch the WAV file and create an AudioConfig
    fetch(fileUrl)
      .then(response => response.blob())
      .then(blob => {
        // Convert the blob to a File object
        const file = new File([blob], "audio.wav", { type: "audio/wav" });

        // Create an AudioConfig using the File object
        const audioConfig = window.SpeechSDK.AudioConfig.fromWavFileInput(file);

        var pronunciationAssessmentConfig = new window.SpeechSDK.PronunciationAssessmentConfig({
          referenceText: "Hello this is a test",
          gradingSystem: "HundredMark",
          granularity: "Phoneme"
        });

        var speechRecognizer = new window.SpeechSDK.SpeechRecognizer(speechConfig, audioConfig);

        pronunciationAssessmentConfig.applyTo(speechRecognizer);

        speechRecognizer.sessionStarted = (s, e) => {
          console.log(`SESSION ID: ${e.sessionId}`);
        };
        pronunciationAssessmentConfig.applyTo(speechRecognizer);
        
        speechRecognizer.recognizeOnceAsync(
          function(speechRecognitionResult) {
            if (speechRecognitionResult.reason === window.SpeechSDK.ResultReason.RecognizedSpeech) {
              // The pronunciation assessment result as a Speech SDK object
              var pronunciationAssessmentResult = SpeechSDK.PronunciationAssessmentResult.fromResult(speechRecognitionResult);
              console.log("pronunciationAssessmentResult", pronunciationAssessmentResult);
          
              // The pronunciation assessment result as a JSON string
              var pronunciationAssessmentResultJson = speechRecognitionResult.properties.getProperty(SpeechSDK.PropertyId.SpeechServiceResponse_JsonResult);
              console.log("pronunciationAssessmentResultJson", pronunciationAssessmentResultJson);
            } else {
              console.error("Speech not recognized. Reason:", speechRecognitionResult);
            }
          },
          function(error) {
            console.error("Error during recognition:", error);
            if (error instanceof window.SpeechSDK.SpeechRecognitionCanceledEventArgs) {
              console.error("Recognition canceled. Reason:", error.reason);
              console.error("Error details:", error.errorDetails);
            }
          }
        );
      })
      .catch(error => {
        console.error("Error fetching WAV file:", error);
      });
  }

I checked the recording (fileUrl) and it's a working Wav file as expected.

Recording configuration:

startRecording(event) {
    event.preventDefault();
    if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) {
      navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => {
        this.recorder = new RecordRTC(stream, {
          type: 'audio',
          mimeType: 'audio/wav',
          recorderType: RecordRTC.StereoAudioRecorder,
          desiredSampRate: 16000,
          numberOfAudioChannels: 1,
          audioBitsPerSecond: 128000
        });
        this.startRecorder(event);
      }).catch((error) => {
        console.log("The following error occurred: " + error);
        alert("Please grant permission for microphone access");
      });
    } else {
      alert("Your browser does not support audio recording, please use a different browser or update your current browser");
    }
  }

Any idea what's the issue? Thanks.

SOLUTION

var audioConfig = window.SpeechSDK.AudioConfig.fromStreamInput(pushStream);

var pronunciationAssessmentConfig = new window.SpeechSDK.PronunciationAssessmentConfig(
      "My voice is my passport, verify me.",
      window.SpeechSDK.PronunciationAssessmentGradingSystem.HundredMark,
      window.SpeechSDK.PronunciationAssessmentGranularity.Phoneme
  );

Solution

try this code block out:

var sdk = require("microsoft-cognitiveservices-speech-sdk");
var fs = require("fs")

// not supported in node
// const audioConfig = sdk.AudioConfig.fromWavFileInput('myVoiceIsMyPassportVerifyMe01.wav');

// workaround
var filename = "myVoiceIsMyPassportVerifyMe01.wav"; // 16000 Hz, Mono
var pushStream = sdk.AudioInputStream.createPushStream();
fs.createReadStream(filename).on('data', function (arrayBuffer) {
    pushStream.write(arrayBuffer.slice());
}).on('end', function () {
    pushStream.close();
});
var audioConfig = sdk.AudioConfig.fromStreamInput(pushStream);


const conf = sdk.SpeechConfig.fromSubscription(
    'xxxx',
    'eastus'
);
conf.speechRecognitionLanguage = "en-GB";


var speechRecognizer = new sdk.SpeechRecognizer(conf, audioConfig);
var pronunciationAssessmentConfig = new sdk.PronunciationAssessmentConfig(
    ReferenceText = "My voice is my passport, verify me.",
    GradingSystem = "HundredMark",
    Granularity = "Phoneme"
);
pronunciationAssessmentConfig.applyTo(speechRecognizer);

speechRecognizer.sessionStarted = (s, e) => {
    console.log('SESSION ID:'+ e.sessionId);
};

speechRecognizer.recognizeOnceAsync(
    function (speechRecognitionResult) {
        // console.log("speechRecognitionResult:", speechRecognitionResult);
        if (speechRecognitionResult.reason === sdk.ResultReason.RecognizedSpeech) {
            // The pronunciation assessment result as a Speech SDK object
            var pronunciationAssessmentResult = sdk.PronunciationAssessmentResult.fromResult(speechRecognitionResult);
            console.log("pronunciationAssessmentResult", pronunciationAssessmentResult);

            // The pronunciation assessment result as a JSON string
            var pronunciationAssessmentResultJson = speechRecognitionResult.properties.getProperty(sdk.PropertyId.SpeechServiceResponse_JsonResult);
            console.log("pronunciationAssessmentResultJson", pronunciationAssessmentResultJson);
        } else {
            console.error("Speech not recognized. Reason:", speechRecognitionResult);
        }
    },
    function (error) {
        console.error("Error during recognition:", error);
        if (error instanceof sdk.SpeechRecognitionCanceledEventArgs) {
            console.error("Recognition canceled. Reason:", error.reason);
            console.error("Error details:", error.errorDetails);
        }
    }
);

a few catches:

AudioConfig.fromWavFileInput might not supported in Node. I just used the workaround mentioned in the link, it worked. https://github.com/Azure-Samples/cognitive-services-speech-sdk/issues/813
the PronunciationAssessmentConfig needs to be passed as individual parameter values, not a json
i used a sample wav from here. you can edit to yours https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/sampledata/audiofiles/myVoiceIsMyPassportVerifyMe01.wav