Search code examples
javascriptazurespeech-recognitionspeech-to-textazure-speech

Azure Pronunciation Assessment Could not deserialize speech context error


I am trying to implement a pronunciation assessment system using Azure's JS SDK (see doc).

I get the following error in console:

"Could not deserialize speech context. websocket error code: 1007"

Here is my implementation:

assessPronunciation(fileUrl) {
    const speechConfig = window.SpeechSDK.SpeechConfig.fromSubscription("xxx", "westeurope");
    speechConfig.speechRecognitionLanguage = "en-GB";

    // Fetch the WAV file and create an AudioConfig
    fetch(fileUrl)
      .then(response => response.blob())
      .then(blob => {
        // Convert the blob to a File object
        const file = new File([blob], "audio.wav", { type: "audio/wav" });

        // Create an AudioConfig using the File object
        const audioConfig = window.SpeechSDK.AudioConfig.fromWavFileInput(file);

        var pronunciationAssessmentConfig = new window.SpeechSDK.PronunciationAssessmentConfig({
          referenceText: "Hello this is a test",
          gradingSystem: "HundredMark",
          granularity: "Phoneme"
        });

        var speechRecognizer = new window.SpeechSDK.SpeechRecognizer(speechConfig, audioConfig);

        pronunciationAssessmentConfig.applyTo(speechRecognizer);

        speechRecognizer.sessionStarted = (s, e) => {
          console.log(`SESSION ID: ${e.sessionId}`);
        };
        pronunciationAssessmentConfig.applyTo(speechRecognizer);
        
        speechRecognizer.recognizeOnceAsync(
          function(speechRecognitionResult) {
            if (speechRecognitionResult.reason === window.SpeechSDK.ResultReason.RecognizedSpeech) {
              // The pronunciation assessment result as a Speech SDK object
              var pronunciationAssessmentResult = SpeechSDK.PronunciationAssessmentResult.fromResult(speechRecognitionResult);
              console.log("pronunciationAssessmentResult", pronunciationAssessmentResult);
          
              // The pronunciation assessment result as a JSON string
              var pronunciationAssessmentResultJson = speechRecognitionResult.properties.getProperty(SpeechSDK.PropertyId.SpeechServiceResponse_JsonResult);
              console.log("pronunciationAssessmentResultJson", pronunciationAssessmentResultJson);
            } else {
              console.error("Speech not recognized. Reason:", speechRecognitionResult);
            }
          },
          function(error) {
            console.error("Error during recognition:", error);
            if (error instanceof window.SpeechSDK.SpeechRecognitionCanceledEventArgs) {
              console.error("Recognition canceled. Reason:", error.reason);
              console.error("Error details:", error.errorDetails);
            }
          }
        );
      })
      .catch(error => {
        console.error("Error fetching WAV file:", error);
      });
  }

I checked the recording (fileUrl) and it's a working Wav file as expected.

Recording configuration:

startRecording(event) {
    event.preventDefault();
    if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) {
      navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => {
        this.recorder = new RecordRTC(stream, {
          type: 'audio',
          mimeType: 'audio/wav',
          recorderType: RecordRTC.StereoAudioRecorder,
          desiredSampRate: 16000,
          numberOfAudioChannels: 1,
          audioBitsPerSecond: 128000
        });
        this.startRecorder(event);
      }).catch((error) => {
        console.log("The following error occurred: " + error);
        alert("Please grant permission for microphone access");
      });
    } else {
      alert("Your browser does not support audio recording, please use a different browser or update your current browser");
    }
  }

Any idea what's the issue? Thanks.

SOLUTION

var audioConfig = window.SpeechSDK.AudioConfig.fromStreamInput(pushStream);

var pronunciationAssessmentConfig = new window.SpeechSDK.PronunciationAssessmentConfig(
      "My voice is my passport, verify me.",
      window.SpeechSDK.PronunciationAssessmentGradingSystem.HundredMark,
      window.SpeechSDK.PronunciationAssessmentGranularity.Phoneme
  );

Solution

  • try this code block out:

    var sdk = require("microsoft-cognitiveservices-speech-sdk");
    var fs = require("fs")
    
    // not supported in node
    // const audioConfig = sdk.AudioConfig.fromWavFileInput('myVoiceIsMyPassportVerifyMe01.wav');
    
    // workaround
    var filename = "myVoiceIsMyPassportVerifyMe01.wav"; // 16000 Hz, Mono
    var pushStream = sdk.AudioInputStream.createPushStream();
    fs.createReadStream(filename).on('data', function (arrayBuffer) {
        pushStream.write(arrayBuffer.slice());
    }).on('end', function () {
        pushStream.close();
    });
    var audioConfig = sdk.AudioConfig.fromStreamInput(pushStream);
    
    
    const conf = sdk.SpeechConfig.fromSubscription(
        'xxxx',
        'eastus'
    );
    conf.speechRecognitionLanguage = "en-GB";
    
    
    var speechRecognizer = new sdk.SpeechRecognizer(conf, audioConfig);
    var pronunciationAssessmentConfig = new sdk.PronunciationAssessmentConfig(
        ReferenceText = "My voice is my passport, verify me.",
        GradingSystem = "HundredMark",
        Granularity = "Phoneme"
    );
    pronunciationAssessmentConfig.applyTo(speechRecognizer);
    
    speechRecognizer.sessionStarted = (s, e) => {
        console.log('SESSION ID:'+ e.sessionId);
    };
    
    speechRecognizer.recognizeOnceAsync(
        function (speechRecognitionResult) {
            // console.log("speechRecognitionResult:", speechRecognitionResult);
            if (speechRecognitionResult.reason === sdk.ResultReason.RecognizedSpeech) {
                // The pronunciation assessment result as a Speech SDK object
                var pronunciationAssessmentResult = sdk.PronunciationAssessmentResult.fromResult(speechRecognitionResult);
                console.log("pronunciationAssessmentResult", pronunciationAssessmentResult);
    
                // The pronunciation assessment result as a JSON string
                var pronunciationAssessmentResultJson = speechRecognitionResult.properties.getProperty(sdk.PropertyId.SpeechServiceResponse_JsonResult);
                console.log("pronunciationAssessmentResultJson", pronunciationAssessmentResultJson);
            } else {
                console.error("Speech not recognized. Reason:", speechRecognitionResult);
            }
        },
        function (error) {
            console.error("Error during recognition:", error);
            if (error instanceof sdk.SpeechRecognitionCanceledEventArgs) {
                console.error("Recognition canceled. Reason:", error.reason);
                console.error("Error details:", error.errorDetails);
            }
        }
    );
    

    a few catches:

    1. AudioConfig.fromWavFileInput might not supported in Node. I just used the workaround mentioned in the link, it worked. https://github.com/Azure-Samples/cognitive-services-speech-sdk/issues/813

    2. the PronunciationAssessmentConfig needs to be passed as individual parameter values, not a json

    3. i used a sample wav from here. you can edit to yours https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/sampledata/audiofiles/myVoiceIsMyPassportVerifyMe01.wav

    enter image description here