Search code examples
node.jsbrowserspeech-to-textazure-cognitive-services

Stream audio to Azure speech api by node.js on browser


I'm making a demo of speech to text using Azure speech api on browser by node.js. According to API document here, it does specify that it need .wav or .ogg files. But the example down there does a api call through sending byte data to api.

So I've already get my data from microphone in byte array form. Is it the right path to convert it to byte and send it to api? Or is it better for me to save it as a .wav file then send to the api?

So below is my code.

This is stream from microphone part.


navigator.mediaDevices.getUserMedia({ audio: true })
    .then(stream => { handlerFunction(stream) })

function handlerFunction(stream) {
    rec = new MediaRecorder(stream);
    rec.ondataavailable = e => {
        audioChunks.push(e.data);
        if (rec.state == "inactive") {
            let blob = new Blob(audioChunks, { type: 'audio/wav; codec=audio/pcm; samplerate=16000' });
            recordedAudio.src = URL.createObjectURL(blob);
            recordedAudio.controls = true;
            recordedAudio.autoplay = true;
            console.log(blob);
            let fileReader = new FileReader();
            var arrayBuffer = new Uint8Array(1024);
            var reader = new FileReader();
            reader.readAsArrayBuffer(blob);
            reader.onloadend = function () {
                var byteArray = new Uint8Array(reader.result);
                console.log("reader result" + reader.result)
                etTimeout(() => getText(byteArray), 1000);
            }
        }
    }
}

This is api call part


function getText(audio, callback) {
    console.log("in function audio " + audio);
    console.log("how many byte?: " + audio.byteLength)
    const sendTime = Date.now();
    fetch('https://westus.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?language=en-US', {
        method: "POST",
        headers: {
            'Accept': 'application/json',
            'Ocp-Apim-Subscription-Key': YOUR_API_KEY,
            // 'Transfer-Encoding': 'chunked',
            // 'Expect': '100-continue',
            'Content-type': 'audio/wav; codec=audio/pcm; samplerate=16000'
        },
        body: audio
    })
        .then(function (r) {
            return r.json();
        })
        .then(function (response) {
            if (sendTime < time) {
                return
            }
            time = sendTime
            //callback(response)
        }).catch(e => {
            console.log("Error", e)
        })
}

It returns with 400 (Bad Request) and says :

{Message: "Unsupported audio format"}


Solution

  • Reason:

    Note you're not creating a MediaRecorder with a audio/wav mimeType by

    new Blob(audioChunks,{type:'audio/wav; codec=audio/pcm; samplerate=16000'})
    

    This statement is only a description for blob. I test my Chrome(v71) with isTypeSupported:

    MediaRecorder.isTypeSupported("audio/wav")  // return false
    MediaRecorder.isTypeSupported("audio/ogg")  // return false
    MediaRecorder.isTypeSupported("audio/webm") // return true
    

    It seems that the MediaRecorder will only record the audio in audio/webm. Also, when I run the following code on Chrome , the default rec.mimeType is audio/webm;codecs=opus

    rec = new MediaRecorder(stream);
    

    According to the Audio formats Requiremnts, the audio/webm is not supported yet.

    Approach:

    Before calling getText() we need convert the webm to wav firstly. There're quite a lot of libraries that can help us do that. I just copy Jam3's script before your code to convert webm to wav :

        // add Jam3's script between Line 2 and Line 94 or import that module as you like
    
        // create a audioContext that helps us decode the webm audio
        var audioCtx = new (window.AudioContext || window.webkitAudioContext)();
    
        rec = new MediaRecorder(stream,{
            mimeType : 'audio/webm',
            codecs : "opus",
        });
    
        // ...
    
        rec.ondataavailable = e => {
            audioChunks.push(e.data);
            if (rec.state == "inactive") {
                var blob = new Blob(audioChunks, { 'type': 'audio/webm; codecs=opus' });
                var arrayBuffer;
                var fileReader = new FileReader();
                fileReader.onload = function(event) {
                    arrayBuffer = event.target.result;
                };
                fileReader.readAsArrayBuffer(blob);
                fileReader.onloadend=function(d){
                    audioCtx.decodeAudioData(
                        fileReader.result,
                        function(buffer) {
                            var wav = audioBufferToWav(buffer);
                            setTimeout(() => getText(wav), 1000);
                        },
                        function(e){ console.log( e); }
                    );
                };
            }
        }
    

    And it works fine for me :

    enter image description here


    As a side note, I suggest you should use your backend to invoke the speech-to-text services. Never invoke azure stt service in a browser. That's because exposing your subscription key to front end is really dangerous. Anyone could inspect the network and steal your key.