azure text-to-speech azure-cognitive-services

Using Microsoft Azure Pronunciation Assessment on Browser, /gettoken not found

Cloned and set up the Pronunciation Assessment from MS Azure Speech as instructed here: https://github.com/Azure-Samples/Cognitive-Speech-TTS/tree/master/PronunciationAssessment/

No errors.

Key and region are correctly set, on another test on the same server I can successfully recognize text from a sample WAV file on terminal.

No matter what I do, the error is always: POST http://XX.XX.XX.XX/gettoken 404 (Not Found)

I understand it can't be found because it's not there, but there is nothing in the documentation addressing this.

Setting /gettoken to the Azure endpoint returns "unauthorized".

microsoft.cognitiveservices.speech.sdk.bundle.js is located on /static folder

SDK is installed globally and locally.

There was no bundle previously since I followed the instructions to the letter, so I created one. IP omitted.

(()=>{var e=document.getElementById("accuracyscore"),t=document.getElementById("fluencyscore"),n=document.getElementById("completenessscore"),o=document.getElementById("pronscore"),r=document.getElementById("wordsomitted"),a=document.getElementById("wordsinserted"),d="",c="";a.style.display="none",document.getElementById("wih").style.display="none";var l,s,i,u,m,p,y,g,E=document.getElementById("wordrow"),f=document.getElementById("phonemerow"),v=document.getElementById("scorerow"),b=document.getElementById("reftext"),w=(document.getElementById("formcontainer"),document.getElementById("randomtt")),T=document.getElementById("buttonhear"),S=document.getElementById("recordingsList"),I=document.getElementById("ttsList"),x=new Array,B=80,k=60,C=40,R=window.AudioContext||window.webkitAudioContext,L=!1,N=!1,O=!1,A=!0;function M(e){var t=document.getElementById("ttsaudio");t.playbackRate=.5;for(var n=0;n<x.length;n++)if(x[n].word==e){t.src=x[n].objectUrl,t.playbackRate=.7,t.play();break}var o=function(){t.src=s,t.playbackRate=.9,t.autoplay=!1,t.removeEventListener("ended",o)};t.addEventListener("ended",o)}window.onload=()=>{var e;A&&((e=new XMLHttpRequest).open("POST","http://XX.XXX.XXX.XX/gettoken",!0),e.onload=()=>{JSON.parse(e.responseText).at},e.send(),A=!1,A=!1)},b.onclick=function(){!function(){var e=document.activeElement.selectionStart;u=b.value,g=u.split(" ");var t=0,n=0;for(n=0;n<g.length;n++){if((t+=g[n].length)>=e){M(g[n]);break}t+=1}}()};var P=function(e){O=!0,i=new R,m=e,y=i.createMediaStreamSource(e),(p=new Recorder(y,{numChannels:1})).record()},U=function(e){h.innerHTML="You must allow your microphone.",console.log(e)};function H(e){var t=new XMLHttpRequest;t.open("POST","/getttsforword",!0),t.responseType="blob",t.onload=()=>{var n=t.response,o=URL.createObjectURL(n);x.push({word:e,objectUrl:o})};const n=new FormData;n.append("word",e),t.send(n)}function D(l){document.getElementById("recordloader").style.display="block",document.getElementById("footeralert").style.display="none";var s=URL.createObjectURL(l),i=document.createElement("audio"),m=document.createElement("p"),p=(document.createElement("a"),(new Date).toISOString());i.controls=!0,i.src=s,m.appendChild(i),S.appendChild(m);var y=new XMLHttpRequest;y.open("POST","/ackaud",!0),y.onload=()=>{const l=JSON.parse(y.responseText);"Success"==l.RecognitionStatus?(function(l){document.getElementById("summarytable").style.display="flex",e.innerText=l.AccuracyScore,t.innerText=l.FluencyScore,n.innerText=l.CompletenessScore,o.innerText=parseInt(l.PronScore,10),function(e){for(var t in e){var n=e[t],o=0;if("Omission"==n.ErrorType){d+=n.Word,d+=", ";var r=document.createElement("td");r.innerText="-",f.appendChild(r);var a=document.createElement("td");a.innerText="-",v.appendChild(a),(m=document.createElement("td")).innerText=n.Word,m.style.backgroundColor="orange",E.appendChild(m)}else if("Insertion"==n.ErrorType)c+=n.Word,c+=", ";else if("None"==n.ErrorType||"Mispronunciation"==n.ErrorType){for(var l in n.Phonemes){var s=n.Phonemes[l],i=document.createElement("td");i.innerText=s.Phoneme,s.AccuracyScore>=B?i.style.backgroundColor="green":s.AccuracyScore>=k?i.style.backgroundColor="lightgreen":s.AccuracyScore>=C?i.style.backgroundColor="yellow":i.style.backgroundColor="red",f.appendChild(i);var u=document.createElement("td");u.innerText=s.AccuracyScore,v.appendChild(u),o=Number(l)+1}var m;(m=document.createElement("td")).innerText=n.Word;var p=document.createElement("SUP"),y=document.createTextNode(n.AccuracyScore);p.appendChild(y),m.appendChild(p),m.colSpan=o,"None"==n.ErrorType?m.style.backgroundColor="lightgreen":m.style.backgroundColor="red",E.appendChild(m)}}}(l.Words),r.innerText=d,""!=c&&(document.getElementById("wih").style.display="block",a.style.display="block",a.innerText=c)}(l.NBest[0]),document.getElementById("recordloader").style.display="none",document.getElementById("metrics").style.display="block"):(alert("Did not catch audio properly! Please try again."),console.log("Server returned: Error"),console.log(l.RecognitionStatus))};const g=new FormData;return g.append("audio_data",l,p),g.append("reftext",u),y.send(g),!1}T.onclick=function(){if((u=b.value)!=l){document.getElementById("ttsloader").style.display="block";var e=new XMLHttpRequest;e.open("POST","/gettts",!0),e.responseType="blob",e.onload=()=>{var t=e.response,n=e.getResponseHeader("offsets");n.substring(1,n.length-1).replace(/ /g,"").split(",").map(Number),s=URL.createObjectURL(t);var o=document.createElement("audio"),r=document.createElement("p");o.controls=!0,o.autoplay=!0,o.id="ttsaudio",o.src=s,r.appendChild(o),I.hasChildNodes()&&I.lastChild.remove(),I.appendChild(r),document.getElementById("ttsloader").style.display="none"};const n=new FormData;n.append("reftext",u),e.send(n),l=u,g=u.split(" ");for(var t=0;t<g.length;t++)H(g[t])}else console.log("TTS Audio for given text already exists. You may change ref text");return!1},w.onclick=function(){var e=new XMLHttpRequest;return e.open("POST","/gettonguetwister",!0),e.onload=()=>{const t=JSON.parse(e.responseText);u=t.tt,b.value=u,b.innerText=u},e.send(),!1},document.getElementById("buttonmic").onclick=function(){0==b.value.length?alert("Reference Text cannot be empty!"):N?window.location.reload():L?(L=!1,N=!0,this.innerHTML="<span class='fa fa-refresh'></span>Refresh",this.className="green-button",p.stop(),m.getAudioTracks()[0].stop(),p.exportWAV(D)):(O||navigator.mediaDevices.getUserMedia({audio:!0}).then(P).catch(U),L=!0,b.readonly=!0,b.disabled=!0,w.disabled=!0,w.className="btn",u=b.value,this.innerHTML="<span class='fa fa-stop'></span>Stop",this.className="red-button")}})();

Solution

In "application .py", replace the SPEECH_SERVICE_SUBSCRIPTION_KEY and SPEECH_SERVICE_REGION
Code reference taken from github

subscription_key = '<SPEECH_SERVICE_SUBSCRIPTION_KEY>'
region = "<SPEECH_SERVICE_REGION>"
language = "en-US"
voice = "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)"

@app.route("/")

To create virtual environment, I use a command :

python -m venv venv

To active virtual environment Running with & Operator: the use of the & operator to run scripts.

& venv\Scripts\Activate

enter image description here

Command to install the packages in requirements.txt and set env. pip install -r requirements.txt and set FLASK_ENV=development
To specify the application file using the FLASK_APP environment variable, If your Flask application is in a file called application.py, set the FLASK_APP variable like this:

$env:FLASK_APP = "application"

flask run

enter image description here

Output: enter image description here

enter image description here