I would like to use Google Speech Recognition API Rest in C#. The reason I want to use the API Rest instead of a Client Library is because I would like to use Unity3D and it doesn't currently support the client library.
I use HttpClient to connect and Newtonsoft Json to serialize the json (I'm working right now with windows forms and will move to Unity when it works in Winforms).
I always get a bad request response from Google, it doesn't give more detail, but I have noticed that if I change my API key value to an invalid one I get the same result.
Here is my code:
The classes:
class Speech
{
public RecognitionConfig config { get; set; }
public RecognitionAudio audio { get; set; }
public bool sendToApi(string baseUri, string url, ref string apiResponse)
{
try
{
HttpClient client = new HttpClient();
// Update port # in the following line.
client.BaseAddress = new Uri(baseUri);
client.DefaultRequestHeaders.Accept.Clear();
client.DefaultRequestHeaders.Accept.Add(new System.Net.Http.Headers.MediaTypeWithQualityHeaderValue("application/json"));
//var speechJson = new JavaScriptSerializer().Serialize(certificado);
string speechJson = JsonConvert.SerializeObject(this);
var contenido = new StringContent(speechJson.ToString(), Encoding.UTF8, "application/json");
HttpResponseMessage response = client.PostAsync(url, contenido).Result;
if (response.IsSuccessStatusCode)
{
string responseJson = response.Content.ReadAsStringAsync().Result;
apiResponse = responseJson;
}
else
{
apiResponse = "ERROR " + JsonConvert.DeserializeObject(JsonConvert.SerializeObject(response));
}
return true;
}
catch (Exception e)
{
apiResponse = e.Message;
return false;
}
}
}
class RecognitionConfig
{
public string encoding { get; set; }
public int sampleRateHertz { get; set; }
public string languageCode { get; set; }
// public int maxAlternatives { get; set; }
// public bool profanityFilter { get; set; }
// public List<SpeechContext> speechContexts { get; set; }
// public bool enableWordTimeOffsets { get; set; }
}
class SpeechContext
{
public List<string> phrases { get; set; }
}
class RecognitionAudio
{
public string content { get; set; }
// public string uri { get; set; }
public bool setContentBase64FromAudio(string path)
{
try
{
FileStream fileStream = File.OpenRead(path);
MemoryStream memoryStream = new MemoryStream();
memoryStream.SetLength(fileStream.Length);
fileStream.Read(memoryStream.GetBuffer(), 0, (int)fileStream.Length);
byte[] BA_AudioFile = memoryStream.GetBuffer();
this.content = System.Convert.ToBase64String(BA_AudioFile);
return true;
}
catch(Exception e)
{
return false;
}
}
}
The call:
private void button1_Click(object sender, EventArgs e)
{
Speech speech = new Speech();
speech.config = new RecognitionConfig();
speech.audio = new RecognitionAudio();
speech.config.encoding = "FLAC";
speech.config.sampleRateHertz = 44100;
speech.config.languageCode = "en-US";
RecognitionAudio audio = new RecognitionAudio();
audio.setContentBase64FromAudio("C:\\Users\\Manena\\Downloads\\good-morning-google.flac");
speech.audio = audio;
string response = "";
speech.sendToApi("https://speech.googleapis.com/", "v1/speech:recognize?key=<mykey>", ref response);
textBox1.Text = response;
}
}
Edit: Here is the Json I send:
{
"config":{
"encoding":"FLAC",
"sampleRateHertz":44100,
"languageCode":"en-US"
},
"audio":{
"content":"base64 audio"
}
}
And what I receive:
{
"Version": {
"Major": 1,
"Minor": 1,
"Build": -1,
"Revision": -1,
"MajorRevision": -1,
"MinorRevision": -1
},
"Content": {
"Headers": [
{
"Key": "Content-Type",
"Value": [
"application/json; charset=UTF-8"
]
}
]
},
"StatusCode": 400,
"ReasonPhrase": "Bad Request",
"Headers": [
{
"Key": "Vary",
"Value": [
"X-Origin",
"Referer",
"Origin",
"Accept-Encoding"
]
},
{
"Key": "X-XSS-Protection",
"Value": [
"1; mode=block"
]
},
{
"Key": "X-Frame-Options",
"Value": [
"SAMEORIGIN"
]
},
{
"Key": "X-Content-Type-Options",
"Value": [
"nosniff"
]
},
{
"Key": "Alt-Svc",
"Value": [
"hq=\":443\"; ma=2592000; quic=51303431; quic=51303339; quic=51303338; quic=51303337; quic=51303335,quic=\":443\"; ma=2592000; v=\"41,39,38,37,35\""
]
},
{
"Key": "Transfer-Encoding",
"Value": [
"chunked"
]
},
{
"Key": "Accept-Ranges",
"Value": [
"none"
]
},
{
"Key": "Cache-Control",
"Value": [
"private"
]
},
{
"Key": "Date",
"Value": [
"Sat, 30 Dec 2017 09:06:19 GMT"
]
},
{
"Key": "Server",
"Value": [
"ESF"
]
}
],
"RequestMessage": {
"Version": {
"Major": 1,
"Minor": 1,
"Build": -1,
"Revision": -1,
"MajorRevision": -1,
"MinorRevision": -1
},
"Content": {
"Headers": [
{
"Key": "Content-Type",
"Value": [
"application/json; charset=utf-8"
]
},
{
"Key": "Content-Length",
"Value": [
"106"
]
}
]
},
"Method": {
"Method": "POST"
},
"RequestUri": "https://speech.googleapis.com/v1/speech:recognize?key=mykey",
"Headers": [
{
"Key": "Accept",
"Value": [
"application/json"
]
}
],
"Properties": {}
},
"IsSuccessStatusCode": false
}
I know my code is mabe not the most elegant, but right now I'm only interested in getting a good response from Google API. Any clue?
I have solved the issue.
The problem was that I was using a 2 channel audio file, and Google Speech API currently accepts only mono audios.
So the code in the question works for 1 channel audios, it could be useful for someone
Thanks