I am using the sample code provided here and have implemented with the following:
# [START import_libraries]
import argparse
import base64
import json
import time
from oauth2client.service_account import ServiceAccountCredentials
import googleapiclient.discovery
import googleapiclient as gac
# [END import_libraries]
# [START authenticating]
# Application default credentials provided by env variable
# GOOGLE_APPLICATION_CREDENTIALS
def get_speech_service(credentials):
return googleapiclient.discovery.build('speech', 'v1beta1',credentials = credentials)
def main(speech_file):
"""Transcribe the given audio file asynchronously.
Args:
speech_file: the name of the audio file.
"""
# [START construct_request]
with open(speech_file, 'rb') as speech:
# Base64 encode the binary audio file for inclusion in the request.
speech_content = base64.b64encode(speech.read())
# print speech_content
scopes = ['https://www.googleapis.com/auth/cloud-platform']
credentials = ServiceAccountCredentials.from_json_keyfile_name(
'/Users/user/Documents/google_cloud/myjson.json', scopes)
service = get_speech_service(credentials)
service_request = service.speech().asyncrecognize(
body={
'config': {
# There are a bunch of config options you can specify. See
# https://cloud.google.com/speech/reference/rest/v1beta1/RecognitionConfig for the full list.
'encoding': 'LINEAR16', # raw 16-bit signed LE samples
'sampleRate': 16000, # 16 khz
# See http://g.co/cloud/speech/docs/languages for a list of
# supported languages.
'languageCode': 'en-US', # a BCP-47 language tag
},
'audio': {
'content': speech_content.decode('UTF-8')
}
})
# [END construct_request]
# [START send_request]
response = service_request.execute()
print(json.dumps(response))
# [END send_request]
name = response['name']
# Construct a GetOperation request.
service_request = service.operations().get(name=name)
while True:
# Give the server a few seconds to process.
print('Waiting for server processing...')
time.sleep(1)
# Get the long running operation with response.
response = service_request.execute()
if 'done' in response and response['done']:
break
# First print the raw json response
print(json.dumps(response['response'], indent=2))
# Now print the actual transcriptions
out = []
for result in response['response'].get('results', []):
print 'poo'
print('Result:')
for alternative in result['alternatives']:
print(u' Alternative: {}'.format(alternative['transcript']))
out.append(result)
return response
r = main("/Users/user/Downloads/brooklyn.flac")
Yet my print is the following:
{"name": "3202776140236290963"}
Waiting for server processing...
Waiting for server processing...
{
"@type": "type.googleapis.com/google.cloud.speech.v1beta1.AsyncRecognizeResponse"
}
And my returned object is:
{u'done': True,
u'metadata': {u'@type': u'type.googleapis.com/google.cloud.speech.v1beta1.AsyncRecognizeMetadata',
u'lastUpdateTime': u'2017-03-25T15:54:46.136925Z',
u'progressPercent': 100,
u'startTime': u'2017-03-25T15:54:44.514614Z'},
u'name': u'2024312474309214820',
u'response': {u'@type': u'type.googleapis.com/google.cloud.speech.v1beta1.AsyncRecognizeResponse'}}
On my console screen I see the requests coming through via:
Unsure why I am not getting the proper transcription back from the sample file.
Any input is appreciated!
Well your config options have the following:
'config': {
# There are a bunch of config options you can specify. See
# https://cloud.google.com/speech/reference/rest/v1beta1/RecognitionConfig for the full list.
'encoding': 'LINEAR16', # raw 16-bit signed LE samples
'sampleRate': 16000, # 16 khz
# See http://g.co/cloud/speech/docs/languages for a list of
# supported languages.
'languageCode': 'en-US', # a BCP-47 language tag
},
However, you are using a FLAC
file:
r = main("/Users/user/Downloads/brooklyn.flac")
Quoting https://cloud.google.com/speech/reference/rest/v1beta1/RecognitionConfig:
LINEAR16
Uncompressed 16-bit signed little-endian samples (Linear PCM). This is the only encoding that may be used by
speech.asyncrecognize
.
FLAC
This is the recommended encoding for
speech.syncrecognize
andStreamingRecognize
because it uses lossless compression; therefore recognition accuracy is not compromised by a lossy codec.
In other words you can't use FLAC
with speech.asyncrecognize
, you may need to transcode your sample to Linear PCM first, or use speech.syncrecognize
with FLAC
encoding option.