I am currently using Microsoft Azure's Text to Speech Unity SDK in my small app. With the C# Unity script I'm currently using, it plays the audio right after a button is pressed. However, instead of playing the audio, I'd like it to instead save as an audio file that can later be played. I know that you can save TTS Audio to a .wav file with speechSynthesizer.SetOutputToWaveFile(), but I can't seem to find that method in Azure's Unity SDK. Here's the sample code that produces live TTS.
using UnityEngine;
using UnityEngine.UI;
using Microsoft.CognitiveServices.Speech;
public class TextToSpeech : MonoBehaviour
{
// Hook up the three properties below with a Text, InputField and Button object in your UI.
public Text outputText;
public InputField inputField;
public Button speakButton;
public AudioSource audioSource;
private object threadLocker = new object();
private bool waitingForSpeak;
private string message;
public void ButtonClick()
{
// Creates an instance of a speech config with specified subscription key and service region.
// Replace with your own subscription key and service region (e.g., "westus").
var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion");
// Creates a speech synthesizer.
// Make sure to dispose the synthesizer after use!
using (var synthsizer = new SpeechSynthesizer(config, null))
{
lock (threadLocker)
{
waitingForSpeak = true;
}
// Starts speech synthesis, and returns after a single utterance is synthesized.
var result = synthsizer.SpeakTextAsync(inputField.text).Result;
// Checks result.
string newMessage = string.Empty;
if (result.Reason == ResultReason.SynthesizingAudioCompleted)
{
// Since native playback is not yet supported on Unity yet (currently only supported on Windows/Linux Desktop),
// use the Unity API to play audio here as a short term solution.
// Native playback support will be added in the future release.
var sampleCount = result.AudioData.Length / 2;
var audioData = new float[sampleCount];
for (var i = 0; i < sampleCount; ++i)
{
audioData[i] = (short)(result.AudioData[i * 2 + 1] << 8 | result.AudioData[i * 2]) / 32768.0F;
}
// The default output audio format is 16K 16bit mono
var audioClip = AudioClip.Create("SynthesizedAudio", sampleCount, 1, 16000, false);
audioClip.SetData(audioData, 0);
audioSource.clip = audioClip;
audioSource.Play();
newMessage = "Speech synthesis succeeded!";
}
else if (result.Reason == ResultReason.Canceled)
{
var cancellation = SpeechSynthesisCancellationDetails.FromResult(result);
newMessage = $"CANCELED:\nReason=[{cancellation.Reason}]\nErrorDetails=[{cancellation.ErrorDetails}]\nDid you update the subscription info?";
}
lock (threadLocker)
{
message = newMessage;
waitingForSpeak = false;
}
}
}
void Start()
{
if (outputText == null)
{
UnityEngine.Debug.LogError("outputText property is null! Assign a UI Text element to it.");
}
else if (inputField == null)
{
message = "inputField property is null! Assign a UI InputField element to it.";
UnityEngine.Debug.LogError(message);
}
else if (speakButton == null)
{
message = "speakButton property is null! Assign a UI Button to it.";
UnityEngine.Debug.LogError(message);
}
else
{
// Continue with normal initialization, Text, InputField and Button objects are present.
inputField.text = "Enter text you wish spoken here.";
message = "Click button to synthesize speech";
speakButton.onClick.AddListener(ButtonClick);
}
}
void Update()
{
lock (threadLocker)
{
if (speakButton != null)
{
speakButton.interactable = !waitingForSpeak;
}
if (outputText != null)
{
outputText.text = message;
}
}
}
}
As you can see, this code will play the text input as a sound as soon as button is pressed, but ideally, I'd like to save the audio output to a file beforehand so it's more efficient. Any help would be appreciated.
You can use rest API to implement this requirement. Here is a c# code demo which calling rest API and save an audio file to local:
using System;
using System.Net.Http;
using System.Text;
using System.IO;
using System.Threading.Tasks;
using System.Xml.Linq;
namespace TTSSample
{
public class Authentication
{
private string subscriptionKey;
private string tokenFetchUri;
public Authentication(string tokenFetchUri, string subscriptionKey)
{
if (string.IsNullOrWhiteSpace(tokenFetchUri))
{
throw new ArgumentNullException(nameof(tokenFetchUri));
}
if (string.IsNullOrWhiteSpace(subscriptionKey))
{
throw new ArgumentNullException(nameof(subscriptionKey));
}
this.tokenFetchUri = tokenFetchUri;
this.subscriptionKey = subscriptionKey;
}
public async Task<string> FetchTokenAsync()
{
using (HttpClient client = new HttpClient())
{
client.DefaultRequestHeaders.Add("Ocp-Apim-Subscription-Key", this.subscriptionKey);
UriBuilder uriBuilder = new UriBuilder(this.tokenFetchUri);
HttpResponseMessage result = await client.PostAsync(uriBuilder.Uri.AbsoluteUri, null).ConfigureAwait(false);
return await result.Content.ReadAsStringAsync().ConfigureAwait(false);
}
}
}
class Program
{
static async Task Main(string[] args)
{
// Prompts the user to input text for TTS conversion
Console.Write("What would you like to convert to speech? ");
string text = Console.ReadLine();
// Gets an access token
string accessToken;
Console.WriteLine("Attempting token exchange. Please wait...\n");
// Add your subscription key here
// If your resource isn't in WEST US, change the endpoint
Authentication auth = new Authentication("https://westus.api.cognitive.microsoft.com/sts/v1.0/issueToken", "REPLACE_WITH_YOUR_KEY");
try
{
accessToken = await auth.FetchTokenAsync().ConfigureAwait(false);
Console.WriteLine("Successfully obtained an access token. \n");
}
catch (Exception ex)
{
Console.WriteLine("Failed to obtain an access token.");
Console.WriteLine(ex.ToString());
Console.WriteLine(ex.Message);
return;
}
string host = "https://westus.tts.speech.microsoft.com/cognitiveservices/v1";
// Create SSML document.
XDocument body = new XDocument(
new XElement("speak",
new XAttribute("version", "1.0"),
new XAttribute(XNamespace.Xml + "lang", "en-US"),
new XElement("voice",
new XAttribute(XNamespace.Xml + "lang", "en-US"),
new XAttribute(XNamespace.Xml + "gender", "Female"),
new XAttribute("name", "en-US-Jessa24kRUS"), // Short name for "Microsoft Server Speech Text to Speech Voice (en-US, Jessa24KRUS)"
text)));
using (HttpClient client = new HttpClient())
{
using (HttpRequestMessage request = new HttpRequestMessage())
{
// Set the HTTP method
request.Method = HttpMethod.Post;
// Construct the URI
request.RequestUri = new Uri(host);
// Set the content type header
request.Content = new StringContent(body.ToString(), Encoding.UTF8, "application/ssml+xml");
// Set additional header, such as Authorization and User-Agent
request.Headers.Add("Authorization", "Bearer " + accessToken);
request.Headers.Add("Connection", "Keep-Alive");
// Update your resource name
request.Headers.Add("User-Agent", "YOUR_RESOURCE_NAME");
// Audio output format. See API reference for full list.
request.Headers.Add("X-Microsoft-OutputFormat", "riff-24khz-16bit-mono-pcm");
// Create a request
Console.WriteLine("Calling the TTS service. Please wait... \n");
using (HttpResponseMessage response = await client.SendAsync(request).ConfigureAwait(false))
{
response.EnsureSuccessStatusCode();
// Asynchronously read the response
using (Stream dataStream = await response.Content.ReadAsStreamAsync().ConfigureAwait(false))
{
Console.WriteLine("Your speech file is being written to file...");
using (FileStream fileStream = new FileStream(@"sample.wav", FileMode.Create, FileAccess.Write, FileShare.Write))
{
await dataStream.CopyToAsync(fileStream).ConfigureAwait(false);
fileStream.Close();
}
Console.WriteLine("\nYour file is ready. Press any key to exit.");
Console.ReadLine();
}
}
}
}
}
}
}