swift azure audio-streaming azure-cognitive-services

How to stream text-to-speech on iOS using the SDK?

I am trying to stream the audio I get from the Speech SDK using SPXPushAudioOutputStream. I get all data without an issue and can write it into a wav or mp3 and then play it back with the code below.

struct ContentView: View {
    @State private var inputText = """
    Die Gesundheitspolitik bleibt ein hartes Pflaster für Reformen. Bundesrätin Elisabeth Baume-Schneider forderte alle Akteure am Sonntag «nachdrücklich» auf, ihren Teil der Verantwortung zu übernehmen und «konkrete, mehrheitsfähige Sparvorschläge» vorzulegen. Mit Blick auf die vergangenen Jahrzehnte kann man darüber nur schmunzeln.
    Solange besagte Akteure ihren Besitzstand eisern verteidigen und solange die politischen Kräfte aus allen Lagern ihrem Lobbydruck nachgeben, wird sich nichts ändern. Auch in den Kantonen überwiegen die Hemmungen, Spitäler zu schliessen und über die Grenzen hinweg die Zusammenarbeit zu verstärken. Ausnahmen bestätigen die Regel.
    Das sagen die Ökonomen
    Deshalb stellt sich die Frage, ob man nicht das zunehmend absurde Kopfprämiensystem abschaffen und auf ein durch Steuergelder finanziertes Gesundheitswesen umstellen sollte, wie in anderen Ländern. watson hat diese Frage den Gesundheitsökonomen Heinz Locher und Willy Oggier gestellt – und interessante Antworten erhalten.
    """
    @State private var resultText = ""
    @State private var isPlaying = false
    @State private var audioPlayer: AVAudioPlayer?
    @State private var synthesisCompleted = false
    
    let speechKey = "censored"
    let serviceRegion = "switzerlandnorth"
    
    var body: some View {
        VStack {
            TextField("Enter text to synthesize", text: $inputText)
                .textFieldStyle(RoundedBorderTextFieldStyle())
                .padding()
            
            Button(action: synthesisToPushAudioOutputStream) {
                Text("Synthesize Speech")
            }
            .padding()
            
            Button(action: playAudio) {
                Text(isPlaying ? "Stop" : "Play")
            }
            .padding()
            .disabled(!synthesisCompleted)
            
            Text(resultText)
                .padding()
        }
        .onChange(of: resultText) { newValue in
            debug("Result text changed to: \(newValue)", function: "body.onChange")
            synthesisCompleted = newValue.contains("Speech synthesis completed")
            debug("Synthesis completed: \(synthesisCompleted)", function: "body.onChange")
        }
    }
    
    private func synthesisToPushAudioOutputStream() {
        let startTime = Date()
        debug("Starting speech synthesis...", function: #function)
        let filePath = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent("pushStream.mp3")
        debug("File path: \(filePath.path)", function: #function)
        
        if !FileManager.default.fileExists(atPath: filePath.path) {
            debug("File doesn't exist. Creating new file...", function: #function)
            FileManager.default.createFile(atPath: filePath.path, contents: nil, attributes: nil)
        } else {
            debug("File already exists. Will overwrite.", function: #function)
        }
        
        guard let fileHandle = try? FileHandle(forWritingTo: filePath) else {
            debug("Failed to open file handle", function: #function)
            updateResultText("Failed to open file at \(filePath.path)")
            return
        }
        debug("File handle opened successfully", function: #function)
        
        var totalBytesWritten: UInt = 0
        let stream = SPXPushAudioOutputStream(writeHandler: { data -> UInt in
            fileHandle.write(data)
            totalBytesWritten += UInt(data.count)
            debug("Wrote \(data.count) bytes. Total: \(totalBytesWritten) bytes", function: "SPXPushAudioOutputStream.writeHandler")
            return UInt(data.count)
        }, closeHandler: {
            fileHandle.closeFile()
            debug("File closed. Total bytes written: \(totalBytesWritten)", function: "SPXPushAudioOutputStream.closeHandler")
        })!
        
        debug("Configuring audio and speech...", function: #function)
        let audioConfig = try? SPXAudioConfiguration(streamOutput: stream)
        let speechConfig = try? SPXSpeechConfiguration(subscription: speechKey, region: serviceRegion)
        
        guard let config = speechConfig, let audio = audioConfig else {
            debug("Failed to create speech or audio configuration", function: #function)
            updateResultText("Speech Config Error")
            return
        }
        
        config.setSpeechSynthesisOutputFormat(.audio24Khz160KBitRateMonoMp3)
        debug("Set output format to MP3", function: #function)
        
        updateResultText("Synthesizing...")
        
        debug("Creating speech synthesizer...", function: #function)
        let synthesizer = try? SPXSpeechSynthesizer(speechConfiguration: config, audioConfiguration: audio)
        guard let synth = synthesizer else {
            debug("Failed to create speech synthesizer", function: #function)
            updateResultText("Speech Synthesis Error")
            return
        }
        
        debug("Starting text-to-speech...", function: #function)
        let speechResult = try? synth.speakText(inputText)
        if let result = speechResult {
            if result.reason == SPXResultReason.canceled {
                let details = try! SPXSpeechSynthesisCancellationDetails(fromCanceledSynthesisResult: result)
                debug("Speech synthesis canceled: \(details.errorDetails ?? "Unknown error")", function: #function)
                updateResultText("Canceled: \(details.errorDetails ?? "Unknown error")")
            } else if result.reason == SPXResultReason.synthesizingAudioCompleted {
                let synthesisTime = Date().timeIntervalSince(startTime)
                debug("Speech synthesis completed successfully in \(String(format: "%.2f", synthesisTime)) seconds", function: #function)
                updateResultText("Speech synthesis completed in \(String(format: "%.2f", synthesisTime)) seconds.")
                
                // Add a small delay to ensure file writing is complete
                DispatchQueue.main.asyncAfter(deadline: .now() + 0.5) {
                    // Get file size
                    do {
                        let attributes = try FileManager.default.attributesOfItem(atPath: filePath.path)
                        let fileSize = attributes[.size] as? Int64 ?? 0
                        debug("File size: \(fileSize) bytes", function: "DispatchQueue.asyncAfter")
                    } catch {
                        debug("Error getting file size: \(error)", function: "DispatchQueue.asyncAfter")
                    }
                    
                    // Get audio duration
                    let asset = AVAsset(url: filePath)
                    let duration = asset.duration
                    let durationSeconds = CMTimeGetSeconds(duration)
                    debug("Audio duration: \(durationSeconds) seconds", function: "DispatchQueue.asyncAfter")
                    self.updateResultText("Speech synthesis completed in \(String(format: "%.2f", synthesisTime)) seconds. Audio Duration: \(String(format: "%.2f", durationSeconds)) seconds, Size: \(FileManager.default.sizeFormatted(ofPath: filePath.path) ?? "Unknown")")
                }
            } else {
                debug("Speech synthesis failed with reason: \(result.reason)", function: #function)
                updateResultText("Speech synthesis error.")
            }
        } else {
            debug("Speech synthesis failed (no result)", function: #function)
            updateResultText("Speech synthesis error.")
        }
    }
    
    private func updateResultText(_ text: String) {
        DispatchQueue.main.async {
            self.resultText = text
            debug("Updated result text: \(text)", function: #function)
            self.synthesisCompleted = text.contains("Speech synthesis completed")
            debug("Synthesis completed: \(self.synthesisCompleted)", function: #function)
        }
    }
    
    private func playAudio() {
        if isPlaying {
            audioPlayer?.stop()
            isPlaying = false
            debug("Audio playback stopped", function: #function)
        } else {
            let filePath = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent("pushStream.mp3")
            debug("Attempting to play audio from: \(filePath.path)", function: #function)
            
            do {
                audioPlayer = try AVAudioPlayer(contentsOf: filePath)
                audioPlayer?.play()
                isPlaying = true
                debug("Audio playback started", function: #function)
                if let duration = audioPlayer?.duration {
                    debug("Audio duration: \(duration) seconds", function: #function)
                }
            } catch {
                updateResultText("Error playing audio: \(error.localizedDescription)")
                debug("Detailed error playing audio: \(error)", function: #function)
            }
        }
    }
    
    private func debug(_ message: String, function: String) {
        let timestamp = DateFormatter.localizedString(from: Date(), dateStyle: .none, timeStyle: .medium)
        print("[\(timestamp)] [\(function)] \(message)")
    }
}

// Add this extension for formatting file size
extension FileManager {
    func sizeFormatted(ofPath path: String) -> String? {
        guard let attributes = try? attributesOfItem(atPath: path) else { return nil }
        let size = attributes[.size] as? Int64 ?? 0
        return ByteCountFormatter.string(fromByteCount: size, countStyle: .file)
    }
}

However I cannot for the life of me figure out how I would go about streaming it. I have very little knowledge of AVPlayer, so that obviously doesn't help, but I tried using every approach I could find while browsing the net... any pointers to potential solutions would be highly appreciated!

Solution

To stream audio generated from the Speech SDK using SPXPushAudioOutputStream, you can modify your existing code to play the audio as it is being streamed.

I have configure the SPXPushAudioOutputStream to stream data to an AVAudioEngine for real-time playback.

private func synthesisToPushAudioOutputStream() {
    let startTime = Date()
    debug("Starting speech synthesis...", function: #function)
    
    guard let audioEngine = audioEngine else {
        debug("Audio engine is not initialized", function: #function)
        updateResultText("Audio Engine Error")
        return
    }
    
    // Prepare audio engine and player node
    audioEngine.attach(audioPlayerNode)
    let format = audioEngine.mainMixerNode.outputFormat(forBus: 0)
    audioEngine.connect(audioPlayerNode, to: audioEngine.mainMixerNode, format: format)
    
    let stream = SPXPushAudioOutputStream(writeHandler: { data -> UInt in
        if let pcmBuffer = self.convertDataToPCMBuffer(data: data, format: format) {
            self.audioPlayerNode.scheduleBuffer(pcmBuffer, completionHandler: nil)
        }
        return UInt(data.count)
    }, closeHandler: {
        audioEngine.stop()
        debug("Audio engine stopped", function: "SPXPushAudioOutputStream.closeHandler")
    })!
    
    let audioConfig = try? SPXAudioConfiguration(streamOutput: stream)
    let speechConfig = try? SPXSpeechConfiguration(subscription: speechKey, region: serviceRegion)
    
    guard let config = speechConfig, let audio = audioConfig else {
        debug("Failed to create speech or audio configuration", function: #function)
        updateResultText("Speech Config Error")
        return
    }
    
    config.setSpeechSynthesisOutputFormat(.audio16Khz16KbpsMonoPcm)
    debug("Set output format to PCM", function: #function)
    
    updateResultText("Synthesizing...")
    
    let synthesizer = try? SPXSpeechSynthesizer(speechConfiguration: config, audioConfiguration: audio)
    guard let synth = synthesizer else {
        debug("Failed to create speech synthesizer", function: #function)
        updateResultText("Speech Synthesis Error")
        return
    }
    
    debug("Starting text-to-speech...", function: #function)
    let speechResult = try? synth.speakText(inputText)
    if let result = speechResult {
        if result.reason == SPXResultReason.canceled {
            let details = try! SPXSpeechSynthesisCancellationDetails(fromCanceledSynthesisResult: result)
            debug("Speech synthesis canceled: \(details.errorDetails ?? "Unknown error")", function: #function)
            updateResultText("Canceled: \(details.errorDetails ?? "Unknown error")")
        } else if result.reason == SPXResultReason.synthesizingAudioCompleted {
            let synthesisTime = Date().timeIntervalSince(startTime)
            debug("Speech synthesis completed successfully in \(String(format: "%.2f", synthesisTime)) seconds", function: #function)
            updateResultText("Speech synthesis completed in \(String(format: "%.2f", synthesisTime)) seconds.")
            synthesisCompleted = true
        } else {
            debug("Speech synthesis failed with reason: \(result.reason)", function: #function)
            updateResultText("Speech synthesis error.")
        }
    } else {
        debug("Speech synthesis failed (no result)", function: #function)
        updateResultText("Speech synthesis error.")
    }
}

private func convertDataToPCMBuffer(data: Data, format: AVAudioFormat) -> AVAudioPCMBuffer? {
    let audioBuffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: UInt32(data.count) / format.streamDescription.pointee.mBytesPerFrame)
    audioBuffer?.frameLength = audioBuffer!.frameCapacity
    let audioBufferPointer = audioBuffer?.floatChannelData?[0]
    data.copyBytes(to: UnsafeMutableBufferPointer(start: audioBufferPointer, count: data.count / MemoryLayout<Float>.size))
    return audioBuffer
}

import UIKit
import MicrosoftCognitiveServicesSpeech

let EmbeddedSpeechSynthesisVoicesFolderName = "TTS"
let EmbeddedSpeechSynthesisVoiceName = "YourEmbeddedSpeechSynthesisVoiceName"
let EmbeddedSpeechSynthesisVoiceKey = "YourEmbeddedSpeechSynthesisVoiceKey"

class ViewController: UIViewController, UITextFieldDelegate {
    
    var textField: UITextField!
    var synthButton: UIButton!
    
    var inputText: String!
    var embeddedSpeechConfig: SPXEmbeddedSpeechConfiguration?
    
    override func viewDidLoad() {
        super.viewDidLoad()
        
        let bundle = Bundle(for: type(of: self))
        if let absoluteModelPath = bundle.path(forResource: EmbeddedSpeechSynthesisVoicesFolderName, ofType: nil) {
            do {
                embeddedSpeechConfig = try SPXEmbeddedSpeechConfiguration(fromPath: absoluteModelPath)
                embeddedSpeechConfig?.setSpeechSynthesisVoice(EmbeddedSpeechSynthesisVoiceName, key: EmbeddedSpeechSynthesisVoiceKey)
            } catch {
                print("Error: \(error) in initializing embedded speech configuration.")
                embeddedSpeechConfig = nil
            }
        } else {
            print("Error: Unable to locate the specified embedded speech synthesis voice.")
        }
        
        setupUI()
    }
    
    func setupUI() {
        textField = UITextField(frame: CGRect(x: 100, y: 250, width: 200, height: 50))
        textField.textColor = UIColor.black
        textField.borderStyle = UITextField.BorderStyle.roundedRect
        textField.placeholder = "Type something to synthesize."
        textField.delegate = self
        
        inputText = ""
        
        synthButton = UIButton(frame: CGRect(x: 100, y: 400, width: 200, height: 50))
        synthButton.setTitle("Synthesize", for: .normal)
        synthButton.addTarget(self, action: #selector(synthesisButtonClicked), for: .touchUpInside)
        synthButton.setTitleColor(UIColor.black, for: .normal)
        
        self.view.addSubview(textField)
        self.view.addSubview(synthButton)
    }
    
    func textField(_ textField: UITextField, shouldChangeCharactersIn range: NSRange, replacementString string: String) -> Bool {
        if let demotext = textField.text, let textRange = Range(range, in: text) {
            self.inputText = demotext.replacingCharacters(in: textRange, with: string)
        }
        return true
    }
    
    @objc func synthesisButtonClicked() {
        DispatchQueue.global(qos: .userInitiated).async {
            self.synthesisToWAV()
        }
    }
    
    func synthesisToWAV() {
        let synthesizer = try! SPXSpeechSynthesizer(embeddedSpeechConfiguration: embeddedSpeechConfig!)
        if inputText.isEmpty {
            return
        }
        
        do {
            let audioConfig = try SPXAudioConfiguration.fromDefaultSpeakerOutput()
            let result = try synthesizer.synthesizeSpeech(inputText, audioConfig: audioConfig)
            
            guard let audioData = result.audioData else {
                print("Error: Audio data is nil.")
                return
            }
            
            let documentsPath = NSSearchPathForDirectoriesInDomains(.documentDirectory, .userDomainMask, true)[0] as NSString
            let filePath = documentsPath.appendingPathComponent("output.wav")
            let fileURL = URL(fileURLWithPath: filePath)
            
            do {
                try audioData.write(to: fileURL)
                print("Speech synthesized and saved to: \(fileURL)")
            } catch {
                print("Error writing file: \(error)")
            }
            
        } catch {
            print("Error synthesizing speech: \(error)")
        }
    }
}

enter image description here

The synthesisToWAV() method synthesizes text to speech using SPXSpeechSynthesizer and saved the synthesized audio as a .wav file.

enter image description here

samples_swift_ios.wav:

enter image description here

I have used this git for code to text-to-speech using swift language.