Search code examples
iosswiftavfoundationopus

Invalid Frame Size Error when Encoding Audio Stream to Opus on iOS


I'm writing logic to encode an audio stream to opus (to send over a web socket); I keep getting an Invalid frame size: 4800. Must be one of [120, 240, 480, 960, 1920, 2880] error:

//
//  AudioManager.swift
//
//

import Foundation
import AVFoundation
import Opus

class AudioManager: ObservableObject {
    private var audioEngine: AVAudioEngine?
    private var audioInputNode: AVAudioInputNode?
    private weak var appManager: AppManager?
    private var canSendData: Bool = false  // Flag to control data sending
    private var audioBufferQueue: AVAudioPCMBuffer?
    private var opusEncoder: Opus.Encoder?

    init(appManager: AppManager) {
        self.appManager = appManager
        setupOpusEncoder()
        setupAudioSession()
        setupAudioEngine()
    }

    private func setupAudioSession() {
        let session = AVAudioSession.sharedInstance()
        do {
            try session.setPreferredSampleRate(48000)
            try session.setPreferredInputNumberOfChannels(1)
            try session.setCategory(.playAndRecord, mode: .default, options: [.defaultToSpeaker, .allowBluetooth])
            try session.setActive(true)
            print("Audio session setup complete with sample rate 48000 Hz and mono channel.")
        } catch {
            print("Failed to set up audio session: \(error)")
        }
    }

    func checkMicrophonePermission() -> Bool {
        return AVAudioSession.sharedInstance().recordPermission == .granted
    }

    func requestMicrophoneAccess(completion: @escaping (Bool) -> Void) {
        let audioSession = AVAudioSession.sharedInstance()
        
        switch audioSession.recordPermission {
        case .granted:
            completion(true)
        case .denied:
            completion(false)
        case .undetermined:
            audioSession.requestRecordPermission { granted in
                DispatchQueue.main.async {
                    completion(granted)
                }
            }
        @unknown default:
            completion(false)
        }
    }
    
    private func setupOpusEncoder() {
        // Define the parameters for the audio format
        let sampleRate = 48000.0  // Sample rate in Hz
        let channels = 1          // Number of audio channels

        // Attempt to create an AVAudioFormat instance
        guard let opusFormat = AVAudioFormat(opusPCMFormat: .float32, sampleRate: sampleRate, channels: AVAudioChannelCount(channels)) else {
            print("Invalid audio format parameters")
            return
        }

        do {
            // Create the Opus encoder with the valid audio format
            opusEncoder = try Opus.Encoder(format: opusFormat)
            print("Opus encoder successfully created")
        } catch {
            // Handle any errors that might occur during the encoder initialization
            print("Failed to create Opus encoder: \(error)")
        }
    }

    func setupAudioEngine() {
        audioEngine = AVAudioEngine()
        guard let audioEngine = audioEngine else {
            print("Audio engine could not be initialized")
            return
        }

        let inputNode = audioEngine.inputNode
        let mixerNode = AVAudioMixerNode()
        audioEngine.attach(mixerNode)

        // Choose an Opus-compatible buffer size
        let opusCompatibleBufferSize: AVAudioFrameCount = 960 // Choose based on your latency and quality requirements

        let desiredFormat = AVAudioFormat(standardFormatWithSampleRate: 48000, channels: 1)
        audioEngine.connect(inputNode, to: mixerNode, format: inputNode.inputFormat(forBus: 0))
        audioEngine.connect(mixerNode, to: audioEngine.mainMixerNode, format: desiredFormat)

        mixerNode.installTap(onBus: 0, bufferSize: opusCompatibleBufferSize, format: desiredFormat) { [weak self] (buffer, when) in
            self?.bufferAudioData(buffer)
        }

        do {
            try audioEngine.start()
            print("Audio engine started with desired format.")
        } catch {
            print("Failed to start audio engine: \(error)")
        }
    }

    private func bufferAudioData(_ buffer: AVAudioPCMBuffer) {
        guard let encoder = opusEncoder else {
            print("Opus encoder not initialized")
            return
        }

        // Validate buffer format again before attempting to encode
        if buffer.format.sampleRate != 48000 || buffer.format.channelCount != 1 {
            print("Buffer format mismatch: Expected 48000 Hz, 1 channel, but got \(buffer.format.sampleRate) Hz, \(buffer.format.channelCount) channels")
            return
        }

        // Ensure the buffer frame size is a valid Opus frame size
        let validFrameSizes = [120, 240, 480, 960, 1920, 2880] // Frame sizes for 48000 Hz
        guard validFrameSizes.contains(Int(buffer.frameLength)) else {
            print("Invalid frame size: \(buffer.frameLength). Must be one of \(validFrameSizes)")
            return
        }

        var opusData = Data() // Initialize an empty Data object to hold the encoded data.

        do {
            // Attempt to encode and capture the number of bytes encoded
            let bytesEncoded = try encoder.encode(buffer, to: &opusData)
            print("Encoded \(bytesEncoded) bytes of data.")

            if !opusData.isEmpty && canSendData {
                appManager?.webSocketManager.send(data: opusData) {
                    print("Opus encoded audio data sent.")
                }
            }
        } catch let error as Opus.Error {
            // Print the Opus error with its raw value and a possible interpretation
            print("Failed to encode audio: Opus Error \(error.rawValue) - \(interpretOpusError(error))")
        } catch {
            // This catches non-Opus errors
            print("Failed to encode audio: \(error)")
        }
    }

    /// Interprets Opus error codes into human-readable descriptions
    private func interpretOpusError(_ error: Opus.Error) -> String {
        switch error {
        case .ok:
            return "No error."
        case .badArgument:
            return "One or more invalid/out of range arguments."
        case .bufferTooSmall:
            return "The mode struct passed is invalid."
        case .internalError:
            return "An internal error was detected."
        case .invalidPacket:
            return "The compressed data passed is corrupted."
        case .unimplemented:
            return "Invalid/unsupported request number."
        case .invalidState:
            return "An encoder or decoder structure is invalid or already freed."
        case .allocationFailure:
            return "Memory allocation has failed."
        default:
            return "Unknown error."
        }
    }

    func startRecording() {
        print("Starting recording...")
        canSendData = true
        appManager?.webSocketManager.send(string: "{\"command\": \"aq_start\"}") {
            print("Sent start recording command.")
        }
    }

    func stopRecording() {
        print("Stopping recording...")
        canSendData = false
        appManager?.webSocketManager.send(string: "{\"command\": \"aq_stop\"}") {
            print("Sent stop recording command.")
        }
        audioEngine?.stop()
        print("Recording stopped.")
    }
}

Solution

  • Your AVAudioNode tap bufferSize is being ignored and you're getting 10ms chunks of audio, while the opus encoder wants smaller chunks.

    You need to break up the overly large audio buffers into Opus sized chunks.

    I have never seen an AVAudioNode tap honour its bufferSize parameter, but a header file comment says

    Supported range is [100, 400] ms

    so it's probable that I too have simply been choosing buffer sizes that were too small.

    But then the online documentation says

    The implementation may choose another size.

    which may explain why you didn't notice and my growing disenchantment with bufferSize during these last 10 years of AVAudioEngine.

    At 48kHz, the tap allegedly supports a bufferSize range of [4800, 19200] samples. If you were feeling lazy or just wanted a quick result, 19200 samples, which is a multiple of the Opus 1920 frame size, would mean you get no pesky remainder buffers, however as the doco says, the API is free to ignore bufferSize so for code that other people might see you really should do it right.