Search code examples
pythontwilioaudio-streamingtwilio-apiopenai-whisper

How to stream data from Twilio to openai Whisper


with the below code, the result ends up just being always the same "Thank you." Any ideas what could be going wrong. For reference, I used the load_audio function in the whisper package. And an article about twilio and Vosk.

@sock.route('/stream')
def stream(ws):
    while True:
        message = ws.receive()
        packet = json.loads(message)
    if packet['event'] == 'media':
            # get audio as ulaw
            audio = base64.b64decode(packet['media']['payload'])
            add_audio(audio)

buffer = np.array([], dtype=np.float32)

def add_audio(audio):
    global buffer
    # convert audio to numpy array
    audio = np.frombuffer(audio, np.int16).flatten().astype(np.float32) / 32768.0
    buffer = np.concatenate((buffer, audio))
    process_audio()


def process_audio():
    global buffer
    audio = whisper.pad_or_trim(buffer)
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    mel = torch.nan_to_num(mel)
    result = whisper.decode(model, mel, options)

Solution

  • Andreas' answer below is absolutely correct. Unfortunately, it's downvoted because it's not in English. Here's the updated code snippet to consume Twilio Media Stream to generate transcriptions with OpenAI API.

    audio_data2 = b'' 
    transcription0 = ''
    i = 0
    
    @app.websocket("/v1/twilio/stream")
    async def websocket_endpoint(websocket: WebSocket):
        import base64
        import json
        import threading
        import audioop
        import wave
        import os
        from openai import OpenAI
        from os import environ as env
        client = OpenAI(api_key=env["OPENAI_API_KEY"])
    
        global audio_data2
        global transcription0
        global i
    
        await websocket.accept()
        has_seen_media = False
        message_count = 0
        print("Connected to WebSocket")
    
        try:
            while True:
                message = await websocket.receive_text()
                if message is None:
                    print("No message received...")
                    continue
    
                if isinstance(message, str):
                    data = json.loads(message)
                else:
                    print("Message is not a string")
                    data = message
    
                if data['event'] == "connected":
                    print("Connected Message received: {}".format(message))
                if data['event'] == "start":
                    print("Start Message received: {}".format(message))
                if data['event'] == "closed":
                    print("Closed Message received: {}".format(message))
                    break
                if data['event'] == "media":
                    i = i + 1
                    payload = data['media']['payload']
                    audio_data = base64.b64decode(payload)
                    audio_data = audioop.ulaw2lin(audio_data, 2)
                    audio_data = audioop.ratecv(audio_data, 2, 1, 8000, 16000, None)[0]
                    audio_data2 = audio_data2 + audio_data
                    if len(audio_data2) > 299999:
                        sondosiero = 'sono' + str(i) + '.wav'
                        with wave.open(sondosiero, 'w') as wavfile:
                            wavfile.setnchannels(1)
                            wavfile.setsampwidth(2)
                            wavfile.setframerate(16000)
                            wavfile.writeframes(audio_data2)
                            wavfile.close()
                        audio_file3 = open(sondosiero, "rb")
                        try:
                            transcription = client.audio.transcriptions.create(model="whisper-1", file=audio_file3)
                            if (transcription != transcription0):
                                print(transcription.text + ' ')                        
                                transcription0 = transcription
                            audio_file3.close()
                        except:
                            pass
                        os.remove(sondosiero)
                        audio_data2 = b''
                
                message_count += 1
        except WebSocketDisconnect:
           
            print(f"Connection closed. Transcription is {transcription0}. Received a total of {message_count} messages")
    

    If you are using this in production, you will need to handle the last portion of the audio that is not within the len(audio_data2).