Search code examples
twiliotwilio-api

Dial Out during a Twilio Bidirectional Media Stream


At a high level, I would like to leverage Twilio's Dial functionality from within a Twilio bidirectional media stream, but it seems like Twilio's Media Streams functionality only supports the ability to send raw audio back to the caller.

Here's a bit more detail:

I currently have an app that uses a bidirectional media stream to use the OpenAI Realtime API to answer questions and concerns over a phone call (inspiration for my code here).

Here's the main bidirectional media stream code that receives audio and sends audio back:

@app.websocket("/media-stream")
async def handle_media_stream(websocket: WebSocket):
    """Handle WebSocket connections between Twilio and OpenAI."""
    print("Client connected")
    await websocket.accept()

    async with websockets.connect(
        "wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01",
        extra_headers={
            "Authorization": f"Bearer {CONFIG.api_key}",
            "OpenAI-Beta": "realtime=v1",
        },
    ) as openai_ws:
        await initialize_session(openai_ws)

        # Connection specific state
        stream_sid = None
        latest_media_timestamp = 0
        last_assistant_item = None
        mark_queue = []
        response_start_timestamp_twilio = None

        async def receive_from_twilio():
            """Receive audio data from Twilio and send it to the OpenAI Realtime API."""
            nonlocal stream_sid, latest_media_timestamp
            try:
                async for message in websocket.iter_text():
                    data = json.loads(message)
                    if data["event"] == "media" and openai_ws.open:
                        latest_media_timestamp = int(data["media"]["timestamp"])
                        audio_append = {
                            "type": "input_audio_buffer.append",
                            "audio": data["media"]["payload"],
                        }
                        await openai_ws.send(json.dumps(audio_append))
                    elif data["event"] == "start":
                        stream_sid = data["start"]["streamSid"]
                        print(f"Incoming stream has started {stream_sid}")
                        response_start_timestamp_twilio = None  # noqa: F841
                        latest_media_timestamp = 0
                        last_assistant_item = None  # noqa: F841
                    elif data["event"] == "mark":
                        if mark_queue:
                            mark_queue.pop(0)
            except WebSocketDisconnect:
                print("Client disconnected.")
                if openai_ws.open:
                    await openai_ws.close()

        async def send_to_twilio():
            """Receive events from the OpenAI Realtime API, send audio back to Twilio."""
            nonlocal stream_sid, last_assistant_item, response_start_timestamp_twilio
            try:
                async for openai_message in openai_ws:
                    response = json.loads(openai_message)
                    response_type = response.get("type")
                    if response_type in CONFIG.log_event_types:
                        # print(f"Received event: {response['type']}", response)
                        logging.info(f"Received event: {response['type']}")

                    match response_type:
                        case "response.audio.delta":
                            if "delta" not in response:
                                continue

                            audio_payload = base64.b64encode(
                                base64.b64decode(response["delta"])
                            ).decode("utf-8")
                            audio_delta = {
                                "event": "media",
                                "streamSid": stream_sid,
                                "media": {"payload": audio_payload},
                            }
                            await websocket.send_json(audio_delta)

                            if response_start_timestamp_twilio is None:
                                response_start_timestamp_twilio = latest_media_timestamp
                                if CONFIG.show_timing_math:
                                    print(
                                        f"Setting start timestamp for new response: {response_start_timestamp_twilio}ms"
                                    )

                            # Update last_assistant_item safely
                            if response.get("item_id"):
                                last_assistant_item = response["item_id"]

                            await send_mark(websocket, stream_sid)

                        # Trigger an interruption. Your use case might work better using `input_audio_buffer.speech_stopped`, or combining the two.
                        case "input_audio_buffer.speech_started":
                            print("Speech started detected.")
                            if last_assistant_item:
                                print(
                                    f"Interrupting response with id: {last_assistant_item}"
                                )
                                await handle_speech_started_event()

                        case "response.function_call_arguments.done":
                            # https://platform.openai.com/docs/api-reference/realtime-server-events/response/function_call_arguments/done
                            # TODO: eventually migrate domain model to voice/
                            event = FunctionCallArgumentsEvent(**response)
                            logging.info(
                                f"Calling {event.name=} with {event.arguments=}"
                            )
                            await call_tool(
                                event.call_id,
                                event.name,
                                json.loads(event.arguments),
                                openai_ws,
                            )

            except Exception as e:
                traceback.print_exc()
                print(f"Error in send_to_twilio: {e}")

The problem is that at some point if the caller is frustrated I would like to be able to dial them out to a phone number with a human. So basically something akin to

from twilio.twiml.voice_response import Dial, VoiceResponse, Say

response = VoiceResponse()
response.dial("111-111-1111") # dial out to human

But the latter, which is TwiML is only usable as I understand it within the webhook for an incoming call not within the webhook for a bidirectional media stream. Does anyone know of a workaround or basically the ability to programmatically dial out within a bidirectional media stream?

Thank you for taking the time to read this.


Solution

  • If I understand this right, you want to opt out of the media stream entirely when handing over to a human agent, right?

    In this case, you can override the original TwiML of the call (that initiated the web socket stream) with your new TwiML. For this, you need to know the Call SID of the active call.

    # Download the helper library from https://www.twilio.com/docs/python/install
    import os
    from twilio.rest import Client
    
    # Find your Account SID and Auth Token at twilio.com/console
    # and set the environment variables. See http://twil.io/secure
    account_sid = os.environ["TWILIO_ACCOUNT_SID"]
    auth_token = os.environ["TWILIO_AUTH_TOKEN"]
    client = Client(account_sid, auth_token)
    
    call = client.calls("CAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").update(
        twiml="<Response><Say>Ahoy there</Say></Response>"
    )
    
    print(call.sid)
    

    You can extract the call sid from the Start message that is sent once the stream has been initiated.