At a high level, I would like to leverage Twilio's Dial functionality from within a Twilio bidirectional media stream, but it seems like Twilio's Media Streams functionality only supports the ability to send raw audio back to the caller.
Here's a bit more detail:
I currently have an app that uses a bidirectional media stream to use the OpenAI Realtime API to answer questions and concerns over a phone call (inspiration for my code here).
Here's the main bidirectional media stream code that receives audio and sends audio back:
@app.websocket("/media-stream")
async def handle_media_stream(websocket: WebSocket):
"""Handle WebSocket connections between Twilio and OpenAI."""
print("Client connected")
await websocket.accept()
async with websockets.connect(
"wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01",
extra_headers={
"Authorization": f"Bearer {CONFIG.api_key}",
"OpenAI-Beta": "realtime=v1",
},
) as openai_ws:
await initialize_session(openai_ws)
# Connection specific state
stream_sid = None
latest_media_timestamp = 0
last_assistant_item = None
mark_queue = []
response_start_timestamp_twilio = None
async def receive_from_twilio():
"""Receive audio data from Twilio and send it to the OpenAI Realtime API."""
nonlocal stream_sid, latest_media_timestamp
try:
async for message in websocket.iter_text():
data = json.loads(message)
if data["event"] == "media" and openai_ws.open:
latest_media_timestamp = int(data["media"]["timestamp"])
audio_append = {
"type": "input_audio_buffer.append",
"audio": data["media"]["payload"],
}
await openai_ws.send(json.dumps(audio_append))
elif data["event"] == "start":
stream_sid = data["start"]["streamSid"]
print(f"Incoming stream has started {stream_sid}")
response_start_timestamp_twilio = None # noqa: F841
latest_media_timestamp = 0
last_assistant_item = None # noqa: F841
elif data["event"] == "mark":
if mark_queue:
mark_queue.pop(0)
except WebSocketDisconnect:
print("Client disconnected.")
if openai_ws.open:
await openai_ws.close()
async def send_to_twilio():
"""Receive events from the OpenAI Realtime API, send audio back to Twilio."""
nonlocal stream_sid, last_assistant_item, response_start_timestamp_twilio
try:
async for openai_message in openai_ws:
response = json.loads(openai_message)
response_type = response.get("type")
if response_type in CONFIG.log_event_types:
# print(f"Received event: {response['type']}", response)
logging.info(f"Received event: {response['type']}")
match response_type:
case "response.audio.delta":
if "delta" not in response:
continue
audio_payload = base64.b64encode(
base64.b64decode(response["delta"])
).decode("utf-8")
audio_delta = {
"event": "media",
"streamSid": stream_sid,
"media": {"payload": audio_payload},
}
await websocket.send_json(audio_delta)
if response_start_timestamp_twilio is None:
response_start_timestamp_twilio = latest_media_timestamp
if CONFIG.show_timing_math:
print(
f"Setting start timestamp for new response: {response_start_timestamp_twilio}ms"
)
# Update last_assistant_item safely
if response.get("item_id"):
last_assistant_item = response["item_id"]
await send_mark(websocket, stream_sid)
# Trigger an interruption. Your use case might work better using `input_audio_buffer.speech_stopped`, or combining the two.
case "input_audio_buffer.speech_started":
print("Speech started detected.")
if last_assistant_item:
print(
f"Interrupting response with id: {last_assistant_item}"
)
await handle_speech_started_event()
case "response.function_call_arguments.done":
# https://platform.openai.com/docs/api-reference/realtime-server-events/response/function_call_arguments/done
# TODO: eventually migrate domain model to voice/
event = FunctionCallArgumentsEvent(**response)
logging.info(
f"Calling {event.name=} with {event.arguments=}"
)
await call_tool(
event.call_id,
event.name,
json.loads(event.arguments),
openai_ws,
)
except Exception as e:
traceback.print_exc()
print(f"Error in send_to_twilio: {e}")
The problem is that at some point if the caller is frustrated I would like to be able to dial them out to a phone number with a human. So basically something akin to
from twilio.twiml.voice_response import Dial, VoiceResponse, Say
response = VoiceResponse()
response.dial("111-111-1111") # dial out to human
But the latter, which is TwiML is only usable as I understand it within the webhook for an incoming call not within the webhook for a bidirectional media stream. Does anyone know of a workaround or basically the ability to programmatically dial out within a bidirectional media stream?
Thank you for taking the time to read this.
If I understand this right, you want to opt out of the media stream entirely when handing over to a human agent, right?
In this case, you can override the original TwiML of the call (that initiated the web socket stream) with your new TwiML. For this, you need to know the Call SID of the active call.
# Download the helper library from https://www.twilio.com/docs/python/install
import os
from twilio.rest import Client
# Find your Account SID and Auth Token at twilio.com/console
# and set the environment variables. See http://twil.io/secure
account_sid = os.environ["TWILIO_ACCOUNT_SID"]
auth_token = os.environ["TWILIO_AUTH_TOKEN"]
client = Client(account_sid, auth_token)
call = client.calls("CAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").update(
twiml="<Response><Say>Ahoy there</Say></Response>"
)
print(call.sid)
You can extract the call sid from the Start message that is sent once the stream has been initiated.