diff --git a/examples/realtime/demo.py b/examples/realtime/demo.py index 7b98850b2..3db051963 100644 --- a/examples/realtime/demo.py +++ b/examples/realtime/demo.py @@ -38,6 +38,12 @@ def get_weather(city: str) -> str: ) +def _truncate_str(s: str, max_length: int) -> str: + if len(s) > max_length: + return s[:max_length] + "..." + return s + + class Example: def __init__(self) -> None: self.ui = AppUI() @@ -70,33 +76,38 @@ async def on_audio_recorded(self, audio_bytes: bytes) -> None: await self.session.send_audio(audio_bytes) async def _on_event(self, event: RealtimeSessionEvent) -> None: - if event.type == "agent_start": - self.ui.add_transcript(f"Agent started: {event.agent.name}") - elif event.type == "agent_end": - self.ui.add_transcript(f"Agent ended: {event.agent.name}") - elif event.type == "handoff": - self.ui.add_transcript(f"Handoff from {event.from_agent.name} to {event.to_agent.name}") - elif event.type == "tool_start": - self.ui.add_transcript(f"Tool started: {event.tool.name}") - elif event.type == "tool_end": - self.ui.add_transcript(f"Tool ended: {event.tool.name}; output: {event.output}") - elif event.type == "audio_end": - self.ui.add_transcript("Audio ended") - elif event.type == "audio": - np_audio = np.frombuffer(event.audio.data, dtype=np.int16) - self.ui.play_audio(np_audio) - elif event.type == "audio_interrupted": - self.ui.add_transcript("Audio interrupted") - elif event.type == "error": - self.ui.add_transcript(f"Error: {event.error}") - elif event.type == "history_updated": - pass - elif event.type == "history_added": - pass - elif event.type == "raw_model_event": - self.ui.log_message(f"Raw model event: {event.data}") - else: - self.ui.log_message(f"Unknown event type: {event.type}") + try: + if event.type == "agent_start": + self.ui.add_transcript(f"Agent started: {event.agent.name}") + elif event.type == "agent_end": + self.ui.add_transcript(f"Agent ended: {event.agent.name}") + elif event.type == "handoff": + self.ui.add_transcript( + f"Handoff from {event.from_agent.name} to {event.to_agent.name}" + ) + elif event.type == "tool_start": + self.ui.add_transcript(f"Tool started: {event.tool.name}") + elif event.type == "tool_end": + self.ui.add_transcript(f"Tool ended: {event.tool.name}; output: {event.output}") + elif event.type == "audio_end": + self.ui.add_transcript("Audio ended") + elif event.type == "audio": + np_audio = np.frombuffer(event.audio.data, dtype=np.int16) + self.ui.play_audio(np_audio) + elif event.type == "audio_interrupted": + self.ui.add_transcript("Audio interrupted") + elif event.type == "error": + pass + elif event.type == "history_updated": + pass + elif event.type == "history_added": + pass + elif event.type == "raw_model_event": + self.ui.log_message(f"Raw model event: {_truncate_str(str(event.data), 50)}") + else: + self.ui.log_message(f"Unknown event type: {event.type}") + except Exception as e: + self.ui.log_message(f"Error processing event: {_truncate_str(str(e), 50)}") if __name__ == "__main__": diff --git a/src/agents/realtime/__init__.py b/src/agents/realtime/__init__.py index 912e310a8..0e3e12f75 100644 --- a/src/agents/realtime/__init__.py +++ b/src/agents/realtime/__init__.py @@ -31,7 +31,6 @@ RealtimeToolStart, ) from .items import ( - AssistantAudio, AssistantMessageItem, AssistantText, InputAudio, @@ -123,7 +122,6 @@ "RealtimeToolEnd", "RealtimeToolStart", # Items - "AssistantAudio", "AssistantMessageItem", "AssistantText", "InputAudio", diff --git a/src/agents/realtime/items.py b/src/agents/realtime/items.py index a027b2a39..a835e7a88 100644 --- a/src/agents/realtime/items.py +++ b/src/agents/realtime/items.py @@ -7,7 +7,7 @@ class InputText(BaseModel): type: Literal["input_text"] = "input_text" - text: str + text: str | None = None # Allow extra data model_config = ConfigDict(extra="allow") @@ -24,16 +24,7 @@ class InputAudio(BaseModel): class AssistantText(BaseModel): type: Literal["text"] = "text" - text: str - - # Allow extra data - model_config = ConfigDict(extra="allow") - - -class AssistantAudio(BaseModel): - type: Literal["audio"] = "audio" - audio: str | None = None - transcript: str | None = None + text: str | None = None # Allow extra data model_config = ConfigDict(extra="allow") @@ -55,7 +46,7 @@ class UserMessageItem(BaseModel): previous_item_id: str | None = None type: Literal["message"] = "message" role: Literal["user"] = "user" - content: list[InputText | InputAudio] + content: list[Annotated[InputText | InputAudio, Field(discriminator="type")]] # Allow extra data model_config = ConfigDict(extra="allow") @@ -67,7 +58,7 @@ class AssistantMessageItem(BaseModel): type: Literal["message"] = "message" role: Literal["assistant"] = "assistant" status: Literal["in_progress", "completed", "incomplete"] | None = None - content: list[AssistantText | AssistantAudio] + content: list[AssistantText] # Allow extra data model_config = ConfigDict(extra="allow") diff --git a/src/agents/realtime/openai_realtime.py b/src/agents/realtime/openai_realtime.py index b79227d2e..1c4a4de3c 100644 --- a/src/agents/realtime/openai_realtime.py +++ b/src/agents/realtime/openai_realtime.py @@ -388,15 +388,8 @@ async def _handle_conversation_item( self, item: ConversationItem, previous_item_id: str | None ) -> None: """Handle conversation item creation/retrieval events.""" - message_item: RealtimeMessageItem = TypeAdapter(RealtimeMessageItem).validate_python( - { - "item_id": item.id or "", - "previous_item_id": previous_item_id, - "type": item.type, - "role": item.role, - "content": item.content, - "status": "in_progress", - } + message_item = _ConversionHelper.conversation_item_to_realtime_message_item( + item, previous_item_id ) await self._emit_event(RealtimeModelItemUpdatedEvent(item=message_item)) @@ -418,6 +411,8 @@ async def _cancel_response(self) -> None: async def _handle_ws_event(self, event: dict[str, Any]): try: + if "previous_item_id" in event and event["previous_item_id"] is None: + event["previous_item_id"] = "" # TODO (rm) remove parsed: OpenAIRealtimeServerEvent = TypeAdapter( OpenAIRealtimeServerEvent ).validate_python(event) @@ -465,7 +460,8 @@ async def _handle_ws_event(self, event: dict[str, Any]): previous_item_id = ( parsed.previous_item_id if parsed.type == "conversation.item.created" else None ) - await self._handle_conversation_item(parsed.item, previous_item_id) + if parsed.item.type == "message": + await self._handle_conversation_item(parsed.item, previous_item_id) elif ( parsed.type == "conversation.item.input_audio_transcription.completed" or parsed.type == "conversation.item.truncated" @@ -567,3 +563,22 @@ def _tools_to_session_tools(self, tools: list[Tool]) -> list[OpenAISessionTool]: ) ) return converted_tools + + +class _ConversionHelper: + @classmethod + def conversation_item_to_realtime_message_item( + cls, item: ConversationItem, previous_item_id: str | None + ) -> RealtimeMessageItem: + return TypeAdapter(RealtimeMessageItem).validate_python( + { + "item_id": item.id or "", + "previous_item_id": previous_item_id, + "type": item.type, + "role": item.role, + "content": ( + [content.model_dump() for content in item.content] if item.content else [] + ), + "status": "in_progress", + }, + ) diff --git a/tests/realtime/test_item_parsing.py b/tests/realtime/test_item_parsing.py new file mode 100644 index 000000000..ba128f7fd --- /dev/null +++ b/tests/realtime/test_item_parsing.py @@ -0,0 +1,80 @@ +from openai.types.beta.realtime.conversation_item import ConversationItem +from openai.types.beta.realtime.conversation_item_content import ConversationItemContent + +from agents.realtime.items import ( + AssistantMessageItem, + RealtimeMessageItem, + SystemMessageItem, + UserMessageItem, +) +from agents.realtime.openai_realtime import _ConversionHelper + + +def test_user_message_conversion() -> None: + item = ConversationItem( + id="123", + type="message", + role="user", + content=[ + ConversationItemContent( + id=None, audio=None, text=None, transcript=None, type="input_text" + ) + ], + ) + + converted: RealtimeMessageItem = _ConversionHelper.conversation_item_to_realtime_message_item( + item, None + ) + + assert isinstance(converted, UserMessageItem) + + item = ConversationItem( + id="123", + type="message", + role="user", + content=[ + ConversationItemContent( + id=None, audio=None, text=None, transcript=None, type="input_audio" + ) + ], + ) + + converted = _ConversionHelper.conversation_item_to_realtime_message_item(item, None) + + assert isinstance(converted, UserMessageItem) + + +def test_assistant_message_conversion() -> None: + item = ConversationItem( + id="123", + type="message", + role="assistant", + content=[ + ConversationItemContent(id=None, audio=None, text=None, transcript=None, type="text") + ], + ) + + converted: RealtimeMessageItem = _ConversionHelper.conversation_item_to_realtime_message_item( + item, None + ) + + assert isinstance(converted, AssistantMessageItem) + + +def test_system_message_conversion() -> None: + item = ConversationItem( + id="123", + type="message", + role="system", + content=[ + ConversationItemContent( + id=None, audio=None, text=None, transcript=None, type="input_text" + ) + ], + ) + + converted: RealtimeMessageItem = _ConversionHelper.conversation_item_to_realtime_message_item( + item, None + ) + + assert isinstance(converted, SystemMessageItem) diff --git a/tests/realtime/test_session.py b/tests/realtime/test_session.py index b903ee951..4cc0dae6b 100644 --- a/tests/realtime/test_session.py +++ b/tests/realtime/test_session.py @@ -293,7 +293,7 @@ async def test_item_updated_event_updates_existing_item(self, mock_model, mock_a # Check that item was updated assert len(session._history) == 1 updated_item = cast(AssistantMessageItem, session._history[0]) - assert cast(AssistantText, updated_item.content[0]).text == "Updated" + assert updated_item.content[0].text == "Updated" # Should have 2 events: raw + history updated (not added) assert session._event_queue.qsize() == 2 @@ -524,7 +524,7 @@ def test_update_existing_item_by_id(self): # Item should be updated result_item = cast(AssistantMessageItem, new_history[0]) assert result_item.item_id == "item_1" - assert cast(AssistantText, result_item.content[0]).text == "Updated" + assert result_item.content[0].text == "Updated" def test_update_existing_item_preserves_order(self): """Test that updating existing item preserves its position in history""" @@ -557,13 +557,13 @@ def test_update_existing_item_preserves_order(self): # Middle item should be updated updated_result = cast(AssistantMessageItem, new_history[1]) - assert cast(AssistantText, updated_result.content[0]).text == "Updated Second" + assert updated_result.content[0].text == "Updated Second" # Other items should be unchanged item1_result = cast(AssistantMessageItem, new_history[0]) item3_result = cast(AssistantMessageItem, new_history[2]) - assert cast(AssistantText, item1_result.content[0]).text == "First" - assert cast(AssistantText, item3_result.content[0]).text == "Third" + assert item1_result.content[0].text == "First" + assert item3_result.content[0].text == "Third" def test_insert_new_item_after_previous_item(self): """Test inserting new item after specified previous_item_id""" @@ -598,7 +598,7 @@ def test_insert_new_item_after_previous_item(self): # Content should be correct item2_result = cast(AssistantMessageItem, new_history[1]) - assert cast(AssistantText, item2_result.content[0]).text == "Second" + assert item2_result.content[0].text == "Second" def test_insert_new_item_after_nonexistent_previous_item(self): """Test that item with nonexistent previous_item_id gets added to end""" @@ -701,7 +701,7 @@ def test_complex_insertion_scenario(self): assert len(history) == 4 assert [item.item_id for item in history] == ["A", "B", "D", "C"] itemB_result = cast(AssistantMessageItem, history[1]) - assert cast(AssistantText, itemB_result.content[0]).text == "Updated B" + assert itemB_result.content[0].text == "Updated B" # Test 3: Tool call execution flow (_handle_tool_call method)