64bit · 64bit · Jun 29, 2025 · Jun 23, 2025 · Jun 23, 2025 · Jun 26, 2025
diff --git a/async-openai/src/types/realtime/response_resource.rs b/async-openai/src/types/realtime/response_resource.rs
@@ -40,6 +40,8 @@ pub enum ResponseStatusDetail {
     Incomplete { reason: IncompleteReason },
     #[serde(rename = "failed")]
     Failed { error: Option<FailedError> },
+    #[serde(rename = "cancelled")]
+    Cancelled { reason: String },
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone)]

diff --git a/async-openai/src/types/realtime/server_event.rs b/async-openai/src/types/realtime/server_event.rs
@@ -83,6 +83,17 @@ pub struct ConversationItemCreatedEvent {
     pub item: Item,
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone)]
+/// Log probability information for a transcribed token.
+pub struct LogProb {
+    /// Raw UTF-8 bytes for the token.
+    pub bytes: Vec<u8>,
+    /// The log probability of the token.
+    pub logprob: f64,
+    /// The token string.
+    pub token: String,
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct ConversationItemInputAudioTranscriptionCompletedEvent {
     /// The unique ID of the server event.
@@ -93,6 +104,22 @@ pub struct ConversationItemInputAudioTranscriptionCompletedEvent {
     pub content_index: u32,
     /// The transcribed text.
     pub transcript: String,
+    /// Optional per-token log probability data.
+    pub logprobs: Option<Vec<LogProb>>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ConversationItemInputAudioTranscriptionDeltaEvent {
+    /// The unique ID of the server event.
+    pub event_id: String,
+    /// The ID of the user message item.
+    pub item_id: String,
+    /// The index of the content part containing the audio.
+    pub content_index: u32,
+    /// The text delta.
+    pub delta: String,
+    /// Optional per-token log probability data.
+    pub logprobs: Option<Vec<LogProb>>,
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone)]
@@ -378,6 +405,9 @@ pub enum ServerEvent {
         ConversationItemInputAudioTranscriptionCompletedEvent,
     ),
 
+    #[serde(rename = "conversation.item.input_audio_transcription.delta")]
+    ConversationItemInputAudioTranscriptionDelta(ConversationItemInputAudioTranscriptionDeltaEvent),
+
     /// Returned when input audio transcription is configured, and a transcription request for a user message failed.
     #[serde(rename = "conversation.item.input_audio_transcription.failed")]
     ConversationItemInputAudioTranscriptionFailed(

diff --git a/async-openai/src/types/realtime/session_resource.rs b/async-openai/src/types/realtime/session_resource.rs
@@ -4,18 +4,25 @@ use serde::{Deserialize, Serialize};
 pub enum AudioFormat {
     #[serde(rename = "pcm16")]
     PCM16,
-    #[serde(rename = "g711-ulaw")]
+    #[serde(rename = "g711_law")]
     G711ULAW,
-    #[serde(rename = "g711-alaw")]
+    #[serde(rename = "g711_alaw")]
     G711ALAW,
 }
 
-#[derive(Debug, Serialize, Deserialize, Clone)]
+#[derive(Debug, Default, Serialize, Deserialize, Clone)]
 pub struct AudioTranscription {
-    /// Whether to enable input audio transcription.
-    pub enabled: bool,
-    /// The model to use for transcription (e.g., "whisper-1").
-    pub model: String,
+    /// The language of the input audio. Supplying the input language in ISO-639-1 (e.g. en) format will improve accuracy and latency.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub language: Option<String>,
+    /// The model to use for transcription, current options are gpt-4o-transcribe, gpt-4o-mini-transcribe, and whisper-1.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub model: Option<String>,
+    /// An optional text to guide the model's style or continue a previous audio segment.
+    /// For whisper-1, the prompt is a list of keywords. For gpt-4o-transcribe models,
+    /// the prompt is a free text string, for example "expect words related to technology".
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub prompt: Option<String>,
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone)]
@@ -30,6 +37,32 @@ pub enum TurnDetection {
         prefix_padding_ms: u32,
         /// Duration of silence to detect speech stop (in milliseconds).
         silence_duration_ms: u32,
+
+        /// Whether or not to automatically generate a response when a VAD stop event occurs.
+        #[serde(skip_serializing_if = "Option::is_none")]
+        create_response: Option<bool>,
+
+        /// Whether or not to automatically interrupt any ongoing response with output to
+        /// the default conversation (i.e. conversation of auto) when a VAD start event occurs.
+        #[serde(skip_serializing_if = "Option::is_none")]
+        interrupt_response: Option<bool>,
+    },
+
+    #[serde(rename = "semantic_vad")]
+    SemanticVAD {
+        /// The eagerness of the model to respond.
+        /// `low` will wait longer for the user to continue speaking,
+        /// `high`` will respond more quickly. `auto`` is the default and is equivalent to `medium`
+        eagerness: String,
+
+        /// Whether or not to automatically generate a response when a VAD stop event occurs.
+        #[serde(skip_serializing_if = "Option::is_none", default)]
+        create_response: Option<bool>,
+
+        /// Whether or not to automatically interrupt any ongoing response with output to
+        /// the default conversation (i.e. conversation of auto) when a VAD start event occurs.
+        #[serde(skip_serializing_if = "Option::is_none", default)]
+        interrupt_response: Option<bool>,
     },
 }
 
@@ -78,8 +111,15 @@ pub enum ToolChoice {
 #[serde(rename_all = "lowercase")]
 pub enum RealtimeVoice {
     Alloy,
-    Shimmer,
+    Ash,
+    Ballad,
+    Coral,
     Echo,
+    Fable,
+    Onyx,
+    Nova,
+    Shimmer,
+    Verse,
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone, Default)]