Skip to content

feat: enhance realtime response types and audio transcription options #391

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions async-openai/src/types/realtime/response_resource.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ pub enum ResponseStatusDetail {
Incomplete { reason: IncompleteReason },
#[serde(rename = "failed")]
Failed { error: Option<FailedError> },
#[serde(rename = "cancelled")]
Cancelled { reason: String },
}

#[derive(Debug, Serialize, Deserialize, Clone)]
Expand Down
30 changes: 30 additions & 0 deletions async-openai/src/types/realtime/server_event.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,17 @@ pub struct ConversationItemCreatedEvent {
pub item: Item,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
/// Log probability information for a transcribed token.
pub struct LogProb {
/// Raw UTF-8 bytes for the token.
pub bytes: Vec<u8>,
/// The log probability of the token.
pub logprob: f64,
/// The token string.
pub token: String,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct ConversationItemInputAudioTranscriptionCompletedEvent {
/// The unique ID of the server event.
Expand All @@ -93,6 +104,22 @@ pub struct ConversationItemInputAudioTranscriptionCompletedEvent {
pub content_index: u32,
/// The transcribed text.
pub transcript: String,
/// Optional per-token log probability data.
pub logprobs: Option<Vec<LogProb>>,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct ConversationItemInputAudioTranscriptionDeltaEvent {
/// The unique ID of the server event.
pub event_id: String,
/// The ID of the user message item.
pub item_id: String,
/// The index of the content part containing the audio.
pub content_index: u32,
/// The text delta.
pub delta: String,
/// Optional per-token log probability data.
pub logprobs: Option<Vec<LogProb>>,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
Expand Down Expand Up @@ -378,6 +405,9 @@ pub enum ServerEvent {
ConversationItemInputAudioTranscriptionCompletedEvent,
),

#[serde(rename = "conversation.item.input_audio_transcription.delta")]
ConversationItemInputAudioTranscriptionDelta(ConversationItemInputAudioTranscriptionDeltaEvent),

/// Returned when input audio transcription is configured, and a transcription request for a user message failed.
#[serde(rename = "conversation.item.input_audio_transcription.failed")]
ConversationItemInputAudioTranscriptionFailed(
Expand Down
56 changes: 48 additions & 8 deletions async-openai/src/types/realtime/session_resource.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,25 @@ use serde::{Deserialize, Serialize};
pub enum AudioFormat {
#[serde(rename = "pcm16")]
PCM16,
#[serde(rename = "g711-ulaw")]
#[serde(rename = "g711_law")]
G711ULAW,
#[serde(rename = "g711-alaw")]
#[serde(rename = "g711_alaw")]
G711ALAW,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
#[derive(Debug, Default, Serialize, Deserialize, Clone)]
pub struct AudioTranscription {
/// Whether to enable input audio transcription.
pub enabled: bool,
/// The model to use for transcription (e.g., "whisper-1").
pub model: String,
/// The language of the input audio. Supplying the input language in ISO-639-1 (e.g. en) format will improve accuracy and latency.
#[serde(skip_serializing_if = "Option::is_none")]
pub language: Option<String>,
/// The model to use for transcription, current options are gpt-4o-transcribe, gpt-4o-mini-transcribe, and whisper-1.
#[serde(skip_serializing_if = "Option::is_none")]
pub model: Option<String>,
/// An optional text to guide the model's style or continue a previous audio segment.
/// For whisper-1, the prompt is a list of keywords. For gpt-4o-transcribe models,
/// the prompt is a free text string, for example "expect words related to technology".
#[serde(skip_serializing_if = "Option::is_none")]
pub prompt: Option<String>,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
Expand All @@ -30,6 +37,32 @@ pub enum TurnDetection {
prefix_padding_ms: u32,
/// Duration of silence to detect speech stop (in milliseconds).
silence_duration_ms: u32,

/// Whether or not to automatically generate a response when a VAD stop event occurs.
#[serde(skip_serializing_if = "Option::is_none")]
create_response: Option<bool>,

/// Whether or not to automatically interrupt any ongoing response with output to
/// the default conversation (i.e. conversation of auto) when a VAD start event occurs.
#[serde(skip_serializing_if = "Option::is_none")]
interrupt_response: Option<bool>,
},

#[serde(rename = "semantic_vad")]
SemanticVAD {
/// The eagerness of the model to respond.
/// `low` will wait longer for the user to continue speaking,
/// `high`` will respond more quickly. `auto`` is the default and is equivalent to `medium`
eagerness: String,

/// Whether or not to automatically generate a response when a VAD stop event occurs.
#[serde(skip_serializing_if = "Option::is_none", default)]
create_response: Option<bool>,

/// Whether or not to automatically interrupt any ongoing response with output to
/// the default conversation (i.e. conversation of auto) when a VAD start event occurs.
#[serde(skip_serializing_if = "Option::is_none", default)]
interrupt_response: Option<bool>,
},
}

Expand Down Expand Up @@ -78,8 +111,15 @@ pub enum ToolChoice {
#[serde(rename_all = "lowercase")]
pub enum RealtimeVoice {
Alloy,
Shimmer,
Ash,
Ballad,
Coral,
Echo,
Fable,
Onyx,
Nova,
Shimmer,
Verse,
}

#[derive(Debug, Serialize, Deserialize, Clone, Default)]
Expand Down