Add realtime api

2025-12-03 02:58:20 +00:00 · 2024-10-09 07:36:44 +09:00
parent 78abf3ac83
commit 8d5daac3b4
12 changed files with 2420 additions and 0 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,3 +32,11 @@ version = "1"

 [dependencies.bytes]
 version = "1.7.1"
+
+[dependencies.tokio-tungstenite]
+version = "0.24.0"
+features = ["connect", "native-tls"]
+
+[dependencies.futures-util]
+version = "0.3.31"
+features = ["sink", "std"]
--- a/README.md
+++ b/README.md
@@ -94,6 +94,7 @@ Check out the [full API documentation](https://platform.openai.com/docs/api-refe
 - [x] [Function calling](https://platform.openai.com/docs/guides/gpt/function-calling)
 - [x] [Assistants](https://platform.openai.com/docs/assistants/overview)
 - [x] [Batch](https://platform.openai.com/docs/api-reference/batch)
+- [x] [Realtime](https://platform.openai.com/docs/api-reference/realtime)

 ## License
 This project is licensed under [MIT license](https://github.com/dongri/openai-api-rs/blob/main/LICENSE).
--- a/examples/realtime/.gitignore
+++ b/examples/realtime/.gitignore
@@ -0,0 +1 @@
+target
--- a/examples/realtime/Cargo.lock
+++ b/examples/realtime/Cargo.lock
--- a/examples/realtime/Cargo.toml
+++ b/examples/realtime/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "realtime"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+openai-api-rs = { path = "../../../openai-api-rs" }
+serde = { version = "1.0.210", features = ["derive"] }
+serde_json = "1.0.128"
+tokio = { version = "1.40.0", features = ["full"] }
+tokio-tungstenite = { version = "0.24.0", features = ["connect", "native-tls"] }
+futures-util = { version = "0.3.31", features = ["sink", "std"] }
+futures-channel = "0.3.31"
--- a/examples/realtime/src/main.rs
+++ b/examples/realtime/src/main.rs
@@ -0,0 +1,91 @@
+use std::process::exit;
+use std::env;
+
+use futures_util::{future, pin_mut, StreamExt};
+use openai_api_rs::realtime::api::RealtimeClient;
+use openai_api_rs::realtime::client_event::{ConversationItemCreate, ResponseCreate};
+use openai_api_rs::realtime::server_event::ServerEvent;
+use openai_api_rs::realtime::types::Item;
+use tokio::io::AsyncReadExt;
+use tokio_tungstenite::tungstenite::protocol::Message;
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let api_key = env::var("OPENAI_API_KEY").unwrap().to_string();
+    let model = "gpt-4o-realtime-preview-2024-10-01".to_string();
+
+    let (stdin_tx, stdin_rx) = futures_channel::mpsc::unbounded();
+    tokio::spawn(read_stdin(stdin_tx));
+
+    let realtime_client = RealtimeClient::new(api_key, model);
+
+    let (write, read) = realtime_client.connect().await.unwrap();
+    println!("WebSocket handshake complete");
+
+    let stdin_to_ws = stdin_rx.map(Ok).forward(write);
+
+    let ws_to_stdout = {
+        read.for_each(|message| async {
+            let message = message.unwrap();
+            match message {
+                Message::Text(_) => {
+                    let data = message.clone().into_data();
+                    let server_event: ServerEvent = serde_json::from_slice(&data).unwrap();
+                    match server_event {
+                        ServerEvent::ResponseOutputItemDone(_event) => {
+                            eprintln!();
+                        }
+                        ServerEvent::ResponseAudioTranscriptDelta(event) => {
+                            eprint!("{}", event.delta.trim());
+                        }
+                        ServerEvent::Error(e) => {
+                            eprint!("{e:?}");
+                        }
+                        _ => {}
+                    }
+                }
+                Message::Close(_) => {
+                    eprintln!("Close");
+                    exit(0);
+                }
+                _ => {}
+            }
+        })
+    };
+
+    pin_mut!(stdin_to_ws, ws_to_stdout);
+    future::select(stdin_to_ws, ws_to_stdout).await;
+
+    Ok(())
+}
+
+async fn read_stdin(tx: futures_channel::mpsc::UnboundedSender<Message>) {
+    let mut stdin = tokio::io::stdin();
+    loop {
+        let mut buf = vec![0; 2048];
+        let n = match stdin.read(&mut buf).await {
+            Err(_) | Ok(0) => break,
+            Ok(n) => n,
+        };
+        buf.truncate(n);
+        let text = String::from_utf8_lossy(&buf).into_owned();
+        let item = Item::try_from(serde_json::json!({
+            "type": "message",
+            "role": "user",
+            "content": [
+                {
+                    "type": "input_text",
+                    "text": text.trim()
+                }
+            ]
+        }))
+        .unwrap();
+        let event = ConversationItemCreate {
+            item,
+            ..Default::default()
+        };
+        let message: Message = event.into();
+        tx.unbounded_send(message).unwrap();
+        tx.unbounded_send(ResponseCreate::default().into()).unwrap();
+    }
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1 +1,2 @@
+pub mod realtime;
 pub mod v1;
--- a/src/realtime/api.rs
+++ b/src/realtime/api.rs
@@ -0,0 +1,54 @@
+use futures_util::stream::{SplitSink, SplitStream};
+use futures_util::StreamExt;
+use tokio::net::TcpStream;
+use tokio_tungstenite::{
+    connect_async,
+    tungstenite::{client::IntoClientRequest, protocol::Message},
+    MaybeTlsStream, WebSocketStream,
+};
+
+const WSS_URL: &str = "wss://api.openai.com/v1/realtime";
+
+pub struct RealtimeClient {
+    pub wss_url: String,
+    pub api_key: String,
+    pub model: String,
+}
+
+impl RealtimeClient {
+    pub fn new(api_key: String, model: String) -> Self {
+        let wss_url = std::env::var("WSS_URL").unwrap_or_else(|_| WSS_URL.to_owned());
+        Self::new_with_endpoint(wss_url, api_key, model)
+    }
+
+    pub fn new_with_endpoint(wss_url: String, api_key: String, model: String) -> Self {
+        Self {
+            wss_url,
+            api_key,
+            model,
+        }
+    }
+
+    pub async fn connect(
+        &self,
+    ) -> Result<
+        (
+            SplitSink<WebSocketStream<MaybeTlsStream<TcpStream>>, Message>,
+            SplitStream<WebSocketStream<MaybeTlsStream<TcpStream>>>,
+        ),
+        Box<dyn std::error::Error>,
+    > {
+        let url = format!("{}?model={}", self.wss_url, self.model);
+        let mut request = url.into_client_request()?;
+        let api_key = self.api_key.clone();
+        request
+            .headers_mut()
+            .insert("Authorization", format!("Bearer {api_key}").parse()?);
+        request
+            .headers_mut()
+            .insert("OpenAI-Beta", "realtime=v1".parse()?);
+        let (ws_stream, _) = connect_async(request).await?;
+        let (write, read) = ws_stream.split();
+        Ok((write, read))
+    }
+}
--- a/src/realtime/client_event.rs
+++ b/src/realtime/client_event.rs
@@ -0,0 +1,157 @@
+use serde::{Deserialize, Serialize};
+use tokio_tungstenite::tungstenite::Message;
+
+use crate::realtime::types::{Item, Session};
+
+#[derive(Debug, Serialize, Deserialize, Clone, Default)]
+pub struct SessionUpdate {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub event_id: Option<String>,
+    pub session: Session,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone, Default)]
+pub struct InputAudioBufferAppend {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub event_id: Option<String>,
+    pub audio: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone, Default)]
+pub struct InputAudioBufferCommit {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub event_id: Option<String>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone, Default)]
+pub struct InputAudioBufferClear {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub event_id: Option<String>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone, Default)]
+pub struct ConversationItemCreate {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub event_id: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub previous_item_id: Option<String>,
+    pub item: Item,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone, Default)]
+pub struct ConversationItemTruncate {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub event_id: Option<String>,
+    pub item_id: String,
+    pub content_index: u32,
+    pub audio_end_ms: u32,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone, Default)]
+pub struct ConversationItemDelete {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub event_id: Option<String>,
+    pub item_id: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone, Default)]
+pub struct ResponseCreate {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub event_id: Option<String>,
+    pub response: Option<Session>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone, Default)]
+pub struct ResponseCancel {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub event_id: Option<String>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+#[serde(tag = "type")]
+pub enum ClientEvent {
+    #[serde(rename = "session.update")]
+    SessionUpdate(SessionUpdate),
+    #[serde(rename = "input_audio_buffer.append")]
+    InputAudioBufferAppend(InputAudioBufferAppend),
+    #[serde(rename = "input_audio_buffer.commit")]
+    InputAudioBufferCommit(InputAudioBufferCommit),
+    #[serde(rename = "input_audio_buffer.clear")]
+    InputAudioBufferClear(InputAudioBufferClear),
+    #[serde(rename = "conversation.item.create")]
+    ConversationItemCreate(ConversationItemCreate),
+    #[serde(rename = "conversation.item.truncate")]
+    ConversationItemTruncate(ConversationItemTruncate),
+    #[serde(rename = "conversation.item.delete")]
+    ConversationItemDelete(ConversationItemDelete),
+    #[serde(rename = "response.create")]
+    ResponseCreate(ResponseCreate),
+    #[serde(rename = "response.cancel")]
+    ResponseCancel(ResponseCancel),
+}
+
+impl From<ClientEvent> for Message {
+    fn from(value: ClientEvent) -> Self {
+        Message::Text(String::from(&value))
+    }
+}
+
+impl From<&ClientEvent> for String {
+    fn from(value: &ClientEvent) -> Self {
+        serde_json::to_string(value).unwrap()
+    }
+}
+
+impl From<ConversationItemCreate> for Message {
+    fn from(value: ConversationItemCreate) -> Self {
+        Self::from(ClientEvent::ConversationItemCreate(value))
+    }
+}
+
+impl From<InputAudioBufferAppend> for Message {
+    fn from(value: InputAudioBufferAppend) -> Self {
+        Self::from(ClientEvent::InputAudioBufferAppend(value))
+    }
+}
+
+impl From<InputAudioBufferCommit> for Message {
+    fn from(value: InputAudioBufferCommit) -> Self {
+        Self::from(ClientEvent::InputAudioBufferCommit(value))
+    }
+}
+
+impl From<InputAudioBufferClear> for Message {
+    fn from(value: InputAudioBufferClear) -> Self {
+        Self::from(ClientEvent::InputAudioBufferClear(value))
+    }
+}
+
+impl From<SessionUpdate> for Message {
+    fn from(value: SessionUpdate) -> Self {
+        Self::from(ClientEvent::SessionUpdate(value))
+    }
+}
+
+impl From<ConversationItemTruncate> for Message {
+    fn from(value: ConversationItemTruncate) -> Self {
+        Self::from(ClientEvent::ConversationItemTruncate(value))
+    }
+}
+
+impl From<ConversationItemDelete> for Message {
+    fn from(value: ConversationItemDelete) -> Self {
+        Self::from(ClientEvent::ConversationItemDelete(value))
+    }
+}
+
+impl From<ResponseCreate> for Message {
+    fn from(value: ResponseCreate) -> Self {
+        Self::from(ClientEvent::ResponseCreate(value))
+    }
+}
+
+impl From<ResponseCancel> for Message {
+    fn from(value: ResponseCancel) -> Self {
+        Self::from(ClientEvent::ResponseCancel(value))
+    }
+}
--- a/src/realtime/mod.rs
+++ b/src/realtime/mod.rs
@@ -0,0 +1,4 @@
+pub mod api;
+pub mod client_event;
+pub mod server_event;
+pub mod types;
--- a/src/realtime/server_event.rs
+++ b/src/realtime/server_event.rs
@@ -0,0 +1,288 @@
+use serde::{Deserialize, Serialize};
+
+use crate::realtime::types::{
+    APIError, ContentPart, Conversation, Item, RateLimit, Response, Session,
+};
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct Error {
+    pub event_id: String,
+    pub error: APIError,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct SessionCreated {
+    pub event_id: String,
+    pub session: Session,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct SessionUpdated {
+    pub event_id: String,
+    pub session: Session,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ConversationCreated {
+    pub event_id: String,
+    pub conversation: Conversation,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct InputAudioBufferCommited {
+    pub event_id: String,
+    pub previous_item_id: String,
+    pub item_id: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct InputAudioBufferCleared {
+    pub event_id: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct InputAudioBufferSpeechStarted {
+    pub event_id: String,
+    pub audio_start_ms: u32,
+    pub item_id: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct InputAudioBufferSpeechStopped {
+    pub event_id: String,
+    pub audio_end_ms: u32,
+    pub item_id: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ConversationItemCreated {
+    pub event_id: String,
+    pub previous_item_id: Option<String>,
+    pub item: Item,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ConversationItemInputAudioTranscriptionCompleted {
+    pub event_id: String,
+    pub item_id: String,
+    pub content_index: u32,
+    pub transcript: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ConversationItemInputAudioTranscriptionFailed {
+    pub event_id: String,
+    pub item_id: String,
+    pub content_index: u32,
+    pub error: APIError,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ConversationItemTruncated {
+    pub event_id: String,
+    pub item_id: String,
+    pub content_index: u32,
+    pub audio_end_ms: u32,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ConversationItemDeleted {
+    pub event_id: String,
+    pub item_id: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ResponseCreated {
+    pub event_id: String,
+    pub response: Response,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ResponseDone {
+    pub event_id: String,
+    pub response: Response,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ResponseOutputItemAdded {
+    pub event_id: String,
+    pub response_id: String,
+    pub output_index: u32,
+    pub item: Item,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ResponseOutputItemDone {
+    pub event_id: String,
+    pub response_id: String,
+    pub output_index: u32,
+    pub item: Item,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ResponseContentPartAdded {
+    pub event_id: String,
+    pub response_id: String,
+    pub item_id: String,
+    pub output_index: u32,
+    pub content_index: u32,
+    pub part: ContentPart,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ResponseContentPartDone {
+    pub event_id: String,
+    pub response_id: String,
+    pub item_id: String,
+    pub output_index: u32,
+    pub content_index: u32,
+    pub part: ContentPart,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ResponseTextDelta {
+    pub event_id: String,
+    pub response_id: String,
+    pub item_id: String,
+    pub output_index: u32,
+    pub content_index: u32,
+    pub delta: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ResponseTextDone {
+    pub event_id: String,
+    pub response_id: String,
+    pub item_id: String,
+    pub output_index: u32,
+    pub content_index: u32,
+    pub text: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ResponseAudioTranscriptDelta {
+    pub event_id: String,
+    pub response_id: String,
+    pub item_id: String,
+    pub output_index: u32,
+    pub content_index: u32,
+    pub delta: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ResponseAudioTranscriptDone {
+    pub event_id: String,
+    pub response_id: String,
+    pub item_id: String,
+    pub output_index: u32,
+    pub content_index: u32,
+    pub transcript: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ResponseAudioDelta {
+    pub event_id: String,
+    pub response_id: String,
+    pub item_id: String,
+    pub output_index: u32,
+    pub content_index: u32,
+    pub delta: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ResponseAudioDone {
+    pub event_id: String,
+    pub response_id: String,
+    pub item_id: String,
+    pub output_index: u32,
+    pub content_index: u32,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ResponseFunctionCallArgumentsDelta {
+    pub event_id: String,
+    pub response_id: String,
+    pub item_id: String,
+    pub output_index: u32,
+    pub call_id: String,
+    pub delta: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ResponseFunctionCallArgumentsDone {
+    pub event_id: String,
+    pub response_id: String,
+    pub item_id: String,
+    pub output_index: u32,
+    pub call_id: String,
+    pub arguments: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct RateLimitsUpdated {
+    pub event_id: String,
+    pub rate_limits: Vec<RateLimit>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(tag = "type")]
+pub enum ServerEvent {
+    #[serde(rename = "error")]
+    Error(Error),
+    #[serde(rename = "session.created")]
+    SessionCreated(SessionCreated),
+    #[serde(rename = "session.updated")]
+    SessionUpdated(SessionUpdated),
+    #[serde(rename = "conversation.created")]
+    ConversationCreated(ConversationCreated),
+    #[serde(rename = "input_audio_buffer.committed")]
+    InputAudioBufferCommited(InputAudioBufferCommited),
+    #[serde(rename = "input_audio_buffer.cleared")]
+    InputAudioBufferCleared(InputAudioBufferCleared),
+    #[serde(rename = "input_audio_buffer.speech_started")]
+    InputAudioBufferSpeechStarted(InputAudioBufferSpeechStarted),
+    #[serde(rename = "input_audio_buffer.speech_stopped")]
+    InputAudioBufferSpeechStopped(InputAudioBufferSpeechStopped),
+    #[serde(rename = "conversation.item.created")]
+    ConversationItemCreated(ConversationItemCreated),
+    #[serde(rename = "conversation.item.input_audio_transcription.completed")]
+    ConversationItemInputAudioTranscriptionCompleted(
+        ConversationItemInputAudioTranscriptionCompleted,
+    ),
+    #[serde(rename = "conversation.item.input_audio_transcription.failed")]
+    ConversationItemInputAudioTranscriptionFailed(ConversationItemInputAudioTranscriptionFailed),
+    #[serde(rename = "conversation.item.truncated")]
+    ConversationItemTruncated(ConversationItemTruncated),
+    #[serde(rename = "conversation.item.deleted")]
+    ConversationItemDeleted(ConversationItemDeleted),
+    #[serde(rename = "response.created")]
+    ResponseCreated(ResponseCreated),
+    #[serde(rename = "response.done")]
+    ResponseDone(ResponseDone),
+    #[serde(rename = "response.output_item.added")]
+    ResponseOutputItemAdded(ResponseOutputItemAdded),
+    #[serde(rename = "response.output_item.done")]
+    ResponseOutputItemDone(ResponseOutputItemDone),
+    #[serde(rename = "response.content_part.added")]
+    ResponseContentPartAdded(ResponseContentPartAdded),
+    #[serde(rename = "response.content_part.done")]
+    ResponseContentPartDone(ResponseContentPartDone),
+    #[serde(rename = "response.text.delta")]
+    ResponseTextDelta(ResponseTextDelta),
+    #[serde(rename = "response.text.done")]
+    ResponseTextDone(ResponseTextDone),
+    #[serde(rename = "response.audio_transcript.delta")]
+    ResponseAudioTranscriptDelta(ResponseAudioTranscriptDelta),
+    #[serde(rename = "response.audio_transcript.done")]
+    ResponseAudioTranscriptDone(ResponseAudioTranscriptDone),
+    #[serde(rename = "response.audio.delta")]
+    ResponseAudioDelta(ResponseAudioDelta),
+    #[serde(rename = "response.audio.done")]
+    ResponseAudioDone(ResponseAudioDone),
+    #[serde(rename = "response.function_call_arguments.delta")]
+    ResponseFunctionCallArgumentsDelta(ResponseFunctionCallArgumentsDelta),
+    #[serde(rename = "response.function_call_arguments.done")]
+    ResponseFunctionCallArgumentsDone(ResponseFunctionCallArgumentsDone),
+    #[serde(rename = "rate_limits.updated")]
+    RateLimitsUpdated(RateLimitsUpdated),
+}
--- a/src/realtime/types.rs
+++ b/src/realtime/types.rs
@@ -0,0 +1,259 @@
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, Serialize, Deserialize, Clone, Default)]
+pub struct Session {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub modalities: Option<Vec<String>>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub instructions: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub voice: Option<RealtimeVoice>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub input_audio_format: Option<AudioFormat>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub output_audio_format: Option<AudioFormat>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub input_audio_transcription: Option<AudioTranscription>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub turn_detection: Option<TurnDetection>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tools: Option<Vec<ToolDefinition>>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tool_choice: Option<ToolChoice>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub temperature: Option<f32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_output_tokens: Option<MaxOutputTokens>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "lowercase")]
+pub enum RealtimeVoice {
+    Alloy,
+    Shimmer,
+    Echo,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub enum AudioFormat {
+    #[serde(rename = "pcm16")]
+    PCM16,
+    #[serde(rename = "g711-ulaw")]
+    G711ULAW,
+    #[serde(rename = "g711-alaw")]
+    G711ALAW,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct AudioTranscription {
+    pub enabled: bool,
+    pub model: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(tag = "type")]
+pub enum TurnDetection {
+    #[serde(rename = "server_vad")]
+    ServerVAD {
+        threshold: f32,
+        prefix_padding_ms: u32,
+        silence_duration_ms: u32,
+    },
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(tag = "type")]
+pub enum ToolDefinition {
+    #[serde(rename = "function")]
+    Function {
+        name: String,
+        description: String,
+        parameters: serde_json::Value,
+    },
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "lowercase")]
+pub enum ToolChoice {
+    Auto,
+    None,
+    Required,
+    #[serde(untagged)]
+    Function {
+        r#type: FunctionType,
+        name: String,
+    },
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "lowercase")]
+pub enum FunctionType {
+    Function,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(untagged)]
+pub enum MaxOutputTokens {
+    Num(u16),
+    #[serde(rename = "inf")]
+    Inf,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "snake_case")]
+pub enum ItemType {
+    Message,
+    FunctionCall,
+    FunctionCallOutput,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "snake_case")]
+pub enum ItemStatus {
+    Completed,
+    InProgress,
+    Incomplete,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "lowercase")]
+pub enum ItemRole {
+    User,
+    Assistant,
+    System,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "snake_case")]
+pub enum ItemContentType {
+    InputText,
+    InputAudio,
+    Text,
+    Audio,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ItemContent {
+    pub r#type: ItemContentType,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub text: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub audio: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub transcript: Option<String>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone, Default)]
+pub struct Item {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub id: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<ItemType>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub status: Option<ItemStatus>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub role: Option<ItemRole>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub content: Option<Vec<ItemContent>>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub call_id: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub name: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub arguments: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub output: Option<String>,
+}
+
+impl TryFrom<serde_json::Value> for Item {
+    type Error = serde_json::Error;
+
+    fn try_from(value: serde_json::Value) -> Result<Self, Self::Error> {
+        serde_json::from_value(value)
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct APIError {
+    pub r#type: String,
+    pub code: Option<String>,
+    pub message: String,
+    pub param: Option<String>,
+    pub event_id: Option<String>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct Conversation {
+    pub id: String,
+    pub object: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct Response {
+    pub id: String,
+    pub object: String,
+    pub status: ResponseStatus,
+    pub status_details: Option<ResponseStatusDetail>,
+    pub output: Vec<Item>,
+    pub usage: Option<Usage>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct Usage {
+    pub total_tokens: u32,
+    pub input_tokens: u32,
+    pub output_tokens: u32,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "snake_case")]
+pub enum ResponseStatus {
+    InProgress,
+    Completed,
+    Cancelled,
+    Failed,
+    Incomplete,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(tag = "type")]
+pub enum ResponseStatusDetail {
+    #[serde(rename = "incomplete")]
+    Incomplete { reason: IncompleteReason },
+    #[serde(rename = "failed")]
+    Failed { error: Option<FailedError> },
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct FailedError {
+    pub code: String,
+    pub message: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "snake_case")]
+pub enum IncompleteReason {
+    Interruption,
+    MaxOutputTokens,
+    ContentFilter,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(tag = "type")]
+pub enum ContentPart {
+    #[serde(rename = "text")]
+    Text { text: String },
+    #[serde(rename = "audio")]
+    Audio {
+        audio: Option<String>,
+        transcript: String,
+    },
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct RateLimit {
+    pub name: String,
+    pub limit: u32,
+    pub remaining: u32,
+    pub reset_seconds: f32,
+}