Initial image support

Fix image structure for OpenAI API

Implement base64 image support for OpenAI

Image Support: Some APIs (Gemini) require mime type for URL and Base64 format

Image Support: Update OpenAI and Anthropic API to support new image structure

Image Support: Add Gemini 2.0 Flash Experimental support and implement Image support

Image Support: Create example with Image support

Image Support: Fix rebase issue

Image Support: Fix example and make it runnable from cargo
This commit is contained in:
Adam Strojek
2024-12-09 13:14:25 +01:00
committed by Jeremy Chone
parent 974489e69a
commit 59f0b149b0
7 changed files with 221 additions and 47 deletions

View File

@ -9,6 +9,10 @@ keywords = ["generative-ai","openai","chatgpt","gemini","ollama"]
homepage = "https://github.com/jeremychone/rust-genai" homepage = "https://github.com/jeremychone/rust-genai"
repository = "https://github.com/jeremychone/rust-genai" repository = "https://github.com/jeremychone/rust-genai"
[[example]]
name = "images"
path = "examples/c07-image.rs"
[lints.rust] [lints.rust]
unsafe_code = "forbid" unsafe_code = "forbid"
# unused = { level = "allow", priority = -1 } # For exploratory dev. # unused = { level = "allow", priority = -1 } # For exploratory dev.

35
examples/c07-image.rs Normal file
View File

@ -0,0 +1,35 @@
//! This example demonstrates how to properly attach image to the conversations
use genai::chat::printer::print_chat_stream;
use genai::chat::{ChatMessage, ChatRequest, ContentPart, ImageSource};
use genai::Client;
const MODEL: &str = "gpt-4o-mini";
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let client = Client::default();
let question = "What is in this picture?";
let mut chat_req = ChatRequest::default().with_system("Answer in one sentence");
// This is similar to sending initial system chat messages (which will be cumulative with system chat messages)
chat_req = chat_req.append_message(ChatMessage::user(
vec![
ContentPart::Text(question.to_string()),
ContentPart::Image {
content: "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg".to_string(),
content_type: "image/png".to_string(),
source: ImageSource::Url,
}
]
));
println!("\n--- Question:\n{question}");
let chat_res = client.exec_chat_stream(MODEL, chat_req.clone(), None).await?;
println!("\n--- Answer: (streaming)");
let assistant_answer = print_chat_stream(chat_res, None).await?;
Ok(())
}

View File

@ -3,7 +3,7 @@ use crate::adapter::anthropic::AnthropicStreamer;
use crate::adapter::{Adapter, AdapterKind, ServiceType, WebRequestData}; use crate::adapter::{Adapter, AdapterKind, ServiceType, WebRequestData};
use crate::chat::{ use crate::chat::{
ChatOptionsSet, ChatRequest, ChatResponse, ChatRole, ChatStream, ChatStreamResponse, MessageContent, MetaUsage, ChatOptionsSet, ChatRequest, ChatResponse, ChatRole, ChatStream, ChatStreamResponse, MessageContent, MetaUsage,
ToolCall, ToolCall, ContentPart, ImageSource,
}; };
use crate::resolver::{AuthData, Endpoint}; use crate::resolver::{AuthData, Endpoint};
use crate::webc::WebResponse; use crate::webc::WebResponse;
@ -236,10 +236,35 @@ impl AnthropicAdapter {
// TODO: Needs to trace/warn that other type are not supported // TODO: Needs to trace/warn that other type are not supported
} }
ChatRole::User => { ChatRole::User => {
if let MessageContent::Text(content) = msg.content { let content = match msg.content {
messages.push(json! ({"role": "user", "content": content})) MessageContent::Text(content) => json!(content),
} MessageContent::Parts(parts) => {
// TODO: Needs to trace/warn that other type are not supported json!(parts.iter().map(|part| match part {
ContentPart::Text(text) => json!({"type": "text", "text": text.clone()}),
ContentPart::Image{content, content_type, source} => {
match source {
ImageSource::Url => todo!("Anthropic doesn't support images from URL, need to handle it gracefully"),
ImageSource::Base64 => json!({
"type": "image",
"source": {
"type": "base64",
"media_type": content_type,
"data": content,
},
}),
}
},
}).collect::<Vec<Value>>())
},
// Use `match` instead of `if let`. This will allow to future-proof this
// implementation in case some new message content types would appear,
// this way library would not compile if not all methods are implemented
// continue would allow to gracefully skip pushing unserializable message
// TODO: Probably need to warn if it is a ToolCalls type of content
MessageContent::ToolCalls(_) => continue,
MessageContent::ToolResponses(_) => continue,
};
messages.push(json! ({"role": "user", "content": content}));
} }
ChatRole::Assistant => { ChatRole::Assistant => {
// //
@ -266,6 +291,7 @@ impl AnthropicAdapter {
})); }));
} }
// TODO: Probably need to trace/warn that this will be ignored // TODO: Probably need to trace/warn that this will be ignored
MessageContent::Parts(_) => (),
MessageContent::ToolResponses(_) => (), MessageContent::ToolResponses(_) => (),
} }
} }

View File

@ -3,7 +3,7 @@ use crate::adapter::gemini::GeminiStreamer;
use crate::adapter::{Adapter, AdapterKind, ServiceType, WebRequestData}; use crate::adapter::{Adapter, AdapterKind, ServiceType, WebRequestData};
use crate::chat::{ use crate::chat::{
ChatOptionsSet, ChatRequest, ChatResponse, ChatResponseFormat, ChatRole, ChatStream, ChatStreamResponse, ChatOptionsSet, ChatRequest, ChatResponse, ChatResponseFormat, ChatRole, ChatStream, ChatStreamResponse,
MessageContent, MetaUsage, MessageContent, MetaUsage, ContentPart, ImageSource
}; };
use crate::resolver::{AuthData, Endpoint}; use crate::resolver::{AuthData, Endpoint};
use crate::webc::{WebResponse, WebStream}; use crate::webc::{WebResponse, WebStream};
@ -21,6 +21,7 @@ const MODELS: &[&str] = &[
"gemini-1.5-flash-8b", "gemini-1.5-flash-8b",
"gemini-1.0-pro", "gemini-1.0-pro",
"gemini-1.5-flash-latest", "gemini-1.5-flash-latest",
"gemini-2.0-flash-exp"
]; ];
// curl \ // curl \
@ -214,19 +215,61 @@ impl GeminiAdapter {
// -- Build // -- Build
for msg in chat_req.messages { for msg in chat_req.messages {
// TODO: Needs to implement tool_calls
let MessageContent::Text(content) = msg.content else {
return Err(Error::MessageContentTypeNotSupported {
model_iden,
cause: "Only MessageContent::Text supported for this model (for now)",
});
};
match msg.role { match msg.role {
// For now, system goes as "user" (later, we might have adapter_config.system_to_user_impl) // For now, system goes as "user" (later, we might have adapter_config.system_to_user_impl)
ChatRole::System => systems.push(content), ChatRole::System => {
ChatRole::User => contents.push(json! ({"role": "user", "parts": [{"text": content}]})), let MessageContent::Text(content) = msg.content else {
ChatRole::Assistant => contents.push(json! ({"role": "model", "parts": [{"text": content}]})), return Err(Error::MessageContentTypeNotSupported {
model_iden,
cause: "Only MessageContent::Text supported for this model (for now)",
});
};
systems.push(content)
},
ChatRole::User => {
let content = match msg.content {
MessageContent::Text(content) => json!([{"text": content}]),
MessageContent::Parts(parts) => {
json!(parts.iter().map(|part| match part {
ContentPart::Text(text) => json!({"text": text.clone()}),
ContentPart::Image{content, content_type, source} => {
match source {
ImageSource::Url => json!({
"file_data": {
"mime_type": content_type,
"file_uri": content
}
}),
ImageSource::Base64 => json!({
"inline_data": {
"mime_type": content_type,
"data": content
}
}),
}
},
}).collect::<Vec<Value>>())
},
// Use `match` instead of `if let`. This will allow to future-proof this
// implementation in case some new message content types would appear,
// this way library would not compile if not all methods are implemented
// continue would allow to gracefully skip pushing unserializable message
// TODO: Probably need to warn if it is a ToolCalls type of content
MessageContent::ToolCalls(_) => continue,
MessageContent::ToolResponses(_) => continue,
};
contents.push(json!({"role": "user", "parts": content}));
},
ChatRole::Assistant => {
let MessageContent::Text(content) = msg.content else {
return Err(Error::MessageContentTypeNotSupported {
model_iden,
cause: "Only MessageContent::Text supported for this model (for now)",
});
};
contents.push(json!({"role": "model", "parts": [{"text": content}]}))
},
ChatRole::Tool => { ChatRole::Tool => {
return Err(Error::MessageRoleNotSupported { return Err(Error::MessageRoleNotSupported {
model_iden, model_iden,

View File

@ -3,7 +3,7 @@ use crate::adapter::openai::OpenAIStreamer;
use crate::adapter::{Adapter, AdapterDispatcher, AdapterKind, ServiceType, WebRequestData}; use crate::adapter::{Adapter, AdapterDispatcher, AdapterKind, ServiceType, WebRequestData};
use crate::chat::{ use crate::chat::{
ChatOptionsSet, ChatRequest, ChatResponse, ChatResponseFormat, ChatRole, ChatStream, ChatStreamResponse, ChatOptionsSet, ChatRequest, ChatResponse, ChatResponseFormat, ChatRole, ChatStream, ChatStreamResponse,
MessageContent, MetaUsage, ToolCall, MessageContent, MetaUsage, ToolCall, ContentPart, ImageSource
}; };
use crate::resolver::{AuthData, Endpoint}; use crate::resolver::{AuthData, Endpoint};
use crate::webc::WebResponse; use crate::webc::WebResponse;
@ -250,10 +250,31 @@ impl OpenAIAdapter {
// TODO: Probably need to warn if it is a ToolCalls type of content // TODO: Probably need to warn if it is a ToolCalls type of content
} }
ChatRole::User => { ChatRole::User => {
if let MessageContent::Text(content) = msg.content { let content = match msg.content {
messages.push(json! ({"role": "user", "content": content})); MessageContent::Text(content) => json!(content),
} MessageContent::Parts(parts) => {
// TODO: Probably need to warn if it is a ToolCalls type of content json!(parts.iter().map(|part| match part {
ContentPart::Text(text) => json!({"type": "text", "text": text.clone()}),
ContentPart::Image{content, content_type, source} => {
match source {
ImageSource::Url => json!({"type": "image_url", "image_url": {"url": content}}),
ImageSource::Base64 => {
let image_url = format!("data:{content_type};base64,{content}");
json!({"type": "image_url", "image_url": {"url": image_url}})
},
}
},
}).collect::<Vec<Value>>())
},
// Use `match` instead of `if let`. This will allow to future-proof this
// implementation in case some new message content types would appear,
// this way library would not compile if not all methods are implemented
// continue would allow to gracefully skip pushing unserializable message
// TODO: Probably need to warn if it is a ToolCalls type of content
MessageContent::ToolCalls(_) => continue,
MessageContent::ToolResponses(_) => continue,
};
messages.push(json! ({"role": "user", "content": content}));
} }
ChatRole::Assistant => match msg.content { ChatRole::Assistant => match msg.content {
@ -275,6 +296,7 @@ impl OpenAIAdapter {
messages.push(json! ({"role": "assistant", "tool_calls": tool_calls})) messages.push(json! ({"role": "assistant", "tool_calls": tool_calls}))
} }
// TODO: Probably need to trace/warn that this will be ignored // TODO: Probably need to trace/warn that this will be ignored
MessageContent::Parts(_) => (),
MessageContent::ToolResponses(_) => (), MessageContent::ToolResponses(_) => (),
}, },

View File

@ -2,13 +2,14 @@ use crate::chat::{ToolCall, ToolResponse};
use derive_more::derive::From; use derive_more::derive::From;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
/// Currently, it only supports Text,
/// but the goal is to support multi-part message content (see below)
#[derive(Debug, Clone, Serialize, Deserialize, From)] #[derive(Debug, Clone, Serialize, Deserialize, From)]
pub enum MessageContent { pub enum MessageContent {
/// Text content /// Text content
Text(String), Text(String),
/// Content parts
Parts(Vec<ContentPart>),
/// Tool calls /// Tool calls
#[from] #[from]
ToolCalls(Vec<ToolCall>), ToolCalls(Vec<ToolCall>),
@ -25,6 +26,9 @@ impl MessageContent {
MessageContent::Text(content.into()) MessageContent::Text(content.into())
} }
/// Create a new MessageContent from provided content parts
pub fn from_parts(parts: impl Into<Vec<ContentPart>>) -> Self { MessageContent::Parts(parts.into()) }
/// Create a new MessageContent with the ToolCalls variant /// Create a new MessageContent with the ToolCalls variant
pub fn from_tool_calls(tool_calls: Vec<ToolCall>) -> Self { pub fn from_tool_calls(tool_calls: Vec<ToolCall>) -> Self {
MessageContent::ToolCalls(tool_calls) MessageContent::ToolCalls(tool_calls)
@ -40,6 +44,12 @@ impl MessageContent {
pub fn text_as_str(&self) -> Option<&str> { pub fn text_as_str(&self) -> Option<&str> {
match self { match self {
MessageContent::Text(content) => Some(content.as_str()), MessageContent::Text(content) => Some(content.as_str()),
MessageContent::Parts(parts) => {
Some(parts.iter().filter_map(|part| match part {
ContentPart::Text(content) => Some(content.clone()),
_ => None,
}).collect::<Vec<String>>().join("\n").leak()) // TODO revisit this, should we leak &str?
},
MessageContent::ToolCalls(_) => None, MessageContent::ToolCalls(_) => None,
MessageContent::ToolResponses(_) => None, MessageContent::ToolResponses(_) => None,
} }
@ -53,6 +63,12 @@ impl MessageContent {
pub fn text_into_string(self) -> Option<String> { pub fn text_into_string(self) -> Option<String> {
match self { match self {
MessageContent::Text(content) => Some(content), MessageContent::Text(content) => Some(content),
MessageContent::Parts(parts) => {
Some(parts.into_iter().filter_map(|part| match part {
ContentPart::Text(content) => Some(content),
_ => None,
}).collect::<Vec<String>>().join("\n"))
},
MessageContent::ToolCalls(_) => None, MessageContent::ToolCalls(_) => None,
MessageContent::ToolResponses(_) => None, MessageContent::ToolResponses(_) => None,
} }
@ -62,6 +78,7 @@ impl MessageContent {
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
match self { match self {
MessageContent::Text(content) => content.is_empty(), MessageContent::Text(content) => content.is_empty(),
MessageContent::Parts(parts) => parts.is_empty(),
MessageContent::ToolCalls(tool_calls) => tool_calls.is_empty(), MessageContent::ToolCalls(tool_calls) => tool_calls.is_empty(),
MessageContent::ToolResponses(tool_responses) => tool_responses.is_empty(), MessageContent::ToolResponses(tool_responses) => tool_responses.is_empty(),
} }
@ -94,27 +111,39 @@ impl From<ToolResponse> for MessageContent {
} }
} }
impl From<Vec<ContentPart>> for MessageContent {
fn from(parts: Vec<ContentPart>) -> Self { MessageContent::Parts(parts) }
}
// endregion: --- Froms // endregion: --- Froms
// NOTE: The goal is to add a Parts variant with ContentPart for multipart support #[derive(Debug, Clone, Serialize, Deserialize, From)]
// pub enum ContentPart {
// ```` Text(String),
// pub enum MessageContent { Image {
// Text(String), content: String,
// Parts(Vec<ContentPart>)` variant to `MessageContent` content_type: String,
// } source: ImageSource,
// ``` },
// }
// With something like this:
// ``` // region: --- Froms
// pub enum ContentPart {
// Text(String), impl<'a> From<&'a str> for ContentPart {
// Image(ImagePart) fn from(s: &'a str) -> Self {
// } ContentPart::Text(s.to_string())
// }
// pub enum ImagePart { }
// Local(PathBuf),
// Remote(Url), // endregion: --- Froms
// Base64(String)
// }
// ``` #[derive(Debug, Clone, Serialize, Deserialize, From)]
pub enum ImageSource {
Url,
Base64
// No `Local` location, this would require handling errors like "file not found" etc.
// Such file can be easily provided by user as Base64, also can implement convenient
// TryFrom<File> to Base64 version. All LLMs accepts local Images only as Base64
}

View File

@ -1,4 +1,4 @@
use genai::chat::{ChatMessage, ChatRequest, Tool}; use genai::chat::{ChatMessage, ChatRequest, ContentPart, ImageSource, Tool};
use serde_json::json; use serde_json::json;
pub fn seed_chat_req_simple() -> ChatRequest { pub fn seed_chat_req_simple() -> ChatRequest {
@ -9,6 +9,21 @@ pub fn seed_chat_req_simple() -> ChatRequest {
]) ])
} }
pub fn seed_chat_req_with_image() -> ChatRequest {
ChatRequest::new(vec![
// -- Messages (deactivate to see the differences)
ChatMessage::system("Answer in one sentence"),
ChatMessage::user(vec![
ContentPart::from("What is in this image?"),
ContentPart::Image {
content: "BASE64 ENCODED IMAGE".to_string(),
content_type:"image/png".to_string(),
source: ImageSource::Base64,
}
]),
])
}
pub fn seed_chat_req_tool_simple() -> ChatRequest { pub fn seed_chat_req_tool_simple() -> ChatRequest {
ChatRequest::new(vec![ ChatRequest::new(vec![
// -- Messages (deactivate to see the differences) // -- Messages (deactivate to see the differences)