diff --git a/examples/clip/main.rs b/examples/clip/main.rs index 32aef96..e2bfc1f 100644 --- a/examples/clip/main.rs +++ b/examples/clip/main.rs @@ -39,7 +39,10 @@ fn main() -> Result<(), Box> { let feats_image = model.encode_images(&images).unwrap(); // use image to query texts - let matrix = feats_image.dot2(&feats_text)?; + let matrix = match feats_image.embedding() { + Some(x) => x.dot2(feats_text.embedding().unwrap())?, + None => continue, + }; // summary for i in 0..paths.len() { diff --git a/examples/dinov2/images/bus.jpg b/examples/dinov2/images/bus.jpg deleted file mode 100644 index 40eaaf5..0000000 Binary files a/examples/dinov2/images/bus.jpg and /dev/null differ diff --git a/examples/dinov2/main.rs b/examples/dinov2/main.rs index fd666c6..48373bc 100644 --- a/examples/dinov2/main.rs +++ b/examples/dinov2/main.rs @@ -1,4 +1,4 @@ -use usls::{models::Dinov2, Options}; +use usls::{models::Dinov2, DataLoader, Options}; fn main() -> Result<(), Box> { // build model @@ -7,8 +7,10 @@ fn main() -> Result<(), Box> { .with_i00((1, 1, 1).into()) .with_i02((224, 224, 224).into()) .with_i03((224, 224, 224).into()); - let _model = Dinov2::new(options)?; - println!("TODO..."); + let mut model = Dinov2::new(options)?; + let x = vec![DataLoader::try_read("./examples/dinov2/images/1.jpg")?]; + let y = model.run(&x)?; + println!("{y:?}"); // query from vector // let ys = model.query_from_vec( diff --git a/src/core/ts.rs b/src/core/ts.rs index 197844c..9866953 100644 --- a/src/core/ts.rs +++ b/src/core/ts.rs @@ -4,11 +4,6 @@ use std::time::Duration; pub struct Ts { n: usize, ts: Vec, - // pub t0: Duration, - // pub t1: Duration, - // pub t2: Duration, - // pub t3: Duration, - // pub t4: Duration, } impl Ts { diff --git a/src/models/blip.rs b/src/models/blip.rs index 0f1758e..be4f0c1 100644 --- a/src/models/blip.rs +++ b/src/models/blip.rs @@ -42,7 +42,7 @@ impl Blip { }) } - pub fn encode_images(&mut self, xs: &[DynamicImage]) -> Result { + pub fn encode_images(&mut self, xs: &[DynamicImage]) -> Result { let xs_ = ops::resize( xs, self.height.opt as u32, @@ -56,7 +56,7 @@ impl Blip { &[0.26862954, 0.2613026, 0.2757771], ); let ys: Vec> = self.visual.run(&[xs_])?; - Ok(Embedding::new(ys[0].to_owned())) + Ok(Y::default().with_embedding(Embedding::new(ys[0].to_owned()))) } pub fn caption( @@ -67,8 +67,9 @@ impl Blip { ) -> Result> { let mut ys: Vec = Vec::new(); let image_embeds = self.encode_images(x)?; + let image_embeds = image_embeds.embedding().unwrap(); let image_embeds_attn_mask: Array = - Array::ones((1, image_embeds.embedding().shape()[1])).into_dyn(); + Array::ones((1, image_embeds.data().shape()[1])).into_dyn(); let mut y_text = String::new(); // conditional @@ -104,7 +105,7 @@ impl Blip { let y = self.textual.run(&[ input_ids_nd, input_ids_attn_mask, - image_embeds.embedding().to_owned(), + image_embeds.data().to_owned(), image_embeds_attn_mask.to_owned(), ])?; // N, length, vocab_size let y = y[0].slice(s!(0, -1.., ..)); diff --git a/src/models/clip.rs b/src/models/clip.rs index 03eb633..a10e10e 100644 --- a/src/models/clip.rs +++ b/src/models/clip.rs @@ -1,4 +1,4 @@ -use crate::{ops, Embedding, MinOptMax, Options, OrtEngine}; +use crate::{ops, Embedding, MinOptMax, Options, OrtEngine, Y}; use anyhow::Result; use image::DynamicImage; use ndarray::{Array, Array2, IxDyn}; @@ -52,7 +52,7 @@ impl Clip { }) } - pub fn encode_images(&mut self, xs: &[DynamicImage]) -> Result { + pub fn encode_images(&mut self, xs: &[DynamicImage]) -> Result { let xs_ = ops::resize( xs, self.height.opt as u32, @@ -66,10 +66,10 @@ impl Clip { &[0.26862954, 0.2613026, 0.2757771], ); let ys: Vec> = self.visual.run(&[xs_])?; - Ok(Embedding::new(ys[0].to_owned())) + Ok(Y::default().with_embedding(Embedding::new(ys[0].to_owned()))) } - pub fn encode_texts(&mut self, texts: &[String]) -> Result { + pub fn encode_texts(&mut self, texts: &[String]) -> Result { let encodings = self .tokenizer .encode_batch(texts.to_owned(), false) @@ -80,7 +80,7 @@ impl Clip { .collect(); let xs = Array2::from_shape_vec((texts.len(), self.context_length), xs)?.into_dyn(); let ys = self.textual.run(&[xs])?; - Ok(Embedding::new(ys[0].to_owned())) + Ok(Y::default().with_embedding(Embedding::new(ys[0].to_owned()))) } pub fn batch_visual(&self) -> usize { diff --git a/src/models/dinov2.rs b/src/models/dinov2.rs index f6f7c18..0ea2a42 100644 --- a/src/models/dinov2.rs +++ b/src/models/dinov2.rs @@ -1,4 +1,4 @@ -use crate::{ops, MinOptMax, Options, OrtEngine}; +use crate::{ops, Embedding, MinOptMax, Options, OrtEngine, Y}; use anyhow::Result; use image::DynamicImage; use ndarray::{Array, IxDyn}; @@ -48,22 +48,21 @@ impl Dinov2 { }) } - pub fn run(&mut self, xs: &[DynamicImage]) -> Result> { + pub fn run(&mut self, xs: &[DynamicImage]) -> Result { let xs_ = ops::resize( xs, self.height.opt as u32, self.width.opt as u32, "lanczos3", )?; - let xs_ = ops::normalize(xs_, 0.0, 255.0); + let xs_ = ops::normalize(xs_, 0., 255.); let xs_ = ops::standardize( xs_, &[0.48145466, 0.4578275, 0.40821073], &[0.26862954, 0.2613026, 0.2757771], ); let ys: Vec> = self.engine.run(&[xs_])?; - let ys = ys[0].to_owned(); - Ok(ys) + Ok(Y::default().with_embedding(Embedding::new(ys[0].to_owned()))) } // pub fn build_index(&self, metric: Metric) -> Result { diff --git a/src/ys/embedding.rs b/src/ys/embedding.rs index 87ce35a..c615772 100644 --- a/src/ys/embedding.rs +++ b/src/ys/embedding.rs @@ -7,9 +7,7 @@ pub struct Embedding(Array); impl std::fmt::Debug for Embedding { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("Embedding") - .field("Shape", &self.0.shape()) - .finish() + f.debug_struct("").field("Shape", &self.0.shape()).finish() } } @@ -23,7 +21,7 @@ impl Embedding { self } - pub fn embedding(&self) -> &Array { + pub fn data(&self) -> &Array { &self.0 } diff --git a/src/ys/y.rs b/src/ys/y.rs index 24c5ec3..5007fb3 100644 --- a/src/ys/y.rs +++ b/src/ys/y.rs @@ -1,4 +1,4 @@ -use crate::{Bbox, Keypoint, Mask, Mbr, Polygon, Prob}; +use crate::{Bbox, Embedding, Keypoint, Mask, Mbr, Polygon, Prob}; #[derive(Clone, PartialEq, Default)] pub struct Y { @@ -9,6 +9,7 @@ pub struct Y { polygons: Option>, texts: Option>, masks: Option>, + embedding: Option, } impl std::fmt::Debug for Y { @@ -47,6 +48,9 @@ impl std::fmt::Debug for Y { f.field("Masks", &x); } } + if let Some(x) = &self.embedding { + f.field("Embedding", &x); + } f.finish() } } @@ -71,11 +75,17 @@ impl Y { self.mbrs = Some(mbrs.to_vec()); self } + pub fn with_bboxes(mut self, bboxes: &[Bbox]) -> Self { self.bboxes = Some(bboxes.to_vec()); self } + pub fn with_embedding(mut self, embedding: Embedding) -> Self { + self.embedding = Some(embedding); + self + } + pub fn with_keypoints(mut self, keypoints: &[Vec]) -> Self { self.keypoints = Some(keypoints.to_vec()); self @@ -114,6 +124,10 @@ impl Y { self.texts.as_ref() } + pub fn embedding(&self) -> Option<&Embedding> { + self.embedding.as_ref() + } + pub fn apply_bboxes_nms(mut self, iou_threshold: f32) -> Self { match &mut self.bboxes { None => self,