From a3a4bf47edee64b9ffcbb767c56f2f2bf6aa91ee Mon Sep 17 00:00:00 2001 From: Jamjamjon <51357717+jamjamjon@users.noreply.github.com> Date: Thu, 29 May 2025 23:33:16 +0800 Subject: [PATCH] Add mobileclip and jina-clip-v2 (#106) --- .github/workflows/rust-ci.yml | 10 +++--- README.md | 4 ++- examples/clip/README.md | 6 ++-- examples/clip/main.rs | 50 ++++++++++++++++---------- src/inference/x.rs | 67 +++++++++++++++++++++++++++++++++-- src/models/clip/README.md | 6 +++- src/models/clip/config.rs | 62 +++++++++++++++++++++++++++++--- src/models/clip/impl.rs | 3 +- src/utils/ops.rs | 19 ++++++++++ 9 files changed, 190 insertions(+), 37 deletions(-) diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml index 273c163..55a0847 100644 --- a/.github/workflows/rust-ci.yml +++ b/.github/workflows/rust-ci.yml @@ -21,7 +21,7 @@ jobs: - name: Install dependencies run: | - DEBIAN_FRONTEND=noninteractive apt-get update + DEBIAN_FRONTEND=noninteractive apt-get update --fix-missing DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential ca-certificates clang curl pkg-config protobuf-compiler - name: Setup Rust @@ -46,7 +46,7 @@ jobs: - name: Install dependencies run: | - DEBIAN_FRONTEND=noninteractive apt-get update + DEBIAN_FRONTEND=noninteractive apt-get update --fix-missing DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential ca-certificates clang curl pkg-config protobuf-compiler - name: Setup Rust @@ -66,7 +66,7 @@ jobs: - name: Install dependencies run: | - DEBIAN_FRONTEND=noninteractive apt-get update + DEBIAN_FRONTEND=noninteractive apt-get update --fix-missing DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential ca-certificates clang curl pkg-config protobuf-compiler - name: Setup Rust @@ -92,11 +92,11 @@ jobs: - name: Install dependencies run: | - DEBIAN_FRONTEND=noninteractive apt-get update + DEBIAN_FRONTEND=noninteractive apt-get update --fix-missing DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential ca-certificates clang curl pkg-config protobuf-compiler - name: Setup Rust uses: dtolnay/rust-toolchain@stable - name: Build - run: cargo build --all-features \ No newline at end of file + run: cargo build --all-features diff --git a/README.md b/README.md index 5f7f086..90813d7 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ - **YOLO Models**: [YOLOv5](https://github.com/ultralytics/yolov5), [YOLOv6](https://github.com/meituan/YOLOv6), [YOLOv7](https://github.com/WongKinYiu/yolov7), [YOLOv8](https://github.com/ultralytics/ultralytics), [YOLOv9](https://github.com/WongKinYiu/yolov9), [YOLOv10](https://github.com/THU-MIG/yolov10), [YOLO11](https://github.com/ultralytics/ultralytics), [YOLOv12](https://github.com/sunsmarterjie/yolov12) - **SAM Models**: [SAM](https://github.com/facebookresearch/segment-anything), [SAM2](https://github.com/facebookresearch/segment-anything-2), [MobileSAM](https://github.com/ChaoningZhang/MobileSAM), [EdgeSAM](https://github.com/chongzhou96/EdgeSAM), [SAM-HQ](https://github.com/SysCV/sam-hq), [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM) - **Vision Models**: [RT-DETR](https://arxiv.org/abs/2304.08069), [RTMO](https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo), [Depth-Anything](https://github.com/LiheYoung/Depth-Anything), [DINOv2](https://github.com/facebookresearch/dinov2), [MODNet](https://github.com/ZHKKKe/MODNet), [Sapiens](https://arxiv.org/abs/2408.12569), [DepthPro](https://github.com/apple/ml-depth-pro), [FastViT](https://github.com/apple/ml-fastvit), [BEiT](https://github.com/microsoft/unilm/tree/master/beit), [MobileOne](https://github.com/apple/ml-mobileone) -- **Vision-Language Models**: [CLIP](https://github.com/openai/CLIP), [jina-clip-v1](https://huggingface.co/jinaai/jina-clip-v1), [BLIP](https://arxiv.org/abs/2201.12086), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [YOLO-World](https://github.com/AILab-CVC/YOLO-World), [Florence2](https://arxiv.org/abs/2311.06242), [Moondream2](https://github.com/vikhyat/moondream/tree/main) +- **Vision-Language Models**: [CLIP](https://github.com/openai/CLIP), [jina-clip-v1-v2](https://huggingface.co/jinaai/jina-clip-v1), [BLIP](https://arxiv.org/abs/2201.12086), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [YOLO-World](https://github.com/AILab-CVC/YOLO-World), [Florence2](https://arxiv.org/abs/2311.06242), [Moondream2](https://github.com/vikhyat/moondream/tree/main) - **OCR-Related Models**: [FAST](https://github.com/czczup/FAST), [DB(PaddleOCR-Det)](https://arxiv.org/abs/1911.08947), [SVTR(PaddleOCR-Rec)](https://arxiv.org/abs/2205.00159), [SLANet](https://paddlepaddle.github.io/PaddleOCR/latest/algorithm/table_recognition/algorithm_table_slanet.html), [TrOCR](https://huggingface.co/microsoft/trocr-base-printed), [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
@@ -100,6 +100,8 @@ | [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO) | Open-Set Detection With Language | [demo](examples/grounding-dino) | ✅ | ✅ | ✅ | | | | [CLIP](https://github.com/openai/CLIP) | Vision-Language Embedding | [demo](examples/clip) | ✅ | ✅ | ✅ | ❌ | ❌ | | [jina-clip-v1](https://huggingface.co/jinaai/jina-clip-v1) | Vision-Language Embedding | [demo](examples/clip) | ✅ | ✅ | ✅ | ❌ | ❌ | +| [jina-clip-v2](https://huggingface.co/jinaai/jina-clip-v2) | Vision-Language Embedding | [demo](examples/clip) | ✅ | ✅ | ✅ | ❌ | ❌ | +| [mobileclip](https://github.com/apple/ml-mobileclip) | Vision-Language Embedding | [demo](examples/clip) | ✅ | ✅ | ✅ | ❌ | ❌ | | [BLIP](https://github.com/salesforce/BLIP) | Image Captioning | [demo](examples/blip) | ✅ | ✅ | ✅ | ❌ | ❌ | | [DB(PaddleOCR-Det)](https://arxiv.org/abs/1911.08947) | Text Detection | [demo](examples/db) | ✅ | ✅ | ✅ | ✅ | ✅ | | [FAST](https://github.com/czczup/FAST) | Text Detection | [demo](examples/fast) | ✅ | ✅ | ✅ | ✅ | ✅ | diff --git a/examples/clip/README.md b/examples/clip/README.md index 71fe94e..fc9c1cf 100644 --- a/examples/clip/README.md +++ b/examples/clip/README.md @@ -9,7 +9,7 @@ cargo run -r -F cuda --example clip -- --device cuda:0 ## Results ```shell -(99.9675%) ./examples/clip/images/carrot.jpg => Some carrots -(99.93718%) ./examples/clip/images/doll.jpg => There is a doll with red hair and a clock on a table -(100.0%) ./examples/clip/images/drink.jpg => Some people holding wine glasses in a restaurant +[99.999428%] (examples/clip/images/carrot.jpg) <=> (A picture of some carrots.) +[100.000000%] (examples/clip/images/doll.jpg) <=> (There is a doll with red hair and a clock on a table.) +[99.990738%] (examples/clip/images/drink.jpg) <=> (Some people holding wine glasses in a restaurant.) ``` diff --git a/examples/clip/main.rs b/examples/clip/main.rs index 90d1055..a4ceb98 100644 --- a/examples/clip/main.rs +++ b/examples/clip/main.rs @@ -1,5 +1,6 @@ use anyhow::Result; -use usls::{models::Clip, Config, DataLoader, Ops}; +use ndarray::Axis; +use usls::{models::Clip, Config, DataLoader}; #[derive(argh::FromArgs)] /// CLIP Example @@ -7,6 +8,10 @@ struct Args { /// device #[argh(option, default = "String::from(\"cpu:0\")")] device: String, + + /// dtype + #[argh(option, default = "String::from(\"fp16\")")] + dtype: String, } fn main() -> Result<()> { @@ -17,45 +22,52 @@ fn main() -> Result<()> { let args: Args = argh::from_env(); // build model - let config = Config::jina_clip_v1() + let config = Config::mobileclip_s0() + // mobileclip_blt() + // clip_vit_b16() + // clip_vit_l14() + // clip_vit_b32() + // jina_clip_v1() + // jina_clip_v2() + .with_dtype_all(args.dtype.as_str().try_into()?) .with_device_all(args.device.as_str().try_into()?) .commit()?; let mut model = Clip::new(config)?; // texts let texts = vec![ - "A photo of a dinosaur", - "A photo of a cat", - "A photo of a dog", - "Some carrots", - "There are some playing cards on a striped table cloth", - "There is a doll with red hair and a clock on a table", - "Some people holding wine glasses in a restaurant", + "A photo of a dinosaur.", + "A photo of a cat.", + "A photo of a dog.", + "A picture of some carrots.", + "There are some playing cards on a striped table cloth.", + "There is a doll with red hair and a clock on a table.", + "Some people holding wine glasses in a restaurant.", ]; - let feats_text = model.encode_texts(&texts)?; // [n, ndim] + let feats_text = model.encode_texts(&texts)?.norm(1)?; // load images let dl = DataLoader::new("./examples/clip/images")?.build()?; // run - for images in dl { - let feats_image = model.encode_images(&images)?; + for images in &dl { + let feats_image = model.encode_images(&images)?.norm(1)?; // use image to query texts - let matrix = Ops::dot2(&feats_image, &feats_text)?; + let matrix = (feats_image * 100.).dot2(&feats_text)?.softmax(1)?; - for i in 0..images.len() { - let probs = &matrix[i]; - let (id, &score) = probs + // Process each image's matching scores + for (i, row) in matrix.axis_iter(Axis(0)).enumerate() { + let (id, &score) = row .iter() .enumerate() - .reduce(|max, x| if x.1 > max.1 { x } else { max }) + .max_by(|a, b| a.1.partial_cmp(b.1).unwrap()) .unwrap(); println!( - "({:?}%) {:?} => {} ", + "[{:.6}%] ({}) <=> ({})", score * 100.0, - images[i].source(), + images[i].source().unwrap().display(), &texts[id] ); } diff --git a/src/inference/x.rs b/src/inference/x.rs index a176585..307cec8 100644 --- a/src/inference/x.rs +++ b/src/inference/x.rs @@ -1,6 +1,7 @@ use anyhow::Result; use image::DynamicImage; -use ndarray::{Array, Dim, IntoDimension, IxDyn, IxDynImpl}; +use ndarray::{Array, Dim, IntoDimension, Ix2, IxDyn, IxDynImpl}; +// use std::ops::Mul; use crate::{Ops, ResizeMode}; @@ -64,9 +65,39 @@ impl std::ops::Deref for X { } } -impl X { - // TODO: Add some slice and index method +impl std::ops::Mul for X { + type Output = Self; + fn mul(self, other: f32) -> Self::Output { + Self(self.0 * other) + } +} + +impl std::ops::Div for X { + type Output = Self; + + fn div(self, other: f32) -> Self::Output { + Self(self.0 / other) + } +} + +impl std::ops::Add for X { + type Output = Self; + + fn add(self, other: f32) -> Self::Output { + Self(self.0 + other) + } +} + +impl std::ops::Sub for X { + type Output = Self; + + fn sub(self, other: f32) -> Self::Output { + Self(self.0 - other) + } +} + +impl X { pub fn zeros(shape: &[usize]) -> Self { Self::from(Array::zeros(Dim(IxDynImpl::from(shape.to_vec())))) } @@ -187,6 +218,36 @@ impl X { Ok(self) } + pub fn dot2(&self, other: &Self) -> Result { + // Check dimensions + if self.ndim() != 2 || other.ndim() != 2 { + anyhow::bail!( + "dot2 requires 2D matrices, got {}D and {}D", + self.ndim(), + other.ndim() + ); + } + + let a = self.0.as_standard_layout().into_dimensionality::()?; + let b = other.0.as_standard_layout().into_dimensionality::()?; + + // Check compatibility + if a.shape()[1] != b.shape()[1] { + anyhow::bail!( + "Incompatible dimensions for dot2: {:?} and {:?}", + a.shape(), + b.shape() + ); + } + + Ok(a.dot(&b.t()).into_dyn().into()) + } + + pub fn softmax(mut self, d: usize) -> Result { + self.0 = Ops::softmax(self.0, d)?; + Ok(self) + } + pub fn unsigned(mut self) -> Self { self.0.par_mapv_inplace(|x| if x < 0.0 { 0.0 } else { x }); self diff --git a/src/models/clip/README.md b/src/models/clip/README.md index 8bc962e..e5a7845 100644 --- a/src/models/clip/README.md +++ b/src/models/clip/README.md @@ -4,7 +4,11 @@ CLIP (Contrastive Language-Image Pre-Training) is a neural network trained on a ## Official Repository -The official repository can be found on: [GitHub](https://github.com/openai/CLIP) +The official repository can be found on: +- [CLIP](https://github.com/openai/CLIP) +- [jina-clip-v1](https://huggingface.co/jinaai/jina-clip-v1) +- [jina-clip-v2](https://huggingface.co/jinaai/jina-clip-v2) +- [mobileclip](https://github.com/apple/ml-mobileclip) ## Example diff --git a/src/models/clip/config.rs b/src/models/clip/config.rs index 8b1d69f..0d59776 100644 --- a/src/models/clip/config.rs +++ b/src/models/clip/config.rs @@ -36,22 +36,76 @@ impl crate::Config { pub fn jina_clip() -> Self { Self::default() - .with_name("jina-clip-v1") .with_batch_size_all(1) .with_visual_ixx(0, 1, 3.into()) .with_visual_ixx(0, 2, 224.into()) .with_visual_ixx(0, 3, 224.into()) .with_image_mean(&[0.48145466, 0.4578275, 0.40821073]) .with_image_std(&[0.26862954, 0.2613026, 0.2757771]) + .with_visual_file("visual.onnx") + .with_textual_file("textual.onnx") + } + + pub fn jina_clip_v1() -> Self { + Self::jina_clip() + .with_name("jina-clip-v1") .with_tokenizer_file("jina-clip-v1/tokenizer.json") .with_tokenizer_config_file("jina-clip-v1/tokenizer_config.json") .with_special_tokens_map_file("jina-clip-v1/special_tokens_map.json") .with_config_file("jina-clip-v1/config.json") } - pub fn jina_clip_v1() -> Self { + pub fn jina_clip_v2() -> Self { Self::jina_clip() - .with_visual_file("visual.onnx") - .with_textual_file("textual.onnx") + .with_name("jina-clip-v2") + .with_visual_ixx(0, 2, 512.into()) + .with_visual_ixx(0, 3, 512.into()) + .with_tokenizer_file("jina-clip-v2/tokenizer.json") + .with_tokenizer_config_file("jina-clip-v2/tokenizer_config.json") + .with_special_tokens_map_file("jina-clip-v2/special_tokens_map.json") + .with_config_file("jina-clip-v2/config.json") + } + + pub fn mobileclip() -> Self { + Self::default() + .with_name("mobileclip") + .with_batch_size_all(1) + .with_visual_ixx(0, 1, 3.into()) + .with_visual_ixx(0, 2, 224.into()) + .with_visual_ixx(0, 3, 224.into()) + .with_model_max_length(77) + .with_tokenizer_file("clip/tokenizer.json") + .with_tokenizer_config_file("clip/tokenizer_config.json") + .with_special_tokens_map_file("clip/special_tokens_map.json") + } + + pub fn mobileclip_s0() -> Self { + Self::mobileclip() + .with_textual_file("s0-textual.onnx") + .with_visual_file("s0-visual.onnx") + } + + pub fn mobileclip_s1() -> Self { + Self::mobileclip() + .with_textual_file("s1-textual.onnx") + .with_visual_file("s1-visual.onnx") + } + + pub fn mobileclip_s2() -> Self { + Self::mobileclip() + .with_textual_file("s2-textual.onnx") + .with_visual_file("s2-visual.onnx") + } + + pub fn mobileclip_b() -> Self { + Self::mobileclip() + .with_textual_file("b-textual.onnx") + .with_visual_file("b-visual.onnx") + } + + pub fn mobileclip_blt() -> Self { + Self::mobileclip() + .with_textual_file("blt-textual.onnx") + .with_visual_file("blt-visual.onnx") } } diff --git a/src/models/clip/impl.rs b/src/models/clip/impl.rs index 7d464c0..521426f 100644 --- a/src/models/clip/impl.rs +++ b/src/models/clip/impl.rs @@ -54,7 +54,7 @@ impl Clip { let xs = elapsed!("textual-preprocess", self.ts, { let encodings: Vec = self .processor - .encode_texts_ids(xs, false)? + .encode_texts_ids(xs, true)? .into_iter() .flatten() .collect(); @@ -62,6 +62,7 @@ impl Clip { let x: X = Array2::from_shape_vec((xs.len(), encodings.len() / xs.len()), encodings)? .into_dyn() .into(); + x }); let xs = elapsed!("textual-inference", self.ts, { diff --git a/src/utils/ops.rs b/src/utils/ops.rs index b29e35f..af0492c 100644 --- a/src/utils/ops.rs +++ b/src/utils/ops.rs @@ -189,6 +189,25 @@ impl Ops<'_> { Ok(xs / std_) } + pub fn softmax(xs: Array, d: usize) -> Result> { + if xs.shape().len() <= d { + anyhow::bail!( + "`softmax`: Specified axis {} exceeds the maximum dimension length {}.", + d, + xs.shape().len() + ); + } + let max_vals = xs + .map_axis(Axis(d), |view| { + view.fold(f32::NEG_INFINITY, |a, &b| a.max(b)) + }) + .insert_axis(Axis(d)); + let exps = (&xs - &max_vals).mapv(f32::exp); + let sums = exps.sum_axis(Axis(d)).insert_axis(Axis(d)); + + Ok(exps / sums) + } + pub fn scale_wh(w0: f32, h0: f32, w1: f32, h1: f32) -> (f32, f32, f32) { let r = (w1 / w0).min(h1 / h0); (r, (w0 * r).round(), (h0 * r).round())