From a3a4bf47edee64b9ffcbb767c56f2f2bf6aa91ee Mon Sep 17 00:00:00 2001
From: Jamjamjon <51357717+jamjamjon@users.noreply.github.com>
Date: Thu, 29 May 2025 23:33:16 +0800
Subject: [PATCH] Add mobileclip and jina-clip-v2 (#106)

---
 .github/workflows/rust-ci.yml | 10 +++---
 README.md                     |  4 ++-
 examples/clip/README.md       |  6 ++--
 examples/clip/main.rs         | 50 ++++++++++++++++----------
 src/inference/x.rs            | 67 +++++++++++++++++++++++++++++++++--
 src/models/clip/README.md     |  6 +++-
 src/models/clip/config.rs     | 62 +++++++++++++++++++++++++++++---
 src/models/clip/impl.rs       |  3 +-
 src/utils/ops.rs              | 19 ++++++++++
 9 files changed, 190 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml
index 273c163..55a0847 100644
--- a/.github/workflows/rust-ci.yml
+++ b/.github/workflows/rust-ci.yml
@@ -21,7 +21,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          DEBIAN_FRONTEND=noninteractive apt-get update
+          DEBIAN_FRONTEND=noninteractive apt-get update --fix-missing
           DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential ca-certificates clang curl pkg-config protobuf-compiler
 
       - name: Setup Rust
@@ -46,7 +46,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          DEBIAN_FRONTEND=noninteractive apt-get update
+          DEBIAN_FRONTEND=noninteractive apt-get update --fix-missing
           DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential ca-certificates clang curl pkg-config protobuf-compiler
 
       - name: Setup Rust
@@ -66,7 +66,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          DEBIAN_FRONTEND=noninteractive apt-get update
+          DEBIAN_FRONTEND=noninteractive apt-get update --fix-missing
           DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential ca-certificates clang curl pkg-config protobuf-compiler
 
       - name: Setup Rust
@@ -92,11 +92,11 @@ jobs:
 
       - name: Install dependencies
         run: |
-          DEBIAN_FRONTEND=noninteractive apt-get update
+          DEBIAN_FRONTEND=noninteractive apt-get update --fix-missing
           DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential ca-certificates clang curl pkg-config protobuf-compiler
 
       - name: Setup Rust
         uses: dtolnay/rust-toolchain@stable
 
       - name: Build
-        run: cargo build --all-features
\ No newline at end of file
+        run: cargo build --all-features
diff --git a/README.md b/README.md
index 5f7f086..90813d7 100644
--- a/README.md
+++ b/README.md
@@ -62,7 +62,7 @@
 - **YOLO Models**: [YOLOv5](https://github.com/ultralytics/yolov5), [YOLOv6](https://github.com/meituan/YOLOv6), [YOLOv7](https://github.com/WongKinYiu/yolov7), [YOLOv8](https://github.com/ultralytics/ultralytics), [YOLOv9](https://github.com/WongKinYiu/yolov9), [YOLOv10](https://github.com/THU-MIG/yolov10), [YOLO11](https://github.com/ultralytics/ultralytics), [YOLOv12](https://github.com/sunsmarterjie/yolov12)
 - **SAM Models**: [SAM](https://github.com/facebookresearch/segment-anything), [SAM2](https://github.com/facebookresearch/segment-anything-2), [MobileSAM](https://github.com/ChaoningZhang/MobileSAM), [EdgeSAM](https://github.com/chongzhou96/EdgeSAM), [SAM-HQ](https://github.com/SysCV/sam-hq), [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM)
 - **Vision Models**: [RT-DETR](https://arxiv.org/abs/2304.08069), [RTMO](https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo), [Depth-Anything](https://github.com/LiheYoung/Depth-Anything), [DINOv2](https://github.com/facebookresearch/dinov2), [MODNet](https://github.com/ZHKKKe/MODNet), [Sapiens](https://arxiv.org/abs/2408.12569), [DepthPro](https://github.com/apple/ml-depth-pro), [FastViT](https://github.com/apple/ml-fastvit), [BEiT](https://github.com/microsoft/unilm/tree/master/beit), [MobileOne](https://github.com/apple/ml-mobileone)
-- **Vision-Language Models**: [CLIP](https://github.com/openai/CLIP), [jina-clip-v1](https://huggingface.co/jinaai/jina-clip-v1), [BLIP](https://arxiv.org/abs/2201.12086), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [YOLO-World](https://github.com/AILab-CVC/YOLO-World), [Florence2](https://arxiv.org/abs/2311.06242), [Moondream2](https://github.com/vikhyat/moondream/tree/main)
+- **Vision-Language Models**: [CLIP](https://github.com/openai/CLIP), [jina-clip-v1-v2](https://huggingface.co/jinaai/jina-clip-v1), [BLIP](https://arxiv.org/abs/2201.12086), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [YOLO-World](https://github.com/AILab-CVC/YOLO-World), [Florence2](https://arxiv.org/abs/2311.06242), [Moondream2](https://github.com/vikhyat/moondream/tree/main)
 - **OCR-Related Models**: [FAST](https://github.com/czczup/FAST), [DB(PaddleOCR-Det)](https://arxiv.org/abs/1911.08947), [SVTR(PaddleOCR-Rec)](https://arxiv.org/abs/2205.00159), [SLANet](https://paddlepaddle.github.io/PaddleOCR/latest/algorithm/table_recognition/algorithm_table_slanet.html), [TrOCR](https://huggingface.co/microsoft/trocr-base-printed), [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
 
 <details>
@@ -100,6 +100,8 @@
 | [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO)                                                   | Open-Set Detection With Language                                                                                             | [demo](examples/grounding-dino) | ✅     | ✅             | ✅             |                    |                    |
 | [CLIP](https://github.com/openai/CLIP)                                                                            | Vision-Language Embedding                                                                                                    | [demo](examples/clip)           | ✅     | ✅             | ✅             | ❌                 | ❌                 |
 | [jina-clip-v1](https://huggingface.co/jinaai/jina-clip-v1)                                                        | Vision-Language Embedding                                                                                                    | [demo](examples/clip)           | ✅     | ✅             | ✅             | ❌                 | ❌                 |
+| [jina-clip-v2](https://huggingface.co/jinaai/jina-clip-v2)                                                        | Vision-Language Embedding                                                                                                    | [demo](examples/clip)           | ✅     | ✅             | ✅             | ❌                 | ❌                 |
+| [mobileclip](https://github.com/apple/ml-mobileclip)                                                        | Vision-Language Embedding                                                                                                    | [demo](examples/clip)           | ✅     | ✅             | ✅             | ❌                 | ❌                 |
 | [BLIP](https://github.com/salesforce/BLIP)                                                                        | Image Captioning                                                                                                             | [demo](examples/blip)           | ✅     | ✅             | ✅             | ❌                 | ❌                 |
 | [DB(PaddleOCR-Det)](https://arxiv.org/abs/1911.08947)                                                             | Text Detection                                                                                                               | [demo](examples/db)             | ✅     | ✅             | ✅             | ✅                 | ✅                 |
 | [FAST](https://github.com/czczup/FAST)                                                                            | Text Detection                                                                                                               | [demo](examples/fast)           | ✅     | ✅             | ✅             | ✅                 | ✅                 |
diff --git a/examples/clip/README.md b/examples/clip/README.md
index 71fe94e..fc9c1cf 100644
--- a/examples/clip/README.md
+++ b/examples/clip/README.md
@@ -9,7 +9,7 @@ cargo run -r -F cuda --example clip -- --device cuda:0
 ## Results
 
 ```shell
-(99.9675%) ./examples/clip/images/carrot.jpg => Some carrots
-(99.93718%) ./examples/clip/images/doll.jpg => There is a doll with red hair and a clock on a table
-(100.0%) ./examples/clip/images/drink.jpg => Some people holding wine glasses in a restaurant 
+[99.999428%] (examples/clip/images/carrot.jpg) <=> (A picture of some carrots.)
+[100.000000%] (examples/clip/images/doll.jpg) <=> (There is a doll with red hair and a clock on a table.)
+[99.990738%] (examples/clip/images/drink.jpg) <=> (Some people holding wine glasses in a restaurant.)
 ```
diff --git a/examples/clip/main.rs b/examples/clip/main.rs
index 90d1055..a4ceb98 100644
--- a/examples/clip/main.rs
+++ b/examples/clip/main.rs
@@ -1,5 +1,6 @@
 use anyhow::Result;
-use usls::{models::Clip, Config, DataLoader, Ops};
+use ndarray::Axis;
+use usls::{models::Clip, Config, DataLoader};
 
 #[derive(argh::FromArgs)]
 /// CLIP Example
@@ -7,6 +8,10 @@ struct Args {
     /// device
     #[argh(option, default = "String::from(\"cpu:0\")")]
     device: String,
+
+    /// dtype
+    #[argh(option, default = "String::from(\"fp16\")")]
+    dtype: String,
 }
 
 fn main() -> Result<()> {
@@ -17,45 +22,52 @@ fn main() -> Result<()> {
     let args: Args = argh::from_env();
 
     // build model
-    let config = Config::jina_clip_v1()
+    let config = Config::mobileclip_s0()
+        // mobileclip_blt()
+        // clip_vit_b16()
+        // clip_vit_l14()
+        // clip_vit_b32()
+        // jina_clip_v1()
+        // jina_clip_v2()
+        .with_dtype_all(args.dtype.as_str().try_into()?)
         .with_device_all(args.device.as_str().try_into()?)
         .commit()?;
     let mut model = Clip::new(config)?;
 
     // texts
     let texts = vec![
-        "A photo of a dinosaur",
-        "A photo of a cat",
-        "A photo of a dog",
-        "Some carrots",
-        "There are some playing cards on a striped table cloth",
-        "There is a doll with red hair and a clock on a table",
-        "Some people holding wine glasses in a restaurant",
+        "A photo of a dinosaur.",
+        "A photo of a cat.",
+        "A photo of a dog.",
+        "A picture of some carrots.",
+        "There are some playing cards on a striped table cloth.",
+        "There is a doll with red hair and a clock on a table.",
+        "Some people holding wine glasses in a restaurant.",
     ];
-    let feats_text = model.encode_texts(&texts)?; // [n, ndim]
+    let feats_text = model.encode_texts(&texts)?.norm(1)?;
 
     // load images
     let dl = DataLoader::new("./examples/clip/images")?.build()?;
 
     // run
-    for images in dl {
-        let feats_image = model.encode_images(&images)?;
+    for images in &dl {
+        let feats_image = model.encode_images(&images)?.norm(1)?;
 
         // use image to query texts
-        let matrix = Ops::dot2(&feats_image, &feats_text)?;
+        let matrix = (feats_image * 100.).dot2(&feats_text)?.softmax(1)?;
 
-        for i in 0..images.len() {
-            let probs = &matrix[i];
-            let (id, &score) = probs
+        // Process each image's matching scores
+        for (i, row) in matrix.axis_iter(Axis(0)).enumerate() {
+            let (id, &score) = row
                 .iter()
                 .enumerate()
-                .reduce(|max, x| if x.1 > max.1 { x } else { max })
+                .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
                 .unwrap();
 
             println!(
-                "({:?}%) {:?} => {} ",
+                "[{:.6}%] ({}) <=> ({})",
                 score * 100.0,
-                images[i].source(),
+                images[i].source().unwrap().display(),
                 &texts[id]
             );
         }
diff --git a/src/inference/x.rs b/src/inference/x.rs
index a176585..307cec8 100644
--- a/src/inference/x.rs
+++ b/src/inference/x.rs
@@ -1,6 +1,7 @@
 use anyhow::Result;
 use image::DynamicImage;
-use ndarray::{Array, Dim, IntoDimension, IxDyn, IxDynImpl};
+use ndarray::{Array, Dim, IntoDimension, Ix2, IxDyn, IxDynImpl};
+// use std::ops::Mul;
 
 use crate::{Ops, ResizeMode};
 
@@ -64,9 +65,39 @@ impl std::ops::Deref for X {
     }
 }
 
-impl X {
-    // TODO: Add some slice and index method
+impl std::ops::Mul<f32> for X {
+    type Output = Self;
 
+    fn mul(self, other: f32) -> Self::Output {
+        Self(self.0 * other)
+    }
+}
+
+impl std::ops::Div<f32> for X {
+    type Output = Self;
+
+    fn div(self, other: f32) -> Self::Output {
+        Self(self.0 / other)
+    }
+}
+
+impl std::ops::Add<f32> for X {
+    type Output = Self;
+
+    fn add(self, other: f32) -> Self::Output {
+        Self(self.0 + other)
+    }
+}
+
+impl std::ops::Sub<f32> for X {
+    type Output = Self;
+
+    fn sub(self, other: f32) -> Self::Output {
+        Self(self.0 - other)
+    }
+}
+
+impl X {
     pub fn zeros(shape: &[usize]) -> Self {
         Self::from(Array::zeros(Dim(IxDynImpl::from(shape.to_vec()))))
     }
@@ -187,6 +218,36 @@ impl X {
         Ok(self)
     }
 
+    pub fn dot2(&self, other: &Self) -> Result<Self> {
+        // Check dimensions
+        if self.ndim() != 2 || other.ndim() != 2 {
+            anyhow::bail!(
+                "dot2 requires 2D matrices, got {}D and {}D",
+                self.ndim(),
+                other.ndim()
+            );
+        }
+
+        let a = self.0.as_standard_layout().into_dimensionality::<Ix2>()?;
+        let b = other.0.as_standard_layout().into_dimensionality::<Ix2>()?;
+
+        // Check compatibility
+        if a.shape()[1] != b.shape()[1] {
+            anyhow::bail!(
+                "Incompatible dimensions for dot2: {:?} and {:?}",
+                a.shape(),
+                b.shape()
+            );
+        }
+
+        Ok(a.dot(&b.t()).into_dyn().into())
+    }
+
+    pub fn softmax(mut self, d: usize) -> Result<Self> {
+        self.0 = Ops::softmax(self.0, d)?;
+        Ok(self)
+    }
+
     pub fn unsigned(mut self) -> Self {
         self.0.par_mapv_inplace(|x| if x < 0.0 { 0.0 } else { x });
         self
diff --git a/src/models/clip/README.md b/src/models/clip/README.md
index 8bc962e..e5a7845 100644
--- a/src/models/clip/README.md
+++ b/src/models/clip/README.md
@@ -4,7 +4,11 @@ CLIP (Contrastive Language-Image Pre-Training) is a neural network trained on a
 
 ## Official Repository
 
-The official repository can be found on: [GitHub](https://github.com/openai/CLIP)
+The official repository can be found on: 
+- [CLIP](https://github.com/openai/CLIP)
+- [jina-clip-v1](https://huggingface.co/jinaai/jina-clip-v1)
+- [jina-clip-v2](https://huggingface.co/jinaai/jina-clip-v2)
+- [mobileclip](https://github.com/apple/ml-mobileclip)
 
 
 ## Example
diff --git a/src/models/clip/config.rs b/src/models/clip/config.rs
index 8b1d69f..0d59776 100644
--- a/src/models/clip/config.rs
+++ b/src/models/clip/config.rs
@@ -36,22 +36,76 @@ impl crate::Config {
 
     pub fn jina_clip() -> Self {
         Self::default()
-            .with_name("jina-clip-v1")
             .with_batch_size_all(1)
             .with_visual_ixx(0, 1, 3.into())
             .with_visual_ixx(0, 2, 224.into())
             .with_visual_ixx(0, 3, 224.into())
             .with_image_mean(&[0.48145466, 0.4578275, 0.40821073])
             .with_image_std(&[0.26862954, 0.2613026, 0.2757771])
+            .with_visual_file("visual.onnx")
+            .with_textual_file("textual.onnx")
+    }
+
+    pub fn jina_clip_v1() -> Self {
+        Self::jina_clip()
+            .with_name("jina-clip-v1")
             .with_tokenizer_file("jina-clip-v1/tokenizer.json")
             .with_tokenizer_config_file("jina-clip-v1/tokenizer_config.json")
             .with_special_tokens_map_file("jina-clip-v1/special_tokens_map.json")
             .with_config_file("jina-clip-v1/config.json")
     }
 
-    pub fn jina_clip_v1() -> Self {
+    pub fn jina_clip_v2() -> Self {
         Self::jina_clip()
-            .with_visual_file("visual.onnx")
-            .with_textual_file("textual.onnx")
+            .with_name("jina-clip-v2")
+            .with_visual_ixx(0, 2, 512.into())
+            .with_visual_ixx(0, 3, 512.into())
+            .with_tokenizer_file("jina-clip-v2/tokenizer.json")
+            .with_tokenizer_config_file("jina-clip-v2/tokenizer_config.json")
+            .with_special_tokens_map_file("jina-clip-v2/special_tokens_map.json")
+            .with_config_file("jina-clip-v2/config.json")
+    }
+
+    pub fn mobileclip() -> Self {
+        Self::default()
+            .with_name("mobileclip")
+            .with_batch_size_all(1)
+            .with_visual_ixx(0, 1, 3.into())
+            .with_visual_ixx(0, 2, 224.into())
+            .with_visual_ixx(0, 3, 224.into())
+            .with_model_max_length(77)
+            .with_tokenizer_file("clip/tokenizer.json")
+            .with_tokenizer_config_file("clip/tokenizer_config.json")
+            .with_special_tokens_map_file("clip/special_tokens_map.json")
+    }
+
+    pub fn mobileclip_s0() -> Self {
+        Self::mobileclip()
+            .with_textual_file("s0-textual.onnx")
+            .with_visual_file("s0-visual.onnx")
+    }
+
+    pub fn mobileclip_s1() -> Self {
+        Self::mobileclip()
+            .with_textual_file("s1-textual.onnx")
+            .with_visual_file("s1-visual.onnx")
+    }
+
+    pub fn mobileclip_s2() -> Self {
+        Self::mobileclip()
+            .with_textual_file("s2-textual.onnx")
+            .with_visual_file("s2-visual.onnx")
+    }
+
+    pub fn mobileclip_b() -> Self {
+        Self::mobileclip()
+            .with_textual_file("b-textual.onnx")
+            .with_visual_file("b-visual.onnx")
+    }
+
+    pub fn mobileclip_blt() -> Self {
+        Self::mobileclip()
+            .with_textual_file("blt-textual.onnx")
+            .with_visual_file("blt-visual.onnx")
     }
 }
diff --git a/src/models/clip/impl.rs b/src/models/clip/impl.rs
index 7d464c0..521426f 100644
--- a/src/models/clip/impl.rs
+++ b/src/models/clip/impl.rs
@@ -54,7 +54,7 @@ impl Clip {
         let xs = elapsed!("textual-preprocess", self.ts, {
             let encodings: Vec<f32> = self
                 .processor
-                .encode_texts_ids(xs, false)?
+                .encode_texts_ids(xs, true)?
                 .into_iter()
                 .flatten()
                 .collect();
@@ -62,6 +62,7 @@ impl Clip {
             let x: X = Array2::from_shape_vec((xs.len(), encodings.len() / xs.len()), encodings)?
                 .into_dyn()
                 .into();
+
             x
         });
         let xs = elapsed!("textual-inference", self.ts, {
diff --git a/src/utils/ops.rs b/src/utils/ops.rs
index b29e35f..af0492c 100644
--- a/src/utils/ops.rs
+++ b/src/utils/ops.rs
@@ -189,6 +189,25 @@ impl Ops<'_> {
         Ok(xs / std_)
     }
 
+    pub fn softmax(xs: Array<f32, IxDyn>, d: usize) -> Result<Array<f32, IxDyn>> {
+        if xs.shape().len() <= d {
+            anyhow::bail!(
+                "`softmax`: Specified axis {} exceeds the maximum dimension length {}.",
+                d,
+                xs.shape().len()
+            );
+        }
+        let max_vals = xs
+            .map_axis(Axis(d), |view| {
+                view.fold(f32::NEG_INFINITY, |a, &b| a.max(b))
+            })
+            .insert_axis(Axis(d));
+        let exps = (&xs - &max_vals).mapv(f32::exp);
+        let sums = exps.sum_axis(Axis(d)).insert_axis(Axis(d));
+
+        Ok(exps / sums)
+    }
+
     pub fn scale_wh(w0: f32, h0: f32, w1: f32, h1: f32) -> (f32, f32, f32) {
         let r = (w1 / w0).min(h1 / h0);
         (r, (w0 * r).round(), (h0 * r).round())