Add mobileclip and jina-clip-v2 (#106)

This commit is contained in:
Jamjamjon
2025-05-29 23:33:16 +08:00
committed by GitHub
parent 027c628b04
commit a3a4bf47ed
9 changed files with 190 additions and 37 deletions

View File

@ -21,7 +21,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
DEBIAN_FRONTEND=noninteractive apt-get update DEBIAN_FRONTEND=noninteractive apt-get update --fix-missing
DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential ca-certificates clang curl pkg-config protobuf-compiler DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential ca-certificates clang curl pkg-config protobuf-compiler
- name: Setup Rust - name: Setup Rust
@ -46,7 +46,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
DEBIAN_FRONTEND=noninteractive apt-get update DEBIAN_FRONTEND=noninteractive apt-get update --fix-missing
DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential ca-certificates clang curl pkg-config protobuf-compiler DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential ca-certificates clang curl pkg-config protobuf-compiler
- name: Setup Rust - name: Setup Rust
@ -66,7 +66,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
DEBIAN_FRONTEND=noninteractive apt-get update DEBIAN_FRONTEND=noninteractive apt-get update --fix-missing
DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential ca-certificates clang curl pkg-config protobuf-compiler DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential ca-certificates clang curl pkg-config protobuf-compiler
- name: Setup Rust - name: Setup Rust
@ -92,11 +92,11 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
DEBIAN_FRONTEND=noninteractive apt-get update DEBIAN_FRONTEND=noninteractive apt-get update --fix-missing
DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential ca-certificates clang curl pkg-config protobuf-compiler DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential ca-certificates clang curl pkg-config protobuf-compiler
- name: Setup Rust - name: Setup Rust
uses: dtolnay/rust-toolchain@stable uses: dtolnay/rust-toolchain@stable
- name: Build - name: Build
run: cargo build --all-features run: cargo build --all-features

View File

@ -62,7 +62,7 @@
- **YOLO Models**: [YOLOv5](https://github.com/ultralytics/yolov5), [YOLOv6](https://github.com/meituan/YOLOv6), [YOLOv7](https://github.com/WongKinYiu/yolov7), [YOLOv8](https://github.com/ultralytics/ultralytics), [YOLOv9](https://github.com/WongKinYiu/yolov9), [YOLOv10](https://github.com/THU-MIG/yolov10), [YOLO11](https://github.com/ultralytics/ultralytics), [YOLOv12](https://github.com/sunsmarterjie/yolov12) - **YOLO Models**: [YOLOv5](https://github.com/ultralytics/yolov5), [YOLOv6](https://github.com/meituan/YOLOv6), [YOLOv7](https://github.com/WongKinYiu/yolov7), [YOLOv8](https://github.com/ultralytics/ultralytics), [YOLOv9](https://github.com/WongKinYiu/yolov9), [YOLOv10](https://github.com/THU-MIG/yolov10), [YOLO11](https://github.com/ultralytics/ultralytics), [YOLOv12](https://github.com/sunsmarterjie/yolov12)
- **SAM Models**: [SAM](https://github.com/facebookresearch/segment-anything), [SAM2](https://github.com/facebookresearch/segment-anything-2), [MobileSAM](https://github.com/ChaoningZhang/MobileSAM), [EdgeSAM](https://github.com/chongzhou96/EdgeSAM), [SAM-HQ](https://github.com/SysCV/sam-hq), [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM) - **SAM Models**: [SAM](https://github.com/facebookresearch/segment-anything), [SAM2](https://github.com/facebookresearch/segment-anything-2), [MobileSAM](https://github.com/ChaoningZhang/MobileSAM), [EdgeSAM](https://github.com/chongzhou96/EdgeSAM), [SAM-HQ](https://github.com/SysCV/sam-hq), [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM)
- **Vision Models**: [RT-DETR](https://arxiv.org/abs/2304.08069), [RTMO](https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo), [Depth-Anything](https://github.com/LiheYoung/Depth-Anything), [DINOv2](https://github.com/facebookresearch/dinov2), [MODNet](https://github.com/ZHKKKe/MODNet), [Sapiens](https://arxiv.org/abs/2408.12569), [DepthPro](https://github.com/apple/ml-depth-pro), [FastViT](https://github.com/apple/ml-fastvit), [BEiT](https://github.com/microsoft/unilm/tree/master/beit), [MobileOne](https://github.com/apple/ml-mobileone) - **Vision Models**: [RT-DETR](https://arxiv.org/abs/2304.08069), [RTMO](https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo), [Depth-Anything](https://github.com/LiheYoung/Depth-Anything), [DINOv2](https://github.com/facebookresearch/dinov2), [MODNet](https://github.com/ZHKKKe/MODNet), [Sapiens](https://arxiv.org/abs/2408.12569), [DepthPro](https://github.com/apple/ml-depth-pro), [FastViT](https://github.com/apple/ml-fastvit), [BEiT](https://github.com/microsoft/unilm/tree/master/beit), [MobileOne](https://github.com/apple/ml-mobileone)
- **Vision-Language Models**: [CLIP](https://github.com/openai/CLIP), [jina-clip-v1](https://huggingface.co/jinaai/jina-clip-v1), [BLIP](https://arxiv.org/abs/2201.12086), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [YOLO-World](https://github.com/AILab-CVC/YOLO-World), [Florence2](https://arxiv.org/abs/2311.06242), [Moondream2](https://github.com/vikhyat/moondream/tree/main) - **Vision-Language Models**: [CLIP](https://github.com/openai/CLIP), [jina-clip-v1-v2](https://huggingface.co/jinaai/jina-clip-v1), [BLIP](https://arxiv.org/abs/2201.12086), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [YOLO-World](https://github.com/AILab-CVC/YOLO-World), [Florence2](https://arxiv.org/abs/2311.06242), [Moondream2](https://github.com/vikhyat/moondream/tree/main)
- **OCR-Related Models**: [FAST](https://github.com/czczup/FAST), [DB(PaddleOCR-Det)](https://arxiv.org/abs/1911.08947), [SVTR(PaddleOCR-Rec)](https://arxiv.org/abs/2205.00159), [SLANet](https://paddlepaddle.github.io/PaddleOCR/latest/algorithm/table_recognition/algorithm_table_slanet.html), [TrOCR](https://huggingface.co/microsoft/trocr-base-printed), [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO) - **OCR-Related Models**: [FAST](https://github.com/czczup/FAST), [DB(PaddleOCR-Det)](https://arxiv.org/abs/1911.08947), [SVTR(PaddleOCR-Rec)](https://arxiv.org/abs/2205.00159), [SLANet](https://paddlepaddle.github.io/PaddleOCR/latest/algorithm/table_recognition/algorithm_table_slanet.html), [TrOCR](https://huggingface.co/microsoft/trocr-base-printed), [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
<details> <details>
@ -100,6 +100,8 @@
| [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO) | Open-Set Detection With Language | [demo](examples/grounding-dino) | ✅ | ✅ | ✅ | | | | [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO) | Open-Set Detection With Language | [demo](examples/grounding-dino) | ✅ | ✅ | ✅ | | |
| [CLIP](https://github.com/openai/CLIP) | Vision-Language Embedding | [demo](examples/clip) | ✅ | ✅ | ✅ | ❌ | ❌ | | [CLIP](https://github.com/openai/CLIP) | Vision-Language Embedding | [demo](examples/clip) | ✅ | ✅ | ✅ | ❌ | ❌ |
| [jina-clip-v1](https://huggingface.co/jinaai/jina-clip-v1) | Vision-Language Embedding | [demo](examples/clip) | ✅ | ✅ | ✅ | ❌ | ❌ | | [jina-clip-v1](https://huggingface.co/jinaai/jina-clip-v1) | Vision-Language Embedding | [demo](examples/clip) | ✅ | ✅ | ✅ | ❌ | ❌ |
| [jina-clip-v2](https://huggingface.co/jinaai/jina-clip-v2) | Vision-Language Embedding | [demo](examples/clip) | ✅ | ✅ | ✅ | ❌ | ❌ |
| [mobileclip](https://github.com/apple/ml-mobileclip) | Vision-Language Embedding | [demo](examples/clip) | ✅ | ✅ | ✅ | ❌ | ❌ |
| [BLIP](https://github.com/salesforce/BLIP) | Image Captioning | [demo](examples/blip) | ✅ | ✅ | ✅ | ❌ | ❌ | | [BLIP](https://github.com/salesforce/BLIP) | Image Captioning | [demo](examples/blip) | ✅ | ✅ | ✅ | ❌ | ❌ |
| [DB(PaddleOCR-Det)](https://arxiv.org/abs/1911.08947) | Text Detection | [demo](examples/db) | ✅ | ✅ | ✅ | ✅ | ✅ | | [DB(PaddleOCR-Det)](https://arxiv.org/abs/1911.08947) | Text Detection | [demo](examples/db) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [FAST](https://github.com/czczup/FAST) | Text Detection | [demo](examples/fast) | ✅ | ✅ | ✅ | ✅ | ✅ | | [FAST](https://github.com/czczup/FAST) | Text Detection | [demo](examples/fast) | ✅ | ✅ | ✅ | ✅ | ✅ |

View File

@ -9,7 +9,7 @@ cargo run -r -F cuda --example clip -- --device cuda:0
## Results ## Results
```shell ```shell
(99.9675%) ./examples/clip/images/carrot.jpg => Some carrots [99.999428%] (examples/clip/images/carrot.jpg) <=> (A picture of some carrots.)
(99.93718%) ./examples/clip/images/doll.jpg => There is a doll with red hair and a clock on a table [100.000000%] (examples/clip/images/doll.jpg) <=> (There is a doll with red hair and a clock on a table.)
(100.0%) ./examples/clip/images/drink.jpg => Some people holding wine glasses in a restaurant [99.990738%] (examples/clip/images/drink.jpg) <=> (Some people holding wine glasses in a restaurant.)
``` ```

View File

@ -1,5 +1,6 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::Clip, Config, DataLoader, Ops}; use ndarray::Axis;
use usls::{models::Clip, Config, DataLoader};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// CLIP Example /// CLIP Example
@ -7,6 +8,10 @@ struct Args {
/// device /// device
#[argh(option, default = "String::from(\"cpu:0\")")] #[argh(option, default = "String::from(\"cpu:0\")")]
device: String, device: String,
/// dtype
#[argh(option, default = "String::from(\"fp16\")")]
dtype: String,
} }
fn main() -> Result<()> { fn main() -> Result<()> {
@ -17,45 +22,52 @@ fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// build model // build model
let config = Config::jina_clip_v1() let config = Config::mobileclip_s0()
// mobileclip_blt()
// clip_vit_b16()
// clip_vit_l14()
// clip_vit_b32()
// jina_clip_v1()
// jina_clip_v2()
.with_dtype_all(args.dtype.as_str().try_into()?)
.with_device_all(args.device.as_str().try_into()?) .with_device_all(args.device.as_str().try_into()?)
.commit()?; .commit()?;
let mut model = Clip::new(config)?; let mut model = Clip::new(config)?;
// texts // texts
let texts = vec![ let texts = vec![
"A photo of a dinosaur", "A photo of a dinosaur.",
"A photo of a cat", "A photo of a cat.",
"A photo of a dog", "A photo of a dog.",
"Some carrots", "A picture of some carrots.",
"There are some playing cards on a striped table cloth", "There are some playing cards on a striped table cloth.",
"There is a doll with red hair and a clock on a table", "There is a doll with red hair and a clock on a table.",
"Some people holding wine glasses in a restaurant", "Some people holding wine glasses in a restaurant.",
]; ];
let feats_text = model.encode_texts(&texts)?; // [n, ndim] let feats_text = model.encode_texts(&texts)?.norm(1)?;
// load images // load images
let dl = DataLoader::new("./examples/clip/images")?.build()?; let dl = DataLoader::new("./examples/clip/images")?.build()?;
// run // run
for images in dl { for images in &dl {
let feats_image = model.encode_images(&images)?; let feats_image = model.encode_images(&images)?.norm(1)?;
// use image to query texts // use image to query texts
let matrix = Ops::dot2(&feats_image, &feats_text)?; let matrix = (feats_image * 100.).dot2(&feats_text)?.softmax(1)?;
for i in 0..images.len() { // Process each image's matching scores
let probs = &matrix[i]; for (i, row) in matrix.axis_iter(Axis(0)).enumerate() {
let (id, &score) = probs let (id, &score) = row
.iter() .iter()
.enumerate() .enumerate()
.reduce(|max, x| if x.1 > max.1 { x } else { max }) .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
.unwrap(); .unwrap();
println!( println!(
"({:?}%) {:?} => {} ", "[{:.6}%] ({}) <=> ({})",
score * 100.0, score * 100.0,
images[i].source(), images[i].source().unwrap().display(),
&texts[id] &texts[id]
); );
} }

View File

@ -1,6 +1,7 @@
use anyhow::Result; use anyhow::Result;
use image::DynamicImage; use image::DynamicImage;
use ndarray::{Array, Dim, IntoDimension, IxDyn, IxDynImpl}; use ndarray::{Array, Dim, IntoDimension, Ix2, IxDyn, IxDynImpl};
// use std::ops::Mul;
use crate::{Ops, ResizeMode}; use crate::{Ops, ResizeMode};
@ -64,9 +65,39 @@ impl std::ops::Deref for X {
} }
} }
impl X { impl std::ops::Mul<f32> for X {
// TODO: Add some slice and index method type Output = Self;
fn mul(self, other: f32) -> Self::Output {
Self(self.0 * other)
}
}
impl std::ops::Div<f32> for X {
type Output = Self;
fn div(self, other: f32) -> Self::Output {
Self(self.0 / other)
}
}
impl std::ops::Add<f32> for X {
type Output = Self;
fn add(self, other: f32) -> Self::Output {
Self(self.0 + other)
}
}
impl std::ops::Sub<f32> for X {
type Output = Self;
fn sub(self, other: f32) -> Self::Output {
Self(self.0 - other)
}
}
impl X {
pub fn zeros(shape: &[usize]) -> Self { pub fn zeros(shape: &[usize]) -> Self {
Self::from(Array::zeros(Dim(IxDynImpl::from(shape.to_vec())))) Self::from(Array::zeros(Dim(IxDynImpl::from(shape.to_vec()))))
} }
@ -187,6 +218,36 @@ impl X {
Ok(self) Ok(self)
} }
pub fn dot2(&self, other: &Self) -> Result<Self> {
// Check dimensions
if self.ndim() != 2 || other.ndim() != 2 {
anyhow::bail!(
"dot2 requires 2D matrices, got {}D and {}D",
self.ndim(),
other.ndim()
);
}
let a = self.0.as_standard_layout().into_dimensionality::<Ix2>()?;
let b = other.0.as_standard_layout().into_dimensionality::<Ix2>()?;
// Check compatibility
if a.shape()[1] != b.shape()[1] {
anyhow::bail!(
"Incompatible dimensions for dot2: {:?} and {:?}",
a.shape(),
b.shape()
);
}
Ok(a.dot(&b.t()).into_dyn().into())
}
pub fn softmax(mut self, d: usize) -> Result<Self> {
self.0 = Ops::softmax(self.0, d)?;
Ok(self)
}
pub fn unsigned(mut self) -> Self { pub fn unsigned(mut self) -> Self {
self.0.par_mapv_inplace(|x| if x < 0.0 { 0.0 } else { x }); self.0.par_mapv_inplace(|x| if x < 0.0 { 0.0 } else { x });
self self

View File

@ -4,7 +4,11 @@ CLIP (Contrastive Language-Image Pre-Training) is a neural network trained on a
## Official Repository ## Official Repository
The official repository can be found on: [GitHub](https://github.com/openai/CLIP) The official repository can be found on:
- [CLIP](https://github.com/openai/CLIP)
- [jina-clip-v1](https://huggingface.co/jinaai/jina-clip-v1)
- [jina-clip-v2](https://huggingface.co/jinaai/jina-clip-v2)
- [mobileclip](https://github.com/apple/ml-mobileclip)
## Example ## Example

View File

@ -36,22 +36,76 @@ impl crate::Config {
pub fn jina_clip() -> Self { pub fn jina_clip() -> Self {
Self::default() Self::default()
.with_name("jina-clip-v1")
.with_batch_size_all(1) .with_batch_size_all(1)
.with_visual_ixx(0, 1, 3.into()) .with_visual_ixx(0, 1, 3.into())
.with_visual_ixx(0, 2, 224.into()) .with_visual_ixx(0, 2, 224.into())
.with_visual_ixx(0, 3, 224.into()) .with_visual_ixx(0, 3, 224.into())
.with_image_mean(&[0.48145466, 0.4578275, 0.40821073]) .with_image_mean(&[0.48145466, 0.4578275, 0.40821073])
.with_image_std(&[0.26862954, 0.2613026, 0.2757771]) .with_image_std(&[0.26862954, 0.2613026, 0.2757771])
.with_visual_file("visual.onnx")
.with_textual_file("textual.onnx")
}
pub fn jina_clip_v1() -> Self {
Self::jina_clip()
.with_name("jina-clip-v1")
.with_tokenizer_file("jina-clip-v1/tokenizer.json") .with_tokenizer_file("jina-clip-v1/tokenizer.json")
.with_tokenizer_config_file("jina-clip-v1/tokenizer_config.json") .with_tokenizer_config_file("jina-clip-v1/tokenizer_config.json")
.with_special_tokens_map_file("jina-clip-v1/special_tokens_map.json") .with_special_tokens_map_file("jina-clip-v1/special_tokens_map.json")
.with_config_file("jina-clip-v1/config.json") .with_config_file("jina-clip-v1/config.json")
} }
pub fn jina_clip_v1() -> Self { pub fn jina_clip_v2() -> Self {
Self::jina_clip() Self::jina_clip()
.with_visual_file("visual.onnx") .with_name("jina-clip-v2")
.with_textual_file("textual.onnx") .with_visual_ixx(0, 2, 512.into())
.with_visual_ixx(0, 3, 512.into())
.with_tokenizer_file("jina-clip-v2/tokenizer.json")
.with_tokenizer_config_file("jina-clip-v2/tokenizer_config.json")
.with_special_tokens_map_file("jina-clip-v2/special_tokens_map.json")
.with_config_file("jina-clip-v2/config.json")
}
pub fn mobileclip() -> Self {
Self::default()
.with_name("mobileclip")
.with_batch_size_all(1)
.with_visual_ixx(0, 1, 3.into())
.with_visual_ixx(0, 2, 224.into())
.with_visual_ixx(0, 3, 224.into())
.with_model_max_length(77)
.with_tokenizer_file("clip/tokenizer.json")
.with_tokenizer_config_file("clip/tokenizer_config.json")
.with_special_tokens_map_file("clip/special_tokens_map.json")
}
pub fn mobileclip_s0() -> Self {
Self::mobileclip()
.with_textual_file("s0-textual.onnx")
.with_visual_file("s0-visual.onnx")
}
pub fn mobileclip_s1() -> Self {
Self::mobileclip()
.with_textual_file("s1-textual.onnx")
.with_visual_file("s1-visual.onnx")
}
pub fn mobileclip_s2() -> Self {
Self::mobileclip()
.with_textual_file("s2-textual.onnx")
.with_visual_file("s2-visual.onnx")
}
pub fn mobileclip_b() -> Self {
Self::mobileclip()
.with_textual_file("b-textual.onnx")
.with_visual_file("b-visual.onnx")
}
pub fn mobileclip_blt() -> Self {
Self::mobileclip()
.with_textual_file("blt-textual.onnx")
.with_visual_file("blt-visual.onnx")
} }
} }

View File

@ -54,7 +54,7 @@ impl Clip {
let xs = elapsed!("textual-preprocess", self.ts, { let xs = elapsed!("textual-preprocess", self.ts, {
let encodings: Vec<f32> = self let encodings: Vec<f32> = self
.processor .processor
.encode_texts_ids(xs, false)? .encode_texts_ids(xs, true)?
.into_iter() .into_iter()
.flatten() .flatten()
.collect(); .collect();
@ -62,6 +62,7 @@ impl Clip {
let x: X = Array2::from_shape_vec((xs.len(), encodings.len() / xs.len()), encodings)? let x: X = Array2::from_shape_vec((xs.len(), encodings.len() / xs.len()), encodings)?
.into_dyn() .into_dyn()
.into(); .into();
x x
}); });
let xs = elapsed!("textual-inference", self.ts, { let xs = elapsed!("textual-inference", self.ts, {

View File

@ -189,6 +189,25 @@ impl Ops<'_> {
Ok(xs / std_) Ok(xs / std_)
} }
pub fn softmax(xs: Array<f32, IxDyn>, d: usize) -> Result<Array<f32, IxDyn>> {
if xs.shape().len() <= d {
anyhow::bail!(
"`softmax`: Specified axis {} exceeds the maximum dimension length {}.",
d,
xs.shape().len()
);
}
let max_vals = xs
.map_axis(Axis(d), |view| {
view.fold(f32::NEG_INFINITY, |a, &b| a.max(b))
})
.insert_axis(Axis(d));
let exps = (&xs - &max_vals).mapv(f32::exp);
let sums = exps.sum_axis(Axis(d)).insert_axis(Axis(d));
Ok(exps / sums)
}
pub fn scale_wh(w0: f32, h0: f32, w1: f32, h1: f32) -> (f32, f32, f32) { pub fn scale_wh(w0: f32, h0: f32, w1: f32, h1: f32) -> (f32, f32, f32) {
let r = (w1 / w0).min(h1 / h0); let r = (w1 / w0).min(h1 / h0);
(r, (w0 * r).round(), (h0 * r).round()) (r, (w0 * r).round(), (h0 * r).round())