Add moondream2

* Add moondream2

* Update README.md
This commit is contained in:
Jamjamjon
2025-01-20 21:37:54 +08:00
committed by GitHub
parent afd1deb1f4
commit 475a680703
18 changed files with 1019 additions and 23 deletions

View File

@ -14,7 +14,7 @@ exclude = ["assets/*", "examples/*", "runs/*", "benches/*"]
aksr = { version = "0.0.2" }
image = { version = "0.25.2" }
imageproc = { version = "0.24" }
ndarray = { version = "0.16.1", features = ["rayon"] }
ndarray = { version = "0.16.1", features = ["rayon", "serde"] }
rayon = { version = "1.10.0" }
anyhow = { version = "1.0.75" }
regex = { version = "1.5.4" }
@ -38,6 +38,7 @@ natord = "1.0.9"
video-rs = { version = "0.10.0", features = ["ndarray"], optional = true }
minifb = { version = "0.27.0", optional = true }
sha2 = "0.10.8"
ndarray-npy = "0.9.1"
[dev-dependencies]
argh = "0.1.13"

View File

@ -37,7 +37,7 @@
- **YOLO Models**: [YOLOv5](https://github.com/ultralytics/yolov5), [YOLOv6](https://github.com/meituan/YOLOv6), [YOLOv7](https://github.com/WongKinYiu/yolov7), [YOLOv8](https://github.com/ultralytics/ultralytics), [YOLOv9](https://github.com/WongKinYiu/yolov9), [YOLOv10](https://github.com/THU-MIG/yolov10), [YOLO11](https://github.com/ultralytics/ultralytics)
- **SAM Models**: [SAM](https://github.com/facebookresearch/segment-anything), [SAM2](https://github.com/facebookresearch/segment-anything-2), [MobileSAM](https://github.com/ChaoningZhang/MobileSAM), [EdgeSAM](https://github.com/chongzhou96/EdgeSAM), [SAM-HQ](https://github.com/SysCV/sam-hq), [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM)
- **Vision Models**: [RT-DETR](https://arxiv.org/abs/2304.08069), [RTMO](https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo), [Depth-Anything](https://github.com/LiheYoung/Depth-Anything), [DINOv2](https://github.com/facebookresearch/dinov2), [MODNet](https://github.com/ZHKKKe/MODNet), [Sapiens](https://arxiv.org/abs/2408.12569), [DepthPro](https://github.com/apple/ml-depth-pro), [FastViT](https://github.com/apple/ml-fastvit), [BEiT](https://github.com/microsoft/unilm/tree/master/beit), [MobileOne](https://github.com/apple/ml-mobileone)
- **Vision-Language Models**: [CLIP](https://github.com/openai/CLIP), [jina-clip-v1](https://huggingface.co/jinaai/jina-clip-v1), [BLIP](https://arxiv.org/abs/2201.12086), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [YOLO-World](https://github.com/AILab-CVC/YOLO-World), [Florence2](https://arxiv.org/abs/2311.06242)
- **Vision-Language Models**: [CLIP](https://github.com/openai/CLIP), [jina-clip-v1](https://huggingface.co/jinaai/jina-clip-v1), [BLIP](https://arxiv.org/abs/2201.12086), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [YOLO-World](https://github.com/AILab-CVC/YOLO-World), [Florence2](https://arxiv.org/abs/2311.06242), [Moondream2](https://github.com/vikhyat/moondream/tree/main)
- **OCR Models**: [FAST](https://github.com/czczup/FAST), [DB(PaddleOCR-Det)](https://arxiv.org/abs/1911.08947), [SVTR(PaddleOCR-Rec)](https://arxiv.org/abs/2205.00159), [SLANet](https://paddlepaddle.github.io/PaddleOCR/latest/algorithm/table_recognition/algorithm_table_slanet.html), [TrOCR](https://huggingface.co/microsoft/trocr-base-printed), [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
<details>
@ -86,6 +86,7 @@
| [MODNet](https://github.com/ZHKKKe/MODNet) | Image Matting | [demo](examples/modnet) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [Sapiens](https://github.com/facebookresearch/sapiens/tree/main) | Foundation for Human Vision Models | [demo](examples/sapiens) | ✅ | ✅ | ✅ | | |
| [Florence2](https://arxiv.org/abs/2311.06242) | a Variety of Vision Tasks | [demo](examples/florence2) | ✅ | ✅ | ✅ | | |
| [Moondream2](https://github.com/vikhyat/moondream/tree/main) | Open-Set Detection<br />Open-Set Keypoints Detection<br />Image Caption<br />Visual Question Answering | [demo](examples/moondream2) | ✅ | ✅ | ✅ | | |
</details>

View File

@ -90,9 +90,11 @@ fn main() -> Result<()> {
Task::ObjectDetection,
Task::DenseRegionCaption,
// w/o inputs
Task::OpenSetDetection("a vehicle"),
Task::CaptionToPhraseGrounding("A vehicle with two wheels parked in front of a building."),
Task::ReferringExpressionSegmentation("a vehicle"),
Task::OpenSetDetection("a vehicle".into()),
Task::CaptionToPhraseGrounding(
"A vehicle with two wheels parked in front of a building.".into(),
),
Task::ReferringExpressionSegmentation("a vehicle".into()),
Task::RegionToSegmentation(
// 31, 156, 581, 373, // car
449, 270, 556, 372, // wheel

View File

@ -0,0 +1,10 @@
## Quick Start
```shell
cargo run -r -F cuda --example moondream2 -- --device 'cuda:0' --dtype i8 --scale 2b --task vqa:"What's in this image?"
cargo run -r -F cuda --example moondream2 -- --device 'cuda:0' --dtype i8 --scale 2b --task cap:0
cargo run -r -F cuda --example moondream2 -- --device 'cuda:0' --dtype i8 --scale 2b --task cap:1
cargo run -r -F cuda --example moondream2 -- --device 'cuda:0' --dtype i8 --scale 2b --task open-od:person
cargo run -r -F cuda --example moondream2 -- --device 'cuda:0' --dtype i8 --scale 2b --task open-kpt:person
```

157
examples/moondream2/main.rs Normal file
View File

@ -0,0 +1,157 @@
use anyhow::Result;
use usls::{models::Moondream2, Annotator, DataLoader, Options, Scale, Task};
#[derive(argh::FromArgs)]
/// Example
struct Args {
/// device
#[argh(option, default = "String::from(\"cpu:0\")")]
device: String,
/// source image
#[argh(
option,
default = "vec![
String::from(\"./assets/bus.jpg\"),
String::from(\"images/green-car.jpg\"),
]"
)]
source: Vec<String>,
/// dtype
#[argh(option, default = "String::from(\"int4\")")]
dtype: String,
/// scale
#[argh(option, default = "String::from(\"0.5b\")")]
scale: String,
/// task
#[argh(option, default = "String::from(\"Caption: 0\")")]
task: String,
}
fn main() -> Result<()> {
tracing_subscriber::fmt()
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
.with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339())
.init();
let args: Args = argh::from_env();
// build model
let (
options_vision_encoder,
options_vision_projection,
options_text_decoder,
options_text_encoder,
options_coord_decoder,
options_coord_encoder,
options_size_decoder,
options_size_encoder,
) = match args.scale.as_str().try_into()? {
Scale::Billion(2.) => (
Options::moondream2_2b_vision_encoder(),
Options::moondream2_2b_vision_projection(),
Options::moondream2_2b_text_decoder(),
Options::moondream2_2b_text_encoder(),
Options::moondream2_2b_coord_decoder(),
Options::moondream2_2b_coord_encoder(),
Options::moondream2_2b_size_decoder(),
Options::moondream2_2b_size_encoder(),
),
Scale::Billion(0.5) => (
Options::moondream2_0_5b_vision_encoder(),
Options::moondream2_0_5b_vision_projection(),
Options::moondream2_0_5b_text_decoder(),
Options::moondream2_0_5b_text_encoder(),
Options::moondream2_0_5b_coord_decoder(),
Options::moondream2_0_5b_coord_encoder(),
Options::moondream2_0_5b_size_decoder(),
Options::moondream2_0_5b_size_encoder(),
),
_ => unimplemented!(),
};
let mut model = Moondream2::new(
options_vision_encoder
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
options_vision_projection
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
options_text_encoder
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
options_text_decoder
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
Some(
options_coord_encoder
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
),
Some(
options_coord_decoder
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
),
Some(
options_size_encoder
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
),
Some(
options_size_decoder
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
),
)?;
// load images
let xs = DataLoader::try_read_batch(&args.source)?;
// run with task
let task: Task = args.task.as_str().try_into()?;
let ys = model.forward(&xs, &task)?;
// annotate
match task {
Task::Caption(_) => {
println!("{}:", task);
for (i, y) in ys.iter().enumerate() {
if let Some(texts) = y.texts() {
println!("Image {}: {:?}\n", i, texts[0]);
}
}
}
Task::Vqa(query) => {
println!("Question: {}", query);
for (i, y) in ys.iter().enumerate() {
if let Some(texts) = y.texts() {
println!("Image {}: {:?}\n", i, texts[0]);
}
}
}
Task::OpenSetDetection(_) | Task::OpenSetKeypointsDetection(_) => {
println!("{:?}", ys);
let annotator = Annotator::default()
.with_bboxes_thickness(4)
.without_bboxes_conf(true)
.with_keypoints_radius(6)
.with_keypoints_name(true)
.with_saveout("moondream2");
annotator.annotate(&xs, &ys);
}
_ => unimplemented!("Unsupported moondream2 task."),
}
Ok(())
}

View File

@ -33,8 +33,8 @@ impl TryFrom<&str> for Device {
// device and its id
let d_id: Vec<&str> = s.trim().split(':').collect();
let (d, id) = match d_id.len() {
1 => (d_id[0], 0),
2 => (d_id[0], d_id[1].parse::<usize>().unwrap_or(0)),
1 => (d_id[0].trim(), 0),
2 => (d_id[0].trim(), d_id[1].trim().parse::<usize>().unwrap_or(0)),
_ => anyhow::bail!(
"Fail to parse device string: {s}. Expect: `device:device_id` or `device`. e.g. `cuda:0` or `cuda`"
),

View File

@ -3,6 +3,7 @@ use ort::tensor::TensorElementType;
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum DType {
Auto,
Int4,
Int8,
Int16,
Int32,
@ -32,6 +33,7 @@ impl TryFrom<&str> for DType {
"u16" | "uint16" => Ok(Self::Uint16),
"u32" | "uint32" => Ok(Self::Uint32),
"u64" | "uint64" => Ok(Self::Uint64),
"i4" | "int4" => Ok(Self::Int4),
"i8" | "int8" => Ok(Self::Int8),
"i16" | "int=16" => Ok(Self::Int16),
"i32" | "int32" => Ok(Self::Int32),
@ -52,6 +54,7 @@ impl std::fmt::Display for DType {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
let x = match self {
Self::Auto => "auto",
Self::Int4 => "int4",
Self::Int8 => "int8",
Self::Int16 => "int16",
Self::Int32 => "int32",

View File

@ -206,6 +206,7 @@ impl Engine {
x, dtype,
)?));
}
xs_
});
@ -223,6 +224,7 @@ impl Engine {
ys.push_kv(name.as_str(), X::from(y))?;
}
});
Ok(ys)
} else {
anyhow::bail!("Failed to run with ONNXRuntime. No model info found.");

View File

@ -68,6 +68,11 @@ pub struct Options {
pub text_confs_2: Vec<f32>,
pub text_confs_3: Vec<f32>,
// Files
pub file: Option<String>,
pub file_2: Option<String>,
pub file_3: Option<String>,
// For classification
pub apply_softmax: Option<bool>,
@ -149,6 +154,9 @@ impl Default for Options {
text_names: None,
text_names_2: None,
text_names_3: None,
file: None,
file_2: None,
file_3: None,
class_confs: vec![0.3f32],
class_confs_2: vec![0.3f32],
class_confs_3: vec![0.3f32],
@ -320,11 +328,6 @@ impl Options {
.try_fetch(&format!("{}/{}", self.model_name, self.model_file))?;
}
}
// let stem = crate::try_fetch_stem(&self.model_file)?;
// self.model_spec = format!("{}/{}", self.model_name, stem);
// self.model_file =
// Hub::default().try_fetch(&format!("{}/{}", self.model_name, self.model_file))?;
}
Ok(self)
@ -408,7 +411,7 @@ impl Options {
.unwrap_or(&format!("{}/tokenizer.json", self.model_name)),
)?,
)
.map_err(|_| anyhow::anyhow!("No `tokenizer.json` found"))?;
.map_err(|err| anyhow::anyhow!("Faild to build tokenizer: {err}"))?;
// TODO: padding
// if `max_length` specified: use `Fixed` strategy

View File

@ -13,6 +13,8 @@ pub enum Scale {
P,
A,
F,
Million(f32),
Billion(f32),
}
impl std::fmt::Display for Scale {
@ -31,6 +33,8 @@ impl std::fmt::Display for Scale {
Self::P => "p",
Self::A => "a",
Self::F => "f",
Self::Million(x) => &format!("{x}m"),
Self::Billion(x) => &format!("{x}b"), // x.0 -> x
};
write!(f, "{}", x)
}
@ -77,6 +81,20 @@ impl TryFrom<&str> for Scale {
"p" | "pico" => Ok(Self::P),
"a" | "atto" => Ok(Self::A),
"f" | "femto" => Ok(Self::F),
scale if scale.ends_with("b") => {
let num_str = &scale[..scale.len() - 1];
match num_str.parse::<f32>() {
Ok(x) => Ok(Self::Billion(x)),
Err(_) => anyhow::bail!("Invalid Billion format: {}", scale),
}
}
scale if scale.ends_with("m") => {
let num_str = &scale[..scale.len() - 1];
match num_str.parse::<f32>() {
Ok(x) => Ok(Self::Million(x)),
Err(_) => anyhow::bail!("Invalid Million format: {}", scale),
}
}
x => anyhow::bail!("Unsupported model scale: {:?}", x),
}
}

View File

@ -1,4 +1,4 @@
#[derive(Debug, Copy, Clone, Ord, Eq, PartialOrd, PartialEq)]
#[derive(Debug, Clone, Ord, Eq, PartialOrd, PartialEq)]
pub enum Task {
/// Image classification task.
/// Input: image
@ -32,7 +32,7 @@ pub enum Task {
/// Input: image
/// Output: bounding boxes, class labels (including an "unknown" category for unfamiliar objects), and detection scores
/// Open set detection task, with String query
OpenSetDetection(&'static str),
OpenSetDetection(String),
/// Task for generating brief descriptions of dense regions in the image.
/// Input: image
/// Output: bounding boxes (bboxes), brief phrase labels, and optional scores for detected regions
@ -44,6 +44,7 @@ pub enum Task {
/// Output: coordinates of detected keypoints
KeypointsDetection,
Pose,
OpenSetKeypointsDetection(String),
/// Semantic segmentation task, segmenting the image into different semantic regions.
/// Input: image
@ -97,12 +98,12 @@ pub enum Task {
/// Input: image and text
/// Output: image region and the corresponding phrase
/// caption to phrase grounding
CaptionToPhraseGrounding(&'static str),
CaptionToPhraseGrounding(String),
/// Referring expression segmentation task, segmenting objects in the image based on a text description.
/// Input: image and referring expression
/// Output: a segmentation mask for the object referred to by the text
ReferringExpressionSegmentation(&'static str),
ReferringExpressionSegmentation(String),
/// Region-to-segmentation task, similar to combining object detection with segmentation (e.g., YOLO + SAM).
/// Input: image and region proposals
@ -125,7 +126,7 @@ pub enum Task {
/// Visual question answering (VQA) task, answering questions related to an image.
/// Input: image and question text
/// Output: the answer to the question
Vqa(&'static str),
Vqa(String),
/// Optical character recognition (OCR) task, recognizing text in an image.
/// Input: image
@ -156,6 +157,7 @@ impl std::fmt::Display for Task {
Self::Ocr => "ocr",
Self::OcrWithRegion => "ocr-with-region",
Self::Vqa(_) => "vqa",
Self::OpenSetKeypointsDetection(_) => "open-set-keypoints-detection",
_ => todo!(),
};
write!(f, "{}", x)
@ -166,13 +168,33 @@ impl TryFrom<&str> for Task {
type Error = anyhow::Error;
fn try_from(s: &str) -> Result<Self, Self::Error> {
// TODO
match s.to_lowercase().as_str() {
"cls" | "classify" | "classification" => Ok(Self::ImageClassification),
"det" | "od" | "detect" => Ok(Self::ObjectDetection),
"kpt" | "pose" => Ok(Self::KeypointsDetection),
"seg" | "segment" => Ok(Self::InstanceSegmentation),
"obb" => Ok(Self::OrientedObjectDetection),
_ => todo!(), // x => anyhow::bail!("Unsupported model task: {}", x),
"cap" | "cap0" | "caption" => Ok(Self::Caption(0)),
"cap1" | "caption1" => Ok(Self::Caption(1)),
"cap2" | "caption2" => Ok(Self::Caption(2)),
x if x.contains(":") => {
let t_tt: Vec<&str> = x.trim().split(':').collect();
let (t, tt) = match t_tt.len() {
2 => (t_tt[0].trim(), t_tt[1].trim()),
_ => anyhow::bail!(
"Fail to parse task: {x}. Expect: `task:content`. e.g. `vqa:What's in this image?`"
),
};
match t {
"cap" | "caption" => Ok(Self::Caption(tt.parse::<usize>().unwrap_or(0) as u8)),
"vqa" => Ok(Self::Vqa(tt.into())),
"open-det" | "open-od" => Ok(Self::OpenSetDetection(tt.into())),
"open-kpt" | "open-pose" => Ok(Self::OpenSetKeypointsDetection(tt.into())),
_ => todo!(),
}
}
_ => todo!(),
}
}
}

View File

@ -88,7 +88,7 @@ impl Florence2 {
.quantize(&[*x0, *y0, *x1, *y1], (image_width, image_height));
Task::RegionToDescription(xyxy[0], xyxy[1], xyxy[2], xyxy[3])
}
_ => *task,
_ => task.clone(),
}
}

View File

@ -16,6 +16,7 @@ mod grounding_dino;
mod linknet;
mod mobileone;
mod modnet;
mod moondream2;
mod picodet;
mod pipeline;
mod rtdetr;
@ -37,6 +38,7 @@ pub use dinov2::*;
pub use florence2::*;
pub use grounding_dino::*;
pub use modnet::*;
pub use moondream2::*;
pub use picodet::*;
pub use pipeline::*;
pub use rtdetr::*;

View File

@ -0,0 +1,9 @@
# moondream: A tiny vision language model that kicks ass and runs anywhere
## Official Repository
The official repository can be found on: [GitHub](https://github.com/vikhyat/moondream/tree/main)
## Example
Refer to the [example](../../../examples/moondream2)

View File

@ -0,0 +1,117 @@
/// Model configuration for `moondream2`
impl crate::Options {
pub fn moondream2() -> Self {
Self::default()
.with_model_name("moondream2")
.with_model_num_dry_run(0)
}
pub fn moondream2_0_5b() -> Self {
Self::moondream2().with_model_scale(crate::Scale::Billion(0.5))
}
pub fn moondream2_0_5b_vision_encoder() -> Self {
Self::moondream2_0_5b()
.with_model_ixx(0, 0, (1, 3, 4).into()) // patch count
.with_model_kind(crate::Kind::Vision)
.with_image_mean(&[0.5, 0.5, 0.5])
.with_image_std(&[0.5, 0.5, 0.5])
.with_normalize(true)
.with_resize_mode(crate::ResizeMode::FitExact)
.with_resize_filter("catmullrom")
.with_model_file("0.5b-vision-encoder.onnx")
}
pub fn moondream2_0_5b_vision_projection() -> Self {
Self::moondream2_0_5b()
.with_batch_size(1)
.with_model_kind(crate::Kind::Vision)
.with_model_file("0.5b-vision-projection.onnx")
}
pub fn moondream2_0_5b_text_decoder() -> Self {
Self::moondream2_0_5b()
.with_batch_size(1)
.with_model_kind(crate::Kind::Language)
.with_model_file("0.5b-text-decoder.onnx")
}
pub fn moondream2_0_5b_text_encoder() -> Self {
Self::moondream2_0_5b()
.with_batch_size(1)
.with_model_kind(crate::Kind::Language)
.with_model_file("0.5b-text-encoder.onnx")
}
pub fn moondream2_0_5b_coord_encoder() -> Self {
Self::moondream2_0_5b()
.with_batch_size(1)
.with_model_file("0.5b-coord-encoder.onnx")
}
pub fn moondream2_0_5b_coord_decoder() -> Self {
Self::moondream2_0_5b()
.with_batch_size(1)
.with_model_file("0.5b-coord-decoder.onnx")
}
pub fn moondream2_0_5b_size_encoder() -> Self {
Self::moondream2_0_5b()
.with_batch_size(1)
.with_model_file("0.5b-size-encoder.onnx")
}
pub fn moondream2_0_5b_size_decoder() -> Self {
Self::moondream2_0_5b()
.with_batch_size(1)
.with_model_file("0.5b-size-decoder.onnx")
}
pub fn moondream2_2b_vision_encoder() -> Self {
Self::moondream2_0_5b_vision_encoder()
.with_model_scale(crate::Scale::Billion(2.))
.with_model_file("2b-vision-encoder.onnx")
}
pub fn moondream2_2b_vision_projection() -> Self {
Self::moondream2_0_5b_vision_projection()
.with_model_scale(crate::Scale::Billion(2.))
.with_model_file("2b-vision-projection.onnx")
}
pub fn moondream2_2b_text_decoder() -> Self {
Self::moondream2_0_5b_text_decoder()
.with_model_scale(crate::Scale::Billion(2.))
.with_model_file("2b-text-decoder.onnx")
}
pub fn moondream2_2b_text_encoder() -> Self {
Self::moondream2_0_5b_text_encoder()
.with_model_scale(crate::Scale::Billion(2.))
.with_model_file("2b-text-encoder.onnx")
}
pub fn moondream2_2b_coord_encoder() -> Self {
Self::moondream2_0_5b_coord_encoder()
.with_model_scale(crate::Scale::Billion(2.))
.with_model_file("2b-coord-encoder.onnx")
}
pub fn moondream2_2b_coord_decoder() -> Self {
Self::moondream2_0_5b_coord_decoder()
.with_model_scale(crate::Scale::Billion(2.))
.with_model_file("2b-coord-decoder.onnx")
}
pub fn moondream2_2b_size_encoder() -> Self {
Self::moondream2_0_5b_size_encoder()
.with_model_scale(crate::Scale::Billion(2.))
.with_model_file("2b-size-encoder.onnx")
}
pub fn moondream2_2b_size_decoder() -> Self {
Self::moondream2_0_5b_size_decoder()
.with_model_scale(crate::Scale::Billion(2.))
.with_model_file("2b-size-decoder.onnx")
}
}

View File

@ -0,0 +1,645 @@
use aksr::Builder;
use anyhow::{Context, Result};
use image::{DynamicImage, GenericImageView};
use ndarray::{s, Array, Array2, Array3, Axis, IxDyn};
use ndarray_npy::ReadNpyExt;
use crate::{
BaseModelTextual, Bbox, DType, Engine, Hub, Keypoint, LogitsSampler, Options, Processor, Scale,
Task, Ts, Xs, Ys, X, Y,
};
#[derive(Builder, Debug)]
pub struct Moondream2 {
vision_encoder: VisionEncoder,
vision_projection: VisionProjection,
pub text_decoder: BaseModelTextual,
text_encoder: BaseModelTextual,
coord_decoder: Option<BaseModelTextual>,
coord_encoder: Option<BaseModelTextual>,
size_decoder: Option<BaseModelTextual>,
size_encoder: Option<BaseModelTextual>,
initial_kv_cache: X, // TODO: use f16
scale: Scale,
dtype: DType,
max_length: usize,
eos_token_id: u32,
max_objects: usize,
}
impl Moondream2 {
// TODO
#[allow(clippy::too_many_arguments)]
pub fn new(
options_vision_encoder: Options,
options_vision_projection: Options,
options_text_encoder: Options,
options_text_decoder: Options,
options_coord_encoder: Option<Options>,
options_coord_decoder: Option<Options>,
options_size_encoder: Option<Options>,
options_size_decoder: Option<Options>,
) -> Result<Self> {
let max_length = 2048;
let max_objects = 50;
let eos_token_id = 50256;
let dtype = options_vision_encoder.model_dtype;
let scale = options_vision_encoder
.model_scale
.unwrap_or(Scale::Billion(0.5));
let initial_kv_cache: X = KVCache::new(&scale, &dtype)?.0.into();
let vision_encoder = VisionEncoder::new(options_vision_encoder)?;
let vision_projection = VisionProjection::new(options_vision_projection)?;
let text_decoder = BaseModelTextual::new(options_text_decoder)?;
let text_encoder = BaseModelTextual::new(options_text_encoder)?;
let coord_decoder = options_coord_decoder
.map(BaseModelTextual::new)
.transpose()?;
let coord_encoder = options_coord_encoder
.map(BaseModelTextual::new)
.transpose()?;
let size_decoder = options_size_decoder
.map(BaseModelTextual::new)
.transpose()?;
let size_encoder = options_size_encoder
.map(BaseModelTextual::new)
.transpose()?;
Ok(Self {
vision_encoder,
vision_projection,
text_decoder,
initial_kv_cache,
max_length,
max_objects,
text_encoder,
coord_decoder,
coord_encoder,
size_encoder,
size_decoder,
eos_token_id,
scale,
dtype,
})
}
pub fn encode_image(&mut self, x: &DynamicImage) -> Result<X> {
let patches_emb = self.vision_encoder.encode(x)?.clone().insert_axis(0)?;
let image_embedding = self.vision_projection.inference(patches_emb.into())?[0].to_owned();
Ok(image_embedding)
}
pub fn forward(&mut self, xs: &[DynamicImage], task: &Task) -> Result<Ys> {
let mut ys: Vec<Y> = Vec::new();
for x in xs.iter() {
let y = self.forward_once(x, task)?;
ys.push(y);
}
Ok(ys.into())
}
pub fn forward_once(&mut self, images: &DynamicImage, task: &Task) -> Result<Y> {
let image_embedding = self.encode_image(images)?;
let kv_cache = self.prepare_kv_cache(&image_embedding)?;
match task {
Task::Caption(n) => {
let input_ids = match n {
0 => vec![198., 198., 16438., 8305., 25.],
_ => vec![198., 198., 24334., 1159., 25.],
};
let text = self.generate_text(&input_ids, kv_cache)?;
let y = Y::default().with_texts(&[text.into()]);
Ok(y)
}
Task::Vqa(query) => {
let input_ids: Vec<_> = [198., 198., 24361., 25.]
.iter()
.chain(
&self
.text_encoder
.processor()
.encode_text_ids(query, false)?,
)
.chain(&[198., 198., 33706., 25.])
.cloned()
.collect();
let text = self.generate_text(&input_ids, kv_cache)?;
let y = Y::default().with_texts(&[text.into()]);
Ok(y)
}
Task::OpenSetDetection(object) => {
let input_ids: Vec<_> = [198., 198., 47504., 25.]
.iter()
.chain(
&self
.text_encoder
.processor()
.encode_text_ids(&format!(" {}", object), false)?,
)
.chain(&[628.])
.cloned()
.collect();
let (_, y_bboxes) =
self.generate_points_boxes(&input_ids, kv_cache, object, true)?;
Ok(Y::default().with_bboxes(&y_bboxes))
}
Task::OpenSetKeypointsDetection(object) => {
let input_ids: Vec<_> = [198., 198., 12727., 25.]
.iter()
.chain(
&self
.text_encoder
.processor()
.encode_text_ids(&format!(" {}", object), false)?,
)
.chain(&[628.])
.cloned()
.collect();
let (y_kpts, _) =
self.generate_points_boxes(&input_ids, kv_cache, object, false)?;
Ok(Y::default().with_keypoints(&y_kpts))
}
x => anyhow::bail!("Unsupported Moondream2 task: {}", x),
}
}
fn generate_text(&mut self, input_ids: &[f32], kv_cache: Array<f32, IxDyn>) -> Result<String> {
let input_ids = X::from(input_ids.to_vec()).insert_axis(0)?;
let mut input_embeds = self.text_encoder.inference(Xs::from(input_ids))?[0].to_owned();
let logits_sampler = LogitsSampler::new();
let mut token_ids: Vec<u32> = Vec::new();
let mut pos = self.vision_projection.seq_len() + self.initial_kv_cache.shape()[4];
let mut inc = input_embeds.shape()[1];
let mut kv_cache = kv_cache.clone();
// generate
for _ in 0..self.max_length {
// TODO
let input = Xs::from(vec![
input_embeds.clone(),
kv_cache
.slice(s![.., .., .., .., ..pos, ..])
.into_owned()
.into_dyn()
.into(),
]);
let decoder_outputs = self.text_decoder.inference(input)?;
// update
let logits = &decoder_outputs["logits"];
let new_kv_cache = &decoder_outputs["new_kv_cache"];
kv_cache
.slice_mut(s![.., .., .., .., pos..pos + inc, ..])
.assign(new_kv_cache);
pos += inc;
// decode
let token_id = logits_sampler.decode(
logits
.slice(s![-1, ..])
.as_slice()
.context("Failed to get slice when decode `logits`")?,
)?;
// break
if token_id == self.eos_token_id {
break;
}
// update
token_ids.push(token_id);
inc = 1;
// encode
let next_tokens = X::from(vec![token_id as f32]).insert_axis(1)?;
input_embeds = self.text_encoder.inference(Xs::from(next_tokens))?[0].to_owned();
}
let text = self
.text_encoder
.processor()
.decode_tokens(&token_ids, true)?;
Ok(text)
}
fn generate_points_boxes(
&mut self,
input_ids: &[f32],
kv_cache: Array<f32, IxDyn>,
object: &str,
generate_boxes: bool,
) -> Result<(Vec<Vec<Keypoint>>, Vec<Bbox>)> {
let mut y_bboxes: Vec<Bbox> = Vec::new();
let mut y_kpts: Vec<Vec<Keypoint>> = Vec::new();
let (image_height, image_width) = self.vision_encoder.processor.image0s_size[0];
let mut pos = self.vision_projection.seq_len() + self.initial_kv_cache.shape()[4];
let logits_sampler = LogitsSampler::new();
// initial input_embeds
let input_ids = X::from(input_ids.to_vec()).insert_axis(0)?;
let mut hidden = self.text_encoder.inference(Xs::from(input_ids))?[0].to_owned();
let mut kv_cache = kv_cache;
// generate
loop {
let logits = self.run_decoder(&mut hidden, &mut kv_cache, &mut pos)?;
// decode
let token_id = logits_sampler.decode(
logits
.slice(s![-1, ..])
.as_slice()
.context("Failed to get slice for `logits`")?,
)?;
// break
if token_id == self.eos_token_id {
break;
}
// cx
let input: X = hidden.slice(s![0, -1, ..]).into_owned().into_dyn().into();
let cx = self
.coord_decoder
.as_mut()
.unwrap()
.inference(Xs::from(input))?[0]
.clone(); // [1024]
let ratio = cx.shape()[0] as f32;
let cx = logits_sampler
.decode(cx.as_slice().context("Failed to get slice for `cx`")?)?
as f32
/ ratio;
hidden = self
.coord_encoder
.as_mut()
.unwrap()
.inference(Xs::from(X::from(vec![cx])))?[0]
.clone()
.insert_axis(0)?
.insert_axis(0)?;
// cy
let _logits = self.run_decoder(&mut hidden, &mut kv_cache, &mut pos)?;
let input: X = hidden.slice(s![0, -1, ..]).into_owned().into_dyn().into();
let cy = self
.coord_decoder
.as_mut()
.unwrap()
.inference(Xs::from(input))?[0]
.clone();
let ratio = cy.shape()[0] as f32;
let cy = logits_sampler
.decode(cy.as_slice().context("Failed to get slice for `cy`")?)?
as f32
/ ratio;
hidden = self
.coord_encoder
.as_mut()
.unwrap()
.inference(Xs::from(X::from(vec![cy])))?[0]
.clone()
.insert_axis(0)?
.insert_axis(0)?;
if !generate_boxes {
y_kpts.push(vec![Keypoint::from((
cx * image_width as f32,
cy * image_height as f32,
0,
))
.with_name(object)]);
// keep?
if y_kpts.len() > self.max_objects {
break;
}
} else {
// wh
let _logits = self.run_decoder(&mut hidden, &mut kv_cache, &mut pos)?;
let input: X = hidden.slice(s![0, -1, ..]).into_owned().into_dyn().into();
let size = self
.size_decoder
.as_mut()
.unwrap()
.inference(Xs::from(input))?[0]
.clone(); // [2, 1024]
let ratio = size.shape()[1] as f32;
let w = logits_sampler.decode(
size.slice(s![0, ..])
.as_slice()
.context("Failed to get slice when decode `w`")?,
)? as f32
/ ratio;
// h
let h = logits_sampler.decode(
size.slice(s![1, ..])
.as_slice()
.context("Failed to get slice when decode `h`")?,
)? as f32
/ ratio;
hidden = self
.size_encoder
.as_mut()
.unwrap()
.inference(Xs::from(X::from(vec![w, h])))?[0]
.clone()
.insert_axis(0)?
.insert_axis(0)?; // [1024]
let xmin = cx - w / 2.;
let ymin = cy - h / 2.;
y_bboxes.push(
Bbox::from((
xmin * image_width as f32,
ymin * image_height as f32,
w * image_width as f32,
h * image_height as f32,
))
.with_name(object)
.with_id(0)
.with_confidence(1.),
);
// Keep?
if y_bboxes.len() > self.max_objects {
break;
}
}
}
Ok((y_kpts, y_bboxes))
}
fn prepare_kv_cache(&mut self, image_embedding: &X) -> Result<Array<f32, IxDyn>> {
let kv_cache_new = self.text_decoder.inference(Xs::from(vec![
image_embedding.clone(),
self.initial_kv_cache.clone(),
]))?["new_kv_cache"]
.to_owned();
// TODO
let kv_cache_new = ndarray::concatenate(
Axis(4),
&[kv_cache_new.view(), self.initial_kv_cache.view()],
)?;
// fill with max sequence length
let mut shapes = self.initial_kv_cache.shape().to_vec();
shapes[4] = self.max_length;
let mut kv_cache = Array::zeros(shapes);
kv_cache
.slice_mut(s![.., .., .., .., ..kv_cache_new.dim()[4], ..])
.assign(&kv_cache_new);
Ok(kv_cache.into_dyn())
}
fn run_decoder(
&mut self,
input_embeds: &mut X,
kv_cache: &mut Array<f32, IxDyn>,
pos: &mut usize,
) -> Result<X> {
let decoder_outputs = self.text_decoder.inference(Xs::from(vec![
input_embeds.clone(),
kv_cache
.slice(s![.., .., .., .., ..*pos, ..])
.into_owned()
.into_dyn()
.into(),
]))?;
let hidden = &decoder_outputs["hidden"];
let new_kv_cache = &decoder_outputs["new_kv_cache"];
// update
let inc = hidden.shape()[1]; // -2
kv_cache
.slice_mut(s![.., .., .., .., *pos..*pos + inc, ..])
.assign(new_kv_cache);
*pos += inc;
*input_embeds = hidden.to_owned();
Ok(decoder_outputs["logits"].to_owned())
}
}
#[derive(Debug, Builder)]
pub struct VisionEncoder {
engine: Engine,
num_patch: usize,
patch_size: usize,
processor: Processor,
ts: Ts,
}
impl VisionEncoder {
pub fn new(options: Options) -> Result<Self> {
let engine = options.to_engine()?;
let (num_patch, patch_size, ts) = (
engine.batch().opt(),
engine.try_height().unwrap_or(&378.into()).opt(),
engine.ts.clone(),
);
let processor = options
.to_processor()?
.with_image_width(patch_size as _)
.with_image_height(patch_size as _);
Ok(Self {
engine,
patch_size,
num_patch,
processor,
ts,
})
}
fn create_patches(
image: &DynamicImage,
image_patch_size: usize,
) -> (Vec<DynamicImage>, (u32, u32)) {
let mut patches = vec![image.clone()];
let image = image.to_rgb8();
let res_templates = vec![(1, 2), (2, 1), (2, 2)];
let (im_width, im_height) = image.dimensions();
let max_dim = im_width.max(im_height);
let selected_template = if max_dim < (image_patch_size as f32 * 1.4) as u32 {
(1, 1)
} else {
let aspect_ratio = im_width as f32 / im_height as f32;
res_templates
.into_iter()
.min_by(|a, b| {
let diff_a = ((a.1 as f32 / a.0 as f32) - aspect_ratio).abs();
let diff_b = ((b.1 as f32 / b.0 as f32) - aspect_ratio).abs();
diff_a.partial_cmp(&diff_b).unwrap()
})
.unwrap()
};
let patch_width = im_width / selected_template.1;
let patch_height = im_height / selected_template.0;
for row in 0..selected_template.0 {
for col in 0..selected_template.1 {
let x_min = col * patch_width;
let y_min = row * patch_height;
let _x_max = x_min + patch_width;
let _y_max = y_min + patch_height;
let cropped = image
.view(x_min, y_min, patch_width, patch_height)
.to_image();
patches.push(DynamicImage::from(cropped));
}
}
(patches, selected_template)
}
pub fn inference(&mut self, xs: Xs) -> Result<Xs> {
self.engine.run(xs)
}
pub fn encode(&mut self, x: &DynamicImage) -> Result<X> {
let (patches, selected_template) = Self::create_patches(x, self.patch_size);
let patches = self.processor.process_images(&patches)?;
let template = (
(selected_template.0 as usize),
(selected_template.1 as usize),
);
let patch_emb = self.inference(patches.clone().into())?[0].clone();
let patch_emb = patch_emb.clone().0.into_dimensionality::<ndarray::Ix3>()?;
let patch_emb = Self::process_patch_emb(patch_emb, template)?;
let patch_emb = X::from(patch_emb.into_dyn()); // TODO .insert_axis(x),
Ok(patch_emb)
}
fn process_patch_emb(patch_emb: Array3<f32>, template: (usize, usize)) -> Result<Array2<f32>> {
let (_, seq_len, enc_dim) = patch_emb.dim(); // (N, 729, 720)
let global_patch = patch_emb.slice(s![0, .., ..]).into_owned();
if template == (1, 1) {
Ok(ndarray::concatenate(
Axis(1),
&[global_patch.view(), global_patch.view()],
)?)
} else {
let w = (seq_len as f32).sqrt() as usize;
let mut rows = Vec::new();
for r in 0..template.0 {
let mut row = Vec::new();
for c in 0..template.1 {
let idx = r * template.1 + c;
let patch = patch_emb.slice(s![idx, .., ..]).into_owned();
let patch = patch.into_shape_with_order((w, w, enc_dim))?;
row.push(patch);
}
let row_concat = ndarray::concatenate(
Axis(1),
&row.iter().map(|x| x.view()).collect::<Vec<_>>(),
)?;
rows.push(row_concat);
}
let patch_emb =
ndarray::concatenate(Axis(0), &rows.iter().map(|x| x.view()).collect::<Vec<_>>())?;
let patch_emb = Self::adaptive_avg_pool2d(patch_emb, (w, w))
.into_shape_with_order((w * w, enc_dim))?;
Ok(ndarray::concatenate(
Axis(1),
&[global_patch.view(), patch_emb.view()],
)?)
}
}
fn adaptive_avg_pool2d(x: Array3<f32>, output_size: (usize, usize)) -> Array3<f32> {
let (height, width, channels) = x.dim();
let (out_height, out_width) = output_size;
let stride_h = height / out_height;
let stride_w = width / out_width;
let kernel_h = height - (out_height - 1) * stride_h;
let kernel_w = width - (out_width - 1) * stride_w;
let mut output = Array3::zeros((out_height, out_width, channels));
for i in 0..out_height {
for j in 0..out_width {
let h_start = i * stride_h;
let h_end = h_start + kernel_h;
let w_start = j * stride_w;
let w_end = w_start + kernel_w;
for c in 0..channels {
let mut sum = 0.0;
let mut count = 0;
for h in h_start..h_end {
for w in w_start..w_end {
if h < height && w < width {
sum += x[(h, w, c)];
count += 1;
}
}
}
output[(i, j, c)] = sum / count as f32;
}
}
}
output
}
}
#[derive(Debug, Builder)]
pub struct VisionProjection {
engine: Engine,
seq_len: usize,
ts: Ts,
}
impl VisionProjection {
pub fn new(options: Options) -> Result<Self> {
let engine = options.to_engine()?;
let (seq_len, ts) = (engine.inputs_minoptmax[0][1].opt(), engine.ts.clone());
Ok(Self {
engine,
seq_len,
ts,
})
}
pub fn inference(&mut self, xs: Xs) -> Result<Xs> {
self.engine.run(xs)
}
}
#[derive(Builder, Debug)]
struct KVCache(pub Array<f32, IxDyn>);
impl KVCache {
pub fn new(scale: &Scale, dtype: &DType) -> Result<Self> {
let f = format!("moondream2/{}-initial-kv-cache-{}.npy", scale, dtype);
let f = Hub::default().try_fetch(&f)?;
let file = std::fs::File::open(f)?;
let x = Array::<f32, IxDyn>::read_npy(file)?.into_dyn();
Ok(Self(x))
}
}

View File

@ -0,0 +1,4 @@
mod config;
mod r#impl;
pub use r#impl::Moondream2;

View File

@ -59,8 +59,8 @@ impl YOLO {
.to_processor()?
.with_image_width(width as _)
.with_image_height(height as _);
let task: Option<Task> = match options.model_task {
Some(task) => Some(task),
let task: Option<Task> = match &options.model_task {
Some(task) => Some(task.clone()),
None => match engine.try_fetch("task") {
Some(x) => match x.as_str() {
"classify" => Some(Task::ImageClassification),
@ -104,7 +104,7 @@ impl YOLO {
// version + task
None => match (task, version) {
(Some(task), Some(version)) => {
let layout = match (task, version) {
let layout = match (task.clone(), version) {
(Task::ImageClassification, Version(5, 0)) => {
YOLOPredsFormat::n_clss().apply_softmax(true)
}