mirror of
https://github.com/mii443/usls.git
synced 2025-08-22 15:45:41 +00:00
Add moondream2
* Add moondream2 * Update README.md
This commit is contained in:
@ -14,7 +14,7 @@ exclude = ["assets/*", "examples/*", "runs/*", "benches/*"]
|
||||
aksr = { version = "0.0.2" }
|
||||
image = { version = "0.25.2" }
|
||||
imageproc = { version = "0.24" }
|
||||
ndarray = { version = "0.16.1", features = ["rayon"] }
|
||||
ndarray = { version = "0.16.1", features = ["rayon", "serde"] }
|
||||
rayon = { version = "1.10.0" }
|
||||
anyhow = { version = "1.0.75" }
|
||||
regex = { version = "1.5.4" }
|
||||
@ -38,6 +38,7 @@ natord = "1.0.9"
|
||||
video-rs = { version = "0.10.0", features = ["ndarray"], optional = true }
|
||||
minifb = { version = "0.27.0", optional = true }
|
||||
sha2 = "0.10.8"
|
||||
ndarray-npy = "0.9.1"
|
||||
|
||||
[dev-dependencies]
|
||||
argh = "0.1.13"
|
||||
|
@ -37,7 +37,7 @@
|
||||
- **YOLO Models**: [YOLOv5](https://github.com/ultralytics/yolov5), [YOLOv6](https://github.com/meituan/YOLOv6), [YOLOv7](https://github.com/WongKinYiu/yolov7), [YOLOv8](https://github.com/ultralytics/ultralytics), [YOLOv9](https://github.com/WongKinYiu/yolov9), [YOLOv10](https://github.com/THU-MIG/yolov10), [YOLO11](https://github.com/ultralytics/ultralytics)
|
||||
- **SAM Models**: [SAM](https://github.com/facebookresearch/segment-anything), [SAM2](https://github.com/facebookresearch/segment-anything-2), [MobileSAM](https://github.com/ChaoningZhang/MobileSAM), [EdgeSAM](https://github.com/chongzhou96/EdgeSAM), [SAM-HQ](https://github.com/SysCV/sam-hq), [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM)
|
||||
- **Vision Models**: [RT-DETR](https://arxiv.org/abs/2304.08069), [RTMO](https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo), [Depth-Anything](https://github.com/LiheYoung/Depth-Anything), [DINOv2](https://github.com/facebookresearch/dinov2), [MODNet](https://github.com/ZHKKKe/MODNet), [Sapiens](https://arxiv.org/abs/2408.12569), [DepthPro](https://github.com/apple/ml-depth-pro), [FastViT](https://github.com/apple/ml-fastvit), [BEiT](https://github.com/microsoft/unilm/tree/master/beit), [MobileOne](https://github.com/apple/ml-mobileone)
|
||||
- **Vision-Language Models**: [CLIP](https://github.com/openai/CLIP), [jina-clip-v1](https://huggingface.co/jinaai/jina-clip-v1), [BLIP](https://arxiv.org/abs/2201.12086), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [YOLO-World](https://github.com/AILab-CVC/YOLO-World), [Florence2](https://arxiv.org/abs/2311.06242)
|
||||
- **Vision-Language Models**: [CLIP](https://github.com/openai/CLIP), [jina-clip-v1](https://huggingface.co/jinaai/jina-clip-v1), [BLIP](https://arxiv.org/abs/2201.12086), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [YOLO-World](https://github.com/AILab-CVC/YOLO-World), [Florence2](https://arxiv.org/abs/2311.06242), [Moondream2](https://github.com/vikhyat/moondream/tree/main)
|
||||
- **OCR Models**: [FAST](https://github.com/czczup/FAST), [DB(PaddleOCR-Det)](https://arxiv.org/abs/1911.08947), [SVTR(PaddleOCR-Rec)](https://arxiv.org/abs/2205.00159), [SLANet](https://paddlepaddle.github.io/PaddleOCR/latest/algorithm/table_recognition/algorithm_table_slanet.html), [TrOCR](https://huggingface.co/microsoft/trocr-base-printed), [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
|
||||
|
||||
<details>
|
||||
@ -86,6 +86,7 @@
|
||||
| [MODNet](https://github.com/ZHKKKe/MODNet) | Image Matting | [demo](examples/modnet) | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| [Sapiens](https://github.com/facebookresearch/sapiens/tree/main) | Foundation for Human Vision Models | [demo](examples/sapiens) | ✅ | ✅ | ✅ | | |
|
||||
| [Florence2](https://arxiv.org/abs/2311.06242) | a Variety of Vision Tasks | [demo](examples/florence2) | ✅ | ✅ | ✅ | | |
|
||||
| [Moondream2](https://github.com/vikhyat/moondream/tree/main) | Open-Set Detection<br />Open-Set Keypoints Detection<br />Image Caption<br />Visual Question Answering | [demo](examples/moondream2) | ✅ | ✅ | ✅ | | |
|
||||
|
||||
</details>
|
||||
|
||||
|
@ -90,9 +90,11 @@ fn main() -> Result<()> {
|
||||
Task::ObjectDetection,
|
||||
Task::DenseRegionCaption,
|
||||
// w/o inputs
|
||||
Task::OpenSetDetection("a vehicle"),
|
||||
Task::CaptionToPhraseGrounding("A vehicle with two wheels parked in front of a building."),
|
||||
Task::ReferringExpressionSegmentation("a vehicle"),
|
||||
Task::OpenSetDetection("a vehicle".into()),
|
||||
Task::CaptionToPhraseGrounding(
|
||||
"A vehicle with two wheels parked in front of a building.".into(),
|
||||
),
|
||||
Task::ReferringExpressionSegmentation("a vehicle".into()),
|
||||
Task::RegionToSegmentation(
|
||||
// 31, 156, 581, 373, // car
|
||||
449, 270, 556, 372, // wheel
|
||||
|
10
examples/moondream2/README.md
Normal file
10
examples/moondream2/README.md
Normal file
@ -0,0 +1,10 @@
|
||||
## Quick Start
|
||||
|
||||
```shell
|
||||
cargo run -r -F cuda --example moondream2 -- --device 'cuda:0' --dtype i8 --scale 2b --task vqa:"What's in this image?"
|
||||
cargo run -r -F cuda --example moondream2 -- --device 'cuda:0' --dtype i8 --scale 2b --task cap:0
|
||||
cargo run -r -F cuda --example moondream2 -- --device 'cuda:0' --dtype i8 --scale 2b --task cap:1
|
||||
cargo run -r -F cuda --example moondream2 -- --device 'cuda:0' --dtype i8 --scale 2b --task open-od:person
|
||||
cargo run -r -F cuda --example moondream2 -- --device 'cuda:0' --dtype i8 --scale 2b --task open-kpt:person
|
||||
```
|
||||
|
157
examples/moondream2/main.rs
Normal file
157
examples/moondream2/main.rs
Normal file
@ -0,0 +1,157 @@
|
||||
use anyhow::Result;
|
||||
use usls::{models::Moondream2, Annotator, DataLoader, Options, Scale, Task};
|
||||
|
||||
#[derive(argh::FromArgs)]
|
||||
/// Example
|
||||
struct Args {
|
||||
/// device
|
||||
#[argh(option, default = "String::from(\"cpu:0\")")]
|
||||
device: String,
|
||||
|
||||
/// source image
|
||||
#[argh(
|
||||
option,
|
||||
default = "vec![
|
||||
String::from(\"./assets/bus.jpg\"),
|
||||
String::from(\"images/green-car.jpg\"),
|
||||
]"
|
||||
)]
|
||||
source: Vec<String>,
|
||||
|
||||
/// dtype
|
||||
#[argh(option, default = "String::from(\"int4\")")]
|
||||
dtype: String,
|
||||
|
||||
/// scale
|
||||
#[argh(option, default = "String::from(\"0.5b\")")]
|
||||
scale: String,
|
||||
|
||||
/// task
|
||||
#[argh(option, default = "String::from(\"Caption: 0\")")]
|
||||
task: String,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
|
||||
.with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339())
|
||||
.init();
|
||||
let args: Args = argh::from_env();
|
||||
|
||||
// build model
|
||||
let (
|
||||
options_vision_encoder,
|
||||
options_vision_projection,
|
||||
options_text_decoder,
|
||||
options_text_encoder,
|
||||
options_coord_decoder,
|
||||
options_coord_encoder,
|
||||
options_size_decoder,
|
||||
options_size_encoder,
|
||||
) = match args.scale.as_str().try_into()? {
|
||||
Scale::Billion(2.) => (
|
||||
Options::moondream2_2b_vision_encoder(),
|
||||
Options::moondream2_2b_vision_projection(),
|
||||
Options::moondream2_2b_text_decoder(),
|
||||
Options::moondream2_2b_text_encoder(),
|
||||
Options::moondream2_2b_coord_decoder(),
|
||||
Options::moondream2_2b_coord_encoder(),
|
||||
Options::moondream2_2b_size_decoder(),
|
||||
Options::moondream2_2b_size_encoder(),
|
||||
),
|
||||
Scale::Billion(0.5) => (
|
||||
Options::moondream2_0_5b_vision_encoder(),
|
||||
Options::moondream2_0_5b_vision_projection(),
|
||||
Options::moondream2_0_5b_text_decoder(),
|
||||
Options::moondream2_0_5b_text_encoder(),
|
||||
Options::moondream2_0_5b_coord_decoder(),
|
||||
Options::moondream2_0_5b_coord_encoder(),
|
||||
Options::moondream2_0_5b_size_decoder(),
|
||||
Options::moondream2_0_5b_size_encoder(),
|
||||
),
|
||||
_ => unimplemented!(),
|
||||
};
|
||||
|
||||
let mut model = Moondream2::new(
|
||||
options_vision_encoder
|
||||
.with_model_dtype(args.dtype.as_str().try_into()?)
|
||||
.with_model_device(args.device.as_str().try_into()?)
|
||||
.commit()?,
|
||||
options_vision_projection
|
||||
.with_model_dtype(args.dtype.as_str().try_into()?)
|
||||
.with_model_device(args.device.as_str().try_into()?)
|
||||
.commit()?,
|
||||
options_text_encoder
|
||||
.with_model_dtype(args.dtype.as_str().try_into()?)
|
||||
.with_model_device(args.device.as_str().try_into()?)
|
||||
.commit()?,
|
||||
options_text_decoder
|
||||
.with_model_dtype(args.dtype.as_str().try_into()?)
|
||||
.with_model_device(args.device.as_str().try_into()?)
|
||||
.commit()?,
|
||||
Some(
|
||||
options_coord_encoder
|
||||
.with_model_dtype(args.dtype.as_str().try_into()?)
|
||||
.with_model_device(args.device.as_str().try_into()?)
|
||||
.commit()?,
|
||||
),
|
||||
Some(
|
||||
options_coord_decoder
|
||||
.with_model_dtype(args.dtype.as_str().try_into()?)
|
||||
.with_model_device(args.device.as_str().try_into()?)
|
||||
.commit()?,
|
||||
),
|
||||
Some(
|
||||
options_size_encoder
|
||||
.with_model_dtype(args.dtype.as_str().try_into()?)
|
||||
.with_model_device(args.device.as_str().try_into()?)
|
||||
.commit()?,
|
||||
),
|
||||
Some(
|
||||
options_size_decoder
|
||||
.with_model_dtype(args.dtype.as_str().try_into()?)
|
||||
.with_model_device(args.device.as_str().try_into()?)
|
||||
.commit()?,
|
||||
),
|
||||
)?;
|
||||
|
||||
// load images
|
||||
let xs = DataLoader::try_read_batch(&args.source)?;
|
||||
|
||||
// run with task
|
||||
let task: Task = args.task.as_str().try_into()?;
|
||||
let ys = model.forward(&xs, &task)?;
|
||||
|
||||
// annotate
|
||||
match task {
|
||||
Task::Caption(_) => {
|
||||
println!("{}:", task);
|
||||
for (i, y) in ys.iter().enumerate() {
|
||||
if let Some(texts) = y.texts() {
|
||||
println!("Image {}: {:?}\n", i, texts[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
Task::Vqa(query) => {
|
||||
println!("Question: {}", query);
|
||||
for (i, y) in ys.iter().enumerate() {
|
||||
if let Some(texts) = y.texts() {
|
||||
println!("Image {}: {:?}\n", i, texts[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
Task::OpenSetDetection(_) | Task::OpenSetKeypointsDetection(_) => {
|
||||
println!("{:?}", ys);
|
||||
let annotator = Annotator::default()
|
||||
.with_bboxes_thickness(4)
|
||||
.without_bboxes_conf(true)
|
||||
.with_keypoints_radius(6)
|
||||
.with_keypoints_name(true)
|
||||
.with_saveout("moondream2");
|
||||
annotator.annotate(&xs, &ys);
|
||||
}
|
||||
_ => unimplemented!("Unsupported moondream2 task."),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
@ -33,8 +33,8 @@ impl TryFrom<&str> for Device {
|
||||
// device and its id
|
||||
let d_id: Vec<&str> = s.trim().split(':').collect();
|
||||
let (d, id) = match d_id.len() {
|
||||
1 => (d_id[0], 0),
|
||||
2 => (d_id[0], d_id[1].parse::<usize>().unwrap_or(0)),
|
||||
1 => (d_id[0].trim(), 0),
|
||||
2 => (d_id[0].trim(), d_id[1].trim().parse::<usize>().unwrap_or(0)),
|
||||
_ => anyhow::bail!(
|
||||
"Fail to parse device string: {s}. Expect: `device:device_id` or `device`. e.g. `cuda:0` or `cuda`"
|
||||
),
|
||||
|
@ -3,6 +3,7 @@ use ort::tensor::TensorElementType;
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum DType {
|
||||
Auto,
|
||||
Int4,
|
||||
Int8,
|
||||
Int16,
|
||||
Int32,
|
||||
@ -32,6 +33,7 @@ impl TryFrom<&str> for DType {
|
||||
"u16" | "uint16" => Ok(Self::Uint16),
|
||||
"u32" | "uint32" => Ok(Self::Uint32),
|
||||
"u64" | "uint64" => Ok(Self::Uint64),
|
||||
"i4" | "int4" => Ok(Self::Int4),
|
||||
"i8" | "int8" => Ok(Self::Int8),
|
||||
"i16" | "int=16" => Ok(Self::Int16),
|
||||
"i32" | "int32" => Ok(Self::Int32),
|
||||
@ -52,6 +54,7 @@ impl std::fmt::Display for DType {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
let x = match self {
|
||||
Self::Auto => "auto",
|
||||
Self::Int4 => "int4",
|
||||
Self::Int8 => "int8",
|
||||
Self::Int16 => "int16",
|
||||
Self::Int32 => "int32",
|
||||
|
@ -206,6 +206,7 @@ impl Engine {
|
||||
x, dtype,
|
||||
)?));
|
||||
}
|
||||
|
||||
xs_
|
||||
});
|
||||
|
||||
@ -223,6 +224,7 @@ impl Engine {
|
||||
ys.push_kv(name.as_str(), X::from(y))?;
|
||||
}
|
||||
});
|
||||
|
||||
Ok(ys)
|
||||
} else {
|
||||
anyhow::bail!("Failed to run with ONNXRuntime. No model info found.");
|
||||
|
@ -68,6 +68,11 @@ pub struct Options {
|
||||
pub text_confs_2: Vec<f32>,
|
||||
pub text_confs_3: Vec<f32>,
|
||||
|
||||
// Files
|
||||
pub file: Option<String>,
|
||||
pub file_2: Option<String>,
|
||||
pub file_3: Option<String>,
|
||||
|
||||
// For classification
|
||||
pub apply_softmax: Option<bool>,
|
||||
|
||||
@ -149,6 +154,9 @@ impl Default for Options {
|
||||
text_names: None,
|
||||
text_names_2: None,
|
||||
text_names_3: None,
|
||||
file: None,
|
||||
file_2: None,
|
||||
file_3: None,
|
||||
class_confs: vec![0.3f32],
|
||||
class_confs_2: vec![0.3f32],
|
||||
class_confs_3: vec![0.3f32],
|
||||
@ -320,11 +328,6 @@ impl Options {
|
||||
.try_fetch(&format!("{}/{}", self.model_name, self.model_file))?;
|
||||
}
|
||||
}
|
||||
|
||||
// let stem = crate::try_fetch_stem(&self.model_file)?;
|
||||
// self.model_spec = format!("{}/{}", self.model_name, stem);
|
||||
// self.model_file =
|
||||
// Hub::default().try_fetch(&format!("{}/{}", self.model_name, self.model_file))?;
|
||||
}
|
||||
|
||||
Ok(self)
|
||||
@ -408,7 +411,7 @@ impl Options {
|
||||
.unwrap_or(&format!("{}/tokenizer.json", self.model_name)),
|
||||
)?,
|
||||
)
|
||||
.map_err(|_| anyhow::anyhow!("No `tokenizer.json` found"))?;
|
||||
.map_err(|err| anyhow::anyhow!("Faild to build tokenizer: {err}"))?;
|
||||
|
||||
// TODO: padding
|
||||
// if `max_length` specified: use `Fixed` strategy
|
||||
|
@ -13,6 +13,8 @@ pub enum Scale {
|
||||
P,
|
||||
A,
|
||||
F,
|
||||
Million(f32),
|
||||
Billion(f32),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Scale {
|
||||
@ -31,6 +33,8 @@ impl std::fmt::Display for Scale {
|
||||
Self::P => "p",
|
||||
Self::A => "a",
|
||||
Self::F => "f",
|
||||
Self::Million(x) => &format!("{x}m"),
|
||||
Self::Billion(x) => &format!("{x}b"), // x.0 -> x
|
||||
};
|
||||
write!(f, "{}", x)
|
||||
}
|
||||
@ -77,6 +81,20 @@ impl TryFrom<&str> for Scale {
|
||||
"p" | "pico" => Ok(Self::P),
|
||||
"a" | "atto" => Ok(Self::A),
|
||||
"f" | "femto" => Ok(Self::F),
|
||||
scale if scale.ends_with("b") => {
|
||||
let num_str = &scale[..scale.len() - 1];
|
||||
match num_str.parse::<f32>() {
|
||||
Ok(x) => Ok(Self::Billion(x)),
|
||||
Err(_) => anyhow::bail!("Invalid Billion format: {}", scale),
|
||||
}
|
||||
}
|
||||
scale if scale.ends_with("m") => {
|
||||
let num_str = &scale[..scale.len() - 1];
|
||||
match num_str.parse::<f32>() {
|
||||
Ok(x) => Ok(Self::Million(x)),
|
||||
Err(_) => anyhow::bail!("Invalid Million format: {}", scale),
|
||||
}
|
||||
}
|
||||
x => anyhow::bail!("Unsupported model scale: {:?}", x),
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
#[derive(Debug, Copy, Clone, Ord, Eq, PartialOrd, PartialEq)]
|
||||
#[derive(Debug, Clone, Ord, Eq, PartialOrd, PartialEq)]
|
||||
pub enum Task {
|
||||
/// Image classification task.
|
||||
/// Input: image
|
||||
@ -32,7 +32,7 @@ pub enum Task {
|
||||
/// Input: image
|
||||
/// Output: bounding boxes, class labels (including an "unknown" category for unfamiliar objects), and detection scores
|
||||
/// Open set detection task, with String query
|
||||
OpenSetDetection(&'static str),
|
||||
OpenSetDetection(String),
|
||||
/// Task for generating brief descriptions of dense regions in the image.
|
||||
/// Input: image
|
||||
/// Output: bounding boxes (bboxes), brief phrase labels, and optional scores for detected regions
|
||||
@ -44,6 +44,7 @@ pub enum Task {
|
||||
/// Output: coordinates of detected keypoints
|
||||
KeypointsDetection,
|
||||
Pose,
|
||||
OpenSetKeypointsDetection(String),
|
||||
|
||||
/// Semantic segmentation task, segmenting the image into different semantic regions.
|
||||
/// Input: image
|
||||
@ -97,12 +98,12 @@ pub enum Task {
|
||||
/// Input: image and text
|
||||
/// Output: image region and the corresponding phrase
|
||||
/// caption to phrase grounding
|
||||
CaptionToPhraseGrounding(&'static str),
|
||||
CaptionToPhraseGrounding(String),
|
||||
|
||||
/// Referring expression segmentation task, segmenting objects in the image based on a text description.
|
||||
/// Input: image and referring expression
|
||||
/// Output: a segmentation mask for the object referred to by the text
|
||||
ReferringExpressionSegmentation(&'static str),
|
||||
ReferringExpressionSegmentation(String),
|
||||
|
||||
/// Region-to-segmentation task, similar to combining object detection with segmentation (e.g., YOLO + SAM).
|
||||
/// Input: image and region proposals
|
||||
@ -125,7 +126,7 @@ pub enum Task {
|
||||
/// Visual question answering (VQA) task, answering questions related to an image.
|
||||
/// Input: image and question text
|
||||
/// Output: the answer to the question
|
||||
Vqa(&'static str),
|
||||
Vqa(String),
|
||||
|
||||
/// Optical character recognition (OCR) task, recognizing text in an image.
|
||||
/// Input: image
|
||||
@ -156,6 +157,7 @@ impl std::fmt::Display for Task {
|
||||
Self::Ocr => "ocr",
|
||||
Self::OcrWithRegion => "ocr-with-region",
|
||||
Self::Vqa(_) => "vqa",
|
||||
Self::OpenSetKeypointsDetection(_) => "open-set-keypoints-detection",
|
||||
_ => todo!(),
|
||||
};
|
||||
write!(f, "{}", x)
|
||||
@ -166,13 +168,33 @@ impl TryFrom<&str> for Task {
|
||||
type Error = anyhow::Error;
|
||||
|
||||
fn try_from(s: &str) -> Result<Self, Self::Error> {
|
||||
// TODO
|
||||
match s.to_lowercase().as_str() {
|
||||
"cls" | "classify" | "classification" => Ok(Self::ImageClassification),
|
||||
"det" | "od" | "detect" => Ok(Self::ObjectDetection),
|
||||
"kpt" | "pose" => Ok(Self::KeypointsDetection),
|
||||
"seg" | "segment" => Ok(Self::InstanceSegmentation),
|
||||
"obb" => Ok(Self::OrientedObjectDetection),
|
||||
_ => todo!(), // x => anyhow::bail!("Unsupported model task: {}", x),
|
||||
"cap" | "cap0" | "caption" => Ok(Self::Caption(0)),
|
||||
"cap1" | "caption1" => Ok(Self::Caption(1)),
|
||||
"cap2" | "caption2" => Ok(Self::Caption(2)),
|
||||
x if x.contains(":") => {
|
||||
let t_tt: Vec<&str> = x.trim().split(':').collect();
|
||||
let (t, tt) = match t_tt.len() {
|
||||
2 => (t_tt[0].trim(), t_tt[1].trim()),
|
||||
_ => anyhow::bail!(
|
||||
"Fail to parse task: {x}. Expect: `task:content`. e.g. `vqa:What's in this image?`"
|
||||
),
|
||||
};
|
||||
match t {
|
||||
"cap" | "caption" => Ok(Self::Caption(tt.parse::<usize>().unwrap_or(0) as u8)),
|
||||
"vqa" => Ok(Self::Vqa(tt.into())),
|
||||
"open-det" | "open-od" => Ok(Self::OpenSetDetection(tt.into())),
|
||||
"open-kpt" | "open-pose" => Ok(Self::OpenSetKeypointsDetection(tt.into())),
|
||||
_ => todo!(),
|
||||
}
|
||||
}
|
||||
_ => todo!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -88,7 +88,7 @@ impl Florence2 {
|
||||
.quantize(&[*x0, *y0, *x1, *y1], (image_width, image_height));
|
||||
Task::RegionToDescription(xyxy[0], xyxy[1], xyxy[2], xyxy[3])
|
||||
}
|
||||
_ => *task,
|
||||
_ => task.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -16,6 +16,7 @@ mod grounding_dino;
|
||||
mod linknet;
|
||||
mod mobileone;
|
||||
mod modnet;
|
||||
mod moondream2;
|
||||
mod picodet;
|
||||
mod pipeline;
|
||||
mod rtdetr;
|
||||
@ -37,6 +38,7 @@ pub use dinov2::*;
|
||||
pub use florence2::*;
|
||||
pub use grounding_dino::*;
|
||||
pub use modnet::*;
|
||||
pub use moondream2::*;
|
||||
pub use picodet::*;
|
||||
pub use pipeline::*;
|
||||
pub use rtdetr::*;
|
||||
|
9
src/models/moondream2/README.md
Normal file
9
src/models/moondream2/README.md
Normal file
@ -0,0 +1,9 @@
|
||||
# moondream: A tiny vision language model that kicks ass and runs anywhere
|
||||
|
||||
## Official Repository
|
||||
|
||||
The official repository can be found on: [GitHub](https://github.com/vikhyat/moondream/tree/main)
|
||||
|
||||
## Example
|
||||
|
||||
Refer to the [example](../../../examples/moondream2)
|
117
src/models/moondream2/config.rs
Normal file
117
src/models/moondream2/config.rs
Normal file
@ -0,0 +1,117 @@
|
||||
/// Model configuration for `moondream2`
|
||||
impl crate::Options {
|
||||
pub fn moondream2() -> Self {
|
||||
Self::default()
|
||||
.with_model_name("moondream2")
|
||||
.with_model_num_dry_run(0)
|
||||
}
|
||||
|
||||
pub fn moondream2_0_5b() -> Self {
|
||||
Self::moondream2().with_model_scale(crate::Scale::Billion(0.5))
|
||||
}
|
||||
|
||||
pub fn moondream2_0_5b_vision_encoder() -> Self {
|
||||
Self::moondream2_0_5b()
|
||||
.with_model_ixx(0, 0, (1, 3, 4).into()) // patch count
|
||||
.with_model_kind(crate::Kind::Vision)
|
||||
.with_image_mean(&[0.5, 0.5, 0.5])
|
||||
.with_image_std(&[0.5, 0.5, 0.5])
|
||||
.with_normalize(true)
|
||||
.with_resize_mode(crate::ResizeMode::FitExact)
|
||||
.with_resize_filter("catmullrom")
|
||||
.with_model_file("0.5b-vision-encoder.onnx")
|
||||
}
|
||||
|
||||
pub fn moondream2_0_5b_vision_projection() -> Self {
|
||||
Self::moondream2_0_5b()
|
||||
.with_batch_size(1)
|
||||
.with_model_kind(crate::Kind::Vision)
|
||||
.with_model_file("0.5b-vision-projection.onnx")
|
||||
}
|
||||
|
||||
pub fn moondream2_0_5b_text_decoder() -> Self {
|
||||
Self::moondream2_0_5b()
|
||||
.with_batch_size(1)
|
||||
.with_model_kind(crate::Kind::Language)
|
||||
.with_model_file("0.5b-text-decoder.onnx")
|
||||
}
|
||||
|
||||
pub fn moondream2_0_5b_text_encoder() -> Self {
|
||||
Self::moondream2_0_5b()
|
||||
.with_batch_size(1)
|
||||
.with_model_kind(crate::Kind::Language)
|
||||
.with_model_file("0.5b-text-encoder.onnx")
|
||||
}
|
||||
|
||||
pub fn moondream2_0_5b_coord_encoder() -> Self {
|
||||
Self::moondream2_0_5b()
|
||||
.with_batch_size(1)
|
||||
.with_model_file("0.5b-coord-encoder.onnx")
|
||||
}
|
||||
|
||||
pub fn moondream2_0_5b_coord_decoder() -> Self {
|
||||
Self::moondream2_0_5b()
|
||||
.with_batch_size(1)
|
||||
.with_model_file("0.5b-coord-decoder.onnx")
|
||||
}
|
||||
|
||||
pub fn moondream2_0_5b_size_encoder() -> Self {
|
||||
Self::moondream2_0_5b()
|
||||
.with_batch_size(1)
|
||||
.with_model_file("0.5b-size-encoder.onnx")
|
||||
}
|
||||
|
||||
pub fn moondream2_0_5b_size_decoder() -> Self {
|
||||
Self::moondream2_0_5b()
|
||||
.with_batch_size(1)
|
||||
.with_model_file("0.5b-size-decoder.onnx")
|
||||
}
|
||||
|
||||
pub fn moondream2_2b_vision_encoder() -> Self {
|
||||
Self::moondream2_0_5b_vision_encoder()
|
||||
.with_model_scale(crate::Scale::Billion(2.))
|
||||
.with_model_file("2b-vision-encoder.onnx")
|
||||
}
|
||||
|
||||
pub fn moondream2_2b_vision_projection() -> Self {
|
||||
Self::moondream2_0_5b_vision_projection()
|
||||
.with_model_scale(crate::Scale::Billion(2.))
|
||||
.with_model_file("2b-vision-projection.onnx")
|
||||
}
|
||||
|
||||
pub fn moondream2_2b_text_decoder() -> Self {
|
||||
Self::moondream2_0_5b_text_decoder()
|
||||
.with_model_scale(crate::Scale::Billion(2.))
|
||||
.with_model_file("2b-text-decoder.onnx")
|
||||
}
|
||||
|
||||
pub fn moondream2_2b_text_encoder() -> Self {
|
||||
Self::moondream2_0_5b_text_encoder()
|
||||
.with_model_scale(crate::Scale::Billion(2.))
|
||||
.with_model_file("2b-text-encoder.onnx")
|
||||
}
|
||||
|
||||
pub fn moondream2_2b_coord_encoder() -> Self {
|
||||
Self::moondream2_0_5b_coord_encoder()
|
||||
.with_model_scale(crate::Scale::Billion(2.))
|
||||
.with_model_file("2b-coord-encoder.onnx")
|
||||
}
|
||||
|
||||
pub fn moondream2_2b_coord_decoder() -> Self {
|
||||
Self::moondream2_0_5b_coord_decoder()
|
||||
.with_model_scale(crate::Scale::Billion(2.))
|
||||
.with_model_file("2b-coord-decoder.onnx")
|
||||
}
|
||||
|
||||
pub fn moondream2_2b_size_encoder() -> Self {
|
||||
Self::moondream2_0_5b_size_encoder()
|
||||
.with_model_scale(crate::Scale::Billion(2.))
|
||||
.with_model_file("2b-size-encoder.onnx")
|
||||
}
|
||||
|
||||
pub fn moondream2_2b_size_decoder() -> Self {
|
||||
Self::moondream2_0_5b_size_decoder()
|
||||
.with_model_scale(crate::Scale::Billion(2.))
|
||||
.with_model_file("2b-size-decoder.onnx")
|
||||
}
|
||||
}
|
645
src/models/moondream2/impl.rs
Normal file
645
src/models/moondream2/impl.rs
Normal file
@ -0,0 +1,645 @@
|
||||
use aksr::Builder;
|
||||
use anyhow::{Context, Result};
|
||||
use image::{DynamicImage, GenericImageView};
|
||||
use ndarray::{s, Array, Array2, Array3, Axis, IxDyn};
|
||||
use ndarray_npy::ReadNpyExt;
|
||||
|
||||
use crate::{
|
||||
BaseModelTextual, Bbox, DType, Engine, Hub, Keypoint, LogitsSampler, Options, Processor, Scale,
|
||||
Task, Ts, Xs, Ys, X, Y,
|
||||
};
|
||||
|
||||
#[derive(Builder, Debug)]
|
||||
pub struct Moondream2 {
|
||||
vision_encoder: VisionEncoder,
|
||||
vision_projection: VisionProjection,
|
||||
pub text_decoder: BaseModelTextual,
|
||||
text_encoder: BaseModelTextual,
|
||||
coord_decoder: Option<BaseModelTextual>,
|
||||
coord_encoder: Option<BaseModelTextual>,
|
||||
size_decoder: Option<BaseModelTextual>,
|
||||
size_encoder: Option<BaseModelTextual>,
|
||||
initial_kv_cache: X, // TODO: use f16
|
||||
scale: Scale,
|
||||
dtype: DType,
|
||||
max_length: usize,
|
||||
eos_token_id: u32,
|
||||
max_objects: usize,
|
||||
}
|
||||
|
||||
impl Moondream2 {
|
||||
// TODO
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new(
|
||||
options_vision_encoder: Options,
|
||||
options_vision_projection: Options,
|
||||
options_text_encoder: Options,
|
||||
options_text_decoder: Options,
|
||||
options_coord_encoder: Option<Options>,
|
||||
options_coord_decoder: Option<Options>,
|
||||
options_size_encoder: Option<Options>,
|
||||
options_size_decoder: Option<Options>,
|
||||
) -> Result<Self> {
|
||||
let max_length = 2048;
|
||||
let max_objects = 50;
|
||||
let eos_token_id = 50256;
|
||||
let dtype = options_vision_encoder.model_dtype;
|
||||
let scale = options_vision_encoder
|
||||
.model_scale
|
||||
.unwrap_or(Scale::Billion(0.5));
|
||||
let initial_kv_cache: X = KVCache::new(&scale, &dtype)?.0.into();
|
||||
let vision_encoder = VisionEncoder::new(options_vision_encoder)?;
|
||||
let vision_projection = VisionProjection::new(options_vision_projection)?;
|
||||
let text_decoder = BaseModelTextual::new(options_text_decoder)?;
|
||||
let text_encoder = BaseModelTextual::new(options_text_encoder)?;
|
||||
let coord_decoder = options_coord_decoder
|
||||
.map(BaseModelTextual::new)
|
||||
.transpose()?;
|
||||
let coord_encoder = options_coord_encoder
|
||||
.map(BaseModelTextual::new)
|
||||
.transpose()?;
|
||||
let size_decoder = options_size_decoder
|
||||
.map(BaseModelTextual::new)
|
||||
.transpose()?;
|
||||
let size_encoder = options_size_encoder
|
||||
.map(BaseModelTextual::new)
|
||||
.transpose()?;
|
||||
|
||||
Ok(Self {
|
||||
vision_encoder,
|
||||
vision_projection,
|
||||
text_decoder,
|
||||
initial_kv_cache,
|
||||
max_length,
|
||||
max_objects,
|
||||
text_encoder,
|
||||
coord_decoder,
|
||||
coord_encoder,
|
||||
size_encoder,
|
||||
size_decoder,
|
||||
eos_token_id,
|
||||
scale,
|
||||
dtype,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn encode_image(&mut self, x: &DynamicImage) -> Result<X> {
|
||||
let patches_emb = self.vision_encoder.encode(x)?.clone().insert_axis(0)?;
|
||||
let image_embedding = self.vision_projection.inference(patches_emb.into())?[0].to_owned();
|
||||
|
||||
Ok(image_embedding)
|
||||
}
|
||||
|
||||
pub fn forward(&mut self, xs: &[DynamicImage], task: &Task) -> Result<Ys> {
|
||||
let mut ys: Vec<Y> = Vec::new();
|
||||
for x in xs.iter() {
|
||||
let y = self.forward_once(x, task)?;
|
||||
ys.push(y);
|
||||
}
|
||||
|
||||
Ok(ys.into())
|
||||
}
|
||||
|
||||
pub fn forward_once(&mut self, images: &DynamicImage, task: &Task) -> Result<Y> {
|
||||
let image_embedding = self.encode_image(images)?;
|
||||
let kv_cache = self.prepare_kv_cache(&image_embedding)?;
|
||||
|
||||
match task {
|
||||
Task::Caption(n) => {
|
||||
let input_ids = match n {
|
||||
0 => vec![198., 198., 16438., 8305., 25.],
|
||||
_ => vec![198., 198., 24334., 1159., 25.],
|
||||
};
|
||||
let text = self.generate_text(&input_ids, kv_cache)?;
|
||||
let y = Y::default().with_texts(&[text.into()]);
|
||||
|
||||
Ok(y)
|
||||
}
|
||||
Task::Vqa(query) => {
|
||||
let input_ids: Vec<_> = [198., 198., 24361., 25.]
|
||||
.iter()
|
||||
.chain(
|
||||
&self
|
||||
.text_encoder
|
||||
.processor()
|
||||
.encode_text_ids(query, false)?,
|
||||
)
|
||||
.chain(&[198., 198., 33706., 25.])
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
let text = self.generate_text(&input_ids, kv_cache)?;
|
||||
let y = Y::default().with_texts(&[text.into()]);
|
||||
|
||||
Ok(y)
|
||||
}
|
||||
Task::OpenSetDetection(object) => {
|
||||
let input_ids: Vec<_> = [198., 198., 47504., 25.]
|
||||
.iter()
|
||||
.chain(
|
||||
&self
|
||||
.text_encoder
|
||||
.processor()
|
||||
.encode_text_ids(&format!(" {}", object), false)?,
|
||||
)
|
||||
.chain(&[628.])
|
||||
.cloned()
|
||||
.collect();
|
||||
let (_, y_bboxes) =
|
||||
self.generate_points_boxes(&input_ids, kv_cache, object, true)?;
|
||||
|
||||
Ok(Y::default().with_bboxes(&y_bboxes))
|
||||
}
|
||||
Task::OpenSetKeypointsDetection(object) => {
|
||||
let input_ids: Vec<_> = [198., 198., 12727., 25.]
|
||||
.iter()
|
||||
.chain(
|
||||
&self
|
||||
.text_encoder
|
||||
.processor()
|
||||
.encode_text_ids(&format!(" {}", object), false)?,
|
||||
)
|
||||
.chain(&[628.])
|
||||
.cloned()
|
||||
.collect();
|
||||
let (y_kpts, _) =
|
||||
self.generate_points_boxes(&input_ids, kv_cache, object, false)?;
|
||||
|
||||
Ok(Y::default().with_keypoints(&y_kpts))
|
||||
}
|
||||
x => anyhow::bail!("Unsupported Moondream2 task: {}", x),
|
||||
}
|
||||
}
|
||||
|
||||
fn generate_text(&mut self, input_ids: &[f32], kv_cache: Array<f32, IxDyn>) -> Result<String> {
|
||||
let input_ids = X::from(input_ids.to_vec()).insert_axis(0)?;
|
||||
let mut input_embeds = self.text_encoder.inference(Xs::from(input_ids))?[0].to_owned();
|
||||
let logits_sampler = LogitsSampler::new();
|
||||
let mut token_ids: Vec<u32> = Vec::new();
|
||||
let mut pos = self.vision_projection.seq_len() + self.initial_kv_cache.shape()[4];
|
||||
let mut inc = input_embeds.shape()[1];
|
||||
let mut kv_cache = kv_cache.clone();
|
||||
|
||||
// generate
|
||||
for _ in 0..self.max_length {
|
||||
// TODO
|
||||
let input = Xs::from(vec![
|
||||
input_embeds.clone(),
|
||||
kv_cache
|
||||
.slice(s![.., .., .., .., ..pos, ..])
|
||||
.into_owned()
|
||||
.into_dyn()
|
||||
.into(),
|
||||
]);
|
||||
let decoder_outputs = self.text_decoder.inference(input)?;
|
||||
|
||||
// update
|
||||
let logits = &decoder_outputs["logits"];
|
||||
let new_kv_cache = &decoder_outputs["new_kv_cache"];
|
||||
kv_cache
|
||||
.slice_mut(s![.., .., .., .., pos..pos + inc, ..])
|
||||
.assign(new_kv_cache);
|
||||
pos += inc;
|
||||
|
||||
// decode
|
||||
let token_id = logits_sampler.decode(
|
||||
logits
|
||||
.slice(s![-1, ..])
|
||||
.as_slice()
|
||||
.context("Failed to get slice when decode `logits`")?,
|
||||
)?;
|
||||
|
||||
// break
|
||||
if token_id == self.eos_token_id {
|
||||
break;
|
||||
}
|
||||
|
||||
// update
|
||||
token_ids.push(token_id);
|
||||
inc = 1;
|
||||
|
||||
// encode
|
||||
let next_tokens = X::from(vec![token_id as f32]).insert_axis(1)?;
|
||||
input_embeds = self.text_encoder.inference(Xs::from(next_tokens))?[0].to_owned();
|
||||
}
|
||||
|
||||
let text = self
|
||||
.text_encoder
|
||||
.processor()
|
||||
.decode_tokens(&token_ids, true)?;
|
||||
|
||||
Ok(text)
|
||||
}
|
||||
|
||||
fn generate_points_boxes(
|
||||
&mut self,
|
||||
input_ids: &[f32],
|
||||
kv_cache: Array<f32, IxDyn>,
|
||||
object: &str,
|
||||
generate_boxes: bool,
|
||||
) -> Result<(Vec<Vec<Keypoint>>, Vec<Bbox>)> {
|
||||
let mut y_bboxes: Vec<Bbox> = Vec::new();
|
||||
let mut y_kpts: Vec<Vec<Keypoint>> = Vec::new();
|
||||
let (image_height, image_width) = self.vision_encoder.processor.image0s_size[0];
|
||||
let mut pos = self.vision_projection.seq_len() + self.initial_kv_cache.shape()[4];
|
||||
let logits_sampler = LogitsSampler::new();
|
||||
|
||||
// initial input_embeds
|
||||
let input_ids = X::from(input_ids.to_vec()).insert_axis(0)?;
|
||||
let mut hidden = self.text_encoder.inference(Xs::from(input_ids))?[0].to_owned();
|
||||
let mut kv_cache = kv_cache;
|
||||
|
||||
// generate
|
||||
loop {
|
||||
let logits = self.run_decoder(&mut hidden, &mut kv_cache, &mut pos)?;
|
||||
|
||||
// decode
|
||||
let token_id = logits_sampler.decode(
|
||||
logits
|
||||
.slice(s![-1, ..])
|
||||
.as_slice()
|
||||
.context("Failed to get slice for `logits`")?,
|
||||
)?;
|
||||
|
||||
// break
|
||||
if token_id == self.eos_token_id {
|
||||
break;
|
||||
}
|
||||
|
||||
// cx
|
||||
let input: X = hidden.slice(s![0, -1, ..]).into_owned().into_dyn().into();
|
||||
let cx = self
|
||||
.coord_decoder
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.inference(Xs::from(input))?[0]
|
||||
.clone(); // [1024]
|
||||
let ratio = cx.shape()[0] as f32;
|
||||
let cx = logits_sampler
|
||||
.decode(cx.as_slice().context("Failed to get slice for `cx`")?)?
|
||||
as f32
|
||||
/ ratio;
|
||||
hidden = self
|
||||
.coord_encoder
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.inference(Xs::from(X::from(vec![cx])))?[0]
|
||||
.clone()
|
||||
.insert_axis(0)?
|
||||
.insert_axis(0)?;
|
||||
|
||||
// cy
|
||||
let _logits = self.run_decoder(&mut hidden, &mut kv_cache, &mut pos)?;
|
||||
let input: X = hidden.slice(s![0, -1, ..]).into_owned().into_dyn().into();
|
||||
let cy = self
|
||||
.coord_decoder
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.inference(Xs::from(input))?[0]
|
||||
.clone();
|
||||
let ratio = cy.shape()[0] as f32;
|
||||
|
||||
let cy = logits_sampler
|
||||
.decode(cy.as_slice().context("Failed to get slice for `cy`")?)?
|
||||
as f32
|
||||
/ ratio;
|
||||
|
||||
hidden = self
|
||||
.coord_encoder
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.inference(Xs::from(X::from(vec![cy])))?[0]
|
||||
.clone()
|
||||
.insert_axis(0)?
|
||||
.insert_axis(0)?;
|
||||
|
||||
if !generate_boxes {
|
||||
y_kpts.push(vec![Keypoint::from((
|
||||
cx * image_width as f32,
|
||||
cy * image_height as f32,
|
||||
0,
|
||||
))
|
||||
.with_name(object)]);
|
||||
|
||||
// keep?
|
||||
if y_kpts.len() > self.max_objects {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// wh
|
||||
let _logits = self.run_decoder(&mut hidden, &mut kv_cache, &mut pos)?;
|
||||
let input: X = hidden.slice(s![0, -1, ..]).into_owned().into_dyn().into();
|
||||
let size = self
|
||||
.size_decoder
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.inference(Xs::from(input))?[0]
|
||||
.clone(); // [2, 1024]
|
||||
|
||||
let ratio = size.shape()[1] as f32;
|
||||
let w = logits_sampler.decode(
|
||||
size.slice(s![0, ..])
|
||||
.as_slice()
|
||||
.context("Failed to get slice when decode `w`")?,
|
||||
)? as f32
|
||||
/ ratio;
|
||||
|
||||
// h
|
||||
let h = logits_sampler.decode(
|
||||
size.slice(s![1, ..])
|
||||
.as_slice()
|
||||
.context("Failed to get slice when decode `h`")?,
|
||||
)? as f32
|
||||
/ ratio;
|
||||
|
||||
hidden = self
|
||||
.size_encoder
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.inference(Xs::from(X::from(vec![w, h])))?[0]
|
||||
.clone()
|
||||
.insert_axis(0)?
|
||||
.insert_axis(0)?; // [1024]
|
||||
|
||||
let xmin = cx - w / 2.;
|
||||
let ymin = cy - h / 2.;
|
||||
|
||||
y_bboxes.push(
|
||||
Bbox::from((
|
||||
xmin * image_width as f32,
|
||||
ymin * image_height as f32,
|
||||
w * image_width as f32,
|
||||
h * image_height as f32,
|
||||
))
|
||||
.with_name(object)
|
||||
.with_id(0)
|
||||
.with_confidence(1.),
|
||||
);
|
||||
|
||||
// Keep?
|
||||
if y_bboxes.len() > self.max_objects {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok((y_kpts, y_bboxes))
|
||||
}
|
||||
|
||||
fn prepare_kv_cache(&mut self, image_embedding: &X) -> Result<Array<f32, IxDyn>> {
|
||||
let kv_cache_new = self.text_decoder.inference(Xs::from(vec![
|
||||
image_embedding.clone(),
|
||||
self.initial_kv_cache.clone(),
|
||||
]))?["new_kv_cache"]
|
||||
.to_owned();
|
||||
|
||||
// TODO
|
||||
let kv_cache_new = ndarray::concatenate(
|
||||
Axis(4),
|
||||
&[kv_cache_new.view(), self.initial_kv_cache.view()],
|
||||
)?;
|
||||
|
||||
// fill with max sequence length
|
||||
let mut shapes = self.initial_kv_cache.shape().to_vec();
|
||||
shapes[4] = self.max_length;
|
||||
let mut kv_cache = Array::zeros(shapes);
|
||||
kv_cache
|
||||
.slice_mut(s![.., .., .., .., ..kv_cache_new.dim()[4], ..])
|
||||
.assign(&kv_cache_new);
|
||||
|
||||
Ok(kv_cache.into_dyn())
|
||||
}
|
||||
|
||||
fn run_decoder(
|
||||
&mut self,
|
||||
input_embeds: &mut X,
|
||||
kv_cache: &mut Array<f32, IxDyn>,
|
||||
pos: &mut usize,
|
||||
) -> Result<X> {
|
||||
let decoder_outputs = self.text_decoder.inference(Xs::from(vec![
|
||||
input_embeds.clone(),
|
||||
kv_cache
|
||||
.slice(s![.., .., .., .., ..*pos, ..])
|
||||
.into_owned()
|
||||
.into_dyn()
|
||||
.into(),
|
||||
]))?;
|
||||
let hidden = &decoder_outputs["hidden"];
|
||||
let new_kv_cache = &decoder_outputs["new_kv_cache"];
|
||||
|
||||
// update
|
||||
let inc = hidden.shape()[1]; // -2
|
||||
kv_cache
|
||||
.slice_mut(s![.., .., .., .., *pos..*pos + inc, ..])
|
||||
.assign(new_kv_cache);
|
||||
*pos += inc;
|
||||
*input_embeds = hidden.to_owned();
|
||||
|
||||
Ok(decoder_outputs["logits"].to_owned())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Builder)]
|
||||
pub struct VisionEncoder {
|
||||
engine: Engine,
|
||||
num_patch: usize,
|
||||
patch_size: usize,
|
||||
processor: Processor,
|
||||
ts: Ts,
|
||||
}
|
||||
|
||||
impl VisionEncoder {
|
||||
pub fn new(options: Options) -> Result<Self> {
|
||||
let engine = options.to_engine()?;
|
||||
let (num_patch, patch_size, ts) = (
|
||||
engine.batch().opt(),
|
||||
engine.try_height().unwrap_or(&378.into()).opt(),
|
||||
engine.ts.clone(),
|
||||
);
|
||||
let processor = options
|
||||
.to_processor()?
|
||||
.with_image_width(patch_size as _)
|
||||
.with_image_height(patch_size as _);
|
||||
|
||||
Ok(Self {
|
||||
engine,
|
||||
patch_size,
|
||||
num_patch,
|
||||
processor,
|
||||
ts,
|
||||
})
|
||||
}
|
||||
|
||||
fn create_patches(
|
||||
image: &DynamicImage,
|
||||
image_patch_size: usize,
|
||||
) -> (Vec<DynamicImage>, (u32, u32)) {
|
||||
let mut patches = vec![image.clone()];
|
||||
let image = image.to_rgb8();
|
||||
|
||||
let res_templates = vec![(1, 2), (2, 1), (2, 2)];
|
||||
let (im_width, im_height) = image.dimensions();
|
||||
let max_dim = im_width.max(im_height);
|
||||
let selected_template = if max_dim < (image_patch_size as f32 * 1.4) as u32 {
|
||||
(1, 1)
|
||||
} else {
|
||||
let aspect_ratio = im_width as f32 / im_height as f32;
|
||||
res_templates
|
||||
.into_iter()
|
||||
.min_by(|a, b| {
|
||||
let diff_a = ((a.1 as f32 / a.0 as f32) - aspect_ratio).abs();
|
||||
let diff_b = ((b.1 as f32 / b.0 as f32) - aspect_ratio).abs();
|
||||
diff_a.partial_cmp(&diff_b).unwrap()
|
||||
})
|
||||
.unwrap()
|
||||
};
|
||||
let patch_width = im_width / selected_template.1;
|
||||
let patch_height = im_height / selected_template.0;
|
||||
|
||||
for row in 0..selected_template.0 {
|
||||
for col in 0..selected_template.1 {
|
||||
let x_min = col * patch_width;
|
||||
let y_min = row * patch_height;
|
||||
let _x_max = x_min + patch_width;
|
||||
let _y_max = y_min + patch_height;
|
||||
let cropped = image
|
||||
.view(x_min, y_min, patch_width, patch_height)
|
||||
.to_image();
|
||||
|
||||
patches.push(DynamicImage::from(cropped));
|
||||
}
|
||||
}
|
||||
|
||||
(patches, selected_template)
|
||||
}
|
||||
|
||||
pub fn inference(&mut self, xs: Xs) -> Result<Xs> {
|
||||
self.engine.run(xs)
|
||||
}
|
||||
|
||||
pub fn encode(&mut self, x: &DynamicImage) -> Result<X> {
|
||||
let (patches, selected_template) = Self::create_patches(x, self.patch_size);
|
||||
let patches = self.processor.process_images(&patches)?;
|
||||
let template = (
|
||||
(selected_template.0 as usize),
|
||||
(selected_template.1 as usize),
|
||||
);
|
||||
let patch_emb = self.inference(patches.clone().into())?[0].clone();
|
||||
let patch_emb = patch_emb.clone().0.into_dimensionality::<ndarray::Ix3>()?;
|
||||
let patch_emb = Self::process_patch_emb(patch_emb, template)?;
|
||||
let patch_emb = X::from(patch_emb.into_dyn()); // TODO .insert_axis(x),
|
||||
|
||||
Ok(patch_emb)
|
||||
}
|
||||
|
||||
fn process_patch_emb(patch_emb: Array3<f32>, template: (usize, usize)) -> Result<Array2<f32>> {
|
||||
let (_, seq_len, enc_dim) = patch_emb.dim(); // (N, 729, 720)
|
||||
let global_patch = patch_emb.slice(s![0, .., ..]).into_owned();
|
||||
if template == (1, 1) {
|
||||
Ok(ndarray::concatenate(
|
||||
Axis(1),
|
||||
&[global_patch.view(), global_patch.view()],
|
||||
)?)
|
||||
} else {
|
||||
let w = (seq_len as f32).sqrt() as usize;
|
||||
let mut rows = Vec::new();
|
||||
for r in 0..template.0 {
|
||||
let mut row = Vec::new();
|
||||
for c in 0..template.1 {
|
||||
let idx = r * template.1 + c;
|
||||
let patch = patch_emb.slice(s![idx, .., ..]).into_owned();
|
||||
let patch = patch.into_shape_with_order((w, w, enc_dim))?;
|
||||
row.push(patch);
|
||||
}
|
||||
let row_concat = ndarray::concatenate(
|
||||
Axis(1),
|
||||
&row.iter().map(|x| x.view()).collect::<Vec<_>>(),
|
||||
)?;
|
||||
rows.push(row_concat);
|
||||
}
|
||||
|
||||
let patch_emb =
|
||||
ndarray::concatenate(Axis(0), &rows.iter().map(|x| x.view()).collect::<Vec<_>>())?;
|
||||
let patch_emb = Self::adaptive_avg_pool2d(patch_emb, (w, w))
|
||||
.into_shape_with_order((w * w, enc_dim))?;
|
||||
|
||||
Ok(ndarray::concatenate(
|
||||
Axis(1),
|
||||
&[global_patch.view(), patch_emb.view()],
|
||||
)?)
|
||||
}
|
||||
}
|
||||
|
||||
fn adaptive_avg_pool2d(x: Array3<f32>, output_size: (usize, usize)) -> Array3<f32> {
|
||||
let (height, width, channels) = x.dim();
|
||||
let (out_height, out_width) = output_size;
|
||||
let stride_h = height / out_height;
|
||||
let stride_w = width / out_width;
|
||||
let kernel_h = height - (out_height - 1) * stride_h;
|
||||
let kernel_w = width - (out_width - 1) * stride_w;
|
||||
let mut output = Array3::zeros((out_height, out_width, channels));
|
||||
for i in 0..out_height {
|
||||
for j in 0..out_width {
|
||||
let h_start = i * stride_h;
|
||||
let h_end = h_start + kernel_h;
|
||||
let w_start = j * stride_w;
|
||||
let w_end = w_start + kernel_w;
|
||||
|
||||
for c in 0..channels {
|
||||
let mut sum = 0.0;
|
||||
let mut count = 0;
|
||||
|
||||
for h in h_start..h_end {
|
||||
for w in w_start..w_end {
|
||||
if h < height && w < width {
|
||||
sum += x[(h, w, c)];
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
output[(i, j, c)] = sum / count as f32;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Builder)]
|
||||
pub struct VisionProjection {
|
||||
engine: Engine,
|
||||
seq_len: usize,
|
||||
ts: Ts,
|
||||
}
|
||||
|
||||
impl VisionProjection {
|
||||
pub fn new(options: Options) -> Result<Self> {
|
||||
let engine = options.to_engine()?;
|
||||
let (seq_len, ts) = (engine.inputs_minoptmax[0][1].opt(), engine.ts.clone());
|
||||
|
||||
Ok(Self {
|
||||
engine,
|
||||
seq_len,
|
||||
ts,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn inference(&mut self, xs: Xs) -> Result<Xs> {
|
||||
self.engine.run(xs)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Builder, Debug)]
|
||||
struct KVCache(pub Array<f32, IxDyn>);
|
||||
|
||||
impl KVCache {
|
||||
pub fn new(scale: &Scale, dtype: &DType) -> Result<Self> {
|
||||
let f = format!("moondream2/{}-initial-kv-cache-{}.npy", scale, dtype);
|
||||
let f = Hub::default().try_fetch(&f)?;
|
||||
let file = std::fs::File::open(f)?;
|
||||
let x = Array::<f32, IxDyn>::read_npy(file)?.into_dyn();
|
||||
|
||||
Ok(Self(x))
|
||||
}
|
||||
}
|
4
src/models/moondream2/mod.rs
Normal file
4
src/models/moondream2/mod.rs
Normal file
@ -0,0 +1,4 @@
|
||||
mod config;
|
||||
mod r#impl;
|
||||
|
||||
pub use r#impl::Moondream2;
|
@ -59,8 +59,8 @@ impl YOLO {
|
||||
.to_processor()?
|
||||
.with_image_width(width as _)
|
||||
.with_image_height(height as _);
|
||||
let task: Option<Task> = match options.model_task {
|
||||
Some(task) => Some(task),
|
||||
let task: Option<Task> = match &options.model_task {
|
||||
Some(task) => Some(task.clone()),
|
||||
None => match engine.try_fetch("task") {
|
||||
Some(x) => match x.as_str() {
|
||||
"classify" => Some(Task::ImageClassification),
|
||||
@ -104,7 +104,7 @@ impl YOLO {
|
||||
// version + task
|
||||
None => match (task, version) {
|
||||
(Some(task), Some(version)) => {
|
||||
let layout = match (task, version) {
|
||||
let layout = match (task.clone(), version) {
|
||||
(Task::ImageClassification, Version(5, 0)) => {
|
||||
YOLOPredsFormat::n_clss().apply_softmax(true)
|
||||
}
|
||||
|
Reference in New Issue
Block a user