Options -> Config

This commit is contained in:
Jamjamjon
2025-05-20 17:14:12 +08:00
committed by GitHub
128 changed files with 6624 additions and 2741 deletions

View File

@ -1,7 +1,7 @@
[package] [package]
name = "usls" name = "usls"
edition = "2021" edition = "2021"
version = "0.1.0-beta.1" version = "0.1.0-beta.2"
rust-version = "1.82" rust-version = "1.82"
description = "A Rust library integrated with ONNXRuntime, providing a collection of ML models." description = "A Rust library integrated with ONNXRuntime, providing a collection of ML models."
repository = "https://github.com/jamjamjon/usls" repository = "https://github.com/jamjamjon/usls"
@ -44,6 +44,7 @@ ort = { version = "2.0.0-rc.9", default-features = false, optional = true , feat
"half" "half"
]} ]}
tokenizers = { version = "0.21.1" } tokenizers = { version = "0.21.1" }
paste = "1.0.15"
[build-dependencies] [build-dependencies]
prost-build = "0.13.5" prost-build = "0.13.5"

View File

@ -116,7 +116,8 @@
| [Moondream2](https://github.com/vikhyat/moondream/tree/main) | Open-Set Object Detection<br />Open-Set Keypoints Detection<br />Image Caption<br />Visual Question Answering | [demo](examples/moondream2) | ✅ | ✅ | ✅ | | | | [Moondream2](https://github.com/vikhyat/moondream/tree/main) | Open-Set Object Detection<br />Open-Set Keypoints Detection<br />Image Caption<br />Visual Question Answering | [demo](examples/moondream2) | ✅ | ✅ | ✅ | | |
| [OWLv2](https://huggingface.co/google/owlv2-base-patch16-ensemble) | Open-Set Object Detection | [demo](examples/owlv2) | ✅ | ✅ | ✅ | | | | [OWLv2](https://huggingface.co/google/owlv2-base-patch16-ensemble) | Open-Set Object Detection | [demo](examples/owlv2) | ✅ | ✅ | ✅ | | |
| [SmolVLM(256M, 500M)](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct) | Visual Question Answering | [demo](examples/smolvlm) | ✅ | ✅ | ✅ | | | | [SmolVLM(256M, 500M)](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct) | Visual Question Answering | [demo](examples/smolvlm) | ✅ | ✅ | ✅ | | |
| [RMBG(1.4, 2.0)](https://huggingface.co/briaai/RMBG-2.0) | Image Segmentation Answering | [demo](examples/rmbg) | ✅ | ✅ | ✅ | | | | [RMBG(1.4, 2.0)](https://huggingface.co/briaai/RMBG-2.0) | Image Segmentation<br />Background Removal | [demo](examples/rmbg) | ✅ | ✅ | ✅ | | |
| [BEN2](https://huggingface.co/PramaLLC/BEN2) | Image Segmentation<br />Background Removal | [demo](examples/rmbg) | ✅ | ✅ | ✅ | | |
</details> </details>

View File

@ -1,4 +1,4 @@
use usls::{models::RMBG, Annotator, DataLoader, Options}; use usls::{models::RMBG, Annotator, Config, DataLoader};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// Example /// Example
@ -20,11 +20,11 @@ fn main() -> anyhow::Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// build model // build model
let options = Options::ben2_base() let config = Config::ben2_base()
.with_model_dtype(args.dtype.as_str().try_into()?) .with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?) .with_model_device(args.device.as_str().try_into()?)
.commit()?; .commit()?;
let mut model = RMBG::new(options)?; let mut model = RMBG::new(config)?;
// load image // load image
let xs = DataLoader::try_read_n(&["./assets/cat.png"])?; let xs = DataLoader::try_read_n(&["./assets/cat.png"])?;

View File

@ -1,4 +1,4 @@
use usls::{models::Blip, DataLoader, Options}; use usls::{models::Blip, Config, DataLoader};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// BLIP Example /// BLIP Example
@ -20,13 +20,10 @@ fn main() -> anyhow::Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// build model // build model
let options_visual = Options::blip_v1_base_caption_visual() let config = Config::blip_v1_base_caption()
.with_model_device(args.device.as_str().try_into()?) .with_device_all(args.device.as_str().try_into()?)
.commit()?; .commit()?;
let options_textual = Options::blip_v1_base_caption_textual() let mut model = Blip::new(config)?;
.with_model_device(args.device.as_str().try_into()?)
.commit()?;
let mut model = Blip::new(options_visual, options_textual)?;
// image caption // image caption
let xs = DataLoader::try_read_n(&args.source)?; let xs = DataLoader::try_read_n(&args.source)?;

View File

@ -1,4 +1,4 @@
use usls::{models::ImageClassifier, Annotator, DataLoader, Options}; use usls::{models::ImageClassifier, Annotator, Config, DataLoader};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// Example /// Example
@ -12,7 +12,7 @@ struct Args {
device: String, device: String,
/// model name /// model name
#[argh(option, default = "String::from(\"beit\")")] #[argh(option, default = "String::from(\"mobileone\")")]
model: String, model: String,
/// source image /// source image
@ -36,20 +36,20 @@ fn main() -> anyhow::Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// build model // build model
let options = match args.model.to_lowercase().as_str() { let config = match args.model.to_lowercase().as_str() {
"beit" => Options::beit_base(), "beit" => Config::beit_base(),
"convnext" => Options::convnext_v2_atto(), "convnext" => Config::convnext_v2_atto(),
"deit" => Options::deit_tiny_distill(), "deit" => Config::deit_tiny_distill(),
"fastvit" => Options::fastvit_t8_distill(), "fastvit" => Config::fastvit_t8_distill(),
"mobileone" => Options::mobileone_s0(), "mobileone" => Config::mobileone_s0(),
_ => anyhow::bail!("Unsupported model: {}", args.model), _ => anyhow::bail!("Unsupported model: {}", args.model),
}; };
let options = options let config = config
.with_model_dtype(args.dtype.as_str().try_into()?) .with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?) .with_model_device(args.device.as_str().try_into()?)
.commit()?; .commit()?;
let mut model = ImageClassifier::try_from(options)?; let mut model = ImageClassifier::try_from(config)?;
// load images // load images
let xs = DataLoader::try_read_n(&args.source)?; let xs = DataLoader::try_read_n(&args.source)?;

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::Clip, DataLoader, Ops, Options}; use usls::{models::Clip, Config, DataLoader, Ops};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// CLIP Example /// CLIP Example
@ -14,18 +14,13 @@ fn main() -> Result<()> {
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
.with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339()) .with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339())
.init(); .init();
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// build model // build model
let options_visual = Options::jina_clip_v1_visual() let config = Config::jina_clip_v1()
// clip_vit_b32_visual() .with_device_all(args.device.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.commit()?; .commit()?;
let options_textual = Options::jina_clip_v1_textual() let mut model = Clip::new(config)?;
// clip_vit_b32_textual()
.with_model_device(args.device.as_str().try_into()?)
.commit()?;
let mut model = Clip::new(options_visual, options_textual)?;
// texts // texts
let texts = vec![ let texts = vec![

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::RTDETR, Annotator, DataLoader, Options}; use usls::{models::RTDETR, Annotator, Config, DataLoader};
fn main() -> Result<()> { fn main() -> Result<()> {
tracing_subscriber::fmt() tracing_subscriber::fmt()
@ -7,9 +7,8 @@ fn main() -> Result<()> {
.with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339()) .with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339())
.init(); .init();
// options // config
let options = Options::d_fine_n_coco().commit()?; let mut model = RTDETR::new(Config::d_fine_n_coco().commit()?)?;
let mut model = RTDETR::new(options)?;
// load // load
let xs = DataLoader::try_read_n(&["./assets/bus.jpg"])?; let xs = DataLoader::try_read_n(&["./assets/bus.jpg"])?;

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::DB, Annotator, DataLoader, Options, Style}; use usls::{models::DB, Annotator, Config, DataLoader, Style};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// Example /// Example
@ -41,15 +41,13 @@ fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// build model // build model
let options = match &args.model { let config = match &args.model {
Some(m) => Options::db().with_model_file(m), Some(m) => Config::db().with_model_file(m),
None => Options::ppocr_det_v4_ch().with_model_dtype(args.dtype.as_str().try_into()?), None => Config::ppocr_det_v4_ch().with_model_dtype(args.dtype.as_str().try_into()?),
}; }
let mut model = DB::new( .with_device_all(args.device.as_str().try_into()?)
options .commit()?;
.with_model_device(args.device.as_str().try_into()?) let mut model = DB::new(config)?;
.commit()?,
)?;
// load image // load image
let xs = DataLoader::try_read_n(&[ let xs = DataLoader::try_read_n(&[

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::RTDETR, Annotator, DataLoader, Options}; use usls::{models::RTDETR, Annotator, Config, DataLoader};
fn main() -> Result<()> { fn main() -> Result<()> {
tracing_subscriber::fmt() tracing_subscriber::fmt()
@ -7,9 +7,8 @@ fn main() -> Result<()> {
.with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339()) .with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339())
.init(); .init();
// options // config
let options = Options::deim_dfine_s_coco().commit()?; let mut model = RTDETR::new(Config::deim_dfine_s_coco().commit()?)?;
let mut model = RTDETR::new(options)?;
// load // load
let xs = DataLoader::try_read_n(&["./assets/bus.jpg"])?; let xs = DataLoader::try_read_n(&["./assets/bus.jpg"])?;

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::DepthAnything, Annotator, DataLoader, Options, Style}; use usls::{models::DepthAnything, Annotator, Config, DataLoader, Style};
fn main() -> Result<()> { fn main() -> Result<()> {
tracing_subscriber::fmt() tracing_subscriber::fmt()
@ -8,8 +8,7 @@ fn main() -> Result<()> {
.init(); .init();
// build model // build model
let options = Options::depth_anything_v2_small().commit()?; let mut model = DepthAnything::new(Config::depth_anything_v2_small().commit()?)?;
let mut model = DepthAnything::new(options)?;
// load // load
let xs = DataLoader::try_read_n(&["images/street.jpg"])?; let xs = DataLoader::try_read_n(&["images/street.jpg"])?;

View File

@ -1,6 +1,6 @@
use anyhow::Result; use anyhow::Result;
use usls::DataLoader; use usls::DataLoader;
use usls::{models::DepthPro, Annotator, Options, Style}; use usls::{models::DepthPro, Annotator, Config, Style};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// Example /// Example
@ -23,11 +23,12 @@ fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// model // model
let options = Options::depth_pro() let config = Config::depth_pro()
.with_model_dtype(args.dtype.as_str().try_into()?) .with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?) .with_model_device(args.device.as_str().try_into()?)
.commit()?; .commit()?;
let mut model = DepthPro::new(options)?;
let mut model = DepthPro::new(config)?;
// load // load
let xs = DataLoader::try_read_n(&["images/street.jpg"])?; let xs = DataLoader::try_read_n(&["images/street.jpg"])?;

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::DINOv2, DataLoader, Options}; use usls::{models::DINOv2, Config, DataLoader};
fn main() -> Result<()> { fn main() -> Result<()> {
tracing_subscriber::fmt() tracing_subscriber::fmt()
@ -11,8 +11,10 @@ fn main() -> Result<()> {
let xs = DataLoader::try_read_n(&["./assets/bus.jpg", "./assets/bus.jpg"])?; let xs = DataLoader::try_read_n(&["./assets/bus.jpg", "./assets/bus.jpg"])?;
// model // model
let options = Options::dinov2_small().with_batch_size(xs.len()).commit()?; let config = Config::dinov2_small()
let mut model = DINOv2::new(options)?; .with_batch_size_all(xs.len())
.commit()?;
let mut model = DINOv2::new(config)?;
// encode images // encode images
let y = model.encode_images(&xs)?; let y = model.encode_images(&xs)?;

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::YOLO, Annotator, DataLoader, Options}; use usls::{models::YOLO, Annotator, Config, DataLoader};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// Example /// Example
@ -18,7 +18,7 @@ fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// build model // build model
let config = Options::doclayout_yolo_docstructbench() let config = Config::doclayout_yolo_docstructbench()
.with_model_device(args.device.as_str().try_into()?) .with_model_device(args.device.as_str().try_into()?)
.commit()?; .commit()?;
let mut model = YOLO::new(config)?; let mut model = YOLO::new(config)?;

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::DB, Annotator, DataLoader, Options, Scale, Style}; use usls::{models::DB, Annotator, Config, DataLoader, Scale, Style};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// Example /// Example
@ -26,16 +26,16 @@ fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// build model // build model
let options = match args.scale.as_str().try_into()? { let config = match args.scale.as_str().try_into()? {
Scale::T => Options::fast_tiny(), Scale::T => Config::fast_tiny(),
Scale::S => Options::fast_small(), Scale::S => Config::fast_small(),
Scale::B => Options::fast_base(), Scale::B => Config::fast_base(),
_ => unimplemented!("Unsupported model scale: {:?}. Try b, s, t.", args.scale), _ => unimplemented!("Unsupported model scale: {:?}. Try b, s, t.", args.scale),
}; };
let mut model = DB::new( let mut model = DB::new(
options config
.with_model_dtype(args.dtype.as_str().try_into()?) .with_dtype_all(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?) .with_device_all(args.device.as_str().try_into()?)
.commit()?, .commit()?,
)?; )?;

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::YOLO, Annotator, DataLoader, Options}; use usls::{models::YOLO, Annotator, Config, DataLoader};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// Example /// Example
@ -22,7 +22,7 @@ fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// build model // build model
let config = Options::fastsam_s() let config = Config::fastsam_s()
.with_model_dtype(args.dtype.as_str().try_into()?) .with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?) .with_model_device(args.device.as_str().try_into()?)
.commit()?; .commit()?;
@ -45,7 +45,7 @@ fn main() -> Result<()> {
annotator.annotate(x, y)?.save(format!( annotator.annotate(x, y)?.save(format!(
"{}.jpg", "{}.jpg",
usls::Dir::Current usls::Dir::Current
.base_dir_with_subs(&["runs", "FastSAM"])? .base_dir_with_subs(&["runs", model.spec()])?
.join(usls::timestamp(None)) .join(usls::timestamp(None))
.display(), .display(),
))?; ))?;

View File

@ -1,7 +1,7 @@
## Quick Start ## Quick Start
```shell ```shell
cargo run -r -F cuda --example florence2 -- --device cuda --scale base --dtype fp16 cargo run -r -F cuda --example florence2 -- --device cuda --dtype fp16
``` ```

View File

@ -1,20 +1,16 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::Florence2, Annotator, DataLoader, Options, Scale, Style, Task}; use usls::{models::Florence2, Annotator, Config, DataLoader, Style, Task};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// Example /// Example
struct Args { struct Args {
/// dtype /// dtype
#[argh(option, default = "String::from(\"auto\")")] #[argh(option, default = "String::from(\"fp16\")")]
dtype: String, dtype: String,
/// device /// device
#[argh(option, default = "String::from(\"cpu:0\")")] #[argh(option, default = "String::from(\"cpu:0\")")]
device: String, device: String,
/// scale
#[argh(option, default = "String::from(\"base\")")]
scale: String,
} }
fn main() -> Result<()> { fn main() -> Result<()> {
@ -29,51 +25,12 @@ fn main() -> Result<()> {
let xs = DataLoader::try_read_n(&["images/green-car.jpg", "assets/bus.jpg"])?; let xs = DataLoader::try_read_n(&["images/green-car.jpg", "assets/bus.jpg"])?;
// build model // build model
let ( let config = Config::florence2_base()
options_vision_encoder, .with_dtype_all(args.dtype.as_str().try_into()?)
options_text_embed, .with_device_all(args.device.as_str().try_into()?)
options_encoder, .with_batch_size_all(xs.len())
options_decoder, .commit()?;
options_decoder_merged, let mut model = Florence2::new(config)?;
) = match args.scale.as_str().try_into()? {
Scale::B => (
Options::florence2_visual_encoder_base(),
Options::florence2_textual_embed_base(),
Options::florence2_texual_encoder_base(),
Options::florence2_texual_decoder_base(),
Options::florence2_texual_decoder_merged_base(),
),
Scale::L => todo!(),
_ => anyhow::bail!("Unsupported Florence2 scale."),
};
let mut model = Florence2::new(
options_vision_encoder
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.with_batch_size(xs.len())
.commit()?,
options_text_embed
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.with_batch_size(xs.len())
.commit()?,
options_encoder
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.with_batch_size(xs.len())
.commit()?,
options_decoder
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.with_batch_size(xs.len())
.commit()?,
options_decoder_merged
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.with_batch_size(xs.len())
.commit()?,
)?;
// tasks // tasks
let tasks = [ let tasks = [

View File

@ -1,11 +1,11 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::GroundingDINO, Annotator, DataLoader, Options}; use usls::{models::GroundingDINO, Annotator, Config, DataLoader};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// Example /// Example
struct Args { struct Args {
/// dtype /// dtype
#[argh(option, default = "String::from(\"auto\")")] #[argh(option, default = "String::from(\"fp16\")")]
dtype: String, dtype: String,
/// device /// device
@ -45,7 +45,7 @@ fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
let options = Options::grounding_dino_tiny() let config = Config::grounding_dino_tiny()
.with_model_dtype(args.dtype.as_str().try_into()?) .with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?) .with_model_device(args.device.as_str().try_into()?)
.with_text_names(&args.labels.iter().map(|x| x.as_str()).collect::<Vec<_>>()) .with_text_names(&args.labels.iter().map(|x| x.as_str()).collect::<Vec<_>>())
@ -53,7 +53,7 @@ fn main() -> Result<()> {
.with_text_confs(&[0.25]) .with_text_confs(&[0.25])
.commit()?; .commit()?;
let mut model = GroundingDINO::new(options)?; let mut model = GroundingDINO::new(config)?;
// load images // load images
let xs = DataLoader::try_read_n(&args.source)?; let xs = DataLoader::try_read_n(&args.source)?;

View File

@ -1,6 +1,6 @@
use anyhow::Result; use anyhow::Result;
use usls::DataLoader; use usls::DataLoader;
use usls::{models::DB, Annotator, Options, Scale, Style}; use usls::{models::DB, Annotator, Config, Scale, Style};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// Example /// Example
@ -27,14 +27,14 @@ fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// build model // build model
let options = match args.scale.as_str().try_into()? { let config = match args.scale.as_str().try_into()? {
Scale::T => Options::linknet_r18(), Scale::T => Config::linknet_r18(),
Scale::S => Options::linknet_r34(), Scale::S => Config::linknet_r34(),
Scale::B => Options::linknet_r50(), Scale::B => Config::linknet_r50(),
_ => unimplemented!("Unsupported model scale: {:?}. Try b, s, t.", args.scale), _ => unimplemented!("Unsupported model scale: {:?}. Try b, s, t.", args.scale),
}; };
let mut model = DB::new( let mut model = DB::new(
options config
.with_model_dtype(args.dtype.as_str().try_into()?) .with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?) .with_model_device(args.device.as_str().try_into()?)
.commit()?, .commit()?,

View File

@ -1,4 +1,4 @@
use usls::{models::MODNet, Annotator, DataLoader, Options}; use usls::{models::MODNet, Annotator, Config, DataLoader};
fn main() -> anyhow::Result<()> { fn main() -> anyhow::Result<()> {
tracing_subscriber::fmt() tracing_subscriber::fmt()
@ -7,8 +7,7 @@ fn main() -> anyhow::Result<()> {
.init(); .init();
// build model // build model
let options = Options::modnet_photographic().commit()?; let mut model = MODNet::new(Config::modnet_photographic().commit()?)?;
let mut model = MODNet::new(options)?;
// load image // load image
let xs = DataLoader::try_read_n(&["images/liuyifei.png"])?; let xs = DataLoader::try_read_n(&["images/liuyifei.png"])?;

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::Moondream2, Annotator, DataLoader, Options, Scale, Task}; use usls::{models::Moondream2, Annotator, Config, DataLoader, Scale, Task};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// Example /// Example
@ -39,81 +39,16 @@ fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// build model // build model
let ( let config = match args.scale.as_str().try_into()? {
options_vision_encoder, Scale::Billion(0.5) => Config::moondream2_0_5b(),
options_vision_projection, Scale::Billion(2.) => Config::moondream2_2b(),
options_text_decoder,
options_text_encoder,
options_coord_decoder,
options_coord_encoder,
options_size_decoder,
options_size_encoder,
) = match args.scale.as_str().try_into()? {
Scale::Billion(2.) => (
Options::moondream2_2b_vision_encoder(),
Options::moondream2_2b_vision_projection(),
Options::moondream2_2b_text_decoder(),
Options::moondream2_2b_text_encoder(),
Options::moondream2_2b_coord_decoder(),
Options::moondream2_2b_coord_encoder(),
Options::moondream2_2b_size_decoder(),
Options::moondream2_2b_size_encoder(),
),
Scale::Billion(0.5) => (
Options::moondream2_0_5b_vision_encoder(),
Options::moondream2_0_5b_vision_projection(),
Options::moondream2_0_5b_text_decoder(),
Options::moondream2_0_5b_text_encoder(),
Options::moondream2_0_5b_coord_decoder(),
Options::moondream2_0_5b_coord_encoder(),
Options::moondream2_0_5b_size_decoder(),
Options::moondream2_0_5b_size_encoder(),
),
_ => unimplemented!(), _ => unimplemented!(),
}; }
.with_dtype_all(args.dtype.as_str().try_into()?)
.with_device_all(args.device.as_str().try_into()?)
.commit()?;
let mut model = Moondream2::new( let mut model = Moondream2::new(config)?;
options_vision_encoder
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
options_vision_projection
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
options_text_encoder
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
options_text_decoder
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
Some(
options_coord_encoder
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
),
Some(
options_coord_decoder
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
),
Some(
options_size_encoder
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
),
Some(
options_size_decoder
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
),
)?;
// load images // load images
let xs = DataLoader::try_read_n(&args.source)?; let xs = DataLoader::try_read_n(&args.source)?;
@ -142,13 +77,6 @@ fn main() -> Result<()> {
} }
Task::OpenSetDetection(_) | Task::OpenSetKeypointsDetection(_) => { Task::OpenSetDetection(_) | Task::OpenSetKeypointsDetection(_) => {
println!("{:?}", ys); println!("{:?}", ys);
// let annotator = Annotator::default()
// .with_bboxes_thickness(4)
// .without_bboxes_conf(true)
// .with_keypoints_radius(6)
// .with_keypoints_name(true)
// .with_saveout("moondream2");
// annotator.annotate(&xs, &ys);
// annotate // annotate
let annotator = Annotator::default() let annotator = Annotator::default()

View File

@ -1,6 +1,6 @@
use anyhow::Result; use anyhow::Result;
use usls::DataLoader; use usls::DataLoader;
use usls::{models::OWLv2, Annotator, Options}; use usls::{models::OWLv2, Annotator, Config};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// Example /// Example
@ -46,14 +46,14 @@ fn main() -> Result<()> {
.init(); .init();
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// options // config
let options = Options::owlv2_base_ensemble() let config = Config::owlv2_base_ensemble()
// owlv2_base() // owlv2_base()
.with_model_dtype(args.dtype.as_str().try_into()?) .with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?) .with_model_device(args.device.as_str().try_into()?)
.with_class_names(&args.labels.iter().map(|x| x.as_str()).collect::<Vec<_>>()) .with_text_names(&args.labels.iter().map(|x| x.as_str()).collect::<Vec<_>>())
.commit()?; .commit()?;
let mut model = OWLv2::new(options)?; let mut model = OWLv2::new(config)?;
// load // load
let xs = DataLoader::try_read_n(&args.source)?; let xs = DataLoader::try_read_n(&args.source)?;

View File

@ -1,6 +1,6 @@
use anyhow::Result; use anyhow::Result;
use usls::DataLoader; use usls::DataLoader;
use usls::{models::PicoDet, Annotator, Options}; use usls::{models::PicoDet, Annotator, Config};
fn main() -> Result<()> { fn main() -> Result<()> {
tracing_subscriber::fmt() tracing_subscriber::fmt()
@ -8,12 +8,11 @@ fn main() -> Result<()> {
.with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339()) .with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339())
.init(); .init();
// options // config
let options = Options::picodet_layout_1x() let config = Config::picodet_layout_1x().commit()?;
// picodet_l_layout_3cls() // picodet_l_layout_3cls()
// picodet_l_layout_17cls() // picodet_l_layout_17cls()
.commit()?; let mut model = PicoDet::new(config)?;
let mut model = PicoDet::new(options)?;
// load // load
let xs = DataLoader::try_read_n(&["images/academic.jpg"])?; let xs = DataLoader::try_read_n(&["images/academic.jpg"])?;

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::RFDETR, Annotator, DataLoader, Options}; use usls::{models::RFDETR, Annotator, Config, DataLoader};
fn main() -> Result<()> { fn main() -> Result<()> {
tracing_subscriber::fmt() tracing_subscriber::fmt()
@ -7,9 +7,8 @@ fn main() -> Result<()> {
.with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339()) .with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339())
.init(); .init();
// options // config
let options = Options::rfdetr_base().commit()?; let mut model = RFDETR::new(Config::rfdetr_base().commit()?)?;
let mut model = RFDETR::new(options)?;
// load // load
let xs = DataLoader::try_read_n(&["./assets/bus.jpg"])?; let xs = DataLoader::try_read_n(&["./assets/bus.jpg"])?;

View File

@ -1,10 +1,10 @@
use usls::{models::RMBG, Annotator, DataLoader, Options}; use usls::{models::RMBG, Annotator, Config, DataLoader};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// Example /// Example
struct Args { struct Args {
/// dtype /// dtype
#[argh(option, default = "String::from(\"auto\")")] #[argh(option, default = "String::from(\"fp16\")")]
dtype: String, dtype: String,
/// device /// device
@ -23,18 +23,18 @@ fn main() -> anyhow::Result<()> {
.init(); .init();
let args: Args = argh::from_env(); let args: Args = argh::from_env();
let options = match args.ver { let config = match args.ver {
1.4 => Options::rmbg1_4(), 1.4 => Config::rmbg1_4(),
2.0 => Options::rmbg2_0(), 2.0 => Config::rmbg2_0(),
_ => unreachable!("Unsupported version"), _ => unreachable!("Unsupported version"),
}; };
// build model // build model
let options = options let config = config
.with_model_dtype(args.dtype.as_str().try_into()?) .with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?) .with_model_device(args.device.as_str().try_into()?)
.commit()?; .commit()?;
let mut model = RMBG::new(options)?; let mut model = RMBG::new(config)?;
// load image // load image
let xs = DataLoader::try_read_n(&["./assets/cat.png"])?; let xs = DataLoader::try_read_n(&["./assets/cat.png"])?;

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::RTDETR, Annotator, DataLoader, Options}; use usls::{models::RTDETR, Annotator, Config, DataLoader};
fn main() -> Result<()> { fn main() -> Result<()> {
tracing_subscriber::fmt() tracing_subscriber::fmt()
@ -7,15 +7,14 @@ fn main() -> Result<()> {
.with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339()) .with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339())
.init(); .init();
// options // config
let options = Options::rtdetr_v2_s_coco() let config = Config::rtdetr_v2_s_coco().commit()?;
// rtdetr_v1_r18vd_coco() // rtdetr_v1_r18vd_coco()
// rtdetr_v2_ms_coco() // rtdetr_v2_ms_coco()
// rtdetr_v2_m_coco() // rtdetr_v2_m_coco()
// rtdetr_v2_l_coco() // rtdetr_v2_l_coco()
// rtdetr_v2_x_coco() // rtdetr_v2_x_coco()
.commit()?; let mut model = RTDETR::new(config)?;
let mut model = RTDETR::new(options)?;
// load // load
let xs = DataLoader::try_read_n(&["./assets/bus.jpg"])?; let xs = DataLoader::try_read_n(&["./assets/bus.jpg"])?;

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::RTMO, Annotator, DataLoader, Options, Style, SKELETON_COCO_19}; use usls::{models::RTMO, Annotator, Config, DataLoader, Style, SKELETON_COCO_19};
fn main() -> Result<()> { fn main() -> Result<()> {
tracing_subscriber::fmt() tracing_subscriber::fmt()
@ -8,7 +8,7 @@ fn main() -> Result<()> {
.init(); .init();
// build model // build model
let mut model = RTMO::new(Options::rtmo_s().commit()?)?; let mut model = RTMO::new(Config::rtmo_s().commit()?)?;
// load image // load image
let xs = DataLoader::try_read_n(&["./assets/bus.jpg"])?; let xs = DataLoader::try_read_n(&["./assets/bus.jpg"])?;

View File

@ -1,7 +1,7 @@
use anyhow::Result; use anyhow::Result;
use usls::{ use usls::{
models::{SamKind, SamPrompt, SAM}, models::{SamKind, SamPrompt, SAM},
Annotator, DataLoader, Options, Scale, Annotator, Config, DataLoader, Scale,
}; };
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
@ -28,40 +28,22 @@ fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// Build model // Build model
let (options_encoder, options_decoder) = match args.kind.as_str().try_into()? { let config = match args.kind.as_str().try_into()? {
SamKind::Sam => ( SamKind::Sam => Config::sam_v1_base(),
Options::sam_v1_base_encoder(),
Options::sam_v1_base_decoder(),
),
SamKind::Sam2 => match args.scale.as_str().try_into()? { SamKind::Sam2 => match args.scale.as_str().try_into()? {
Scale::T => (Options::sam2_tiny_encoder(), Options::sam2_tiny_decoder()), Scale::T => Config::sam2_tiny(),
Scale::S => (Options::sam2_small_encoder(), Options::sam2_small_decoder()), Scale::S => Config::sam2_small(),
Scale::B => ( Scale::B => Config::sam2_base_plus(),
Options::sam2_base_plus_encoder(),
Options::sam2_base_plus_decoder(),
),
_ => unimplemented!("Unsupported model scale: {:?}. Try b, s, t.", args.scale), _ => unimplemented!("Unsupported model scale: {:?}. Try b, s, t.", args.scale),
}, },
SamKind::MobileSam => Config::mobile_sam_tiny(),
SamKind::SamHq => Config::sam_hq_tiny(),
SamKind::EdgeSam => Config::edge_sam_3x(),
}
.with_device_all(args.device.as_str().try_into()?)
.commit()?;
SamKind::MobileSam => ( let mut model = SAM::new(config)?;
Options::mobile_sam_tiny_encoder(),
Options::mobile_sam_tiny_decoder(),
),
SamKind::SamHq => (
Options::sam_hq_tiny_encoder(),
Options::sam_hq_tiny_decoder(),
),
SamKind::EdgeSam => (
Options::edge_sam_3x_encoder(),
Options::edge_sam_3x_decoder(),
),
};
let options_encoder = options_encoder
.with_model_device(args.device.as_str().try_into()?)
.commit()?;
let options_decoder = options_decoder.commit()?;
let mut model = SAM::new(options_encoder, options_decoder)?;
// Load image // Load image
let xs = DataLoader::try_read_n(&["images/truck.jpg"])?; let xs = DataLoader::try_read_n(&["images/truck.jpg"])?;

View File

@ -1,6 +1,5 @@
## Quick Start ## Quick Start
```Shell ```Shell
cargo run -r -F cuda --example sam2 -- --device cuda --scale t
cargo run -r -F cuda --example sam -- --device cuda --scale t
``` ```

View File

@ -1,7 +1,7 @@
use anyhow::Result; use anyhow::Result;
use usls::{ use usls::{
models::{SamPrompt, SAM2}, models::{SamPrompt, SAM2},
Annotator, DataLoader, Options, Scale, Annotator, Config, DataLoader, Scale,
}; };
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
@ -25,33 +25,16 @@ fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// Build model // Build model
let (options_encoder, options_decoder) = match args.scale.as_str().try_into()? { let config = match args.scale.as_str().try_into()? {
Scale::T => ( Scale::T => Config::sam2_1_tiny(),
Options::sam2_1_tiny_encoder(), Scale::S => Config::sam2_1_small(),
Options::sam2_1_tiny_decoder(), Scale::B => Config::sam2_1_base_plus(),
), Scale::L => Config::sam2_1_large(),
Scale::S => (
Options::sam2_1_small_encoder(),
Options::sam2_1_small_decoder(),
),
Scale::B => (
Options::sam2_1_base_plus_encoder(),
Options::sam2_1_base_plus_decoder(),
),
Scale::L => (
Options::sam2_1_large_encoder(),
Options::sam2_1_large_decoder(),
),
_ => unimplemented!("Unsupported model scale: {:?}. Try b, s, t, l.", args.scale), _ => unimplemented!("Unsupported model scale: {:?}. Try b, s, t, l.", args.scale),
}; }
.with_device_all(args.device.as_str().try_into()?)
let options_encoder = options_encoder .commit()?;
.with_model_device(args.device.as_str().try_into()?) let mut model = SAM2::new(config)?;
.commit()?;
let options_decoder = options_decoder
.with_model_device(args.device.as_str().try_into()?)
.commit()?;
let mut model = SAM2::new(options_encoder, options_decoder)?;
// Load image // Load image
let xs = DataLoader::try_read_n(&["images/truck.jpg"])?; let xs = DataLoader::try_read_n(&["images/truck.jpg"])?;

View File

@ -1,7 +1,7 @@
## Quick Start ## Quick Start
```shell ```shell
cargo run -r -F cuda --example sapiens -- --device cuda cargo run -r -F cuda --example sapiens -- --device cuda
``` ```
## Results ## Results

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::Sapiens, Annotator, DataLoader, Options}; use usls::{models::Sapiens, Annotator, Config, DataLoader};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// Example /// Example
@ -17,10 +17,10 @@ fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// build // build
let options = Options::sapiens_seg_0_3b() let config = Config::sapiens_seg_0_3b()
.with_model_device(args.device.as_str().try_into()?) .with_model_device(args.device.as_str().try_into()?)
.commit()?; .commit()?;
let mut model = Sapiens::new(options)?; let mut model = Sapiens::new(config)?;
// load // load
let xs = DataLoader::try_read_n(&["images/paul-george.jpg"])?; let xs = DataLoader::try_read_n(&["images/paul-george.jpg"])?;

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::SLANet, Annotator, Color, DataLoader, Options}; use usls::{models::SLANet, Annotator, Color, Config, DataLoader};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// Example /// Example
@ -26,11 +26,11 @@ fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// build model // build model
let options = Options::slanet_lcnet_v2_mobile_ch() let config = Config::slanet_lcnet_v2_mobile_ch()
.with_model_device(args.device.as_str().try_into()?) .with_model_device(args.device.as_str().try_into()?)
.with_model_dtype(args.dtype.as_str().try_into()?) .with_model_dtype(args.dtype.as_str().try_into()?)
.commit()?; .commit()?;
let mut model = SLANet::new(options)?; let mut model = SLANet::new(config)?;
// load // load
let xs = DataLoader::try_read_n(&[args.source])?; let xs = DataLoader::try_read_n(&[args.source])?;

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::SmolVLM, DataLoader, Options, Scale}; use usls::{models::SmolVLM, Config, DataLoader, Scale};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// Example /// Example
@ -29,32 +29,14 @@ fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// build model // build model
let (options_vision_encoder, options_text_embed, options_decode) = let config = match args.scale.as_str().try_into()? {
match args.scale.as_str().try_into()? { Scale::Million(256.) => Config::smolvlm_256m(),
Scale::Million(256.) => ( Scale::Million(500.) => Config::smolvlm_500m(),
Options::smolvlm_vision_256m(), _ => unimplemented!(),
Options::smolvlm_text_embed_256m(), }
Options::smolvlm_decoder_256m(), .with_device_all(args.device.as_str().try_into()?)
), .commit()?;
Scale::Million(500.) => ( let mut model = SmolVLM::new(config)?;
Options::smolvlm_vision_500m(),
Options::smolvlm_text_embed_500m(),
Options::smolvlm_decoder_500m(),
),
_ => unimplemented!(),
};
let mut model = SmolVLM::new(
options_vision_encoder
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
options_text_embed
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
options_decode
.with_model_device(args.device.as_str().try_into()?)
.commit()?,
)?;
// load images // load images
let xs = DataLoader::try_read_n(&args.source)?; let xs = DataLoader::try_read_n(&args.source)?;

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::SVTR, DataLoader, Options}; use usls::{models::SVTR, Config, DataLoader};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// Example /// Example
@ -22,13 +22,13 @@ fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// build model // build model
let options = Options::ppocr_rec_v4_ch() let config = Config::ppocr_rec_v4_ch()
// ppocr_rec_v4_en() // ppocr_rec_v4_en()
// repsvtr_ch() // repsvtr_ch()
.with_model_device(args.device.as_str().try_into()?) .with_model_device(args.device.as_str().try_into()?)
.with_model_dtype(args.dtype.as_str().try_into()?) .with_model_dtype(args.dtype.as_str().try_into()?)
.commit()?; .commit()?;
let mut model = SVTR::new(options)?; let mut model = SVTR::new(config)?;
// load images // load images
let dl = DataLoader::new("./examples/svtr/images")? let dl = DataLoader::new("./examples/svtr/images")?

View File

@ -1,6 +1,6 @@
use usls::{ use usls::{
models::{TrOCR, TrOCRKind}, models::{TrOCR, TrOCRKind},
DataLoader, Options, Scale, Config, DataLoader, Scale,
}; };
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
@ -38,52 +38,22 @@ fn main() -> anyhow::Result<()> {
])?; ])?;
// build model // build model
let (options_encoder, options_decoder, options_decoder_merged) = let config = match args.scale.as_str().try_into()? {
match args.scale.as_str().try_into()? { Scale::S => match args.kind.as_str().try_into()? {
Scale::S => match args.kind.as_str().try_into()? { TrOCRKind::Printed => Config::trocr_small_printed(),
TrOCRKind::Printed => ( TrOCRKind::HandWritten => Config::trocr_small_handwritten(),
Options::trocr_encoder_small_printed(), },
Options::trocr_decoder_small_printed(), Scale::B => match args.kind.as_str().try_into()? {
Options::trocr_decoder_merged_small_printed(), TrOCRKind::Printed => Config::trocr_base_printed(),
), TrOCRKind::HandWritten => Config::trocr_base_handwritten(),
TrOCRKind::HandWritten => ( },
Options::trocr_encoder_small_handwritten(), x => anyhow::bail!("Unsupported TrOCR scale: {:?}", x),
Options::trocr_decoder_small_handwritten(), }
Options::trocr_decoder_merged_small_handwritten(), .with_device_all(args.device.as_str().try_into()?)
), .with_dtype_all(args.dtype.as_str().try_into()?)
}, .commit()?;
Scale::B => match args.kind.as_str().try_into()? {
TrOCRKind::Printed => (
Options::trocr_encoder_base_printed(),
Options::trocr_decoder_base_printed(),
Options::trocr_decoder_merged_base_printed(),
),
TrOCRKind::HandWritten => (
Options::trocr_encoder_base_handwritten(),
Options::trocr_decoder_base_handwritten(),
Options::trocr_decoder_merged_base_handwritten(),
),
},
x => anyhow::bail!("Unsupported TrOCR scale: {:?}", x),
};
let mut model = TrOCR::new( let mut model = TrOCR::new(config)?;
options_encoder
.with_model_device(args.device.as_str().try_into()?)
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_batch_size(xs.len())
.commit()?,
options_decoder
.with_model_device(args.device.as_str().try_into()?)
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_batch_size(xs.len())
.commit()?,
options_decoder_merged
.with_model_device(args.device.as_str().try_into()?)
.with_model_dtype(args.dtype.as_str().try_into()?)
.with_batch_size(xs.len())
.commit()?,
)?;
// inference // inference
let ys = model.forward(&xs)?; let ys = model.forward(&xs)?;

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::YOLO, Annotator, DataLoader, Options}; use usls::{models::YOLO, Annotator, Config, DataLoader};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// Example /// Example
@ -22,7 +22,7 @@ fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// build model // build model
let config = Options::yolo_v8_rtdetr_l() let config = Config::ultralytics_rtdetr_l()
.with_model_dtype(args.dtype.as_str().try_into()?) .with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?) .with_model_device(args.device.as_str().try_into()?)
.commit()?; .commit()?;
@ -41,7 +41,7 @@ fn main() -> Result<()> {
annotator.annotate(x, y)?.save(format!( annotator.annotate(x, y)?.save(format!(
"{}.jpg", "{}.jpg",
usls::Dir::Current usls::Dir::Current
.base_dir_with_subs(&["runs", "YOLOv8-RT-DETR"])? .base_dir_with_subs(&["runs", "ultralytics-RTDETR"])?
.join(usls::timestamp(None)) .join(usls::timestamp(None))
.display(), .display(),
))?; ))?;

View File

@ -1,7 +1,7 @@
use anyhow::Result; use anyhow::Result;
use usls::{ use usls::{
models::{SamPrompt, SAM2, YOLO}, models::{SamPrompt, SAM2, YOLO},
Annotator, DataLoader, Options, Scale, Style, Annotator, Config, DataLoader, Scale, Style,
}; };
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
@ -21,16 +21,12 @@ fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// build SAM // build SAM
let (options_encoder, options_decoder) = ( let mut sam = SAM2::new(Config::sam2_1_tiny().commit()?)?;
Options::sam2_1_tiny_encoder().commit()?,
Options::sam2_1_tiny_decoder().commit()?,
);
let mut sam = SAM2::new(options_encoder, options_decoder)?;
// build YOLOv8 // build YOLOv8
let options_yolo = Options::yolo_detect() let options_yolo = Config::yolo_detect()
.with_model_scale(Scale::N) .with_scale(Scale::N)
.with_model_version(8.into()) .with_version(8.into())
.with_model_device(args.device.as_str().try_into()?) .with_model_device(args.device.as_str().try_into()?)
.commit()?; .commit()?;
let mut yolo = YOLO::new(options_yolo)?; let mut yolo = YOLO::new(options_yolo)?;

View File

@ -27,34 +27,36 @@ cargo run -r --example yolo -- --task detect --ver v8 --num-classes 6 --model xx
# Classify # Classify
cargo run -r --example yolo -- --task classify --ver 5 --scale s --image-width 224 --image-height 224 --num-classes 1000 --use-imagenet-1k-classes # YOLOv5 cargo run -r --example yolo -- --task classify --ver 5 --scale s --image-width 224 --image-height 224 --num-classes 1000 --use-imagenet-1k-classes # YOLOv5
cargo run -r --example yolo -- --task classify --ver 8 --scale n --image-width 224 --image-height 224 # YOLOv8 cargo run -r --example yolo -- --task classify --ver 8 --scale n --image-width 224 --image-height 224 --use-imagenet-1k-classes # YOLOv8
cargo run -r --example yolo -- --task classify --ver 11 --scale n --image-width 224 --image-height 224 # YOLOv11 cargo run -r --example yolo -- --task classify --ver 11 --scale n --image-width 224 --image-height 224 # YOLO11
# Detect # Detect
cargo run -r --example yolo -- --task detect --ver 5 --scale n --use-coco-80-classes # YOLOv5 cargo run -r --example yolo -- --task detect --ver 5 --scale n --use-coco-80-classes --dtype fp16 # YOLOv5
cargo run -r --example yolo -- --task detect --ver 6 --scale n --use-coco-80-classes # YOLOv6 cargo run -r --example yolo -- --task detect --ver 6 --scale n --use-coco-80-classes --dtype fp16 # YOLOv6
cargo run -r --example yolo -- --task detect --ver 7 --scale t --use-coco-80-classes # YOLOv7 cargo run -r --example yolo -- --task detect --ver 7 --scale t --use-coco-80-classes --dtype fp16 # YOLOv7
cargo run -r --example yolo -- --task detect --ver 8 --scale n --use-coco-80-classes # YOLOv8 cargo run -r --example yolo -- --task detect --ver 8 --scale n --use-coco-80-classes --dtype fp16 # YOLOv8
cargo run -r --example yolo -- --task detect --ver 9 --scale t --use-coco-80-classes # YOLOv9 cargo run -r --example yolo -- --task detect --ver 9 --scale t --use-coco-80-classes --dtype fp16 # YOLOv9
cargo run -r --example yolo -- --task detect --ver 10 --scale n --use-coco-80-classes # YOLOv10 cargo run -r --example yolo -- --task detect --ver 10 --scale n --use-coco-80-classes --dtype fp16 # YOLOv10
cargo run -r --example yolo -- --task detect --ver 11 --scale n --use-coco-80-classes # YOLOv11 cargo run -r --example yolo -- --task detect --ver 11 --scale n --use-coco-80-classes --dtype fp16 # YOLO11
cargo run -r --example yolo -- --task detect --ver 8 --model v8-s-world-v2-shoes.onnx # YOLOv8-world cargo run -r --example yolo -- --task detect --ver 12 --scale n --use-coco-80-classes --dtype fp16 # YOLOv12
cargo run -r --example yolo -- --task detect --ver 8 --model v8-s-world-v2-shoes.onnx # YOLOv8-world
# Pose # Pose
cargo run -r --example yolo -- --task pose --ver 8 --scale n # YOLOv8-Pose cargo run -r --example yolo -- --task pose --ver 8 --scale n # YOLOv8-Pose
cargo run -r --example yolo -- --task pose --ver 11 --scale n # YOLOv11-Pose cargo run -r --example yolo -- --task pose --ver 11 --scale n # YOLOv11-Pose
# Segment # Segment
cargo run -r --example yolo -- --task segment --ver 5 --scale n # YOLOv5-Segment cargo run -r --example yolo -- --task segment --ver 5 --scale n --use-coco-80-classes --dtype fp16 # YOLOv5-Segment
cargo run -r --example yolo -- --task segment --ver 8 --scale n # YOLOv8-Segment cargo run -r --example yolo -- --task segment --ver 8 --scale n --use-coco-80-classes --dtype fp16 # YOLOv8-Segment
cargo run -r --example yolo -- --task segment --ver 11 --scale n # YOLOv8-Segment cargo run -r --example yolo -- --task segment --ver 9 --scale c --use-coco-80-classes --dtype fp16 # YOLOv9-Segment
cargo run -r --example yolo -- --task segment --ver 11 --scale n --use-coco-80-classes --dtype fp16 # YOLO11-Segment
# Obb # Obb
cargo run -r --example yolo -- --ver 8 --task obb --scale n --image-width 1024 --image-height 1024 --source images/dota.png # YOLOv8-Obb cargo run -r --example yolo -- --ver 8 --task obb --scale n --image-width 1024 --image-height 1024 --source images/dota.png # YOLOv8-Obb
cargo run -r --example yolo -- --ver 11 --task obb --scale n --image-width 1024 --image-height 1024 --source images/dota.png # YOLOv11-Obb cargo run -r --example yolo -- --ver 11 --task obb --scale n --image-width 1024 --image-height 1024 --source images/dota.png # YOLOv11-Obb
``` ```
**`cargo run -r --example yolo -- --help` for more options** **`cargo run -r --example yolo -- --help` for more config**
## Other YOLOv8 Solution Models ## Other YOLOv8 Solution Models

View File

@ -1,25 +1,25 @@
use anyhow::Result; use anyhow::Result;
use usls::{ use usls::{
models::YOLO, Annotator, DataLoader, Options, Style, NAMES_COCO_80, NAMES_COCO_KEYPOINTS_17, models::YOLO, Annotator, Config, DataLoader, Style, NAMES_COCO_80, NAMES_COCO_KEYPOINTS_17,
NAMES_IMAGENET_1K, SKELETON_COCO_19, SKELETON_COLOR_COCO_19, NAMES_IMAGENET_1K, SKELETON_COCO_19, SKELETON_COLOR_COCO_19,
}; };
#[derive(argh::FromArgs, Debug)] #[derive(argh::FromArgs, Debug)]
/// Example /// YOLO Example
struct Args { struct Args {
/// model file /// model file(.onnx)
#[argh(option)] #[argh(option)]
model: Option<String>, model: Option<String>,
/// source /// source: image, image folder, video stream
#[argh(option, default = "String::from(\"./assets/bus.jpg\")")] #[argh(option, default = "String::from(\"./assets/bus.jpg\")")]
source: String, source: String,
/// dtype /// model dtype
#[argh(option, default = "String::from(\"auto\")")] #[argh(option, default = "String::from(\"auto\")")]
dtype: String, dtype: String,
/// task /// task: det, seg, pose, classify, obb
#[argh(option, default = "String::from(\"det\")")] #[argh(option, default = "String::from(\"det\")")]
task: String, task: String,
@ -27,101 +27,101 @@ struct Args {
#[argh(option, default = "8.0")] #[argh(option, default = "8.0")]
ver: f32, ver: f32,
/// device /// device: cuda, cpu, mps
#[argh(option, default = "String::from(\"cpu:0\")")] #[argh(option, default = "String::from(\"cpu:0\")")]
device: String, device: String,
/// scale /// scale: n, s, m, l, x
#[argh(option, default = "String::from(\"n\")")] #[argh(option, default = "String::from(\"n\")")]
scale: String, scale: String,
/// trt_fp16 /// enable TensorRT FP16
#[argh(option, default = "true")] #[argh(option, default = "true")]
trt_fp16: bool, trt_fp16: bool,
/// batch_size /// batch size
#[argh(option, default = "1")] #[argh(option, default = "1")]
batch_size: usize, batch_size: usize,
/// min_batch_size /// bin batch size: For TensorRT
#[argh(option, default = "1")] #[argh(option, default = "1")]
min_batch_size: usize, min_batch_size: usize,
/// max_batch_size /// max Batch size: For TensorRT
#[argh(option, default = "4")] #[argh(option, default = "4")]
max_batch_size: usize, max_batch_size: usize,
/// min_image_width /// min image width: For TensorRT
#[argh(option, default = "224")] #[argh(option, default = "224")]
min_image_width: isize, min_image_width: isize,
/// image_width /// image width: For TensorRT
#[argh(option, default = "640")] #[argh(option, default = "640")]
image_width: isize, image_width: isize,
/// max_image_width /// max image width: For TensorRT
#[argh(option, default = "1280")] #[argh(option, default = "1280")]
max_image_width: isize, max_image_width: isize,
/// min_image_height /// min image height: For TensorRT
#[argh(option, default = "224")] #[argh(option, default = "224")]
min_image_height: isize, min_image_height: isize,
/// image_height /// image height: For TensorRT
#[argh(option, default = "640")] #[argh(option, default = "640")]
image_height: isize, image_height: isize,
/// max_image_height /// max image height: For TensorRT
#[argh(option, default = "1280")] #[argh(option, default = "1280")]
max_image_height: isize, max_image_height: isize,
/// num_classes /// num classes
#[argh(option)] #[argh(option)]
num_classes: Option<usize>, num_classes: Option<usize>,
/// num_keypoints /// num keypoints
#[argh(option)] #[argh(option)]
num_keypoints: Option<usize>, num_keypoints: Option<usize>,
/// use_coco_80_classes /// class names
#[argh(switch)]
use_coco_80_classes: bool,
/// use_coco_17_keypoints_classes
#[argh(switch)]
use_coco_17_keypoints_classes: bool,
/// use_imagenet_1k_classes
#[argh(switch)]
use_imagenet_1k_classes: bool,
/// confs
#[argh(option)]
confs: Vec<f32>,
/// keypoint_confs
#[argh(option)]
keypoint_confs: Vec<f32>,
/// exclude_classes
#[argh(option)]
exclude_classes: Vec<usize>,
/// retain_classes
#[argh(option)]
retain_classes: Vec<usize>,
/// class_names
#[argh(option)] #[argh(option)]
class_names: Vec<String>, class_names: Vec<String>,
/// keypoint_names /// keypoint names
#[argh(option)] #[argh(option)]
keypoint_names: Vec<String>, keypoint_names: Vec<String>,
/// topk /// top-k
#[argh(option, default = "5")] #[argh(option, default = "5")]
topk: usize, topk: usize,
/// use COCO 80 classes
#[argh(switch)]
use_coco_80_classes: bool,
/// use COCO 17 keypoints classes
#[argh(switch)]
use_coco_17_keypoints_classes: bool,
/// use ImageNet 1K classes
#[argh(switch)]
use_imagenet_1k_classes: bool,
/// confidences
#[argh(option)]
confs: Vec<f32>,
/// keypoint nonfidences
#[argh(option)]
keypoint_confs: Vec<f32>,
/// exclude nlasses
#[argh(option)]
exclude_classes: Vec<usize>,
/// retain classes
#[argh(option)]
retain_classes: Vec<usize>,
} }
fn main() -> Result<()> { fn main() -> Result<()> {
@ -129,17 +129,15 @@ fn main() -> Result<()> {
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
.with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339()) .with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339())
.init(); .init();
let args: Args = argh::from_env(); let args: Args = argh::from_env();
let mut config = Config::yolo()
let mut options = Options::yolo()
.with_model_file(&args.model.unwrap_or_default()) .with_model_file(&args.model.unwrap_or_default())
.with_model_task(args.task.as_str().try_into()?) .with_task(args.task.as_str().try_into()?)
.with_model_version(args.ver.try_into()?) .with_version(args.ver.try_into()?)
.with_model_scale(args.scale.as_str().try_into()?) .with_scale(args.scale.as_str().try_into()?)
.with_model_dtype(args.dtype.as_str().try_into()?) .with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?) .with_model_device(args.device.as_str().try_into()?)
.with_trt_fp16(args.trt_fp16) .with_model_trt_fp16(args.trt_fp16)
.with_model_ixx( .with_model_ixx(
0, 0,
0, 0,
@ -172,30 +170,25 @@ fn main() -> Result<()> {
}) })
.with_topk(args.topk) .with_topk(args.topk)
.retain_classes(&args.retain_classes) .retain_classes(&args.retain_classes)
.exclude_classes(&args.exclude_classes); .exclude_classes(&args.exclude_classes)
.with_model_num_dry_run(2);
if args.use_coco_80_classes { if args.use_coco_80_classes {
options = options.with_class_names(&NAMES_COCO_80); config = config.with_class_names(&NAMES_COCO_80);
} }
if args.use_coco_17_keypoints_classes { if args.use_coco_17_keypoints_classes {
options = options.with_keypoint_names(&NAMES_COCO_KEYPOINTS_17); config = config.with_keypoint_names(&NAMES_COCO_KEYPOINTS_17);
} }
if args.use_imagenet_1k_classes { if args.use_imagenet_1k_classes {
options = options.with_class_names(&NAMES_IMAGENET_1K); config = config.with_class_names(&NAMES_IMAGENET_1K);
} }
if let Some(nc) = args.num_classes { if let Some(nc) = args.num_classes {
options = options.with_nc(nc); config = config.with_nc(nc);
} }
if let Some(nk) = args.num_keypoints { if let Some(nk) = args.num_keypoints {
options = options.with_nk(nk); config = config.with_nk(nk);
} }
if !args.class_names.is_empty() { if !args.class_names.is_empty() {
options = options.with_class_names( config = config.with_class_names(
&args &args
.class_names .class_names
.iter() .iter()
@ -203,9 +196,8 @@ fn main() -> Result<()> {
.collect::<Vec<_>>(), .collect::<Vec<_>>(),
); );
} }
if !args.keypoint_names.is_empty() { if !args.keypoint_names.is_empty() {
options = options.with_keypoint_names( config = config.with_keypoint_names(
&args &args
.keypoint_names .keypoint_names
.iter() .iter()
@ -215,7 +207,7 @@ fn main() -> Result<()> {
} }
// build model // build model
let mut model = YOLO::try_from(options.commit()?)?; let mut model = YOLO::new(config.commit()?)?;
// build dataloader // build dataloader
let dl = DataLoader::new(&args.source)? let dl = DataLoader::new(&args.source)?
@ -255,6 +247,7 @@ fn main() -> Result<()> {
} }
} }
// summary
model.summary(); model.summary();
Ok(()) Ok(())

View File

@ -1,6 +1,6 @@
## Quick Start ## Quick Start
```shell ```shell
cargo run -r --example yoloe cargo run -r -F cuda --example yoloe -- --device cuda
``` ```

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::YOLO, Annotator, DataLoader, Options, Style}; use usls::{models::YOLO, Annotator, Config, DataLoader, Style};
#[derive(argh::FromArgs)] #[derive(argh::FromArgs)]
/// Example /// Example
@ -21,8 +21,8 @@ fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// options // config
let options = Options::yoloe_v8s_seg_pf() let config = Config::yoloe_v8s_seg_pf()
// yoloe_v8m_seg_pf() // yoloe_v8m_seg_pf()
// yoloe_v8l_seg_pf() // yoloe_v8l_seg_pf()
// yoloe_11s_seg_pf() // yoloe_11s_seg_pf()
@ -31,7 +31,7 @@ fn main() -> Result<()> {
.with_model_dtype(args.dtype.as_str().try_into()?) .with_model_dtype(args.dtype.as_str().try_into()?)
.with_model_device(args.device.as_str().try_into()?) .with_model_device(args.device.as_str().try_into()?)
.commit()?; .commit()?;
let mut model = YOLO::new(options)?; let mut model = YOLO::new(config)?;
// load // load
let xs = DataLoader::try_read_n(&["./assets/bus.jpg"])?; let xs = DataLoader::try_read_n(&["./assets/bus.jpg"])?;

View File

@ -1,5 +1,5 @@
use anyhow::Result; use anyhow::Result;
use usls::{models::YOLOPv2, Annotator, DataLoader, Options}; use usls::{models::YOLOPv2, Annotator, Config, DataLoader};
fn main() -> Result<()> { fn main() -> Result<()> {
tracing_subscriber::fmt() tracing_subscriber::fmt()
@ -8,8 +8,7 @@ fn main() -> Result<()> {
.init(); .init();
// build model // build model
let options = Options::yolop_v2_480x800().commit()?; let mut model = YOLOPv2::new(Config::yolop_v2_480x800().commit()?)?;
let mut model = YOLOPv2::new(options)?;
// load image // load image
let xs = DataLoader::try_read_n(&["images/car-view.jpg"])?; let xs = DataLoader::try_read_n(&["images/car-view.jpg"])?;

View File

@ -13,8 +13,8 @@ use prost::Message;
use std::collections::HashSet; use std::collections::HashSet;
use crate::{ use crate::{
build_progress_bar, elapsed, human_bytes_binary, onnx, DType, Device, Iiix, MinOptMax, Ops, Ts, build_progress_bar, elapsed, human_bytes_binary, onnx, DType, Device, Iiix, MinOptMax,
Xs, PROGRESS_BAR_STYLE_CYAN_2, PROGRESS_BAR_STYLE_FINISH, X, ORTConfig, Ops, Ts, Xs, PROGRESS_BAR_STYLE_CYAN_2, PROGRESS_BAR_STYLE_FINISH, X,
}; };
impl From<TensorElementType> for DType { impl From<TensorElementType> for DType {
@ -93,6 +93,20 @@ impl Default for Engine {
} }
impl Engine { impl Engine {
pub fn try_from_config(config: &ORTConfig) -> Result<Self> {
Self {
file: config.file.clone(),
spec: config.spec.clone(),
iiixs: config.iiixs.clone(),
device: config.device,
trt_fp16: config.trt_fp16,
num_dry_run: config.num_dry_run,
graph_opt_level: config.graph_opt_level,
..Default::default()
}
.build()
}
pub fn build(mut self) -> Result<Self> { pub fn build(mut self) -> Result<Self> {
let name = format!("[{}] ort_initialization", self.spec); let name = format!("[{}] ort_initialization", self.spec);
elapsed!(&name, self.ts, { elapsed!(&name, self.ts, {

View File

@ -17,7 +17,9 @@ impl std::fmt::Debug for Hbb {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Hbb") f.debug_struct("Hbb")
.field("xyxy", &[self.x, self.y, self.xmax(), self.ymax()]) .field("xyxy", &[self.x, self.y, self.xmax(), self.ymax()])
.field("meta", &self.meta) .field("id", &self.meta.id())
.field("name", &self.meta.name())
.field("confidence", &self.meta.confidence())
.finish() .finish()
} }
} }

View File

@ -308,12 +308,12 @@ impl Image {
)); ));
} }
let (mut resizer, options) = build_resizer_filter(filter)?; let (mut resizer, config) = build_resizer_filter(filter)?;
let x: DynamicImage = self.to_dyn(); let x: DynamicImage = self.to_dyn();
if let ResizeMode::FitExact = mode { if let ResizeMode::FitExact = mode {
let mut dst = FImage::new(tw, th, PixelType::U8x3); let mut dst = FImage::new(tw, th, PixelType::U8x3);
resizer.resize(&x, &mut dst, &options)?; resizer.resize(&x, &mut dst, &config)?;
trans_info = trans_info trans_info = trans_info
.with_height_scale(th as f32 / h0 as f32) .with_height_scale(th as f32 / h0 as f32)
.with_width_scale(tw as f32 / w0 as f32); .with_width_scale(tw as f32 / w0 as f32);
@ -362,7 +362,7 @@ impl Image {
}; };
let mut dst_cropped = CroppedImageMut::new(&mut dst, l, t, w, h)?; let mut dst_cropped = CroppedImageMut::new(&mut dst, l, t, w, h)?;
resizer.resize(&x, &mut dst_cropped, &options)?; resizer.resize(&x, &mut dst_cropped, &config)?;
Ok((Self::from_u8s(&dst.into_vec(), tw, th)?, trans_info)) Ok((Self::from_u8s(&dst.into_vec(), tw, th)?, trans_info))
} }

View File

@ -22,7 +22,6 @@ impl std::fmt::Debug for Keypoint {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Keypoint") f.debug_struct("Keypoint")
.field("xy", &[self.x, self.y]) .field("xy", &[self.x, self.y])
.field("uid", &self.meta.uid())
.field("id", &self.meta.id()) .field("id", &self.meta.id())
.field("name", &self.meta.name()) .field("name", &self.meta.name())
.field("confidence", &self.meta.confidence()) .field("confidence", &self.meta.confidence())

View File

@ -20,7 +20,6 @@ impl std::fmt::Debug for Mask {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Mask") f.debug_struct("Mask")
.field("dimensions", &self.dimensions()) .field("dimensions", &self.dimensions())
.field("uid", &self.meta.uid())
.field("id", &self.meta.id()) .field("id", &self.meta.id())
.field("name", &self.meta.name()) .field("name", &self.meta.name())
.field("confidence", &self.meta.confidence()) .field("confidence", &self.meta.confidence())

View File

@ -13,7 +13,7 @@ pub struct Obb {
impl std::fmt::Debug for Obb { impl std::fmt::Debug for Obb {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Obb") f.debug_struct("Obb")
.field("uid", &self.meta.uid()) .field("vertices", &self.vertices)
.field("id", &self.meta.id()) .field("id", &self.meta.id())
.field("name", &self.meta.name()) .field("name", &self.meta.name())
.field("confidence", &self.meta.confidence()) .field("confidence", &self.meta.confidence())

View File

@ -27,8 +27,7 @@ impl Default for Polygon {
impl std::fmt::Debug for Polygon { impl std::fmt::Debug for Polygon {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Polygon") f.debug_struct("Polygon")
.field("count", &self.count()) .field("n_points", &self.count())
.field("uid", &self.meta.uid())
.field("id", &self.meta.id()) .field("id", &self.meta.id())
.field("name", &self.meta.name()) .field("name", &self.meta.name())
.field("confidence", &self.meta.confidence()) .field("confidence", &self.meta.confidence())

View File

@ -31,7 +31,7 @@ impl Prob {
.with_confidence(confidence); .with_confidence(confidence);
if let Some(names) = names { if let Some(names) = names {
if id < names.len() { if !names.is_empty() {
meta = meta.with_name(names[id]); meta = meta.with_name(names[id]);
} }
} }

View File

@ -367,14 +367,14 @@ impl DataLoader {
fn load_image_paths_from_folder(source: &str, exts: &[&str]) -> Result<Vec<PathBuf>> { fn load_image_paths_from_folder(source: &str, exts: &[&str]) -> Result<Vec<PathBuf>> {
let source_path = Path::new(source); let source_path = Path::new(source);
let mut paths: Vec<PathBuf> = Vec::new(); let mut paths: Vec<PathBuf> = Vec::new();
let options = MatchOptions { let config = MatchOptions {
case_sensitive: false, case_sensitive: false,
require_literal_separator: false, require_literal_separator: false,
require_literal_leading_dot: false, require_literal_leading_dot: false,
}; };
for ext in exts.iter() { for ext in exts.iter() {
let pattern = source_path.join(format!("*.{}", ext)); let pattern = source_path.join(format!("*.{}", ext));
let paths_: Vec<PathBuf> = glob_with(pattern.to_str().unwrap(), options)? let paths_: Vec<PathBuf> = glob_with(pattern.to_str().unwrap(), config)?
.filter_map(|entry| entry.ok()) .filter_map(|entry| entry.ok())
.collect(); .collect();
paths.extend(paths_); paths.extend(paths_);
@ -393,12 +393,12 @@ impl DataLoader {
} }
fn glob(pattern: &str, sort: bool, case_sensitive: bool) -> anyhow::Result<Vec<PathBuf>> { fn glob(pattern: &str, sort: bool, case_sensitive: bool) -> anyhow::Result<Vec<PathBuf>> {
let options = MatchOptions { let config = MatchOptions {
case_sensitive, case_sensitive,
require_literal_separator: false, require_literal_separator: false,
require_literal_leading_dot: false, require_literal_leading_dot: false,
}; };
let mut paths: Vec<PathBuf> = glob_with(pattern, options)? let mut paths: Vec<PathBuf> = glob_with(pattern, config)?
.filter_map(|entry| entry.ok()) .filter_map(|entry| entry.ok())
.collect(); .collect();
@ -479,7 +479,7 @@ impl DataLoader {
self self
} }
pub fn with_batch_size(mut self, x: usize) -> Self { pub fn with_batch_size_all(mut self, x: usize) -> Self {
self.batch_size = x; self.batch_size = x;
self self
} }

View File

@ -1,10 +1,8 @@
use crate::NAMES_IMAGENET_1K;
/// Model configuration for `BEiT` /// Model configuration for `BEiT`
impl crate::Options { impl crate::Config {
pub fn beit() -> Self { pub fn beit() -> Self {
Self::default() Self::default()
.with_model_name("beit") .with_name("beit")
.with_model_ixx(0, 0, 1.into()) .with_model_ixx(0, 0, 1.into())
.with_model_ixx(0, 1, 3.into()) .with_model_ixx(0, 1, 3.into())
.with_model_ixx(0, 2, 224.into()) .with_model_ixx(0, 2, 224.into())
@ -13,7 +11,7 @@ impl crate::Options {
.with_image_std(&[0.5, 0.5, 0.5]) .with_image_std(&[0.5, 0.5, 0.5])
.with_normalize(true) .with_normalize(true)
.with_apply_softmax(true) .with_apply_softmax(true)
.with_class_names(&NAMES_IMAGENET_1K) .with_class_names(&crate::NAMES_IMAGENET_1K)
} }
pub fn beit_base() -> Self { pub fn beit_base() -> Self {

View File

@ -1,5 +1,5 @@
/// Model configuration for `BEN2` /// Model configuration for `BEN2`
impl crate::Options { impl crate::Config {
pub fn ben2_base() -> Self { pub fn ben2_base() -> Self {
Self::rmbg().with_model_file("ben2-base.onnx") Self::rmbg().with_model_file("ben2-base.onnx")
} }

View File

@ -1,34 +1,24 @@
/// Model configuration for `BLIP` /// Model configuration for `BLIP`
impl crate::Options { impl crate::Config {
pub fn blip() -> Self {
Self::default().with_model_name("blip").with_batch_size(1)
}
#[allow(clippy::excessive_precision)] #[allow(clippy::excessive_precision)]
pub fn blip_visual() -> Self { pub fn blip() -> Self {
Self::blip() Self::default()
.with_model_kind(crate::Kind::Vision) .with_name("blip")
.with_model_ixx(0, 2, 384.into()) .with_batch_size_all(1)
.with_model_ixx(0, 3, 384.into()) .with_visual_ixx(0, 1, 3.into())
.with_visual_ixx(0, 2, 384.into())
.with_visual_ixx(0, 3, 384.into())
.with_image_mean(&[0.48145466, 0.4578275, 0.40821073]) .with_image_mean(&[0.48145466, 0.4578275, 0.40821073])
.with_image_std(&[0.26862954, 0.26130258, 0.27577711]) .with_image_std(&[0.26862954, 0.26130258, 0.27577711])
.with_resize_filter("Bilinear")
.with_normalize(true)
} }
pub fn blip_textual() -> Self { pub fn blip_v1_base_caption() -> Self {
Self::blip().with_model_kind(crate::Kind::Language) Self::blip()
} .with_version(1.into())
.with_visual_file("v1-base-caption-visual.onnx")
pub fn blip_v1_base_caption_visual() -> Self { .with_textual_file("v1-base-caption-textual.onnx")
Self::blip_visual() .with_tokenizer_file("blip/tokenizer.json")
.with_model_version(1.into()) .with_tokenizer_config_file("blip/tokenizer_config.json")
.with_model_file("v1-base-caption-visual.onnx") .with_special_tokens_map_file("blip/special_tokens_map.json")
}
pub fn blip_v1_base_caption_textual() -> Self {
Self::blip_textual()
.with_model_version(1.into())
.with_model_file("v1-base-caption-textual.onnx")
} }
} }

View File

@ -2,26 +2,34 @@ use aksr::Builder;
use anyhow::Result; use anyhow::Result;
use ndarray::{s, Axis}; use ndarray::{s, Axis};
use crate::{ use crate::{elapsed, Config, Engine, Image, LogitsSampler, Processor, Ts, Xs, X, Y};
elapsed,
models::{BaseModelTextual, BaseModelVisual},
Image, LogitsSampler, Options, Ts, Xs, X, Y,
};
#[derive(Debug, Builder)] #[derive(Debug, Builder)]
pub struct Blip { pub struct Blip {
visual: BaseModelVisual, visual: Engine,
textual: BaseModelTextual, textual: Engine,
ts: Ts, batch: usize,
height: usize,
width: usize,
processor: Processor,
max_length: usize, max_length: usize,
eos_token_id: u32, eos_token_id: u32,
ts: Ts,
} }
impl Blip { impl Blip {
pub fn new(options_visual: Options, options_textual: Options) -> Result<Self> { pub fn new(config: Config) -> Result<Self> {
let visual = BaseModelVisual::new(options_visual)?; let visual = Engine::try_from_config(&config.visual)?;
let textual = BaseModelTextual::new(options_textual)?; let textual = Engine::try_from_config(&config.textual)?;
let ts = Ts::merge(&[visual.engine().ts(), textual.engine().ts()]); let (batch, height, width) = (
visual.batch().opt(),
visual.try_height().unwrap_or(&384.into()).opt(),
visual.try_width().unwrap_or(&384.into()).opt(),
);
let ts = Ts::merge(&[visual.ts(), textual.ts()]);
let processor = Processor::try_from_config(&config.processor)?
.with_image_width(width as _)
.with_image_height(height as _);
let max_length = 512; let max_length = 512;
let eos_token_id = 102; let eos_token_id = 102;
@ -31,17 +39,24 @@ impl Blip {
ts, ts,
max_length, max_length,
eos_token_id, eos_token_id,
batch,
height,
width,
processor,
}) })
} }
pub fn encode_images(&mut self, xs: &[Image]) -> Result<X> { pub fn encode_images(&mut self, xs: &[Image]) -> Result<X> {
self.visual.encode(xs) let ys = self.processor.process_images(xs)?;
self.batch = xs.len(); // update
let ys = self.visual.run(ys.into())?;
Ok(ys[0].to_owned())
} }
pub fn encode_texts(&mut self, text: Option<&str>) -> Result<Vec<Vec<f32>>> { pub fn encode_texts(&mut self, text: Option<&str>) -> Result<Vec<Vec<f32>>> {
let input_ids = self let input_ids = self
.textual .processor
.processor()
.encode_text_ids(text.unwrap_or_default(), false)?; .encode_text_ids(text.unwrap_or_default(), false)?;
Ok(vec![input_ids.clone(); self.batch()]) Ok(vec![input_ids.clone(); self.batch()])
} }
@ -70,11 +85,11 @@ impl Blip {
let input_ids_attn_mask = X::ones(input_ids_nd.dims()); let input_ids_attn_mask = X::ones(input_ids_nd.dims());
// decode // decode
let outputs = self.textual.inference(Xs::from(vec![ let outputs = self.textual.run(Xs::from(vec![
input_ids_nd, input_ids_nd,
input_ids_attn_mask, input_ids_attn_mask,
image_embeds.clone(), image_embeds.clone(),
X::ones(&[self.visual().batch(), image_embeds.dims()[1]]), // image_embeds_attn_mask X::ones(&[self.batch(), image_embeds.dims()[1]]),
]))?; ]))?;
// decode each token for each batch // decode each token for each batch
@ -102,7 +117,7 @@ impl Blip {
} }
// batch decode // batch decode
let texts = self.textual.processor().decode_tokens_batch( let texts = self.processor.decode_tokens_batch(
&token_ids &token_ids
.into_iter() .into_iter()
.map(|v| v.into_iter().map(|x| x as u32).collect::<Vec<_>>()) .map(|v| v.into_iter().map(|x| x as u32).collect::<Vec<_>>())
@ -114,7 +129,6 @@ impl Blip {
.into_iter() .into_iter()
.map(|x| Y::default().with_texts(&[&x])) .map(|x| Y::default().with_texts(&[&x]))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
// .into();
Ok(ys) Ok(ys)
} }
@ -122,8 +136,4 @@ impl Blip {
pub fn summary(&mut self) { pub fn summary(&mut self) {
self.ts.summary(); self.ts.summary();
} }
pub fn batch(&self) -> usize {
self.visual.batch() as _
}
} }

View File

@ -1,71 +1,57 @@
use crate::Kind;
/// Model configuration for `CLIP` /// Model configuration for `CLIP`
impl crate::Options { impl crate::Config {
pub fn clip() -> Self { pub fn clip() -> Self {
Self::default() Self::default()
.with_model_name("clip") .with_name("clip")
.with_model_ixx(0, 0, 1.into()) .with_batch_size_all(1)
} .with_visual_ixx(0, 1, 3.into())
.with_visual_ixx(0, 2, 224.into())
pub fn clip_visual() -> Self { .with_visual_ixx(0, 3, 224.into())
Self::clip()
.with_model_kind(Kind::Vision)
.with_model_ixx(0, 2, 224.into())
.with_model_ixx(0, 3, 224.into())
.with_image_mean(&[0.48145466, 0.4578275, 0.40821073]) .with_image_mean(&[0.48145466, 0.4578275, 0.40821073])
.with_image_std(&[0.26862954, 0.2613026, 0.2757771]) .with_image_std(&[0.26862954, 0.2613026, 0.2757771])
}
pub fn clip_textual() -> Self {
Self::clip()
.with_model_kind(Kind::Language)
.with_model_max_length(77) .with_model_max_length(77)
.with_tokenizer_file("clip/tokenizer.json")
.with_tokenizer_config_file("clip/tokenizer_config.json")
.with_special_tokens_map_file("clip/special_tokens_map.json")
.with_config_file("clip/config.json")
} }
pub fn clip_vit_b16_visual() -> Self { pub fn clip_vit_b16() -> Self {
Self::clip_visual().with_model_file("vit-b16-visual.onnx") Self::clip()
.with_visual_file("vit-b16-visual.onnx")
.with_textual_file("vit-b16-textual.onnx")
} }
pub fn clip_vit_b16_textual() -> Self { pub fn clip_vit_b32() -> Self {
Self::clip_textual().with_model_file("vit-b16-textual.onnx") Self::clip()
.with_visual_file("vit-b32-visual.onnx")
.with_textual_file("vit-b32-textual.onnx")
} }
pub fn clip_vit_b32_visual() -> Self { pub fn clip_vit_l14() -> Self {
Self::clip_visual().with_model_file("vit-b32-visual.onnx") Self::clip()
.with_visual_file("vit-l14-visual.onnx")
.with_textual_file("vit-l14-textual.onnx")
} }
pub fn clip_vit_b32_textual() -> Self { pub fn jina_clip() -> Self {
Self::clip_textual().with_model_file("vit-b32-textual.onnx") Self::default()
} .with_name("jina-clip-v1")
.with_batch_size_all(1)
pub fn clip_vit_l14_visual() -> Self { .with_visual_ixx(0, 1, 3.into())
Self::clip_visual().with_model_file("vit-l14-visual.onnx") .with_visual_ixx(0, 2, 224.into())
} .with_visual_ixx(0, 3, 224.into())
.with_image_mean(&[0.48145466, 0.4578275, 0.40821073])
pub fn clip_vit_l14_textual() -> Self { .with_image_std(&[0.26862954, 0.2613026, 0.2757771])
Self::clip_textual().with_model_file("vit-l14-textual.onnx") .with_tokenizer_file("jina-clip-v1/tokenizer.json")
.with_tokenizer_config_file("jina-clip-v1/tokenizer_config.json")
.with_special_tokens_map_file("jina-clip-v1/special_tokens_map.json")
.with_config_file("jina-clip-v1/config.json")
} }
pub fn jina_clip_v1() -> Self { pub fn jina_clip_v1() -> Self {
Self::default() Self::jina_clip()
.with_model_name("jina-clip-v1") .with_visual_file("visual.onnx")
.with_model_ixx(0, 0, 1.into()) .with_textual_file("textual.onnx")
}
pub fn jina_clip_v1_visual() -> Self {
Self::jina_clip_v1()
.with_model_kind(Kind::Vision)
.with_model_ixx(0, 2, 224.into())
.with_model_ixx(0, 3, 224.into())
.with_image_mean(&[0.48145466, 0.4578275, 0.40821073])
.with_image_std(&[0.26862954, 0.2613026, 0.2757771])
.with_model_file("visual.onnx")
}
pub fn jina_clip_v1_textual() -> Self {
Self::jina_clip_v1()
.with_model_kind(Kind::Language)
.with_model_file("textual.onnx")
} }
} }

View File

@ -2,11 +2,12 @@ use aksr::Builder;
use anyhow::Result; use anyhow::Result;
use ndarray::Array2; use ndarray::Array2;
use crate::{elapsed, Engine, Image, Options, Processor, Ts, Xs, X}; use crate::{elapsed, Config, Engine, Image, Processor, Ts, X};
#[derive(Debug, Builder)] #[derive(Debug, Builder)]
pub struct ClipVisual { pub struct Clip {
engine: Engine, visual: Engine,
textual: Engine,
height: usize, height: usize,
width: usize, width: usize,
batch: usize, batch: usize,
@ -14,22 +15,23 @@ pub struct ClipVisual {
ts: Ts, ts: Ts,
} }
impl ClipVisual { impl Clip {
pub fn new(options: Options) -> Result<Self> { pub fn new(config: Config) -> Result<Self> {
let engine = options.to_engine()?; let visual = Engine::try_from_config(&config.visual)?;
let (batch, height, width, ts) = ( let textual = Engine::try_from_config(&config.textual)?;
engine.batch().opt(), let (batch, height, width) = (
engine.try_height().unwrap_or(&224.into()).opt(), visual.batch().opt(),
engine.try_width().unwrap_or(&224.into()).opt(), visual.try_height().unwrap_or(&224.into()).opt(),
engine.ts.clone(), visual.try_width().unwrap_or(&224.into()).opt(),
); );
let processor = options let ts = Ts::merge(&[visual.ts(), textual.ts()]);
.to_processor()? let processor = Processor::try_from_config(&config.processor)?
.with_image_width(width as _) .with_image_width(width as _)
.with_image_height(height as _); .with_image_height(height as _);
Ok(Self { Ok(Self {
engine, textual,
visual,
height, height,
width, width,
batch, batch,
@ -38,111 +40,39 @@ impl ClipVisual {
}) })
} }
pub fn preprocess(&mut self, xs: &[Image]) -> Result<Xs> {
let x = self.processor.process_images(xs)?;
Ok(x.into())
}
pub fn inference(&mut self, xs: Xs) -> Result<Xs> {
self.engine.run(xs)
}
pub fn encode_images(&mut self, xs: &[Image]) -> Result<X> { pub fn encode_images(&mut self, xs: &[Image]) -> Result<X> {
let xs = elapsed!("visual-preprocess", self.ts, { self.preprocess(xs)? }); let xs = elapsed!("visual-preprocess", self.ts, {
let xs = elapsed!("visual-inference", self.ts, { self.inference(xs)? }); self.processor.process_images(xs)?
});
let xs = elapsed!("visual-inference", self.ts, { self.visual.run(xs.into())? });
let x = elapsed!("visual-postprocess", self.ts, { xs[0].to_owned() }); let x = elapsed!("visual-postprocess", self.ts, { xs[0].to_owned() });
Ok(x) Ok(x)
} }
}
#[derive(Debug, Builder)]
pub struct ClipTextual {
engine: Engine,
batch: usize,
processor: Processor,
ts: Ts,
}
impl ClipTextual {
pub fn new(options: Options) -> Result<Self> {
let engine = options.to_engine()?;
let (batch, ts) = (engine.batch().opt(), engine.ts.clone());
let processor = options.to_processor()?;
Ok(Self {
engine,
batch,
processor,
ts,
})
}
pub fn preprocess(&self, xs: &[&str]) -> Result<Xs> {
let encodings: Vec<f32> = self
.processor
.encode_texts_ids(xs, false)? // skip_special_tokens
.into_iter()
.flatten()
.collect();
let x: X = Array2::from_shape_vec((xs.len(), encodings.len() / xs.len()), encodings)?
.into_dyn()
.into();
Ok(x.into())
}
pub fn inference(&mut self, xs: Xs) -> Result<Xs> {
self.engine.run(xs)
}
pub fn encode_texts(&mut self, xs: &[&str]) -> Result<X> { pub fn encode_texts(&mut self, xs: &[&str]) -> Result<X> {
let xs = elapsed!("textual-preprocess", self.ts, { self.preprocess(xs)? }); let xs = elapsed!("textual-preprocess", self.ts, {
let xs = elapsed!("textual-inference", self.ts, { self.inference(xs)? }); let encodings: Vec<f32> = self
.processor
.encode_texts_ids(xs, false)?
.into_iter()
.flatten()
.collect();
let x: X = Array2::from_shape_vec((xs.len(), encodings.len() / xs.len()), encodings)?
.into_dyn()
.into();
x
});
let xs = elapsed!("textual-inference", self.ts, {
self.textual.run(xs.into())?
});
let x = elapsed!("textual-postprocess", self.ts, { xs[0].to_owned() }); let x = elapsed!("textual-postprocess", self.ts, { xs[0].to_owned() });
Ok(x) Ok(x)
} }
}
#[derive(Debug, Builder)]
pub struct Clip {
textual: ClipTextual,
visual: ClipVisual,
ts: Ts,
}
impl Clip {
pub fn new(options_visual: Options, options_textual: Options) -> Result<Self> {
let visual = ClipVisual::new(options_visual)?;
let textual = ClipTextual::new(options_textual)?;
// let ts = Ts::merge(&[visual.engine().ts(), textual.engine().ts()]);
let ts = Ts::default();
Ok(Self {
textual,
visual,
ts,
})
}
pub fn encode_images(&mut self, xs: &[Image]) -> Result<X> {
let x = elapsed!("encode_images", self.ts, { self.visual.encode_images(xs)? });
Ok(x)
}
pub fn encode_texts(&mut self, xs: &[&str]) -> Result<X> {
let x = elapsed!("encode_texts", self.ts, { self.textual.encode_texts(xs)? });
Ok(x)
}
pub fn summary(&mut self) { pub fn summary(&mut self) {
// self.ts.clear();
// self.ts = Ts::merge(&[&self.ts, self.visual.ts(), self.textual.ts()]);
self.ts.summary(); self.ts.summary();
self.visual.ts().summary();
self.textual.ts().summary();
} }
} }

View File

@ -1,10 +1,10 @@
use crate::NAMES_IMAGENET_1K; use crate::NAMES_IMAGENET_1K;
/// Model configuration for `ConvNeXt` /// Model configuration for `ConvNeXt`
impl crate::Options { impl crate::Config {
pub fn convnext() -> Self { pub fn convnext() -> Self {
Self::default() Self::default()
.with_model_name("convnext") .with_name("convnext")
.with_model_ixx(0, 0, 1.into()) .with_model_ixx(0, 0, 1.into())
.with_model_ixx(0, 1, 3.into()) .with_model_ixx(0, 1, 3.into())
.with_model_ixx(0, 2, 224.into()) .with_model_ixx(0, 2, 224.into())
@ -13,6 +13,7 @@ impl crate::Options {
.with_image_std(&[0.229, 0.224, 0.225]) .with_image_std(&[0.229, 0.224, 0.225])
.with_normalize(true) .with_normalize(true)
.with_apply_softmax(true) .with_apply_softmax(true)
.with_topk(5)
.with_class_names(&NAMES_IMAGENET_1K) .with_class_names(&NAMES_IMAGENET_1K)
} }

View File

@ -1,7 +1,7 @@
/// Model configuration for `d_fine` /// Model configuration for `d_fine`
impl crate::Options { impl crate::Config {
pub fn d_fine() -> Self { pub fn d_fine() -> Self {
Self::rtdetr().with_model_name("d-fine") Self::rtdetr().with_name("d-fine")
} }
pub fn d_fine_n_coco() -> Self { pub fn d_fine_n_coco() -> Self {

View File

@ -1,8 +1,8 @@
/// Model configuration for [DB](https://github.com/MhLiao/DB) and [PaddleOCR-Det](https://github.com/PaddlePaddle/PaddleOCR) /// Model configuration for [DB](https://github.com/MhLiao/DB) and [PaddleOCR-Det](https://github.com/PaddlePaddle/PaddleOCR)
impl crate::Options { impl crate::Config {
pub fn db() -> Self { pub fn db() -> Self {
Self::default() Self::default()
.with_model_name("db") .with_name("db")
.with_model_ixx(0, 0, (1, 1, 8).into()) .with_model_ixx(0, 0, (1, 1, 8).into())
.with_model_ixx(0, 1, 3.into()) .with_model_ixx(0, 1, 3.into())
.with_model_ixx(0, 2, (608, 960, 1600).into()) .with_model_ixx(0, 2, (608, 960, 1600).into())
@ -11,7 +11,7 @@ impl crate::Options {
.with_normalize(true) .with_normalize(true)
.with_image_mean(&[0.485, 0.456, 0.406]) .with_image_mean(&[0.485, 0.456, 0.406])
.with_image_std(&[0.229, 0.224, 0.225]) .with_image_std(&[0.229, 0.224, 0.225])
.with_binary_thresh(0.2) .with_db_binary_thresh(0.2)
.with_class_confs(&[0.35]) .with_class_confs(&[0.35])
.with_min_width(5.0) .with_min_width(5.0)
.with_min_height(12.0) .with_min_height(12.0)

View File

@ -4,7 +4,7 @@ use ndarray::Axis;
use rayon::prelude::*; use rayon::prelude::*;
use crate::{ use crate::{
elapsed, DynConf, Engine, Hbb, Image, Mask, Obb, Ops, Options, Polygon, Processor, Ts, Xs, Y, elapsed, Config, DynConf, Engine, Hbb, Image, Mask, Obb, Ops, Polygon, Processor, Ts, Xs, Y,
}; };
#[derive(Debug, Builder)] #[derive(Debug, Builder)]
@ -24,8 +24,8 @@ pub struct DB {
} }
impl DB { impl DB {
pub fn new(options: Options) -> Result<Self> { pub fn new(config: Config) -> Result<Self> {
let engine = options.to_engine()?; let engine = Engine::try_from_config(&config.model)?;
let (batch, height, width, ts, spec) = ( let (batch, height, width, ts, spec) = (
engine.batch().opt(), engine.batch().opt(),
engine.try_height().unwrap_or(&960.into()).opt(), engine.try_height().unwrap_or(&960.into()).opt(),
@ -33,15 +33,14 @@ impl DB {
engine.ts.clone(), engine.ts.clone(),
engine.spec().to_owned(), engine.spec().to_owned(),
); );
let processor = options let processor = Processor::try_from_config(&config.processor)?
.to_processor()?
.with_image_width(width as _) .with_image_width(width as _)
.with_image_height(height as _); .with_image_height(height as _);
let confs = DynConf::new(options.class_confs(), 1); let confs = DynConf::new(config.class_confs(), 1);
let binary_thresh = options.binary_thresh().unwrap_or(0.2); let binary_thresh = config.db_binary_thresh().unwrap_or(0.2);
let unclip_ratio = options.unclip_ratio().unwrap_or(1.5); let unclip_ratio = config.db_unclip_ratio().unwrap_or(1.5);
let min_width = options.min_width().unwrap_or(12.0); let min_width = config.min_width().unwrap_or(12.0);
let min_height = options.min_height().unwrap_or(5.0); let min_height = config.min_height().unwrap_or(5.0);
Ok(Self { Ok(Self {
engine, engine,

View File

@ -1,7 +1,7 @@
/// Model configuration for `DEIM` /// Model configuration for `DEIM`
impl crate::Options { impl crate::Config {
pub fn deim() -> Self { pub fn deim() -> Self {
Self::d_fine().with_model_name("deim") Self::d_fine().with_name("deim")
} }
pub fn deim_dfine_s_coco() -> Self { pub fn deim_dfine_s_coco() -> Self {

View File

@ -1,10 +1,10 @@
use crate::NAMES_IMAGENET_1K; use crate::NAMES_IMAGENET_1K;
/// Model configuration for `DeiT` /// Model configuration for `DeiT`
impl crate::Options { impl crate::Config {
pub fn deit() -> Self { pub fn deit() -> Self {
Self::default() Self::default()
.with_model_name("deit") .with_name("deit")
.with_model_ixx(0, 0, 1.into()) .with_model_ixx(0, 0, 1.into())
.with_model_ixx(0, 1, 3.into()) .with_model_ixx(0, 1, 3.into())
.with_model_ixx(0, 2, 224.into()) .with_model_ixx(0, 2, 224.into())

View File

@ -1,8 +1,8 @@
/// Model configuration for `DepthAnything` /// Model configuration for `DepthAnything`
impl crate::Options { impl crate::Config {
pub fn depth_anything() -> Self { pub fn depth_anything() -> Self {
Self::default() Self::default()
.with_model_name("depth-anything") .with_name("depth-anything")
.with_model_ixx(0, 0, 1.into()) .with_model_ixx(0, 0, 1.into())
.with_model_ixx(0, 1, 3.into()) .with_model_ixx(0, 1, 3.into())
.with_model_ixx(0, 2, (384, 518, 1024).into()) .with_model_ixx(0, 2, (384, 518, 1024).into())
@ -14,26 +14,26 @@ impl crate::Options {
} }
pub fn depth_anything_s() -> Self { pub fn depth_anything_s() -> Self {
Self::depth_anything().with_model_scale(crate::Scale::S) Self::depth_anything().with_scale(crate::Scale::S)
} }
pub fn depth_anything_v1() -> Self { pub fn depth_anything_v1() -> Self {
Self::depth_anything().with_model_version(1.into()) Self::depth_anything().with_version(1.into())
} }
pub fn depth_anything_v2() -> Self { pub fn depth_anything_v2() -> Self {
Self::depth_anything().with_model_version(2.into()) Self::depth_anything().with_version(2.into())
} }
pub fn depth_anything_v1_small() -> Self { pub fn depth_anything_v1_small() -> Self {
Self::depth_anything_v1() Self::depth_anything_v1()
.with_model_scale(crate::Scale::S) .with_scale(crate::Scale::S)
.with_model_file("v1-s.onnx") .with_model_file("v1-s.onnx")
} }
pub fn depth_anything_v2_small() -> Self { pub fn depth_anything_v2_small() -> Self {
Self::depth_anything_v2() Self::depth_anything_v2()
.with_model_scale(crate::Scale::S) .with_scale(crate::Scale::S)
.with_model_file("v2-s.onnx") .with_model_file("v2-s.onnx")
} }
} }

View File

@ -1,7 +1,7 @@
use aksr::Builder; use aksr::Builder;
use anyhow::Result; use anyhow::Result;
use crate::{elapsed, Engine, Image, Mask, Ops, Options, Processor, Ts, Xs, Y}; use crate::{elapsed, Config, Engine, Image, Mask, Ops, Processor, Ts, Xs, Y};
#[derive(Debug, Builder)] #[derive(Debug, Builder)]
pub struct DepthAnything { pub struct DepthAnything {
@ -15,8 +15,8 @@ pub struct DepthAnything {
} }
impl DepthAnything { impl DepthAnything {
pub fn new(options: Options) -> Result<Self> { pub fn new(config: Config) -> Result<Self> {
let engine = options.to_engine()?; let engine = Engine::try_from_config(&config.model)?;
let spec = engine.spec().to_string(); let spec = engine.spec().to_string();
let (batch, height, width, ts) = ( let (batch, height, width, ts) = (
@ -25,9 +25,7 @@ impl DepthAnything {
engine.try_width().unwrap_or(&518.into()).opt(), engine.try_width().unwrap_or(&518.into()).opt(),
engine.ts().clone(), engine.ts().clone(),
); );
let processor = Processor::try_from_config(&config.processor)?
let processor = options
.to_processor()?
.with_image_width(width as _) .with_image_width(width as _)
.with_image_height(height as _); .with_image_height(height as _);

View File

@ -1,8 +1,8 @@
/// Model configuration for `DepthPro` /// Model configuration for `DepthPro`
impl crate::Options { impl crate::Config {
pub fn depth_pro() -> Self { pub fn depth_pro() -> Self {
Self::default() Self::default()
.with_model_name("depth-pro") .with_name("depth-pro")
.with_model_ixx(0, 0, 1.into()) // batch. Note: now only support batch_size = 1 .with_model_ixx(0, 0, 1.into()) // batch. Note: now only support batch_size = 1
.with_model_ixx(0, 1, 3.into()) .with_model_ixx(0, 1, 3.into())
.with_model_ixx(0, 2, 1536.into()) .with_model_ixx(0, 2, 1536.into())
@ -11,17 +11,6 @@ impl crate::Options {
.with_image_std(&[0.5, 0.5, 0.5]) .with_image_std(&[0.5, 0.5, 0.5])
.with_resize_mode(crate::ResizeMode::FitExact) .with_resize_mode(crate::ResizeMode::FitExact)
.with_normalize(true) .with_normalize(true)
.with_model_file("model.onnx")
} }
// pub fn depth_pro_q4f16() -> Self {
// Self::depth_pro().with_model_file("q4f16.onnx")
// }
// pub fn depth_pro_fp16() -> Self {
// Self::depth_pro().with_model_file("fp16.onnx")
// }
// pub fn depth_pro_bnb4() -> Self {
// Self::depth_pro().with_model_file("bnb4.onnx")
// }
} }

View File

@ -2,7 +2,7 @@ use aksr::Builder;
use anyhow::Result; use anyhow::Result;
use ndarray::Axis; use ndarray::Axis;
use crate::{elapsed, Engine, Image, Mask, Ops, Options, Processor, Ts, Xs, Y}; use crate::{elapsed, Config, Engine, Image, Mask, Ops, Processor, Ts, Xs, Y};
#[derive(Builder, Debug)] #[derive(Builder, Debug)]
pub struct DepthPro { pub struct DepthPro {
@ -16,8 +16,8 @@ pub struct DepthPro {
} }
impl DepthPro { impl DepthPro {
pub fn new(options: Options) -> Result<Self> { pub fn new(config: Config) -> Result<Self> {
let engine = options.to_engine()?; let engine = Engine::try_from_config(&config.model)?;
let spec = engine.spec().to_string(); let spec = engine.spec().to_string();
let (batch, height, width, ts) = ( let (batch, height, width, ts) = (
engine.batch().opt(), engine.batch().opt(),
@ -25,8 +25,7 @@ impl DepthPro {
engine.try_width().unwrap_or(&512.into()).opt(), engine.try_width().unwrap_or(&512.into()).opt(),
engine.ts().clone(), engine.ts().clone(),
); );
let processor = options let processor = Processor::try_from_config(&config.processor)?
.to_processor()?
.with_image_width(width as _) .with_image_width(width as _)
.with_image_height(height as _); .with_image_height(height as _);

View File

@ -1,8 +1,8 @@
/// Model configuration for `DINOv2` /// Model configuration for `DINOv2`
impl crate::Options { impl crate::Config {
pub fn dinov2() -> Self { pub fn dinov2() -> Self {
Self::default() Self::default()
.with_model_name("dinov2") .with_name("dinov2")
.with_model_ixx(0, 0, (1, 1, 8).into()) .with_model_ixx(0, 0, (1, 1, 8).into())
.with_model_ixx(0, 1, 3.into()) .with_model_ixx(0, 1, 3.into())
.with_model_ixx(0, 2, 224.into()) .with_model_ixx(0, 2, 224.into())
@ -16,13 +16,13 @@ impl crate::Options {
pub fn dinov2_small() -> Self { pub fn dinov2_small() -> Self {
Self::dinov2() Self::dinov2()
.with_model_scale(crate::Scale::S) .with_scale(crate::Scale::S)
.with_model_file("s.onnx") .with_model_file("s.onnx")
} }
pub fn dinov2_base() -> Self { pub fn dinov2_base() -> Self {
Self::dinov2() Self::dinov2()
.with_model_scale(crate::Scale::B) .with_scale(crate::Scale::B)
.with_model_file("b.onnx") .with_model_file("b.onnx")
} }
} }

View File

@ -1,7 +1,7 @@
use aksr::Builder; use aksr::Builder;
use anyhow::Result; use anyhow::Result;
use crate::{elapsed, Engine, Image, Options, Processor, Scale, Ts, Xs, X}; use crate::{elapsed, Config, Engine, Image, Processor, Scale, Ts, Xs, X};
#[derive(Builder, Debug)] #[derive(Builder, Debug)]
pub struct DINOv2 { pub struct DINOv2 {
@ -15,15 +15,15 @@ pub struct DINOv2 {
} }
impl DINOv2 { impl DINOv2 {
pub fn new(options: Options) -> Result<Self> { pub fn new(config: Config) -> Result<Self> {
let engine = options.to_engine()?; let engine = Engine::try_from_config(&config.model)?;
let (batch, height, width, ts) = ( let (batch, height, width, ts) = (
engine.batch().opt(), engine.batch().opt(),
engine.try_height().unwrap_or(&384.into()).opt(), engine.try_height().unwrap_or(&384.into()).opt(),
engine.try_width().unwrap_or(&384.into()).opt(), engine.try_width().unwrap_or(&384.into()).opt(),
engine.ts.clone(), engine.ts.clone(),
); );
let dim = match options.model_scale() { let dim = match &config.scale {
Some(Scale::S) => 384, Some(Scale::S) => 384,
Some(Scale::B) => 768, Some(Scale::B) => 768,
Some(Scale::L) => 1024, Some(Scale::L) => 1024,
@ -31,8 +31,7 @@ impl DINOv2 {
Some(x) => anyhow::bail!("Unsupported scale: {:?}", x), Some(x) => anyhow::bail!("Unsupported scale: {:?}", x),
None => anyhow::bail!("No model scale specified"), None => anyhow::bail!("No model scale specified"),
}; };
let processor = options let processor = Processor::try_from_config(&config.processor)?
.to_processor()?
.with_image_width(width as _) .with_image_width(width as _)
.with_image_height(height as _); .with_image_height(height as _);

View File

@ -1,8 +1,8 @@
/// Model configuration for [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://github.com/czczup/FAST) /// Model configuration for [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://github.com/czczup/FAST)
impl crate::Options { impl crate::Config {
pub fn fast() -> Self { pub fn fast() -> Self {
Self::db() Self::db()
.with_model_name("fast") .with_name("fast")
.with_image_mean(&[0.798, 0.785, 0.772]) .with_image_mean(&[0.798, 0.785, 0.772])
.with_image_std(&[0.264, 0.2749, 0.287]) .with_image_std(&[0.264, 0.2749, 0.287])
} }

View File

@ -1,10 +1,10 @@
use crate::NAMES_IMAGENET_1K; use crate::NAMES_IMAGENET_1K;
/// Model configuration for `FastViT` /// Model configuration for `FastViT`
impl crate::Options { impl crate::Config {
pub fn fastvit() -> Self { pub fn fastvit() -> Self {
Self::default() Self::default()
.with_model_name("fastvit") .with_name("fastvit")
.with_model_ixx(0, 0, 1.into()) .with_model_ixx(0, 0, 1.into())
.with_model_ixx(0, 1, 3.into()) .with_model_ixx(0, 1, 3.into())
.with_model_ixx(0, 2, 224.into()) .with_model_ixx(0, 2, 224.into())

View File

@ -1,59 +1,31 @@
/// Model configuration for `Florence2` /// Model configuration for `Florence2`
impl crate::Options { impl crate::Config {
pub fn florence2() -> Self { pub fn florence2() -> Self {
Self::default() Self::default()
.with_model_name("florence2") .with_name("florence2")
.with_batch_size(1) .with_batch_size_all(1)
} .with_visual_ixx(0, 1, 3.into())
.with_visual_ixx(0, 2, 768.into())
pub fn florence2_visual() -> Self { .with_visual_ixx(0, 3, 768.into())
Self::florence2()
.with_model_kind(crate::Kind::Vision)
.with_model_ixx(0, 2, 768.into())
.with_model_ixx(0, 3, 768.into())
.with_image_mean(&[0.485, 0.456, 0.406]) .with_image_mean(&[0.485, 0.456, 0.406])
.with_image_std(&[0.229, 0.224, 0.225]) .with_image_std(&[0.229, 0.224, 0.225])
.with_resize_filter("Bilinear")
.with_normalize(true)
} }
pub fn florence2_textual() -> Self { pub fn florence2_base() -> Self {
Self::florence2().with_model_kind(crate::Kind::Language) Self::florence2()
.with_scale(crate::Scale::B)
.with_visual_file("base-vision-encoder.onnx")
.with_textual_file("base-embed-tokens.onnx")
.with_textual_encoder_file("base-encoder.onnx")
.with_textual_decoder_file("base-decoder.onnx")
.with_textual_decoder_merged_file("base-decoder-merged.onnx")
.with_tokenizer_file("florence2/tokenizer.json")
.with_config_file("florence2/config.json")
.with_special_tokens_map_file("florence2/special_tokens_map.json")
.with_tokenizer_config_file("florence2/tokenizer_config.json")
} }
pub fn florence2_visual_base() -> Self { pub fn florence2_large() -> Self {
Self::florence2_visual().with_model_scale(crate::Scale::B) todo!()
}
pub fn florence2_textual_base() -> Self {
Self::florence2_textual().with_model_scale(crate::Scale::B)
}
pub fn florence2_visual_large() -> Self {
Self::florence2_visual().with_model_scale(crate::Scale::L)
}
pub fn florence2_textual_large() -> Self {
Self::florence2_textual().with_model_scale(crate::Scale::L)
}
pub fn florence2_visual_encoder_base() -> Self {
Self::florence2_visual_base().with_model_file("base-vision-encoder.onnx")
}
pub fn florence2_textual_embed_base() -> Self {
Self::florence2_textual_base().with_model_file("base-embed-tokens.onnx")
}
pub fn florence2_texual_encoder_base() -> Self {
Self::florence2_textual_base().with_model_file("base-encoder.onnx")
}
pub fn florence2_texual_decoder_base() -> Self {
Self::florence2_textual_base().with_model_file("base-decoder.onnx")
}
pub fn florence2_texual_decoder_merged_base() -> Self {
Self::florence2_textual_base().with_model_file("base-decoder-merged.onnx")
} }
} }

View File

@ -4,51 +4,56 @@ use ndarray::{s, Axis};
use rayon::prelude::*; use rayon::prelude::*;
use crate::{ use crate::{
elapsed, elapsed, models::Quantizer, Config, Engine, Hbb, Image, LogitsSampler, Polygon, Processor,
models::{BaseModelTextual, BaseModelVisual, Quantizer}, Scale, Task, Ts, Xs, X, Y,
Hbb, Image, LogitsSampler, Options, Polygon, Scale, Task, Ts, Xs, X, Y,
}; };
#[derive(Debug, Builder)] #[derive(Debug, Builder)]
pub struct Florence2 { pub struct Florence2 {
pub vision_encoder: BaseModelVisual, pub vision_encoder: Engine,
pub text_embed: BaseModelTextual, pub text_embed: Engine,
pub encoder: BaseModelTextual, pub encoder: Engine,
pub decoder: BaseModelTextual, pub decoder: Engine,
pub decoder_merged: BaseModelTextual, pub decoder_merged: Engine,
ts: Ts, ts: Ts,
quantizer: Quantizer, quantizer: Quantizer,
max_length: usize, max_length: usize,
eos_token_id: u32, eos_token_id: u32,
decoder_start_token_id: u32, decoder_start_token_id: u32,
n_kvs: usize, n_kvs: usize,
height: usize,
width: usize,
batch: usize,
processor: Processor,
} }
impl Florence2 { impl Florence2 {
pub fn new( pub fn new(config: Config) -> Result<Self> {
options_vision_encoder: Options, let vision_encoder = Engine::try_from_config(&config.visual)?;
options_text_embed: Options, let text_embed = Engine::try_from_config(&config.textual)?;
options_encoder: Options, let encoder = Engine::try_from_config(&config.textual_encoder)?;
options_decoder: Options, let decoder = Engine::try_from_config(&config.textual_decoder)?;
options_decoder_merged: Options, let decoder_merged = Engine::try_from_config(&config.textual_decoder_merged)?;
) -> Result<Self> { let (batch, height, width) = (
let vision_encoder = BaseModelVisual::new(options_vision_encoder)?; vision_encoder.batch().opt(),
let text_embed = BaseModelTextual::new(options_text_embed)?; vision_encoder.try_height().unwrap_or(&1024.into()).opt(),
let encoder = BaseModelTextual::new(options_encoder)?; vision_encoder.try_width().unwrap_or(&1024.into()).opt(),
let decoder = BaseModelTextual::new(options_decoder)?; );
let decoder_merged = BaseModelTextual::new(options_decoder_merged)?; let processor = Processor::try_from_config(&config.processor)?
.with_image_width(width as _)
.with_image_height(height as _);
let quantizer = Quantizer::default(); let quantizer = Quantizer::default();
let ts = Ts::merge(&[ let ts = Ts::merge(&[
vision_encoder.engine().ts(), vision_encoder.ts(),
text_embed.engine().ts(), text_embed.ts(),
encoder.engine().ts(), encoder.ts(),
decoder.engine().ts(), decoder.ts(),
decoder_merged.engine().ts(), decoder_merged.ts(),
]); ]);
let max_length = 1024; let max_length = 1024;
let eos_token_id = 2; let eos_token_id = 2;
let decoder_start_token_id = 2; let decoder_start_token_id = 2;
let n_kvs = match decoder.scale() { let n_kvs = match config.scale {
Some(Scale::B) => 6, Some(Scale::B) => 6,
Some(Scale::L) => 12, Some(Scale::L) => 12,
_ => unimplemented!(), _ => unimplemented!(),
@ -66,6 +71,10 @@ impl Florence2 {
eos_token_id, eos_token_id,
decoder_start_token_id, decoder_start_token_id,
n_kvs, n_kvs,
batch,
height,
width,
processor,
}) })
} }
@ -97,12 +106,12 @@ impl Florence2 {
.map(|im| { .map(|im| {
let text = Self::process_task(task, im.height() as _, im.width() as _) let text = Self::process_task(task, im.height() as _, im.width() as _)
.prompt_for_florence2()?; .prompt_for_florence2()?;
let ids = self.text_embed.processor().encode_text_ids(&text, true)?; let ids = self.processor.encode_text_ids(&text, true)?;
X::from(ids).insert_axis(0) X::from(ids).insert_axis(0)
}) })
.collect::<Result<Vec<_>, _>>()?; .collect::<Result<Vec<_>, _>>()?;
let x = X::concat(&xs, 0)?; let x = X::concat(&xs, 0)?;
let xs = self.text_embed.inference(x.into())?; let xs = self.text_embed.run(x.into())?;
let x = xs[0].to_owned(); let x = xs[0].to_owned();
Ok(x) Ok(x)
@ -110,7 +119,10 @@ impl Florence2 {
pub fn forward(&mut self, xs_visual: &[Image], x_textual: &Task) -> Result<Vec<Y>> { pub fn forward(&mut self, xs_visual: &[Image], x_textual: &Task) -> Result<Vec<Y>> {
let visual_embeddings = elapsed!("visual-encode", self.ts, { let visual_embeddings = elapsed!("visual-encode", self.ts, {
self.vision_encoder.encode(xs_visual)? let xs = self.processor.process_images(xs_visual)?;
self.batch = xs_visual.len(); // update
let xs = self.vision_encoder.run(xs.into())?;
xs[0].to_owned()
}); });
let textual_embedding = elapsed!("textual-encode", self.ts, { let textual_embedding = elapsed!("textual-encode", self.ts, {
@ -141,7 +153,7 @@ impl Florence2 {
let attention_mask = X::ones(&[self.batch(), inputs_embeds.dims()[1]]); let attention_mask = X::ones(&[self.batch(), inputs_embeds.dims()[1]]);
// encoder // encoder
let last_hidden_state = self.encoder.inference(Xs::from(vec![ let last_hidden_state = self.encoder.run(Xs::from(vec![
attention_mask.clone(), attention_mask.clone(),
inputs_embeds.clone(), inputs_embeds.clone(),
]))?[0] ]))?[0]
@ -150,7 +162,7 @@ impl Florence2 {
// decoder // decoder
let inputs_embeds = inputs_embeds.slice(s![.., -1.., ..]); let inputs_embeds = inputs_embeds.slice(s![.., -1.., ..]);
let inputs_embeds = X::from(inputs_embeds.to_owned().into_dyn()); let inputs_embeds = X::from(inputs_embeds.to_owned().into_dyn());
let mut decoder_outputs = self.decoder.inference(Xs::from(vec![ let mut decoder_outputs = self.decoder.run(Xs::from(vec![
attention_mask.clone(), attention_mask.clone(),
last_hidden_state.clone(), last_hidden_state.clone(),
inputs_embeds, inputs_embeds,
@ -215,7 +227,7 @@ impl Florence2 {
// decode // decode
let next_tokens = X::from(last_tokens.clone()).insert_axis(1)?; let next_tokens = X::from(last_tokens.clone()).insert_axis(1)?;
let inputs_embeds = &self.text_embed.inference(Xs::from(next_tokens))?[0].clone(); let inputs_embeds = &self.text_embed.run(Xs::from(next_tokens))?[0].clone();
let use_cache = X::ones(&[1]); let use_cache = X::ones(&[1]);
let mut xs = vec![ let mut xs = vec![
attention_mask.clone(), attention_mask.clone(),
@ -229,13 +241,13 @@ impl Florence2 {
xs.push(encoder_kvs[i * 2 + 1].clone()); xs.push(encoder_kvs[i * 2 + 1].clone());
} }
xs.push(use_cache); xs.push(use_cache);
decoder_outputs = self.decoder_merged.inference(xs.into())?; decoder_outputs = self.decoder_merged.run(xs.into())?;
} }
// batch decode // batch decode
let texts = self let texts = self
.text_embed // .text_embed
.processor() .processor
.decode_tokens_batch(&token_ids, false)?; .decode_tokens_batch(&token_ids, false)?;
Ok(texts) Ok(texts)
@ -416,10 +428,6 @@ impl Florence2 {
Ok(ys) Ok(ys)
} }
pub fn batch(&self) -> usize {
self.vision_encoder.batch() as _
}
pub fn summary(&mut self) { pub fn summary(&mut self) {
self.ts.summary(); self.ts.summary();
} }

View File

@ -1,9 +1,8 @@
/// Model configuration for `GroundingDino` /// Model configuration for `GroundingDino`
impl crate::Options { impl crate::Config {
pub fn grounding_dino() -> Self { pub fn grounding_dino() -> Self {
Self::default() Self::default()
.with_model_name("grounding-dino") .with_name("grounding-dino")
.with_model_kind(crate::Kind::VisionLanguage)
.with_model_ixx(0, 0, 1.into()) // TODO: current onnx model does not support bs > 1 .with_model_ixx(0, 0, 1.into()) // TODO: current onnx model does not support bs > 1
.with_model_ixx(0, 2, 800.into()) // TODO: matters .with_model_ixx(0, 2, 800.into()) // TODO: matters
.with_model_ixx(0, 3, 1200.into()) // TODO: matters .with_model_ixx(0, 3, 1200.into()) // TODO: matters
@ -11,9 +10,10 @@ impl crate::Options {
.with_resize_filter("CatmullRom") .with_resize_filter("CatmullRom")
.with_image_mean(&[0.485, 0.456, 0.406]) .with_image_mean(&[0.485, 0.456, 0.406])
.with_image_std(&[0.229, 0.224, 0.225]) .with_image_std(&[0.229, 0.224, 0.225])
.with_normalize(true) .with_tokenizer_file("grounding-dino/tokenizer.json")
.with_class_confs(&[0.25]) .with_config_file("grounding-dino/config.json")
.with_text_confs(&[0.25]) .with_special_tokens_map_file("grounding-dino/special_tokens_map.json")
.with_tokenizer_config_file("grounding-dino/tokenizer_config.json")
} }
pub fn grounding_dino_tiny() -> Self { pub fn grounding_dino_tiny() -> Self {

View File

@ -4,7 +4,7 @@ use ndarray::{s, Array2, Axis};
use rayon::prelude::*; use rayon::prelude::*;
use std::fmt::Write; use std::fmt::Write;
use crate::{elapsed, DynConf, Engine, Hbb, Image, Options, Processor, Ts, Xs, X, Y}; use crate::{elapsed, Config, DynConf, Engine, Hbb, Image, Processor, Ts, Xs, X, Y};
#[derive(Builder, Debug)] #[derive(Builder, Debug)]
pub struct GroundingDINO { pub struct GroundingDINO {
@ -24,8 +24,8 @@ pub struct GroundingDINO {
} }
impl GroundingDINO { impl GroundingDINO {
pub fn new(options: Options) -> Result<Self> { pub fn new(config: Config) -> Result<Self> {
let engine = options.to_engine()?; let engine = Engine::try_from_config(&config.model)?;
let spec = engine.spec().to_string(); let spec = engine.spec().to_string();
let (batch, height, width, ts) = ( let (batch, height, width, ts) = (
engine.batch().opt(), engine.batch().opt(),
@ -33,31 +33,29 @@ impl GroundingDINO {
engine.try_width().unwrap_or(&1200.into()).opt(), engine.try_width().unwrap_or(&1200.into()).opt(),
engine.ts().clone(), engine.ts().clone(),
); );
let processor = options let class_names: Vec<_> = config
.to_processor()?
.with_image_width(width as _)
.with_image_height(height as _);
let class_names = options
.text_names .text_names
.as_ref() .iter()
.and_then(|v| { .map(|s| s.trim().to_ascii_lowercase())
let v: Vec<_> = v .filter(|s| !s.is_empty())
.iter() .collect();
.map(|s| s.trim().to_ascii_lowercase()) if class_names.is_empty() {
.filter(|s| !s.is_empty()) anyhow::bail!(
.collect(); "No valid class names were provided in the config. Ensure the 'text_names' field is non-empty and contains valid class names."
(!v.is_empty()).then_some(v) );
}) }
.ok_or_else(|| anyhow::anyhow!("No valid class names were provided in the options. Ensure the 'text_names' field is non-empty and contains valid class names."))?;
let text_prompt = class_names.iter().fold(String::new(), |mut acc, text| { let text_prompt = class_names.iter().fold(String::new(), |mut acc, text| {
write!(&mut acc, "{}.", text).unwrap(); write!(&mut acc, "{}.", text).unwrap();
acc acc
}); });
let confs_visual = DynConf::new(config.class_confs(), class_names.len());
let confs_textual = DynConf::new(config.text_confs(), class_names.len());
let processor = Processor::try_from_config(&config.processor)?
.with_image_width(width as _)
.with_image_height(height as _);
let token_ids = processor.encode_text_ids(&text_prompt, true)?; let token_ids = processor.encode_text_ids(&text_prompt, true)?;
let tokens = processor.encode_text_tokens(&text_prompt, true)?; let tokens = processor.encode_text_tokens(&text_prompt, true)?;
let class_ids_map = Self::process_class_ids(&tokens); let class_ids_map = Self::process_class_ids(&tokens);
let confs_visual = DynConf::new(options.class_confs(), class_names.len());
let confs_textual = DynConf::new(options.text_confs(), class_names.len());
Ok(Self { Ok(Self {
engine, engine,

View File

@ -1,8 +1,8 @@
/// Model configuration for [LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation](https://arxiv.org/abs/1707.03718) /// Model configuration for [LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation](https://arxiv.org/abs/1707.03718)
impl crate::Options { impl crate::Config {
pub fn linknet() -> Self { pub fn linknet() -> Self {
Self::fast() Self::fast()
.with_model_name("linknet") .with_name("linknet")
.with_image_mean(&[0.798, 0.785, 0.772]) .with_image_mean(&[0.798, 0.785, 0.772])
.with_image_std(&[0.264, 0.2749, 0.287]) .with_image_std(&[0.264, 0.2749, 0.287])
} }

View File

@ -1,10 +1,10 @@
use crate::NAMES_IMAGENET_1K; use crate::NAMES_IMAGENET_1K;
/// Model configuration for `MobileOne` /// Model configuration for `MobileOne`
impl crate::Options { impl crate::Config {
pub fn mobileone() -> Self { pub fn mobileone() -> Self {
Self::default() Self::default()
.with_model_name("mobileone") .with_name("mobileone")
.with_model_ixx(0, 0, 1.into()) .with_model_ixx(0, 0, 1.into())
.with_model_ixx(0, 1, 3.into()) .with_model_ixx(0, 1, 3.into())
.with_model_ixx(0, 2, 224.into()) .with_model_ixx(0, 2, 224.into())

View File

@ -1,8 +1,8 @@
/// Model configuration for `MODNet` /// Model configuration for `MODNet`
impl crate::Options { impl crate::Config {
pub fn modnet() -> Self { pub fn modnet() -> Self {
Self::default() Self::default()
.with_model_name("modnet") .with_name("modnet")
.with_model_ixx(0, 0, 1.into()) .with_model_ixx(0, 0, 1.into())
.with_model_ixx(0, 2, (416, 512, 800).into()) .with_model_ixx(0, 2, (416, 512, 800).into())
.with_model_ixx(0, 3, (416, 512, 800).into()) .with_model_ixx(0, 3, (416, 512, 800).into())

View File

@ -2,7 +2,7 @@ use aksr::Builder;
use anyhow::Result; use anyhow::Result;
use ndarray::Axis; use ndarray::Axis;
use crate::{elapsed, Engine, Image, Mask, Ops, Options, Processor, Ts, Xs, Y}; use crate::{elapsed, Config, Engine, Image, Mask, Ops, Processor, Ts, Xs, Y};
#[derive(Builder, Debug)] #[derive(Builder, Debug)]
pub struct MODNet { pub struct MODNet {
@ -16,8 +16,8 @@ pub struct MODNet {
} }
impl MODNet { impl MODNet {
pub fn new(options: Options) -> Result<Self> { pub fn new(config: Config) -> Result<Self> {
let engine = options.to_engine()?; let engine = Engine::try_from_config(&config.model)?;
let spec = engine.spec().to_string(); let spec = engine.spec().to_string();
let (batch, height, width, ts) = ( let (batch, height, width, ts) = (
engine.batch().opt(), engine.batch().opt(),
@ -25,8 +25,7 @@ impl MODNet {
engine.try_width().unwrap_or(&512.into()).opt(), engine.try_width().unwrap_or(&512.into()).opt(),
engine.ts().clone(), engine.ts().clone(),
); );
let processor = options let processor = Processor::try_from_config(&config.processor)?
.to_processor()?
.with_image_width(width as _) .with_image_width(width as _)
.with_image_height(height as _); .with_image_height(height as _);

View File

@ -1,117 +1,47 @@
/// Model configuration for `moondream2` /// Model configuration for `moondream2`
impl crate::Options { impl crate::Config {
pub fn moondream2() -> Self { pub fn moondream2() -> Self {
Self::default() Self::default()
.with_model_name("moondream2") .with_name("moondream2")
.with_model_num_dry_run(0) .with_visual_encoder_ixx(0, 0, (1, 3, 4).into()) // patch count
.with_image_mean(&[0.5, 0.5, 0.5])
.with_image_std(&[0.5, 0.5, 0.5])
.with_resize_mode(crate::ResizeMode::FitExact)
.with_resize_filter("catmullrom")
.with_visual_projection_ixx(0, 0, 1.into())
.with_textual_encoder_ixx(0, 0, 1.into())
.with_textual_decoder_ixx(0, 0, 1.into())
.with_size_encoder_ixx(0, 0, 1.into())
.with_size_decoder_ixx(0, 0, 1.into())
.with_coord_encoder_ixx(0, 0, 1.into())
.with_coord_decoder_ixx(0, 0, 1.into())
.with_tokenizer_file("moondream2/tokenizer.json")
.with_tokenizer_config_file("moondream2/tokenizer_config.json")
} }
pub fn moondream2_0_5b() -> Self { pub fn moondream2_0_5b() -> Self {
Self::moondream2().with_model_scale(crate::Scale::Billion(0.5)) Self::moondream2()
.with_scale(crate::Scale::Billion(0.5))
.with_visual_encoder_file("0.5b-vision-encoder.onnx")
.with_visual_projection_file("0.5b-vision-projection.onnx")
.with_textual_decoder_file("0.5b-text-decoder.onnx")
.with_textual_encoder_file("0.5b-text-encoder.onnx")
.with_coord_encoder_file("0.5b-coord-encoder.onnx")
.with_coord_decoder_file("0.5b-coord-decoder.onnx")
.with_size_encoder_file("0.5b-size-encoder.onnx")
.with_size_decoder_file("0.5b-size-decoder.onnx")
} }
pub fn moondream2_0_5b_vision_encoder() -> Self { pub fn moondream2_2b() -> Self {
Self::moondream2_0_5b() Self::moondream2()
.with_model_ixx(0, 0, (1, 3, 4).into()) // patch count .with_scale(crate::Scale::Billion(2.))
.with_model_kind(crate::Kind::Vision) .with_visual_encoder_file("2b-vision-encoder.onnx")
.with_image_mean(&[0.5, 0.5, 0.5]) .with_visual_projection_file("2b-vision-projection.onnx")
.with_image_std(&[0.5, 0.5, 0.5]) .with_textual_decoder_file("2b-text-decoder.onnx")
.with_normalize(true) .with_textual_encoder_file("2b-text-encoder.onnx")
.with_resize_mode(crate::ResizeMode::FitExact) .with_coord_encoder_file("2b-coord-encoder.onnx")
.with_resize_filter("catmullrom") .with_coord_decoder_file("2b-coord-decoder.onnx")
.with_model_file("0.5b-vision-encoder.onnx") .with_size_encoder_file("2b-size-encoder.onnx")
} .with_size_decoder_file("2b-size-decoder.onnx")
pub fn moondream2_0_5b_vision_projection() -> Self {
Self::moondream2_0_5b()
.with_batch_size(1)
.with_model_kind(crate::Kind::Vision)
.with_model_file("0.5b-vision-projection.onnx")
}
pub fn moondream2_0_5b_text_decoder() -> Self {
Self::moondream2_0_5b()
.with_batch_size(1)
.with_model_kind(crate::Kind::Language)
.with_model_file("0.5b-text-decoder.onnx")
}
pub fn moondream2_0_5b_text_encoder() -> Self {
Self::moondream2_0_5b()
.with_batch_size(1)
.with_model_kind(crate::Kind::Language)
.with_model_file("0.5b-text-encoder.onnx")
}
pub fn moondream2_0_5b_coord_encoder() -> Self {
Self::moondream2_0_5b()
.with_batch_size(1)
.with_model_file("0.5b-coord-encoder.onnx")
}
pub fn moondream2_0_5b_coord_decoder() -> Self {
Self::moondream2_0_5b()
.with_batch_size(1)
.with_model_file("0.5b-coord-decoder.onnx")
}
pub fn moondream2_0_5b_size_encoder() -> Self {
Self::moondream2_0_5b()
.with_batch_size(1)
.with_model_file("0.5b-size-encoder.onnx")
}
pub fn moondream2_0_5b_size_decoder() -> Self {
Self::moondream2_0_5b()
.with_batch_size(1)
.with_model_file("0.5b-size-decoder.onnx")
}
pub fn moondream2_2b_vision_encoder() -> Self {
Self::moondream2_0_5b_vision_encoder()
.with_model_scale(crate::Scale::Billion(2.))
.with_model_file("2b-vision-encoder.onnx")
}
pub fn moondream2_2b_vision_projection() -> Self {
Self::moondream2_0_5b_vision_projection()
.with_model_scale(crate::Scale::Billion(2.))
.with_model_file("2b-vision-projection.onnx")
}
pub fn moondream2_2b_text_decoder() -> Self {
Self::moondream2_0_5b_text_decoder()
.with_model_scale(crate::Scale::Billion(2.))
.with_model_file("2b-text-decoder.onnx")
}
pub fn moondream2_2b_text_encoder() -> Self {
Self::moondream2_0_5b_text_encoder()
.with_model_scale(crate::Scale::Billion(2.))
.with_model_file("2b-text-encoder.onnx")
}
pub fn moondream2_2b_coord_encoder() -> Self {
Self::moondream2_0_5b_coord_encoder()
.with_model_scale(crate::Scale::Billion(2.))
.with_model_file("2b-coord-encoder.onnx")
}
pub fn moondream2_2b_coord_decoder() -> Self {
Self::moondream2_0_5b_coord_decoder()
.with_model_scale(crate::Scale::Billion(2.))
.with_model_file("2b-coord-decoder.onnx")
}
pub fn moondream2_2b_size_encoder() -> Self {
Self::moondream2_0_5b_size_encoder()
.with_model_scale(crate::Scale::Billion(2.))
.with_model_file("2b-size-encoder.onnx")
}
pub fn moondream2_2b_size_decoder() -> Self {
Self::moondream2_0_5b_size_decoder()
.with_model_scale(crate::Scale::Billion(2.))
.with_model_file("2b-size-decoder.onnx")
} }
} }

View File

@ -5,66 +5,57 @@ use ndarray::{s, Array, Array2, Array3, Axis, IxDyn};
use ndarray_npy::ReadNpyExt; use ndarray_npy::ReadNpyExt;
use crate::{ use crate::{
BaseModelTextual, DType, Engine, Hbb, Hub, Image, Keypoint, LogitsSampler, Options, Processor, Config, DType, Engine, Hbb, Hub, Image, Keypoint, LogitsSampler, Processor, Scale, Task, Xs, X,
Scale, Task, Ts, Xs, X, Y, Y,
}; };
#[derive(Builder, Debug)] #[derive(Builder, Debug)]
pub struct Moondream2 { pub struct Moondream2 {
vision_encoder: VisionEncoder, vision_encoder: Engine,
vision_projection: VisionProjection, vision_projection: Engine,
pub text_decoder: BaseModelTextual, text_decoder: Engine,
text_encoder: BaseModelTextual, text_encoder: Engine,
coord_decoder: Option<BaseModelTextual>, coord_decoder: Option<Engine>,
coord_encoder: Option<BaseModelTextual>, coord_encoder: Option<Engine>,
size_decoder: Option<BaseModelTextual>, size_decoder: Option<Engine>,
size_encoder: Option<BaseModelTextual>, size_encoder: Option<Engine>,
initial_kv_cache: X, // TODO: use f16 initial_kv_cache: X, // TODO: use f16
scale: Scale, scale: Scale,
dtype: DType, dtype: DType,
max_length: usize, max_length: usize,
eos_token_id: u32, eos_token_id: u32,
max_objects: usize, max_objects: usize,
num_patch: usize,
patch_size: usize,
processor: Processor,
seq_len: usize,
} }
impl Moondream2 { impl Moondream2 {
// TODO pub fn new(config: Config) -> Result<Self> {
#[allow(clippy::too_many_arguments)]
pub fn new(
options_vision_encoder: Options,
options_vision_projection: Options,
options_text_encoder: Options,
options_text_decoder: Options,
options_coord_encoder: Option<Options>,
options_coord_decoder: Option<Options>,
options_size_encoder: Option<Options>,
options_size_decoder: Option<Options>,
) -> Result<Self> {
let max_length = 2048; let max_length = 2048;
let max_objects = 50; let max_objects = 50;
let eos_token_id = 50256; let eos_token_id = 50256;
let dtype = options_vision_encoder.model_dtype; let dtype = config.visual_encoder.dtype;
let scale = options_vision_encoder let scale = config.scale.clone().unwrap_or(Scale::Billion(0.5));
.model_scale
.clone()
.unwrap_or(Scale::Billion(0.5));
let initial_kv_cache: X = KVCache::new(&scale, &dtype)?.0.into(); let initial_kv_cache: X = KVCache::new(&scale, &dtype)?.0.into();
let vision_encoder = VisionEncoder::new(options_vision_encoder)?; let vision_encoder = Engine::try_from_config(&config.visual_encoder)?;
let vision_projection = VisionProjection::new(options_vision_projection)?; let vision_projection = Engine::try_from_config(&config.visual_projection)?;
let text_decoder = BaseModelTextual::new(options_text_decoder)?; let text_decoder = Engine::try_from_config(&config.textual_decoder)?;
let text_encoder = BaseModelTextual::new(options_text_encoder)?; let text_encoder = Engine::try_from_config(&config.textual_encoder)?;
let coord_decoder = options_coord_decoder let coord_decoder = Engine::try_from_config(&config.coord_decoder).ok();
.map(BaseModelTextual::new) let coord_encoder = Engine::try_from_config(&config.coord_encoder).ok();
.transpose()?; let size_decoder = Engine::try_from_config(&config.size_decoder).ok();
let coord_encoder = options_coord_encoder let size_encoder = Engine::try_from_config(&config.size_encoder).ok();
.map(BaseModelTextual::new) let (num_patch, patch_size, _ts) = (
.transpose()?; vision_encoder.batch().opt(),
let size_decoder = options_size_decoder vision_encoder.try_height().unwrap_or(&378.into()).opt(),
.map(BaseModelTextual::new) vision_encoder.ts.clone(),
.transpose()?; );
let size_encoder = options_size_encoder let seq_len = vision_projection.inputs_minoptmax[0][1].opt();
.map(BaseModelTextual::new) let processor = Processor::try_from_config(&config.processor)?
.transpose()?; .with_image_width(patch_size as _)
.with_image_height(patch_size as _);
Ok(Self { Ok(Self {
vision_encoder, vision_encoder,
@ -81,12 +72,16 @@ impl Moondream2 {
eos_token_id, eos_token_id,
scale, scale,
dtype, dtype,
num_patch,
patch_size,
processor,
seq_len,
}) })
} }
pub fn encode_image(&mut self, x: &Image) -> Result<X> { pub fn encode_image(&mut self, x: &Image) -> Result<X> {
let patches_emb = self.vision_encoder.encode(x)?.clone().insert_axis(0)?; let patches_emb = self.encode(x)?.clone().insert_axis(0)?;
let image_embedding = self.vision_projection.inference(patches_emb.into())?[0].to_owned(); let image_embedding = self.vision_projection.run(patches_emb.into())?[0].to_owned();
Ok(image_embedding) Ok(image_embedding)
} }
@ -119,12 +114,7 @@ impl Moondream2 {
Task::Vqa(query) => { Task::Vqa(query) => {
let input_ids: Vec<_> = [198., 198., 24361., 25.] let input_ids: Vec<_> = [198., 198., 24361., 25.]
.iter() .iter()
.chain( .chain(&self.processor.encode_text_ids(query, false)?)
&self
.text_encoder
.processor()
.encode_text_ids(query, false)?,
)
.chain(&[198., 198., 33706., 25.]) .chain(&[198., 198., 33706., 25.])
.cloned() .cloned()
.collect(); .collect();
@ -139,8 +129,7 @@ impl Moondream2 {
.iter() .iter()
.chain( .chain(
&self &self
.text_encoder .processor
.processor()
.encode_text_ids(&format!(" {}", object), false)?, .encode_text_ids(&format!(" {}", object), false)?,
) )
.chain(&[628.]) .chain(&[628.])
@ -156,8 +145,7 @@ impl Moondream2 {
.iter() .iter()
.chain( .chain(
&self &self
.text_encoder .processor
.processor()
.encode_text_ids(&format!(" {}", object), false)?, .encode_text_ids(&format!(" {}", object), false)?,
) )
.chain(&[628.]) .chain(&[628.])
@ -174,10 +162,10 @@ impl Moondream2 {
fn generate_text(&mut self, input_ids: &[f32], kv_cache: Array<f32, IxDyn>) -> Result<String> { fn generate_text(&mut self, input_ids: &[f32], kv_cache: Array<f32, IxDyn>) -> Result<String> {
let input_ids = X::from(input_ids.to_vec()).insert_axis(0)?; let input_ids = X::from(input_ids.to_vec()).insert_axis(0)?;
let mut input_embeds = self.text_encoder.inference(Xs::from(input_ids))?[0].to_owned(); let mut input_embeds = self.text_encoder.run(Xs::from(input_ids))?[0].to_owned();
let logits_sampler = LogitsSampler::new(); let logits_sampler = LogitsSampler::new();
let mut token_ids: Vec<u32> = Vec::new(); let mut token_ids: Vec<u32> = Vec::new();
let mut pos = self.vision_projection.seq_len() + self.initial_kv_cache.shape()[4]; let mut pos = self.seq_len + self.initial_kv_cache.shape()[4];
let mut inc = input_embeds.shape()[1]; let mut inc = input_embeds.shape()[1];
let mut kv_cache = kv_cache.clone(); let mut kv_cache = kv_cache.clone();
@ -192,7 +180,7 @@ impl Moondream2 {
.into_dyn() .into_dyn()
.into(), .into(),
]); ]);
let decoder_outputs = self.text_decoder.inference(input)?; let decoder_outputs = self.text_decoder.run(input)?;
// update // update
let logits = &decoder_outputs["logits"]; let logits = &decoder_outputs["logits"];
@ -221,13 +209,10 @@ impl Moondream2 {
// encode // encode
let next_tokens = X::from(vec![token_id as f32]).insert_axis(1)?; let next_tokens = X::from(vec![token_id as f32]).insert_axis(1)?;
input_embeds = self.text_encoder.inference(Xs::from(next_tokens))?[0].to_owned(); input_embeds = self.text_encoder.run(Xs::from(next_tokens))?[0].to_owned();
} }
let text = self let text = self.processor.decode_tokens(&token_ids, true)?;
.text_encoder
.processor()
.decode_tokens(&token_ids, true)?;
Ok(text) Ok(text)
} }
@ -242,16 +227,16 @@ impl Moondream2 {
let mut y_bboxes: Vec<Hbb> = Vec::new(); let mut y_bboxes: Vec<Hbb> = Vec::new();
let mut y_kpts: Vec<Vec<Keypoint>> = Vec::new(); let mut y_kpts: Vec<Vec<Keypoint>> = Vec::new();
let (image_height, image_width) = ( let (image_height, image_width) = (
self.vision_encoder.processor.images_transform_info[0].height_src, self.processor.images_transform_info[0].height_src,
self.vision_encoder.processor.images_transform_info[0].width_src, self.processor.images_transform_info[0].width_src,
); );
let mut pos = self.vision_projection.seq_len() + self.initial_kv_cache.shape()[4]; let mut pos = self.seq_len + self.initial_kv_cache.shape()[4];
let logits_sampler = LogitsSampler::new(); let logits_sampler = LogitsSampler::new();
// initial input_embeds // initial input_embeds
let input_ids = X::from(input_ids.to_vec()).insert_axis(0)?; let input_ids = X::from(input_ids.to_vec()).insert_axis(0)?;
let mut hidden = self.text_encoder.inference(Xs::from(input_ids))?[0].to_owned(); let mut hidden = self.text_encoder.run(Xs::from(input_ids))?[0].to_owned();
let mut kv_cache = kv_cache; let mut kv_cache = kv_cache;
// generate // generate
@ -273,12 +258,7 @@ impl Moondream2 {
// cx // cx
let input: X = hidden.slice(s![0, -1, ..]).into_owned().into_dyn().into(); let input: X = hidden.slice(s![0, -1, ..]).into_owned().into_dyn().into();
let cx = self let cx = self.coord_decoder.as_mut().unwrap().run(Xs::from(input))?[0].clone(); // [1024]
.coord_decoder
.as_mut()
.unwrap()
.inference(Xs::from(input))?[0]
.clone(); // [1024]
let ratio = cx.shape()[0] as f32; let ratio = cx.shape()[0] as f32;
let cx = logits_sampler let cx = logits_sampler
.decode(cx.as_slice().context("Failed to get slice for `cx`")?)? .decode(cx.as_slice().context("Failed to get slice for `cx`")?)?
@ -288,7 +268,7 @@ impl Moondream2 {
.coord_encoder .coord_encoder
.as_mut() .as_mut()
.unwrap() .unwrap()
.inference(Xs::from(X::from(vec![cx])))?[0] .run(Xs::from(X::from(vec![cx])))?[0]
.clone() .clone()
.insert_axis(0)? .insert_axis(0)?
.insert_axis(0)?; .insert_axis(0)?;
@ -296,12 +276,7 @@ impl Moondream2 {
// cy // cy
let _logits = self.run_decoder(&mut hidden, &mut kv_cache, &mut pos)?; let _logits = self.run_decoder(&mut hidden, &mut kv_cache, &mut pos)?;
let input: X = hidden.slice(s![0, -1, ..]).into_owned().into_dyn().into(); let input: X = hidden.slice(s![0, -1, ..]).into_owned().into_dyn().into();
let cy = self let cy = self.coord_decoder.as_mut().unwrap().run(Xs::from(input))?[0].clone();
.coord_decoder
.as_mut()
.unwrap()
.inference(Xs::from(input))?[0]
.clone();
let ratio = cy.shape()[0] as f32; let ratio = cy.shape()[0] as f32;
let cy = logits_sampler let cy = logits_sampler
@ -313,7 +288,7 @@ impl Moondream2 {
.coord_encoder .coord_encoder
.as_mut() .as_mut()
.unwrap() .unwrap()
.inference(Xs::from(X::from(vec![cy])))?[0] .run(Xs::from(X::from(vec![cy])))?[0]
.clone() .clone()
.insert_axis(0)? .insert_axis(0)?
.insert_axis(0)?; .insert_axis(0)?;
@ -324,6 +299,7 @@ impl Moondream2 {
cy * image_height as f32, cy * image_height as f32,
)) ))
.with_id(0) .with_id(0)
.with_confidence(1.)
.with_name(object)]); .with_name(object)]);
// keep? // keep?
@ -334,12 +310,7 @@ impl Moondream2 {
// wh // wh
let _logits = self.run_decoder(&mut hidden, &mut kv_cache, &mut pos)?; let _logits = self.run_decoder(&mut hidden, &mut kv_cache, &mut pos)?;
let input: X = hidden.slice(s![0, -1, ..]).into_owned().into_dyn().into(); let input: X = hidden.slice(s![0, -1, ..]).into_owned().into_dyn().into();
let size = self let size = self.size_decoder.as_mut().unwrap().run(Xs::from(input))?[0].clone(); // [2, 1024]
.size_decoder
.as_mut()
.unwrap()
.inference(Xs::from(input))?[0]
.clone(); // [2, 1024]
let ratio = size.shape()[1] as f32; let ratio = size.shape()[1] as f32;
let w = logits_sampler.decode( let w = logits_sampler.decode(
@ -361,7 +332,7 @@ impl Moondream2 {
.size_encoder .size_encoder
.as_mut() .as_mut()
.unwrap() .unwrap()
.inference(Xs::from(X::from(vec![w, h])))?[0] .run(Xs::from(X::from(vec![w, h])))?[0]
.clone() .clone()
.insert_axis(0)? .insert_axis(0)?
.insert_axis(0)?; // [1024] .insert_axis(0)?; // [1024]
@ -392,7 +363,7 @@ impl Moondream2 {
} }
fn prepare_kv_cache(&mut self, image_embedding: &X) -> Result<Array<f32, IxDyn>> { fn prepare_kv_cache(&mut self, image_embedding: &X) -> Result<Array<f32, IxDyn>> {
let kv_cache_new = self.text_decoder.inference(Xs::from(vec![ let kv_cache_new = self.text_decoder.run(Xs::from(vec![
image_embedding.clone(), image_embedding.clone(),
self.initial_kv_cache.clone(), self.initial_kv_cache.clone(),
]))?["new_kv_cache"] ]))?["new_kv_cache"]
@ -421,7 +392,7 @@ impl Moondream2 {
kv_cache: &mut Array<f32, IxDyn>, kv_cache: &mut Array<f32, IxDyn>,
pos: &mut usize, pos: &mut usize,
) -> Result<X> { ) -> Result<X> {
let decoder_outputs = self.text_decoder.inference(Xs::from(vec![ let decoder_outputs = self.text_decoder.run(Xs::from(vec![
input_embeds.clone(), input_embeds.clone(),
kv_cache kv_cache
.slice(s![.., .., .., .., ..*pos, ..]) .slice(s![.., .., .., .., ..*pos, ..])
@ -442,38 +413,6 @@ impl Moondream2 {
Ok(decoder_outputs["logits"].to_owned()) Ok(decoder_outputs["logits"].to_owned())
} }
}
#[derive(Debug, Builder)]
pub struct VisionEncoder {
engine: Engine,
num_patch: usize,
patch_size: usize,
processor: Processor,
ts: Ts,
}
impl VisionEncoder {
pub fn new(options: Options) -> Result<Self> {
let engine = options.to_engine()?;
let (num_patch, patch_size, ts) = (
engine.batch().opt(),
engine.try_height().unwrap_or(&378.into()).opt(),
engine.ts.clone(),
);
let processor = options
.to_processor()?
.with_image_width(patch_size as _)
.with_image_height(patch_size as _);
Ok(Self {
engine,
patch_size,
num_patch,
processor,
ts,
})
}
fn create_patches(image: &Image, image_patch_size: usize) -> (Vec<Image>, (u32, u32)) { fn create_patches(image: &Image, image_patch_size: usize) -> (Vec<Image>, (u32, u32)) {
let mut patches = vec![image.clone()]; let mut patches = vec![image.clone()];
@ -515,10 +454,6 @@ impl VisionEncoder {
(patches, selected_template) (patches, selected_template)
} }
pub fn inference(&mut self, xs: Xs) -> Result<Xs> {
self.engine.run(xs)
}
pub fn encode(&mut self, x: &Image) -> Result<X> { pub fn encode(&mut self, x: &Image) -> Result<X> {
let (patches, selected_template) = Self::create_patches(x, self.patch_size); let (patches, selected_template) = Self::create_patches(x, self.patch_size);
let patches = self.processor.process_images(&patches)?; let patches = self.processor.process_images(&patches)?;
@ -526,7 +461,7 @@ impl VisionEncoder {
(selected_template.0 as usize), (selected_template.0 as usize),
(selected_template.1 as usize), (selected_template.1 as usize),
); );
let patch_emb = self.inference(patches.clone().into())?[0].clone(); let patch_emb = self.vision_encoder.run(patches.clone().into())?[0].clone();
let patch_emb = patch_emb.clone().0.into_dimensionality::<ndarray::Ix3>()?; let patch_emb = patch_emb.clone().0.into_dimensionality::<ndarray::Ix3>()?;
let patch_emb = Self::process_patch_emb(patch_emb, template)?; let patch_emb = Self::process_patch_emb(patch_emb, template)?;
let patch_emb = X::from(patch_emb.into_dyn()); // TODO .insert_axis(x), let patch_emb = X::from(patch_emb.into_dyn()); // TODO .insert_axis(x),
@ -608,30 +543,6 @@ impl VisionEncoder {
} }
} }
#[derive(Debug, Builder)]
pub struct VisionProjection {
engine: Engine,
seq_len: usize,
ts: Ts,
}
impl VisionProjection {
pub fn new(options: Options) -> Result<Self> {
let engine = options.to_engine()?;
let (seq_len, ts) = (engine.inputs_minoptmax[0][1].opt(), engine.ts.clone());
Ok(Self {
engine,
seq_len,
ts,
})
}
pub fn inference(&mut self, xs: Xs) -> Result<Xs> {
self.engine.run(xs)
}
}
#[derive(Builder, Debug)] #[derive(Builder, Debug)]
struct KVCache(pub Array<f32, IxDyn>); struct KVCache(pub Array<f32, IxDyn>);

View File

@ -1,11 +1,10 @@
/// Model configuration for `OWLv2` /// Model configuration for `OWLv2`
impl crate::Options { impl crate::Config {
pub fn owlv2() -> Self { pub fn owlv2() -> Self {
Self::default() Self::default()
.with_model_name("owlv2") .with_name("owlv2")
.with_model_kind(crate::Kind::VisionLanguage)
// 1st & 3rd: text // 1st & 3rd: text
.with_model_ixx(0, 0, (1, 1, 1).into()) // TODO .with_model_ixx(0, 0, (1, 1, 1).into())
.with_model_ixx(0, 1, 1.into()) .with_model_ixx(0, 1, 1.into())
.with_model_ixx(2, 0, (1, 1, 1).into()) .with_model_ixx(2, 0, (1, 1, 1).into())
.with_model_ixx(2, 1, 1.into()) .with_model_ixx(2, 1, 1.into())
@ -21,6 +20,7 @@ impl crate::Options {
.with_normalize(true) .with_normalize(true)
.with_class_confs(&[0.1]) .with_class_confs(&[0.1])
.with_model_num_dry_run(0) .with_model_num_dry_run(0)
.with_tokenizer_file("owlv2/tokenizer.json")
} }
pub fn owlv2_base() -> Self { pub fn owlv2_base() -> Self {

View File

@ -3,7 +3,7 @@ use anyhow::Result;
use ndarray::{s, Axis}; use ndarray::{s, Axis};
use rayon::prelude::*; use rayon::prelude::*;
use crate::{elapsed, DynConf, Engine, Hbb, Image, Options, Processor, Ts, Xs, X, Y}; use crate::{elapsed, Config, DynConf, Engine, Hbb, Image, Processor, Ts, Xs, X, Y};
#[derive(Debug, Builder)] #[derive(Debug, Builder)]
pub struct OWLv2 { pub struct OWLv2 {
@ -22,8 +22,8 @@ pub struct OWLv2 {
} }
impl OWLv2 { impl OWLv2 {
pub fn new(options: Options) -> Result<Self> { pub fn new(config: Config) -> Result<Self> {
let engine = options.to_engine()?; let engine = Engine::try_from_config(&config.model)?;
let (batch, height, width, ts) = ( let (batch, height, width, ts) = (
engine.batch().opt(), engine.batch().opt(),
engine.try_height().unwrap_or(&960.into()).opt(), engine.try_height().unwrap_or(&960.into()).opt(),
@ -31,20 +31,19 @@ impl OWLv2 {
engine.ts.clone(), engine.ts.clone(),
); );
let spec = engine.spec().to_owned(); let spec = engine.spec().to_owned();
let processor = options let names: Vec<String> = config.text_names().to_vec();
.to_processor()? if names.is_empty() {
.with_image_width(width as _) anyhow::bail!(
.with_image_height(height as _); "No valid class names were provided in the config. Ensure the 'text_names' field is non-empty and contains valid class names."
let names: Vec<String> = options );
.class_names() }
.expect("No class names specified.")
.iter()
.map(|x| x.to_string())
.collect();
let names_with_prompt: Vec<String> = let names_with_prompt: Vec<String> =
names.iter().map(|x| format!("a photo of {}", x)).collect(); names.iter().map(|x| format!("a photo of {}", x)).collect();
let n = names.len(); let n = names.len();
let confs = DynConf::new(options.class_confs(), n); let confs = DynConf::new(config.class_confs(), n);
let processor = Processor::try_from_config(&config.processor)?
.with_image_width(width as _)
.with_image_height(height as _);
let input_ids: Vec<f32> = processor let input_ids: Vec<f32> = processor
.encode_texts_ids( .encode_texts_ids(
&names_with_prompt &names_with_prompt

View File

@ -4,11 +4,11 @@ use crate::{
}; };
/// Model configuration for `PicoDet` /// Model configuration for `PicoDet`
impl crate::Options { impl crate::Config {
pub fn picodet() -> Self { pub fn picodet() -> Self {
Self::default() Self::default()
.with_model_name("picodet") .with_name("picodet")
.with_batch_size(1) // TODO: ONNX model's batch size seems always = 1 .with_batch_size_all(1) // TODO: ONNX model's batch size seems always = 1
.with_model_ixx(0, 2, 640.into()) .with_model_ixx(0, 2, 640.into())
.with_model_ixx(0, 3, 640.into()) .with_model_ixx(0, 3, 640.into())
.with_model_ixx(1, 0, (1, 1, 8).into()) .with_model_ixx(1, 0, (1, 1, 8).into())

View File

@ -3,7 +3,7 @@ use anyhow::Result;
use ndarray::Axis; use ndarray::Axis;
use rayon::prelude::*; use rayon::prelude::*;
use crate::{elapsed, DynConf, Engine, Hbb, Image, Options, Processor, Ts, Xs, X, Y}; use crate::{elapsed, Config, DynConf, Engine, Hbb, Image, Processor, Ts, Xs, X, Y};
#[derive(Debug, Builder)] #[derive(Debug, Builder)]
pub struct PicoDet { pub struct PicoDet {
@ -19,8 +19,8 @@ pub struct PicoDet {
} }
impl PicoDet { impl PicoDet {
pub fn new(options: Options) -> Result<Self> { pub fn new(config: Config) -> Result<Self> {
let engine = options.to_engine()?; let engine = Engine::try_from_config(&config.model)?;
let (batch, height, width, ts) = ( let (batch, height, width, ts) = (
engine.batch().opt(), engine.batch().opt(),
engine.try_height().unwrap_or(&640.into()).opt(), engine.try_height().unwrap_or(&640.into()).opt(),
@ -28,15 +28,11 @@ impl PicoDet {
engine.ts.clone(), engine.ts.clone(),
); );
let spec = engine.spec().to_owned(); let spec = engine.spec().to_owned();
let processor = options let names: Vec<String> = config.class_names().to_vec();
.to_processor()? let confs = DynConf::new(config.class_confs(), names.len());
let processor = Processor::try_from_config(&config.processor)?
.with_image_width(width as _) .with_image_width(width as _)
.with_image_height(height as _); .with_image_height(height as _);
let names = options
.class_names()
.expect("No class names are specified.")
.to_vec();
let confs = DynConf::new(options.class_confs(), names.len());
Ok(Self { Ok(Self {
engine, engine,
@ -95,14 +91,15 @@ impl PicoDet {
return None; return None;
} }
let (x1, y1, x2, y2) = (pred[2], pred[3], pred[4], pred[5]); let (x1, y1, x2, y2) = (pred[2], pred[3], pred[4], pred[5]);
let mut hbb = Hbb::default()
.with_xyxy(x1.max(0.0f32), y1.max(0.0f32), x2, y2)
.with_confidence(confidence)
.with_id(class_id);
if !self.names.is_empty() {
hbb = hbb.with_name(&self.names[class_id]);
}
Some( Some(hbb)
Hbb::default()
.with_xyxy(x1.max(0.0f32), y1.max(0.0f32), x2, y2)
.with_confidence(confidence)
.with_id(class_id)
.with_name(&self.names[class_id]),
)
}) })
.collect(); .collect();

View File

@ -2,8 +2,7 @@ use aksr::Builder;
use anyhow::Result; use anyhow::Result;
use crate::{ use crate::{
elapsed, DType, Device, Engine, Image, Kind, Options, Processor, Scale, Task, Ts, Version, Xs, elapsed, Config, DType, Device, Engine, Image, Processor, Scale, Task, Ts, Version, Xs, X,
X,
}; };
#[derive(Debug, Builder)] #[derive(Debug, Builder)]
@ -20,7 +19,6 @@ pub struct BaseModelVisual {
dtype: DType, dtype: DType,
task: Option<Task>, task: Option<Task>,
scale: Option<Scale>, scale: Option<Scale>,
kind: Option<Kind>,
version: Option<Version>, version: Option<Version>,
} }
@ -29,8 +27,8 @@ impl BaseModelVisual {
self.ts.summary(); self.ts.summary();
} }
pub fn new(options: Options) -> Result<Self> { pub fn new(config: Config) -> Result<Self> {
let engine = options.to_engine()?; let engine = Engine::try_from_config(&config.model)?;
let err_msg = "You need to specify the image height and image width for visual model."; let err_msg = "You need to specify the image height and image width for visual model.";
let (batch, height, width, ts, spec) = ( let (batch, height, width, ts, spec) = (
engine.batch().opt(), engine.batch().opt(),
@ -39,18 +37,15 @@ impl BaseModelVisual {
engine.ts.clone(), engine.ts.clone(),
engine.spec().to_owned(), engine.spec().to_owned(),
); );
let processor = options let processor = Processor::try_from_config(&config.processor)?
.to_processor()?
.with_image_width(width as _) .with_image_width(width as _)
.with_image_height(height as _); .with_image_height(height as _);
let device = config.model.device;
let device = options.model_device; let task = config.task;
let task = options.model_task; let scale = config.scale;
let scale = options.model_scale; let dtype = config.model.dtype;
let dtype = options.model_dtype; let name = config.name;
let kind = options.model_kind; let version = config.version;
let name = options.model_name;
let version = options.model_version;
Ok(Self { Ok(Self {
engine, engine,
@ -63,7 +58,6 @@ impl BaseModelVisual {
dtype, dtype,
task, task,
scale, scale,
kind,
device, device,
version, version,
name, name,
@ -101,7 +95,6 @@ pub struct BaseModelTextual {
dtype: DType, dtype: DType,
task: Option<Task>, task: Option<Task>,
scale: Option<Scale>, scale: Option<Scale>,
kind: Option<Kind>,
version: Option<Version>, version: Option<Version>,
} }
@ -110,21 +103,20 @@ impl BaseModelTextual {
self.ts.summary(); self.ts.summary();
} }
pub fn new(options: Options) -> Result<Self> { pub fn new(config: Config) -> Result<Self> {
let engine = options.to_engine()?; let engine = Engine::try_from_config(&config.model)?;
let (batch, ts, spec) = ( let (batch, ts, spec) = (
engine.batch().opt(), engine.batch().opt(),
engine.ts.clone(), engine.ts.clone(),
engine.spec().to_owned(), engine.spec().to_owned(),
); );
let processor = options.to_processor()?; let processor = Processor::try_from_config(&config.processor)?;
let device = options.model_device; let device = config.model.device;
let task = options.model_task; let dtype = config.model.dtype;
let scale = options.model_scale; let task = config.task;
let dtype = options.model_dtype; let scale = config.scale;
let kind = options.model_kind; let name = config.name;
let name = options.model_name; let version = config.version;
let version = options.model_version;
Ok(Self { Ok(Self {
engine, engine,
@ -135,7 +127,6 @@ impl BaseModelTextual {
dtype, dtype,
task, task,
scale, scale,
kind,
device, device,
version, version,
name, name,

View File

@ -3,7 +3,7 @@ use anyhow::Result;
use ndarray::Axis; use ndarray::Axis;
use rayon::prelude::*; use rayon::prelude::*;
use crate::{elapsed, DynConf, Engine, Image, Options, Prob, Processor, Ts, Xs, Y}; use crate::{elapsed, Config, Engine, Image, Prob, Processor, Ts, Xs, Y};
#[derive(Debug, Builder)] #[derive(Debug, Builder)]
pub struct ImageClassifier { pub struct ImageClassifier {
@ -12,19 +12,24 @@ pub struct ImageClassifier {
width: usize, width: usize,
batch: usize, batch: usize,
apply_softmax: bool, apply_softmax: bool,
ts: Ts,
processor: Processor, processor: Processor,
confs: DynConf,
nc: usize,
names: Vec<String>, names: Vec<String>,
spec: String, spec: String,
topk: usize,
ts: Ts,
} }
impl TryFrom<Options> for ImageClassifier { impl TryFrom<Config> for ImageClassifier {
type Error = anyhow::Error; type Error = anyhow::Error;
fn try_from(options: Options) -> Result<Self, Self::Error> { fn try_from(config: Config) -> Result<Self, Self::Error> {
let engine = options.to_engine()?; Self::new(config)
}
}
impl ImageClassifier {
pub fn new(config: Config) -> Result<Self> {
let engine = Engine::try_from_config(&config.model)?;
let spec = engine.spec().to_string(); let spec = engine.spec().to_string();
let (batch, height, width, ts) = ( let (batch, height, width, ts) = (
engine.batch().opt(), engine.batch().opt(),
@ -32,50 +37,27 @@ impl TryFrom<Options> for ImageClassifier {
engine.try_width().unwrap_or(&224.into()).opt(), engine.try_width().unwrap_or(&224.into()).opt(),
engine.ts().clone(), engine.ts().clone(),
); );
let processor = options let names = config.class_names.to_vec();
.to_processor()? let apply_softmax = config.apply_softmax.unwrap_or_default();
let topk = config.topk.unwrap_or(5);
let processor = Processor::try_from_config(&config.processor)?
.with_image_width(width as _) .with_image_width(width as _)
.with_image_height(height as _); .with_image_height(height as _);
let (nc, names) = match (options.nc(), options.class_names()) {
(Some(nc), Some(names)) => {
if nc != names.len() {
anyhow::bail!(
"The length of the input class names: {} is inconsistent with the number of classes: {}.",
names.len(),
nc
);
}
(nc, names.to_vec())
}
(Some(nc), None) => (
nc,
(0..nc).map(|x| format!("# {}", x)).collect::<Vec<String>>(),
),
(None, Some(names)) => (names.len(), names.to_vec()),
(None, None) => {
anyhow::bail!("Neither class names nor class numbers were specified.");
}
};
let confs = DynConf::new(options.class_confs(), nc);
let apply_softmax = options.apply_softmax.unwrap_or_default();
Ok(Self { Ok(Self {
engine, engine,
height, height,
width, width,
batch, batch,
nc,
ts, ts,
spec, spec,
processor, processor,
confs,
names, names,
apply_softmax, apply_softmax,
topk,
}) })
} }
}
impl ImageClassifier {
pub fn summary(&mut self) { pub fn summary(&mut self) {
self.ts.summary(); self.ts.summary();
} }
@ -113,7 +95,7 @@ impl ImageClassifier {
let probs = Prob::new_probs( let probs = Prob::new_probs(
&logits.into_raw_vec_and_offset().0, &logits.into_raw_vec_and_offset().0,
Some(&self.names.iter().map(|x| x.as_str()).collect::<Vec<_>>()), Some(&self.names.iter().map(|x| x.as_str()).collect::<Vec<_>>()),
3, self.topk,
); );
Some(Y::default().with_probs(&probs)) Some(Y::default().with_probs(&probs))

View File

@ -1,18 +1,17 @@
use crate::NAMES_COCO_91; use crate::NAMES_COCO_91;
/// Model configuration for `RT-DETR` /// Model configuration for `RT-DETR`
impl crate::Options { impl crate::Config {
pub fn rfdetr() -> Self { pub fn rfdetr() -> Self {
Self::default() Self::default()
.with_model_name("rfdetr") .with_name("rfdetr")
.with_batch_size(1) .with_model_ixx(0, 0, 1.into())
.with_model_ixx(0, 1, 3.into())
.with_model_ixx(0, 2, 560.into()) .with_model_ixx(0, 2, 560.into())
.with_model_ixx(0, 3, 560.into()) .with_model_ixx(0, 3, 560.into())
.with_resize_mode(crate::ResizeMode::FitAdaptive) .with_resize_mode(crate::ResizeMode::FitAdaptive)
.with_normalize(true)
.with_image_mean(&[0.485, 0.456, 0.406]) .with_image_mean(&[0.485, 0.456, 0.406])
.with_image_std(&[0.229, 0.224, 0.225]) .with_image_std(&[0.229, 0.224, 0.225])
.with_class_confs(&[0.25])
.with_class_names(&NAMES_COCO_91) .with_class_names(&NAMES_COCO_91)
} }

View File

@ -3,7 +3,7 @@ use anyhow::Result;
use ndarray::{s, Axis}; use ndarray::{s, Axis};
use rayon::prelude::*; use rayon::prelude::*;
use crate::{elapsed, DynConf, Engine, Hbb, Image, Options, Processor, Ts, Xs, Y}; use crate::{elapsed, Config, DynConf, Engine, Hbb, Image, Processor, Ts, Xs, Y};
#[derive(Debug, Builder)] #[derive(Debug, Builder)]
pub struct RFDETR { pub struct RFDETR {
@ -19,8 +19,8 @@ pub struct RFDETR {
} }
impl RFDETR { impl RFDETR {
pub fn new(options: Options) -> Result<Self> { pub fn new(config: Config) -> Result<Self> {
let engine = options.to_engine()?; let engine = Engine::try_from_config(&config.model)?;
let (batch, height, width, ts) = ( let (batch, height, width, ts) = (
engine.batch().opt(), engine.batch().opt(),
engine.try_height().unwrap_or(&560.into()).opt(), engine.try_height().unwrap_or(&560.into()).opt(),
@ -28,16 +28,11 @@ impl RFDETR {
engine.ts.clone(), engine.ts.clone(),
); );
let spec = engine.spec().to_owned(); let spec = engine.spec().to_owned();
let processor = options let names: Vec<String> = config.class_names().to_vec();
.to_processor()? let confs = DynConf::new(config.class_confs(), names.len());
let processor = Processor::try_from_config(&config.processor)?
.with_image_width(width as _) .with_image_width(width as _)
.with_image_height(height as _); .with_image_height(height as _);
let names = options
.class_names()
.expect("No class names specified.")
.to_vec();
let confs = DynConf::new(options.class_confs(), names.len());
Ok(Self { Ok(Self {
engine, engine,
height, height,
@ -107,14 +102,15 @@ impl RFDETR {
let y = cy - h / 2.; let y = cy - h / 2.;
let x = x.max(0.0).min(image_width as _); let x = x.max(0.0).min(image_width as _);
let y = y.max(0.0).min(image_height as _); let y = y.max(0.0).min(image_height as _);
let mut hbb = Hbb::default()
.with_xywh(x, y, w, h)
.with_confidence(conf)
.with_id(class_id as _);
if !self.names.is_empty() {
hbb = hbb.with_name(&self.names[class_id]);
}
Some( Some(hbb)
Hbb::default()
.with_xywh(x, y, w, h)
.with_confidence(conf)
.with_id(class_id as _)
.with_name(&self.names[class_id]),
)
}) })
.collect(); .collect();

View File

@ -1,9 +1,10 @@
/// Model configuration for `RMBG` /// Model configuration for `RMBG`
impl crate::Options { impl crate::Config {
pub fn rmbg() -> Self { pub fn rmbg() -> Self {
Self::default() Self::default()
.with_model_name("rmbg") .with_name("rmbg")
.with_model_ixx(0, 0, 1.into()) .with_model_ixx(0, 0, 1.into())
.with_model_ixx(0, 1, 3.into())
.with_model_ixx(0, 2, 1024.into()) .with_model_ixx(0, 2, 1024.into())
.with_model_ixx(0, 3, 1024.into()) .with_model_ixx(0, 3, 1024.into())
} }

View File

@ -1,7 +1,7 @@
use aksr::Builder; use aksr::Builder;
use anyhow::Result; use anyhow::Result;
use crate::{elapsed, Engine, Image, Mask, Ops, Options, Processor, Ts, Xs, Y}; use crate::{elapsed, Config, Engine, Image, Mask, Ops, Processor, Ts, Xs, Y};
#[derive(Builder, Debug)] #[derive(Builder, Debug)]
pub struct RMBG { pub struct RMBG {
@ -15,8 +15,8 @@ pub struct RMBG {
} }
impl RMBG { impl RMBG {
pub fn new(options: Options) -> Result<Self> { pub fn new(config: Config) -> Result<Self> {
let engine = options.to_engine()?; let engine = Engine::try_from_config(&config.model)?;
let spec = engine.spec().to_string(); let spec = engine.spec().to_string();
let (batch, height, width, ts) = ( let (batch, height, width, ts) = (
engine.batch().opt(), engine.batch().opt(),
@ -24,8 +24,7 @@ impl RMBG {
engine.try_width().unwrap_or(&1024.into()).opt(), engine.try_width().unwrap_or(&1024.into()).opt(),
engine.ts().clone(), engine.ts().clone(),
); );
let processor = options let processor = Processor::try_from_config(&config.processor)?
.to_processor()?
.with_image_width(width as _) .with_image_width(width as _)
.with_image_height(height as _); .with_image_height(height as _);
@ -63,7 +62,6 @@ impl RMBG {
fn postprocess(&mut self, xs: Xs) -> Result<Vec<Y>> { fn postprocess(&mut self, xs: Xs) -> Result<Vec<Y>> {
let mut ys: Vec<Y> = Vec::new(); let mut ys: Vec<Y> = Vec::new();
for (idx, luma) in xs[0].axis_iter(ndarray::Axis(0)).enumerate() { for (idx, luma) in xs[0].axis_iter(ndarray::Axis(0)).enumerate() {
// image size
let (h1, w1) = ( let (h1, w1) = (
self.processor.images_transform_info[idx].height_src, self.processor.images_transform_info[idx].height_src,
self.processor.images_transform_info[idx].width_src, self.processor.images_transform_info[idx].width_src,

View File

@ -1,15 +1,15 @@
use crate::NAMES_COCO_80; use crate::NAMES_COCO_80;
/// Model configuration for `RT-DETR` /// Model configuration for `RT-DETR`
impl crate::Options { impl crate::Config {
pub fn rtdetr() -> Self { pub fn rtdetr() -> Self {
Self::default() Self::default()
.with_model_name("rtdetr") .with_name("rtdetr")
.with_batch_size(1) .with_model_ixx(0, 0, 1.into())
.with_model_ixx(0, 1, 3.into())
.with_model_ixx(0, 2, 640.into()) .with_model_ixx(0, 2, 640.into())
.with_model_ixx(0, 3, 640.into()) .with_model_ixx(0, 3, 640.into())
.with_resize_mode(crate::ResizeMode::FitAdaptive) .with_resize_mode(crate::ResizeMode::FitAdaptive)
.with_normalize(true)
.with_class_confs(&[0.5]) .with_class_confs(&[0.5])
.with_class_names(&NAMES_COCO_80) .with_class_names(&NAMES_COCO_80)
} }

View File

@ -3,7 +3,7 @@ use anyhow::Result;
use ndarray::{s, Axis}; use ndarray::{s, Axis};
use rayon::prelude::*; use rayon::prelude::*;
use crate::{elapsed, DynConf, Engine, Hbb, Image, Options, Processor, Ts, Xs, X, Y}; use crate::{elapsed, Config, DynConf, Engine, Hbb, Image, Processor, Ts, Xs, X, Y};
#[derive(Debug, Builder)] #[derive(Debug, Builder)]
pub struct RTDETR { pub struct RTDETR {
@ -19,8 +19,8 @@ pub struct RTDETR {
} }
impl RTDETR { impl RTDETR {
pub fn new(options: Options) -> Result<Self> { pub fn new(config: Config) -> Result<Self> {
let engine = options.to_engine()?; let engine = Engine::try_from_config(&config.model)?;
let (batch, height, width, ts) = ( let (batch, height, width, ts) = (
engine.batch().opt(), engine.batch().opt(),
engine.try_height().unwrap_or(&640.into()).opt(), engine.try_height().unwrap_or(&640.into()).opt(),
@ -28,15 +28,11 @@ impl RTDETR {
engine.ts.clone(), engine.ts.clone(),
); );
let spec = engine.spec().to_owned(); let spec = engine.spec().to_owned();
let processor = options let names: Vec<String> = config.class_names().to_vec();
.to_processor()? let confs = DynConf::new(config.class_confs(), names.len());
let processor = Processor::try_from_config(&config.processor)?
.with_image_width(width as _) .with_image_width(width as _)
.with_image_height(height as _); .with_image_height(height as _);
let names = options
.class_names()
.expect("No class names specified.")
.to_vec();
let confs = DynConf::new(options.class_confs(), names.len());
Ok(Self { Ok(Self {
engine, engine,
@ -87,14 +83,12 @@ impl RTDETR {
.enumerate() .enumerate()
.filter_map(|(idx, ((labels, boxes), scores))| { .filter_map(|(idx, ((labels, boxes), scores))| {
let ratio = self.processor.images_transform_info[idx].height_scale; let ratio = self.processor.images_transform_info[idx].height_scale;
let mut y_bboxes = Vec::new(); let mut y_bboxes = Vec::new();
for (i, &score) in scores.iter().enumerate() { for (i, &score) in scores.iter().enumerate() {
let class_id = labels[i] as usize; let class_id = labels[i] as usize;
if score < self.confs[class_id] { if score < self.confs[class_id] {
continue; continue;
} }
let xyxy = boxes.slice(s![i, ..]); let xyxy = boxes.slice(s![i, ..]);
let (x1, y1, x2, y2) = ( let (x1, y1, x2, y2) = (
xyxy[0] / ratio, xyxy[0] / ratio,
@ -102,14 +96,14 @@ impl RTDETR {
xyxy[2] / ratio, xyxy[2] / ratio,
xyxy[3] / ratio, xyxy[3] / ratio,
); );
let mut hbb = Hbb::default()
y_bboxes.push( .with_xyxy(x1.max(0.0f32), y1.max(0.0f32), x2, y2)
Hbb::default() .with_confidence(score)
.with_xyxy(x1.max(0.0f32), y1.max(0.0f32), x2, y2) .with_id(class_id);
.with_confidence(score) if !self.names.is_empty() {
.with_id(class_id) hbb = hbb.with_name(&self.names[class_id]);
.with_name(&self.names[class_id]), }
); y_bboxes.push(hbb);
} }
let mut y = Y::default(); let mut y = Y::default();

View File

@ -1,9 +1,10 @@
/// Model configuration for `RTMO` /// Model configuration for `RTMO`
impl crate::Options { impl crate::Config {
pub fn rtmo() -> Self { pub fn rtmo() -> Self {
Self::default() Self::default()
.with_model_name("rtmo") .with_name("rtmo")
.with_model_ixx(0, 0, 1.into()) .with_model_ixx(0, 0, 1.into())
.with_model_ixx(0, 1, 3.into())
.with_model_ixx(0, 2, 640.into()) .with_model_ixx(0, 2, 640.into())
.with_model_ixx(0, 3, 640.into()) .with_model_ixx(0, 3, 640.into())
.with_resize_mode(crate::ResizeMode::FitAdaptive) .with_resize_mode(crate::ResizeMode::FitAdaptive)

View File

@ -2,7 +2,7 @@ use aksr::Builder;
use anyhow::Result; use anyhow::Result;
use ndarray::Axis; use ndarray::Axis;
use crate::{elapsed, DynConf, Engine, Hbb, Image, Keypoint, Options, Processor, Ts, Xs, Y}; use crate::{elapsed, Config, DynConf, Engine, Hbb, Image, Keypoint, Processor, Ts, Xs, Y};
#[derive(Builder, Debug)] #[derive(Builder, Debug)]
pub struct RTMO { pub struct RTMO {
@ -18,8 +18,8 @@ pub struct RTMO {
} }
impl RTMO { impl RTMO {
pub fn new(options: Options) -> Result<Self> { pub fn new(config: Config) -> Result<Self> {
let engine = options.to_engine()?; let engine = Engine::try_from_config(&config.model)?;
let spec = engine.spec().to_string(); let spec = engine.spec().to_string();
let (batch, height, width, ts) = ( let (batch, height, width, ts) = (
engine.batch().opt(), engine.batch().opt(),
@ -27,15 +27,13 @@ impl RTMO {
engine.try_width().unwrap_or(&512.into()).opt(), engine.try_width().unwrap_or(&512.into()).opt(),
engine.ts().clone(), engine.ts().clone(),
); );
let processor = options let nk = config.nk().unwrap_or(17);
.to_processor()? let confs = DynConf::new(config.class_confs(), 1);
let kconfs = DynConf::new(config.keypoint_confs(), nk);
let processor = Processor::try_from_config(&config.processor)?
.with_image_width(width as _) .with_image_width(width as _)
.with_image_height(height as _); .with_image_height(height as _);
let nk = options.nk().unwrap_or(17);
let confs = DynConf::new(options.class_confs(), 1);
let kconfs = DynConf::new(options.keypoint_confs(), nk);
Ok(Self { Ok(Self {
engine, engine,
height, height,

View File

@ -1,100 +1,73 @@
use crate::{models::SamKind, Options}; use crate::{models::SamKind, Config};
/// Model configuration for `Segment Anything Model` /// Model configuration for `Segment Anything Model`
impl Options { impl Config {
pub fn sam() -> Self { pub fn sam() -> Self {
Self::default() Self::default()
.with_model_name("sam") .with_name("sam")
.with_model_ixx(0, 0, 1.into()) .with_encoder_ixx(0, 0, 1.into())
} .with_encoder_ixx(0, 1, 3.into())
.with_encoder_ixx(0, 2, 1024.into())
pub fn sam_encoder() -> Self { .with_encoder_ixx(0, 3, 1024.into())
Self::sam()
.with_model_ixx(0, 2, 1024.into())
.with_model_ixx(0, 3, 1024.into())
.with_resize_mode(crate::ResizeMode::FitAdaptive) .with_resize_mode(crate::ResizeMode::FitAdaptive)
.with_resize_filter("Bilinear") .with_resize_filter("Bilinear")
.with_image_mean(&[123.5, 116.5, 103.5]) .with_image_mean(&[123.5, 116.5, 103.5])
.with_image_std(&[58.5, 57.0, 57.5]) .with_image_std(&[58.5, 57.0, 57.5])
.with_normalize(false) .with_normalize(false)
.with_sam_kind(SamKind::Sam) .with_sam_kind(SamKind::Sam)
.with_low_res_mask(false) .with_sam_low_res_mask(false)
.with_find_contours(true) .with_find_contours(true)
} }
pub fn sam_decoder() -> Self { pub fn sam_v1_base() -> Self {
Self::sam() Self::sam()
.with_encoder_file("sam-vit-b-encoder.onnx")
.with_decoder_file("sam-vit-b-decoder.onnx")
} }
pub fn sam_v1_base_encoder() -> Self { // pub fn sam_v1_base_singlemask_decoder() -> Self {
Self::sam_encoder().with_model_file("sam-vit-b-encoder.onnx") // Self::sam().with_decoder_file("sam-vit-b-decoder-singlemask.onnx")
// }
pub fn sam2_tiny() -> Self {
Self::sam()
.with_encoder_file("sam2-hiera-tiny-encoder.onnx")
.with_sam_kind(SamKind::Sam2)
.with_decoder_file("sam2-hiera-tiny-decoder.onnx")
} }
pub fn sam_v1_base_decoder() -> Self { pub fn sam2_small() -> Self {
Self::sam_decoder().with_model_file("sam-vit-b-decoder.onnx") Self::sam()
} .with_encoder_file("sam2-hiera-small-encoder.onnx")
.with_decoder_file("sam2-hiera-small-decoder.onnx")
pub fn sam_v1_base_singlemask_decoder() -> Self {
Self::sam_decoder().with_model_file("sam-vit-b-decoder-singlemask.onnx")
}
pub fn sam2_tiny_encoder() -> Self {
Self::sam_encoder()
.with_model_file("sam2-hiera-tiny-encoder.onnx")
.with_sam_kind(SamKind::Sam2) .with_sam_kind(SamKind::Sam2)
} }
pub fn sam2_tiny_decoder() -> Self { pub fn sam2_base_plus() -> Self {
Self::sam_decoder().with_model_file("sam2-hiera-tiny-decoder.onnx") Self::sam()
} .with_encoder_file("sam2-hiera-base-plus-encoder.onnx")
.with_decoder_file("sam2-hiera-base-plus-decoder.onnx")
pub fn sam2_small_encoder() -> Self {
Self::sam_encoder()
.with_model_file("sam2-hiera-small-encoder.onnx")
.with_sam_kind(SamKind::Sam2) .with_sam_kind(SamKind::Sam2)
} }
pub fn sam2_small_decoder() -> Self { pub fn mobile_sam_tiny() -> Self {
Self::sam_decoder().with_model_file("sam2-hiera-small-decoder.onnx") Self::sam()
} .with_encoder_file("mobile-sam-vit-t-encoder.onnx")
pub fn sam2_base_plus_encoder() -> Self {
Self::sam_encoder()
.with_model_file("sam2-hiera-base-plus-encoder.onnx")
.with_sam_kind(SamKind::Sam2)
}
pub fn sam2_base_plus_decoder() -> Self {
Self::sam_decoder().with_model_file("sam2-hiera-base-plus-decoder.onnx")
}
pub fn mobile_sam_tiny_encoder() -> Self {
Self::sam_encoder()
.with_model_file("mobile-sam-vit-t-encoder.onnx")
.with_sam_kind(SamKind::MobileSam) .with_sam_kind(SamKind::MobileSam)
.with_decoder_file("mobile-sam-vit-t-decoder.onnx")
} }
pub fn mobile_sam_tiny_decoder() -> Self { pub fn sam_hq_tiny() -> Self {
Self::sam_decoder().with_model_file("mobile-sam-vit-t-decoder.onnx") Self::sam()
} .with_encoder_file("sam-hq-vit-t-encoder.onnx")
pub fn sam_hq_tiny_encoder() -> Self {
Self::sam_encoder()
.with_model_file("sam-hq-vit-t-encoder.onnx")
.with_sam_kind(SamKind::SamHq) .with_sam_kind(SamKind::SamHq)
.with_decoder_file("sam-hq-vit-t-decoder.onnx")
} }
pub fn sam_hq_tiny_decoder() -> Self { pub fn edge_sam_3x() -> Self {
Self::sam_decoder().with_model_file("sam-hq-vit-t-decoder.onnx") Self::sam()
} .with_encoder_file("edge-sam-3x-encoder.onnx")
.with_decoder_file("edge-sam-3x-decoder.onnx")
pub fn edge_sam_3x_encoder() -> Self {
Self::sam_encoder()
.with_model_file("edge-sam-3x-encoder.onnx")
.with_sam_kind(SamKind::EdgeSam) .with_sam_kind(SamKind::EdgeSam)
} }
pub fn edge_sam_3x_decoder() -> Self {
Self::sam_decoder().with_model_file("edge-sam-3x-decoder.onnx")
}
} }

View File

@ -4,8 +4,7 @@ use ndarray::{s, Axis};
use rand::prelude::*; use rand::prelude::*;
use crate::{ use crate::{
elapsed, DynConf, Engine, Image, Mask, Ops, Options, Polygon, Processor, SamPrompt, Ts, Xs, X, elapsed, Config, DynConf, Engine, Image, Mask, Ops, Polygon, Processor, SamPrompt, Ts, Xs, X, Y,
Y,
}; };
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
@ -49,9 +48,10 @@ pub struct SAM {
} }
impl SAM { impl SAM {
pub fn new(options_encoder: Options, options_decoder: Options) -> Result<Self> { pub fn new(config: Config) -> Result<Self> {
let encoder = options_encoder.to_engine()?; let encoder = Engine::try_from_config(&config.encoder)?;
let decoder = options_decoder.to_engine()?; let decoder = Engine::try_from_config(&config.decoder)?;
let (batch, height, width) = ( let (batch, height, width) = (
encoder.batch().opt(), encoder.batch().opt(),
encoder.try_height().unwrap_or(&1024.into()).opt(), encoder.try_height().unwrap_or(&1024.into()).opt(),
@ -60,24 +60,23 @@ impl SAM {
let ts = Ts::merge(&[encoder.ts(), decoder.ts()]); let ts = Ts::merge(&[encoder.ts(), decoder.ts()]);
let spec = encoder.spec().to_owned(); let spec = encoder.spec().to_owned();
let processor = options_encoder let conf = DynConf::new(config.class_confs(), 1);
.to_processor()? let find_contours = config.find_contours;
.with_image_width(width as _) let kind = match config.sam_kind {
.with_image_height(height as _);
let conf = DynConf::new(options_encoder.class_confs(), 1);
let find_contours = options_encoder.find_contours;
let kind = match options_encoder.sam_kind {
Some(x) => x, Some(x) => x,
None => anyhow::bail!("Error: no clear `SamKind` specified."), None => anyhow::bail!("Error: no clear `SamKind` specified."),
}; };
let use_low_res_mask = match kind { let use_low_res_mask = match kind {
SamKind::Sam | SamKind::MobileSam | SamKind::SamHq => { SamKind::Sam | SamKind::MobileSam | SamKind::SamHq => {
options_encoder.low_res_mask.unwrap_or(false) config.sam_low_res_mask.unwrap_or(false)
} }
SamKind::EdgeSam | SamKind::Sam2 => true, SamKind::EdgeSam | SamKind::Sam2 => true,
}; };
let processor = Processor::try_from_config(&config.processor)?
.with_image_width(width as _)
.with_image_height(height as _);
Ok(Self { Ok(Self {
encoder, encoder,
decoder, decoder,

View File

@ -1,50 +1,28 @@
use crate::Options; use crate::Config;
/// Model configuration for `SAM2.1` /// Model configuration for `SAM2.1`
impl Options { impl Config {
pub fn sam2_encoder() -> Self { pub fn sam2_1_tiny() -> Self {
Self::sam() Self::sam()
.with_model_ixx(0, 2, 1024.into()) .with_encoder_file("sam2.1-hiera-tiny-encoder.onnx")
.with_model_ixx(0, 3, 1024.into()) .with_decoder_file("sam2.1-hiera-tiny-decoder.onnx")
.with_resize_mode(crate::ResizeMode::FitAdaptive)
.with_resize_filter("Bilinear")
.with_image_mean(&[0.485, 0.456, 0.406])
.with_image_std(&[0.229, 0.224, 0.225])
} }
pub fn sam2_decoder() -> Self { pub fn sam2_1_small() -> Self {
Self::sam() Self::sam()
.with_encoder_file("sam2.1-hiera-small-encoder.onnx")
.with_decoder_file("sam2.1-hiera-small-decoder.onnx")
} }
pub fn sam2_1_tiny_encoder() -> Self { pub fn sam2_1_base_plus() -> Self {
Self::sam2_encoder().with_model_file("sam2.1-hiera-tiny-encoder.onnx") Self::sam()
.with_encoder_file("sam2.1-hiera-base-plus-encoder.onnx")
.with_decoder_file("sam2.1-hiera-base-plus-decoder.onnx")
} }
pub fn sam2_1_tiny_decoder() -> Self { pub fn sam2_1_large() -> Self {
Self::sam2_decoder().with_model_file("sam2.1-hiera-tiny-decoder.onnx") Self::sam()
} .with_encoder_file("sam2.1-hiera-large-encoder.onnx")
.with_decoder_file("sam2.1-hiera-large-decoder.onnx")
pub fn sam2_1_small_encoder() -> Self {
Self::sam2_encoder().with_model_file("sam2.1-hiera-small-encoder.onnx")
}
pub fn sam2_1_small_decoder() -> Self {
Self::sam2_decoder().with_model_file("sam2.1-hiera-small-decoder.onnx")
}
pub fn sam2_1_base_plus_encoder() -> Self {
Self::sam2_encoder().with_model_file("sam2.1-hiera-base-plus-encoder.onnx")
}
pub fn sam2_1_base_plus_decoder() -> Self {
Self::sam2_decoder().with_model_file("sam2.1-hiera-base-plus-decoder.onnx")
}
pub fn sam2_1_large_encoder() -> Self {
Self::sam2_encoder().with_model_file("sam2.1-hiera-large-encoder.onnx")
}
pub fn sam2_1_large_decoder() -> Self {
Self::sam2_decoder().with_model_file("sam2.1-hiera-large-decoder.onnx")
} }
} }

Some files were not shown because too many files have changed in this diff Show More