🐍 v0.1.0 (#53)

2025-12-03 02:58:22 +00:00 · 2025-01-12 16:59:57 +08:00
parent 4e932c4910
commit 0f2d84b8c5
256 changed files with 12485 additions and 9088 deletions
--- a/examples/florence2/README.md
+++ b/examples/florence2/README.md
@@ -0,0 +1,30 @@
+## Quick Start
+
+```shell
+cargo run -r -F cuda --example florence2 -- --device cuda --scale base --dtype fp16
+```
+
+
+```Shell
+Task: Caption(0)
+Ys([Y { Texts: [Text("A green car parked in front of a yellow building.")] }, Y { Texts: [Text("A group of people walking down a street next to a bus.")] }])
+
+Task: Caption(1)
+Ys([Y { Texts: [Text("The image shows a green car parked in front of a yellow building with two brown doors. The car is on the road, and the building has a wall and a tree in the background.")] }, Y { Texts: [Text("The image shows a group of people walking down a street next to a bus, with a building in the background. The bus is likely part of the World Electric Emission Bus, which is a new bus that will be launched in Madrid. The people are walking on the road, and there are trees and a sign board to the left of the bus.")] }])
+
+Task: Caption(2)
+Ys([Y { Texts: [Text("The image shows a vintage Volkswagen Beetle car parked on a cobblestone street in front of a yellow building with two wooden doors. The car is a light blue color with silver rims and appears to be in good condition. The building has a sloping roof and is painted in a bright yellow color. The sky is blue and there are trees in the background. The overall mood of the image is peaceful and serene.")] }, Y { Texts: [Text("The image shows a blue and white bus with the logo of the Brazilian football club, Cero Emisiones, on the side. The bus is parked on a street with a building in the background. There are several people walking on the sidewalk in front of the bus, some of them are carrying bags and one person is holding a camera. The sky is blue and there are trees and a traffic light visible in the top right corner of the image. The image appears to be taken during the day.")] }])
+```
+
+## Results
+
+| Task   |  Demo |
+| -----| ------|
+|Caption-To-Phrase-Grounding | <img src='https://github.com/jamjamjon/assets/releases/download/florence2/Caption-To-Phrase-Grounding-car.png' alt=''> |
+| Ocr-With-Region | <img src='https://github.com/jamjamjon/assets/releases/download/florence2/Ocr-With-Region.png' alt=''>|
+|  Dense-Region-Caption | <img src='https://github.com/jamjamjon/assets/releases/download/florence2/Dense-Region-Caption-car.png' alt=''>|
+| Object-Detection | <img src='https://github.com/jamjamjon/assets/releases/download/florence2/Object-Detection-car.png' alt=''>|
+| Region-Proposal | <img src='https://github.com/jamjamjon/assets/releases/download/florence2/Region-Proposal.png' alt=''>|
+| Referring-Expression-Segmentation | <img src='https://github.com/jamjamjon/assets/releases/download/florence2/Referring-Expression-Segmentation.png' alt=''>|
+
+
--- a/examples/florence2/main.rs
+++ b/examples/florence2/main.rs
@@ -1,157 +1,176 @@
-use usls::{models::Florence2, Annotator, DataLoader, Options, Task};
-
-fn main() -> Result<(), Box<dyn std::error::Error>> {
-    let batch_size = 3;
-
-    // vision encoder
-    let options_vision_encoder = Options::default()
-        .with_model("florence2/base-vision-encoder-f16.onnx")?
-        .with_ixx(0, 2, (512, 768, 800).into())
-        .with_ixx(0, 3, 768.into())
-        .with_ixx(0, 0, (1, batch_size as _, 8).into());
-
-    // text embed
-    let options_text_embed = Options::default()
-        .with_model("florence2/base-embed-tokens-f16.onnx")?
-        .with_tokenizer("florence2/tokenizer.json")?
-        .with_batch(batch_size);
-
-    // transformer encoder
-    let options_encoder = Options::default()
-        .with_model("florence2/base-encoder-f16.onnx")?
-        .with_batch(batch_size);
-
-    // transformer decoder
-    let options_decoder = Options::default()
-        .with_model("florence2/base-decoder-f16.onnx")?
-        .with_batch(batch_size);
-
-    // transformer decoder merged
-    let options_decoder_merged = Options::default()
-        .with_model("florence2/base-decoder-merged-f16.onnx")?
-        .with_batch(batch_size);
-
-    // build model
-    let mut model = Florence2::new(
-        options_vision_encoder,
-        options_text_embed,
-        options_encoder,
-        options_decoder,
-        options_decoder_merged,
-    )?;
-
-    // load images
-    let xs = [
-        // DataLoader::try_read("florence2/car.jpg")?, // for testing region-related tasks
-        DataLoader::try_read("florence2/car.jpg")?,
-        // DataLoader::try_read("images/db.png")?,
-        DataLoader::try_read("assets/bus.jpg")?,
-    ];
-
-    // region-related tasks
-    let quantizer = usls::Quantizer::default();
-    // let coords = [449., 270., 556., 372.];  // wheel
-    let coords = [31., 156., 581., 373.]; // car
-    let (width_car, height_car) = (xs[0].width(), xs[0].height());
-    let quantized_coords = quantizer.quantize(&coords, (width_car as _, height_car as _));
-
-    // run with tasks
-    let ys = model.run_with_tasks(
-        &xs,
-        &[
-            // w/ inputs
-            Task::Caption(0),
-            Task::Caption(1),
-            Task::Caption(2),
-            Task::Ocr,
-            Task::OcrWithRegion,
-            Task::RegionProposal,
-            Task::ObjectDetection,
-            Task::DenseRegionCaption,
-            // w/o inputs
-            Task::OpenSetDetection("a vehicle".into()),
-            Task::CaptionToPhraseGrounding(
-                "A vehicle with two wheels parked in front of a building.".into(),
-            ),
-            Task::ReferringExpressionSegmentation("a vehicle".into()),
-            Task::RegionToSegmentation(
-                quantized_coords[0],
-                quantized_coords[1],
-                quantized_coords[2],
-                quantized_coords[3],
-            ),
-            Task::RegionToCategory(
-                quantized_coords[0],
-                quantized_coords[1],
-                quantized_coords[2],
-                quantized_coords[3],
-            ),
-            Task::RegionToDescription(
-                quantized_coords[0],
-                quantized_coords[1],
-                quantized_coords[2],
-                quantized_coords[3],
-            ),
-        ],
-    )?;
-
-    // annotator
-    let annotator = Annotator::new()
-        .without_bboxes_conf(true)
-        .with_bboxes_thickness(3)
-        .with_saveout_subs(&["Florence2"]);
-    for (task, ys_) in ys.iter() {
-        match task {
-            Task::Caption(_)
-            | Task::Ocr
-            | Task::RegionToCategory(..)
-            | Task::RegionToDescription(..) => {
-                println!("Task: {:?}\n{:?}\n", task, ys_)
-            }
-            Task::DenseRegionCaption => {
-                let annotator = annotator.clone().with_saveout("Dense-Region-Caption");
-                annotator.annotate(&xs, ys_);
-            }
-            Task::RegionProposal => {
-                let annotator = annotator
-                    .clone()
-                    .without_bboxes_name(false)
-                    .with_saveout("Region-Proposal");
-
-                annotator.annotate(&xs, ys_);
-            }
-            Task::ObjectDetection => {
-                let annotator = annotator.clone().with_saveout("Object-Detection");
-                annotator.annotate(&xs, ys_);
-            }
-            Task::OpenSetDetection(_) => {
-                let annotator = annotator.clone().with_saveout("Open-Set-Detection");
-                annotator.annotate(&xs, ys_);
-            }
-            Task::CaptionToPhraseGrounding(_) => {
-                let annotator = annotator
-                    .clone()
-                    .with_saveout("Caption-To-Phrase-Grounding");
-                annotator.annotate(&xs, ys_);
-            }
-            Task::ReferringExpressionSegmentation(_) => {
-                let annotator = annotator
-                    .clone()
-                    .with_saveout("Referring-Expression-Segmentation");
-                annotator.annotate(&xs, ys_);
-            }
-            Task::RegionToSegmentation(..) => {
-                let annotator = annotator.clone().with_saveout("Region-To-Segmentation");
-                annotator.annotate(&xs, ys_);
-            }
-            Task::OcrWithRegion => {
-                let annotator = annotator.clone().with_saveout("Ocr-With-Region");
-                annotator.annotate(&xs, ys_);
-            }
-
-            _ => (),
-        }
-    }
-
-    Ok(())
-}
+use anyhow::Result;
+use usls::{models::Florence2, Annotator, DataLoader, Options, Scale, Task};
+
+#[derive(argh::FromArgs)]
+/// Example
+struct Args {
+    /// dtype
+    #[argh(option, default = "String::from(\"auto\")")]
+    dtype: String,
+
+    /// device
+    #[argh(option, default = "String::from(\"cpu:0\")")]
+    device: String,
+
+    /// scale
+    #[argh(option, default = "String::from(\"base\")")]
+    scale: String,
+}
+
+fn main() -> Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
+        .with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339())
+        .init();
+
+    let args: Args = argh::from_env();
+
+    // load images
+    let xs = [
+        DataLoader::try_read("images/green-car.jpg")?,
+        DataLoader::try_read("assets/bus.jpg")?,
+    ];
+
+    // build model
+    let (
+        options_vision_encoder,
+        options_text_embed,
+        options_encoder,
+        options_decoder,
+        options_decoder_merged,
+    ) = match args.scale.as_str().try_into()? {
+        Scale::B => (
+            Options::florence2_visual_encoder_base(),
+            Options::florence2_textual_embed_base(),
+            Options::florence2_texual_encoder_base(),
+            Options::florence2_texual_decoder_base(),
+            Options::florence2_texual_decoder_merged_base(),
+        ),
+        Scale::L => todo!(),
+        _ => anyhow::bail!("Unsupported Florence2 scale."),
+    };
+
+    let mut model = Florence2::new(
+        options_vision_encoder
+            .with_model_dtype(args.dtype.as_str().try_into()?)
+            .with_model_device(args.device.as_str().try_into()?)
+            .with_batch_size(xs.len())
+            .commit()?,
+        options_text_embed
+            .with_model_dtype(args.dtype.as_str().try_into()?)
+            .with_model_device(args.device.as_str().try_into()?)
+            .with_batch_size(xs.len())
+            .commit()?,
+        options_encoder
+            .with_model_dtype(args.dtype.as_str().try_into()?)
+            .with_model_device(args.device.as_str().try_into()?)
+            .with_batch_size(xs.len())
+            .commit()?,
+        options_decoder
+            .with_model_dtype(args.dtype.as_str().try_into()?)
+            .with_model_device(args.device.as_str().try_into()?)
+            .with_batch_size(xs.len())
+            .commit()?,
+        options_decoder_merged
+            .with_model_dtype(args.dtype.as_str().try_into()?)
+            .with_model_device(args.device.as_str().try_into()?)
+            .with_batch_size(xs.len())
+            .commit()?,
+    )?;
+
+    // tasks
+    let tasks = [
+        // w inputs
+        Task::Caption(0),
+        Task::Caption(1),
+        Task::Caption(2),
+        Task::Ocr,
+        // Task::OcrWithRegion,
+        Task::RegionProposal,
+        Task::ObjectDetection,
+        Task::DenseRegionCaption,
+        // w/o inputs
+        Task::OpenSetDetection("a vehicle"),
+        Task::CaptionToPhraseGrounding("A vehicle with two wheels parked in front of a building."),
+        Task::ReferringExpressionSegmentation("a vehicle"),
+        Task::RegionToSegmentation(
+            // 31, 156, 581, 373,  // car
+            449, 270, 556, 372, // wheel
+        ),
+        Task::RegionToCategory(
+            // 31, 156, 581, 373,
+            449, 270, 556, 372,
+        ),
+        Task::RegionToDescription(
+            // 31, 156, 581, 373,
+            449, 270, 556, 372,
+        ),
+    ];
+
+    // annotator
+    let annotator = Annotator::new()
+        .without_bboxes_conf(true)
+        .with_bboxes_thickness(3)
+        .with_saveout_subs(&["Florence2"]);
+
+    // inference
+    for task in tasks.iter() {
+        let ys = model.forward(&xs, task)?;
+
+        // annotate
+        match task {
+            Task::Caption(_)
+            | Task::Ocr
+            | Task::RegionToCategory(..)
+            | Task::RegionToDescription(..) => {
+                println!("Task: {:?}\n{:?}\n", task, &ys)
+            }
+            Task::DenseRegionCaption => {
+                let annotator = annotator.clone().with_saveout("Dense-Region-Caption");
+                annotator.annotate(&xs, &ys);
+            }
+            Task::RegionProposal => {
+                let annotator = annotator
+                    .clone()
+                    .without_bboxes_name(false)
+                    .with_saveout("Region-Proposal");
+
+                annotator.annotate(&xs, &ys);
+            }
+            Task::ObjectDetection => {
+                let annotator = annotator.clone().with_saveout("Object-Detection");
+                annotator.annotate(&xs, &ys);
+            }
+            Task::OpenSetDetection(_) => {
+                let annotator = annotator.clone().with_saveout("Open-Set-Detection");
+                annotator.annotate(&xs, &ys);
+            }
+            Task::CaptionToPhraseGrounding(_) => {
+                let annotator = annotator
+                    .clone()
+                    .with_saveout("Caption-To-Phrase-Grounding");
+                annotator.annotate(&xs, &ys);
+            }
+            Task::ReferringExpressionSegmentation(_) => {
+                let annotator = annotator
+                    .clone()
+                    .with_saveout("Referring-Expression-Segmentation");
+                annotator.annotate(&xs, &ys);
+            }
+            Task::RegionToSegmentation(..) => {
+                let annotator = annotator.clone().with_saveout("Region-To-Segmentation");
+                annotator.annotate(&xs, &ys);
+            }
+            Task::OcrWithRegion => {
+                let annotator = annotator.clone().with_saveout("Ocr-With-Region");
+                annotator.annotate(&xs, &ys);
+            }
+
+            _ => (),
+        }
+    }
+
+    model.summary();
+
+    Ok(())
+}