mirror of
https://github.com/mii443/usls.git
synced 2025-12-03 02:58:22 +00:00
🐍 v0.1.0 (#53)
This commit is contained in:
30
examples/florence2/README.md
Normal file
30
examples/florence2/README.md
Normal file
@@ -0,0 +1,30 @@
|
||||
## Quick Start
|
||||
|
||||
```shell
|
||||
cargo run -r -F cuda --example florence2 -- --device cuda --scale base --dtype fp16
|
||||
```
|
||||
|
||||
|
||||
```Shell
|
||||
Task: Caption(0)
|
||||
Ys([Y { Texts: [Text("A green car parked in front of a yellow building.")] }, Y { Texts: [Text("A group of people walking down a street next to a bus.")] }])
|
||||
|
||||
Task: Caption(1)
|
||||
Ys([Y { Texts: [Text("The image shows a green car parked in front of a yellow building with two brown doors. The car is on the road, and the building has a wall and a tree in the background.")] }, Y { Texts: [Text("The image shows a group of people walking down a street next to a bus, with a building in the background. The bus is likely part of the World Electric Emission Bus, which is a new bus that will be launched in Madrid. The people are walking on the road, and there are trees and a sign board to the left of the bus.")] }])
|
||||
|
||||
Task: Caption(2)
|
||||
Ys([Y { Texts: [Text("The image shows a vintage Volkswagen Beetle car parked on a cobblestone street in front of a yellow building with two wooden doors. The car is a light blue color with silver rims and appears to be in good condition. The building has a sloping roof and is painted in a bright yellow color. The sky is blue and there are trees in the background. The overall mood of the image is peaceful and serene.")] }, Y { Texts: [Text("The image shows a blue and white bus with the logo of the Brazilian football club, Cero Emisiones, on the side. The bus is parked on a street with a building in the background. There are several people walking on the sidewalk in front of the bus, some of them are carrying bags and one person is holding a camera. The sky is blue and there are trees and a traffic light visible in the top right corner of the image. The image appears to be taken during the day.")] }])
|
||||
```
|
||||
|
||||
## Results
|
||||
|
||||
| Task | Demo |
|
||||
| -----| ------|
|
||||
|Caption-To-Phrase-Grounding | <img src='https://github.com/jamjamjon/assets/releases/download/florence2/Caption-To-Phrase-Grounding-car.png' alt=''> |
|
||||
| Ocr-With-Region | <img src='https://github.com/jamjamjon/assets/releases/download/florence2/Ocr-With-Region.png' alt=''>|
|
||||
| Dense-Region-Caption | <img src='https://github.com/jamjamjon/assets/releases/download/florence2/Dense-Region-Caption-car.png' alt=''>|
|
||||
| Object-Detection | <img src='https://github.com/jamjamjon/assets/releases/download/florence2/Object-Detection-car.png' alt=''>|
|
||||
| Region-Proposal | <img src='https://github.com/jamjamjon/assets/releases/download/florence2/Region-Proposal.png' alt=''>|
|
||||
| Referring-Expression-Segmentation | <img src='https://github.com/jamjamjon/assets/releases/download/florence2/Referring-Expression-Segmentation.png' alt=''>|
|
||||
|
||||
|
||||
@@ -1,157 +1,176 @@
|
||||
use usls::{models::Florence2, Annotator, DataLoader, Options, Task};
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let batch_size = 3;
|
||||
|
||||
// vision encoder
|
||||
let options_vision_encoder = Options::default()
|
||||
.with_model("florence2/base-vision-encoder-f16.onnx")?
|
||||
.with_ixx(0, 2, (512, 768, 800).into())
|
||||
.with_ixx(0, 3, 768.into())
|
||||
.with_ixx(0, 0, (1, batch_size as _, 8).into());
|
||||
|
||||
// text embed
|
||||
let options_text_embed = Options::default()
|
||||
.with_model("florence2/base-embed-tokens-f16.onnx")?
|
||||
.with_tokenizer("florence2/tokenizer.json")?
|
||||
.with_batch(batch_size);
|
||||
|
||||
// transformer encoder
|
||||
let options_encoder = Options::default()
|
||||
.with_model("florence2/base-encoder-f16.onnx")?
|
||||
.with_batch(batch_size);
|
||||
|
||||
// transformer decoder
|
||||
let options_decoder = Options::default()
|
||||
.with_model("florence2/base-decoder-f16.onnx")?
|
||||
.with_batch(batch_size);
|
||||
|
||||
// transformer decoder merged
|
||||
let options_decoder_merged = Options::default()
|
||||
.with_model("florence2/base-decoder-merged-f16.onnx")?
|
||||
.with_batch(batch_size);
|
||||
|
||||
// build model
|
||||
let mut model = Florence2::new(
|
||||
options_vision_encoder,
|
||||
options_text_embed,
|
||||
options_encoder,
|
||||
options_decoder,
|
||||
options_decoder_merged,
|
||||
)?;
|
||||
|
||||
// load images
|
||||
let xs = [
|
||||
// DataLoader::try_read("florence2/car.jpg")?, // for testing region-related tasks
|
||||
DataLoader::try_read("florence2/car.jpg")?,
|
||||
// DataLoader::try_read("images/db.png")?,
|
||||
DataLoader::try_read("assets/bus.jpg")?,
|
||||
];
|
||||
|
||||
// region-related tasks
|
||||
let quantizer = usls::Quantizer::default();
|
||||
// let coords = [449., 270., 556., 372.]; // wheel
|
||||
let coords = [31., 156., 581., 373.]; // car
|
||||
let (width_car, height_car) = (xs[0].width(), xs[0].height());
|
||||
let quantized_coords = quantizer.quantize(&coords, (width_car as _, height_car as _));
|
||||
|
||||
// run with tasks
|
||||
let ys = model.run_with_tasks(
|
||||
&xs,
|
||||
&[
|
||||
// w/ inputs
|
||||
Task::Caption(0),
|
||||
Task::Caption(1),
|
||||
Task::Caption(2),
|
||||
Task::Ocr,
|
||||
Task::OcrWithRegion,
|
||||
Task::RegionProposal,
|
||||
Task::ObjectDetection,
|
||||
Task::DenseRegionCaption,
|
||||
// w/o inputs
|
||||
Task::OpenSetDetection("a vehicle".into()),
|
||||
Task::CaptionToPhraseGrounding(
|
||||
"A vehicle with two wheels parked in front of a building.".into(),
|
||||
),
|
||||
Task::ReferringExpressionSegmentation("a vehicle".into()),
|
||||
Task::RegionToSegmentation(
|
||||
quantized_coords[0],
|
||||
quantized_coords[1],
|
||||
quantized_coords[2],
|
||||
quantized_coords[3],
|
||||
),
|
||||
Task::RegionToCategory(
|
||||
quantized_coords[0],
|
||||
quantized_coords[1],
|
||||
quantized_coords[2],
|
||||
quantized_coords[3],
|
||||
),
|
||||
Task::RegionToDescription(
|
||||
quantized_coords[0],
|
||||
quantized_coords[1],
|
||||
quantized_coords[2],
|
||||
quantized_coords[3],
|
||||
),
|
||||
],
|
||||
)?;
|
||||
|
||||
// annotator
|
||||
let annotator = Annotator::new()
|
||||
.without_bboxes_conf(true)
|
||||
.with_bboxes_thickness(3)
|
||||
.with_saveout_subs(&["Florence2"]);
|
||||
for (task, ys_) in ys.iter() {
|
||||
match task {
|
||||
Task::Caption(_)
|
||||
| Task::Ocr
|
||||
| Task::RegionToCategory(..)
|
||||
| Task::RegionToDescription(..) => {
|
||||
println!("Task: {:?}\n{:?}\n", task, ys_)
|
||||
}
|
||||
Task::DenseRegionCaption => {
|
||||
let annotator = annotator.clone().with_saveout("Dense-Region-Caption");
|
||||
annotator.annotate(&xs, ys_);
|
||||
}
|
||||
Task::RegionProposal => {
|
||||
let annotator = annotator
|
||||
.clone()
|
||||
.without_bboxes_name(false)
|
||||
.with_saveout("Region-Proposal");
|
||||
|
||||
annotator.annotate(&xs, ys_);
|
||||
}
|
||||
Task::ObjectDetection => {
|
||||
let annotator = annotator.clone().with_saveout("Object-Detection");
|
||||
annotator.annotate(&xs, ys_);
|
||||
}
|
||||
Task::OpenSetDetection(_) => {
|
||||
let annotator = annotator.clone().with_saveout("Open-Set-Detection");
|
||||
annotator.annotate(&xs, ys_);
|
||||
}
|
||||
Task::CaptionToPhraseGrounding(_) => {
|
||||
let annotator = annotator
|
||||
.clone()
|
||||
.with_saveout("Caption-To-Phrase-Grounding");
|
||||
annotator.annotate(&xs, ys_);
|
||||
}
|
||||
Task::ReferringExpressionSegmentation(_) => {
|
||||
let annotator = annotator
|
||||
.clone()
|
||||
.with_saveout("Referring-Expression-Segmentation");
|
||||
annotator.annotate(&xs, ys_);
|
||||
}
|
||||
Task::RegionToSegmentation(..) => {
|
||||
let annotator = annotator.clone().with_saveout("Region-To-Segmentation");
|
||||
annotator.annotate(&xs, ys_);
|
||||
}
|
||||
Task::OcrWithRegion => {
|
||||
let annotator = annotator.clone().with_saveout("Ocr-With-Region");
|
||||
annotator.annotate(&xs, ys_);
|
||||
}
|
||||
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
use anyhow::Result;
|
||||
use usls::{models::Florence2, Annotator, DataLoader, Options, Scale, Task};
|
||||
|
||||
#[derive(argh::FromArgs)]
|
||||
/// Example
|
||||
struct Args {
|
||||
/// dtype
|
||||
#[argh(option, default = "String::from(\"auto\")")]
|
||||
dtype: String,
|
||||
|
||||
/// device
|
||||
#[argh(option, default = "String::from(\"cpu:0\")")]
|
||||
device: String,
|
||||
|
||||
/// scale
|
||||
#[argh(option, default = "String::from(\"base\")")]
|
||||
scale: String,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
|
||||
.with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339())
|
||||
.init();
|
||||
|
||||
let args: Args = argh::from_env();
|
||||
|
||||
// load images
|
||||
let xs = [
|
||||
DataLoader::try_read("images/green-car.jpg")?,
|
||||
DataLoader::try_read("assets/bus.jpg")?,
|
||||
];
|
||||
|
||||
// build model
|
||||
let (
|
||||
options_vision_encoder,
|
||||
options_text_embed,
|
||||
options_encoder,
|
||||
options_decoder,
|
||||
options_decoder_merged,
|
||||
) = match args.scale.as_str().try_into()? {
|
||||
Scale::B => (
|
||||
Options::florence2_visual_encoder_base(),
|
||||
Options::florence2_textual_embed_base(),
|
||||
Options::florence2_texual_encoder_base(),
|
||||
Options::florence2_texual_decoder_base(),
|
||||
Options::florence2_texual_decoder_merged_base(),
|
||||
),
|
||||
Scale::L => todo!(),
|
||||
_ => anyhow::bail!("Unsupported Florence2 scale."),
|
||||
};
|
||||
|
||||
let mut model = Florence2::new(
|
||||
options_vision_encoder
|
||||
.with_model_dtype(args.dtype.as_str().try_into()?)
|
||||
.with_model_device(args.device.as_str().try_into()?)
|
||||
.with_batch_size(xs.len())
|
||||
.commit()?,
|
||||
options_text_embed
|
||||
.with_model_dtype(args.dtype.as_str().try_into()?)
|
||||
.with_model_device(args.device.as_str().try_into()?)
|
||||
.with_batch_size(xs.len())
|
||||
.commit()?,
|
||||
options_encoder
|
||||
.with_model_dtype(args.dtype.as_str().try_into()?)
|
||||
.with_model_device(args.device.as_str().try_into()?)
|
||||
.with_batch_size(xs.len())
|
||||
.commit()?,
|
||||
options_decoder
|
||||
.with_model_dtype(args.dtype.as_str().try_into()?)
|
||||
.with_model_device(args.device.as_str().try_into()?)
|
||||
.with_batch_size(xs.len())
|
||||
.commit()?,
|
||||
options_decoder_merged
|
||||
.with_model_dtype(args.dtype.as_str().try_into()?)
|
||||
.with_model_device(args.device.as_str().try_into()?)
|
||||
.with_batch_size(xs.len())
|
||||
.commit()?,
|
||||
)?;
|
||||
|
||||
// tasks
|
||||
let tasks = [
|
||||
// w inputs
|
||||
Task::Caption(0),
|
||||
Task::Caption(1),
|
||||
Task::Caption(2),
|
||||
Task::Ocr,
|
||||
// Task::OcrWithRegion,
|
||||
Task::RegionProposal,
|
||||
Task::ObjectDetection,
|
||||
Task::DenseRegionCaption,
|
||||
// w/o inputs
|
||||
Task::OpenSetDetection("a vehicle"),
|
||||
Task::CaptionToPhraseGrounding("A vehicle with two wheels parked in front of a building."),
|
||||
Task::ReferringExpressionSegmentation("a vehicle"),
|
||||
Task::RegionToSegmentation(
|
||||
// 31, 156, 581, 373, // car
|
||||
449, 270, 556, 372, // wheel
|
||||
),
|
||||
Task::RegionToCategory(
|
||||
// 31, 156, 581, 373,
|
||||
449, 270, 556, 372,
|
||||
),
|
||||
Task::RegionToDescription(
|
||||
// 31, 156, 581, 373,
|
||||
449, 270, 556, 372,
|
||||
),
|
||||
];
|
||||
|
||||
// annotator
|
||||
let annotator = Annotator::new()
|
||||
.without_bboxes_conf(true)
|
||||
.with_bboxes_thickness(3)
|
||||
.with_saveout_subs(&["Florence2"]);
|
||||
|
||||
// inference
|
||||
for task in tasks.iter() {
|
||||
let ys = model.forward(&xs, task)?;
|
||||
|
||||
// annotate
|
||||
match task {
|
||||
Task::Caption(_)
|
||||
| Task::Ocr
|
||||
| Task::RegionToCategory(..)
|
||||
| Task::RegionToDescription(..) => {
|
||||
println!("Task: {:?}\n{:?}\n", task, &ys)
|
||||
}
|
||||
Task::DenseRegionCaption => {
|
||||
let annotator = annotator.clone().with_saveout("Dense-Region-Caption");
|
||||
annotator.annotate(&xs, &ys);
|
||||
}
|
||||
Task::RegionProposal => {
|
||||
let annotator = annotator
|
||||
.clone()
|
||||
.without_bboxes_name(false)
|
||||
.with_saveout("Region-Proposal");
|
||||
|
||||
annotator.annotate(&xs, &ys);
|
||||
}
|
||||
Task::ObjectDetection => {
|
||||
let annotator = annotator.clone().with_saveout("Object-Detection");
|
||||
annotator.annotate(&xs, &ys);
|
||||
}
|
||||
Task::OpenSetDetection(_) => {
|
||||
let annotator = annotator.clone().with_saveout("Open-Set-Detection");
|
||||
annotator.annotate(&xs, &ys);
|
||||
}
|
||||
Task::CaptionToPhraseGrounding(_) => {
|
||||
let annotator = annotator
|
||||
.clone()
|
||||
.with_saveout("Caption-To-Phrase-Grounding");
|
||||
annotator.annotate(&xs, &ys);
|
||||
}
|
||||
Task::ReferringExpressionSegmentation(_) => {
|
||||
let annotator = annotator
|
||||
.clone()
|
||||
.with_saveout("Referring-Expression-Segmentation");
|
||||
annotator.annotate(&xs, &ys);
|
||||
}
|
||||
Task::RegionToSegmentation(..) => {
|
||||
let annotator = annotator.clone().with_saveout("Region-To-Segmentation");
|
||||
annotator.annotate(&xs, &ys);
|
||||
}
|
||||
Task::OcrWithRegion => {
|
||||
let annotator = annotator.clone().with_saveout("Ocr-With-Region");
|
||||
annotator.annotate(&xs, &ys);
|
||||
}
|
||||
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
model.summary();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user