diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml
new file mode 100644
index 0000000..366b451
--- /dev/null
+++ b/.github/workflows/rust-ci.yml
@@ -0,0 +1,78 @@
+name: Rust
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+jobs:
+  check:
+    name: Check
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macOS-latest]
+        rust: [stable]
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: ${{ matrix.rust }}
+          override: true
+      - uses: actions-rs/cargo@v1
+        with:
+          command: check
+          args: --workspace --examples
+
+  test:
+    name: Test Suite
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macOS-latest]
+        rust: [stable]
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: ${{ matrix.rust }}
+          override: true
+      - uses: actions-rs/cargo@v1
+        with:
+          command: test
+          args: --workspace --examples
+
+  fmt:
+    name: Rustfmt
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: stable
+          override: true
+      - run: rustup component add rustfmt
+      - uses: actions-rs/cargo@v1
+        with:
+          command: fmt
+          args: --all -- --check
+
+  clippy:
+    name: Clippy
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: stable
+          override: true
+      - run: rustup component add clippy
+      - uses: actions-rs/cargo@v1
+        with:
+          command: clippy
+          args: --workspace --tests --examples --all-targets --all-features -- -Dwarnings
diff --git a/.gitignore b/.gitignore
index 6985cf1..b99985e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,9 @@ Cargo.lock
 
 # MSVC Windows builds of rustc generate these, which store debugging information
 *.pdb
+
+
+.debug
+.vscode
+runs/
+.DS_Store
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..6b0cbc9
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,39 @@
+[package]
+name = "usls"
+version = "0.0.1"
+edition = "2021"
+description = "A Rust library integrated with ONNXRuntime, providing a collection of ML models."
+repository = "https://github.com/jamjamjon/usls"
+authors = ["Jamjamjon <jamjamjon.usls@gmail.com>"]
+license = "MIT"
+readme = "README.md"
+
+[dependencies]
+clap = { version = "4.2.4", features = ["derive"] }
+image = { version = "0.24.7", default-features = false, features = [
+    "jpeg", 
+    "png", 
+    "tiff", 
+    "webp", 
+    "webp-encoder",
+    "bmp"
+]}
+imageproc = { version = "0.23.0", default-features = false }
+ndarray = { version = "0.15.6" }
+# ort-sys = { version = "2.0.0-alpha.4" }
+# ort = { version = "2.0.0-alpha.4", default-features = false, features = ["load-dynamic", "copy-dylibs", "half", "ndarray", "cuda", "tensorrt", "coreml", "openvino"] }
+ort = { version = "2.0.0-alpha.4", default-features = false, features = ["load-dynamic", "copy-dylibs", "profiling", "half", "ndarray", "cuda", "tensorrt", "coreml", "ureq", "openvino"] }
+rusttype = { version = "0.9", default-features = false }
+anyhow = { version = "1.0.75" }
+regex = { version = "1.5.4" }
+rand = { version = "0.8.5" }
+chrono = { version = "0.4.30" }
+half = { version = "2.3.1" }
+dirs = { version = "5.0.1" }
+ureq = { version = "2.9.1", default-features = true, features = [ "socks-proxy" ] }
+walkdir = { version = "2.5.0" }
+tokenizers = { version = "0.15.2" }
+itertools = { version = "0.12.1" }
+usearch = { version = "2.9.1" }
+rayon = "1.10.0"
+indicatif = "0.17.8"
diff --git a/README.md b/README.md
index 52b605b..9861d56 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,105 @@
-# usls
\ No newline at end of file
+# usls
+
+A Rust library integrated with **ONNXRuntime**, providing a collection of **Computer Vison** and **Vision-Language** models including [YOLOv8](https://github.com/ultralytics/ultralytics) `(Classification, Segmentation, Detection and Pose Detection)`, [YOLOv9](https://github.com/WongKinYiu/yolov9), [RTDETR](https://arxiv.org/abs/2304.08069), [CLIP](https://github.com/openai/CLIP), [DINOv2](https://github.com/facebookresearch/dinov2), [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM), [YOLO-World](https://github.com/AILab-CVC/YOLO-World), [BLIP](https://arxiv.org/abs/2201.12086), and others. Many execution providers are supported, sunch as `CUDA`, `TensorRT` and `CoreML`.
+
+
+## Supported Models
+
+|         Model         |         Example         |     CUDA(f32)     |     CUDA(f16)     |       TensorRT(f32)       |       TensorRT(f16)       | 
+| :-------------------: | :----------------------: | :----------------: | :----------------: | :------------------------: | :-----------------------: | 
+|   YOLOv8-detection   |   [demo](examples/yolov8)   |         ✅         |         ✅         |             ✅             |            ✅            |                    
+|      YOLOv8-pose      |   [demo](examples/yolov8)   |         ✅         |         ✅         |             ✅             |            ✅            |   
+| YOLOv8-classification |   [demo](examples/yolov8)   |         ✅         |         ✅         |             ✅             |            ✅            |               
+|  YOLOv8-segmentation  |   [demo](examples/yolov8)   |         ✅         |         ✅         |             ✅             |            ✅            |               
+|      YOLOv8-OBB      |    ***TODO***    | ***TODO*** | ***TODO*** |     ***TODO***     |    ***TODO***    |                                   |         
+|        YOLOv9        |   [demo](examples/yolov9)   |         ✅         |         ✅         |             ✅             |            ✅            |                           
+|        RT-DETR        |   [demo](examples/rtdetr)   |         ✅         |         ✅         |             ✅             |            ✅            |          
+|        FastSAM        |  [demo](examples/fastsam)  |         ✅         |         ✅         |             ✅             |            ✅            |     
+|      YOLO-World      | [demo](examples/yolo-world) |         ✅         |         ✅         |             ✅             |            ✅            |      
+|        DINOv2        |   [demo](examples/dinov2)   |         ✅         |         ✅         |             ✅             |            ✅            |      
+|         CLIP         |    [demo](examples/clip)    |         ✅         |         ✅         | ✅ visual<br />❌ textual | ✅ visual<br />❌ textual |                   
+|         BLIP         |    [demo](examples/blip)    |         ✅         |         ✅         | ✅ visual<br />❌ textual | ✅ visual<br />❌ textual |     
+|     OCR(DB, SVTR)     |    ***TODO***    | ***TODO*** | ***TODO*** |     ***TODO***     |    ***TODO***    |                                   |    
+
+## Solution Models
+Additionally, this repo also provides some solution models such as pedestrian `fall detection`, `head detection`, `trash detection`, and more.
+
+|             Model             |             Example             |                                    Result                                    |
+| :---------------------------: | :------------------------------: | :--------------------------------------------------------------------------: |
+|    face-landmark detection    |    [demo](examples/yolov8-face)    |   <img src="./examples/yolov8-face/demo.jpg" width="400" height="300">  |
+|        head detection        |    [demo](examples/yolov8-head)    |   <img src="./examples/yolov8-head/demo.jpg" width="400" height="300">   |
+|      fall detection      |  [demo](examples/yolov8-falldown)  | <img src="./examples/yolov8-falldown/demo.jpg" width="400" height="300"> |
+| trash detection | [demo](examples/yolov8-plastic-bag) |  <img src="./examples/yolov8-trash/demo.jpg" width="400" height="260">  |
+
+## Demo
+
+```
+cargo run -r --example yolov8   # fastsam, yolov9, blip, clip, dinov2, yolo-world...
+```
+
+## Integrate into your own project
+
+#### 1. Install [ort](https://github.com/pykeio/ort)
+
+check **[ort guide](https://ort.pyke.io/setup/linking)**
+
+<details close>
+<summary>For Linux or MacOS users</summary>	
+
+- Firstly, download from latest release from [ONNXRuntime Releases](https://github.com/microsoft/onnxruntime/releases)
+- Then linking
+   ```shell
+   export ORT_DYLIB_PATH=/Users/qweasd/Desktop/onnxruntime-osx-arm64-1.17.1/lib/libonnxruntime.1.17.1.dylib
+   ```
+</details>
+
+#### 2. Add `usls` as a dependency to your project's `Cargo.toml:`
+
+```
+[dependencies]
+usls = "0.0.1"
+```
+
+#### 3. Set model `Options` and build `model`, then you're ready to go.
+
+```Rust
+2use usls::{models::YOLO, Options};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // 1.build model
+    let options = Options::default()
+        .with_model("../models/yolov8m-seg-dyn-f16.onnx")
+        .with_trt(0) // using cuda(0) by default
+	// when model with dynamic shapes
+        .with_i00((1, 2, 4).into()) // dynamic batch
+        .with_i02((416, 640, 800).into())   // dynamic height
+        .with_i03((416, 640, 800).into())   // dynamic width
+        .with_confs(&[0.4, 0.15]) // person: 0.4, others: 0.15
+        .with_saveout("YOLOv8");    // save results
+    let mut model = YOLO::new(&options)?;
+
+    // 2.build dataloader
+    let dl = DataLoader::default()
+        .with_batch(model.batch.opt as usize)
+        .load("./assets/")?;
+
+    // 3.run
+    for (xs, _paths) in dl {
+        let _y = model.run(&xs)?;
+    }
+    Ok(())
+}
+```
+
+## Script: converte ONNX model from `float32` to `float16`
+
+```python
+import onnx
+from pathlib import Path
+from onnxconverter_common import float16
+
+model_f32 = "onnx_model.onnx"
+model_f16 = float16.convert_float_to_float16(onnx.load(model_f32))
+saveout = Path(model_f32).with_name(Path(model_f32).stem + "-f16.onnx")
+onnx.save(model_f16, saveout)
+```
diff --git a/assets/bus.jpg b/assets/bus.jpg
new file mode 100644
index 0000000..40eaaf5
Binary files /dev/null and b/assets/bus.jpg differ
diff --git a/assets/falldown.jpg b/assets/falldown.jpg
new file mode 100644
index 0000000..1492401
Binary files /dev/null and b/assets/falldown.jpg differ
diff --git a/assets/kids.jpg b/assets/kids.jpg
new file mode 100644
index 0000000..7eda4f3
Binary files /dev/null and b/assets/kids.jpg differ
diff --git a/assets/trash.jpg b/assets/trash.jpg
new file mode 100644
index 0000000..2ead8d5
Binary files /dev/null and b/assets/trash.jpg differ
diff --git a/examples/blip/README.md b/examples/blip/README.md
new file mode 100644
index 0000000..823fdb5
--- /dev/null
+++ b/examples/blip/README.md
@@ -0,0 +1,53 @@
+This demo shows how to use [BLIP](https://arxiv.org/abs/2201.12086) to do conditional or unconditional image captioning.
+
+
+## Quick Start
+
+```shell
+cargo run -r --example blip
+```
+
+## Or you can manully
+
+
+### 1. Donwload CLIP ONNX Model
+
+[blip-visual-base](https://github.com/jamjamjon/assets/releases/download/v0.0.1/blip-visual-base.onnx)  
+[blip-textual-base](https://github.com/jamjamjon/assets/releases/download/v0.0.1/blip-textual-base.onnx)
+
+
+### 2. Specify the ONNX model path in `main.rs`
+
+```Rust
+    // visual
+    let options_visual = Options::default()
+        .with_model("VISUAL_MODEL")   // <= modify this
+        .with_profile(false);
+
+    // textual
+    let options_textual = Options::default()
+        .with_model("TEXTUAL_MODEL")  // <= modify this
+        .with_profile(false);
+
+```
+
+### 3. Then, run
+
+```bash
+cargo run -r --example blip
+```
+
+
+## Results
+
+```shell
+[Unconditional image captioning]: a group of people walking around a bus
+[Conditional image captioning]: three man walking in front of a bus
+```
+
+## TODO
+
+* [ ] text decode with Top-p sample
+* [ ] VQA
+* [ ] Retrival
+* [ ] TensorRT support for textual model
diff --git a/examples/blip/main.rs b/examples/blip/main.rs
new file mode 100644
index 0000000..a21c89a
--- /dev/null
+++ b/examples/blip/main.rs
@@ -0,0 +1,29 @@
+use usls::{models::Blip, Options};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // visual
+    let options_visual = Options::default()
+        .with_model("../models/blip-visual-base.onnx")
+        .with_i00((1, 1, 4).into())
+        .with_profile(false);
+
+    // textual
+    let options_textual = Options::default()
+        .with_model("../models/blip-textual-base.onnx")
+        .with_i00((1, 1, 4).into()) // input_id: batch
+        .with_i01((1, 1, 4).into()) // input_id: seq_len
+        .with_i10((1, 1, 4).into()) // attention_mask: batch
+        .with_i11((1, 1, 4).into()) // attention_mask: seq_len
+        .with_i20((1, 1, 4).into()) // encoder_hidden_states: batch
+        .with_i30((1, 1, 4).into()) // encoder_attention_mask: batch
+        .with_profile(false);
+
+    // build model
+    let mut model = Blip::new(options_visual, options_textual)?;
+
+    // image caption
+    model.caption("./assets/bus.jpg", None)?; // unconditional
+    model.caption("./assets/bus.jpg", Some("three man"))?; // conditional
+
+    Ok(())
+}
diff --git a/examples/clip/README.md b/examples/clip/README.md
new file mode 100644
index 0000000..230e6e7
--- /dev/null
+++ b/examples/clip/README.md
@@ -0,0 +1,58 @@
+This demo showcases how to use [CLIP](https://github.com/openai/CLIP) to compute similarity between texts and images, which can be employed for image-to-text or text-to-image retrieval tasks.
+
+## Quick Start
+
+```shell
+cargo run -r --example clip
+```
+
+## Or you can manully
+
+
+### 1.Donwload CLIP ONNX Model
+
+[clip-b32-visual](https://github.com/jamjamjon/assets/releases/download/v0.0.1/clip-b32-visual.onnx)  
+[clip-b32-textual](https://github.com/jamjamjon/assets/releases/download/v0.0.1/clip-b32-textual.onnx)
+
+
+### 2. Specify the ONNX model path in `main.rs`
+
+```Rust
+    // visual
+    let options_visual = Options::default()
+        .with_model("VISUAL_MODEL")  // <= modify this
+        .with_i00((1, 1, 4).into())
+        .with_profile(false);
+
+    // textual
+    let options_textual = Options::default()
+        .with_model("TEXTUAL_MODEL")  // <= modify this
+        .with_i00((1, 1, 4).into())
+        .with_profile(false);
+```
+
+### 3. Then, run
+
+```bash
+cargo run -r --example clip
+```
+
+
+
+## Results
+
+```shell
+(82.24775%) ./examples/clip/images/carrot.jpg => 几个胡萝卜 
+[0.06708972, 0.0067733657, 0.0019306632, 0.8224775, 0.003044935, 0.083962336, 0.014721389]
+
+(85.56889%) ./examples/clip/images/doll.jpg => There is a doll with red hair and a clock on a table 
+[0.0786363, 0.0004783095, 0.00060898095, 0.06286741, 0.0006842306, 0.8556889, 0.0010357979]
+
+(90.03625%) ./examples/clip/images/peoples.jpg => Some people holding wine glasses in a restaurant 
+[0.07473288, 0.0027821448, 0.0075673857, 0.010874652, 0.003041679, 0.0006387719, 0.9003625]
+```
+
+
+## TODO
+
+* [ ] TensorRT support for textual model
diff --git a/examples/clip/images/carrot.jpg b/examples/clip/images/carrot.jpg
new file mode 100644
index 0000000..dd51810
Binary files /dev/null and b/examples/clip/images/carrot.jpg differ
diff --git a/examples/clip/images/doll.jpg b/examples/clip/images/doll.jpg
new file mode 100644
index 0000000..0a3935a
Binary files /dev/null and b/examples/clip/images/doll.jpg differ
diff --git a/examples/clip/images/peoples.jpg b/examples/clip/images/peoples.jpg
new file mode 100644
index 0000000..3953afa
Binary files /dev/null and b/examples/clip/images/peoples.jpg differ
diff --git a/examples/clip/main.rs b/examples/clip/main.rs
new file mode 100644
index 0000000..2b0aaf8
--- /dev/null
+++ b/examples/clip/main.rs
@@ -0,0 +1,63 @@
+use usls::{models::Clip, ops, DataLoader, Options};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // visual
+    let options_visual = Options::default()
+        .with_model("../models/clip-b32-visual-dyn.onnx")
+        .with_i00((1, 1, 4).into())
+        .with_profile(false);
+
+    // textual
+    let options_textual = Options::default()
+        .with_model("../models/clip-b32-textual-dyn.onnx")
+        .with_i00((1, 1, 4).into())
+        .with_profile(false);
+
+    // build model
+    let model = Clip::new(options_visual, options_textual)?;
+
+    // texts
+    let texts = vec![
+        "A photo of a dinosaur ".to_string(),
+        "A photo of a cat".to_string(),
+        "A photo of a dog".to_string(),
+        "几个胡萝卜".to_string(),
+        "There are some playing cards on a striped table cloth".to_string(),
+        "There is a doll with red hair and a clock on a table".to_string(),
+        "Some people holding wine glasses in a restaurant".to_string(),
+    ];
+    let feats_text = model.encode_texts(&texts)?; // [n, ndim]
+
+    // load image
+    let dl = DataLoader::default()
+        .with_batch(model.batch_visual())
+        .load("./examples/clip/images")?;
+
+    // loop
+    for (images, paths) in dl {
+        let feats_image = model.encode_images(&images).unwrap();
+
+        // use image to query texts
+        let matrix = ops::dot2(&feats_image, &feats_text)?; // [m, n]
+
+        // summary
+        for i in 0..paths.len() {
+            let probs = &matrix[i];
+            let (id, &score) = probs
+                .iter()
+                .enumerate()
+                .reduce(|max, x| if x.1 > max.1 { x } else { max })
+                .unwrap();
+
+            println!(
+                "({:?}%) {} => {} ",
+                score * 100.0,
+                paths[i].display(),
+                &texts[id]
+            );
+            println!("{:?}\n", probs);
+        }
+    }
+
+    Ok(())
+}
diff --git a/examples/dinov2/README.md b/examples/dinov2/README.md
new file mode 100644
index 0000000..8a30ecb
--- /dev/null
+++ b/examples/dinov2/README.md
@@ -0,0 +1,50 @@
+This demo showcases how to use `DINOv2` to compute image similarity, applicable for image-to-image retrieval tasks.
+
+## Quick Start
+
+```shell
+cargo run -r --example dinov2
+```
+
+## Or you can manully
+
+### 1.Donwload DINOv2 ONNX Model
+
+[dinov2-s14](https://github.com/jamjamjon/assets/releases/download/v0.0.1/dinov2-s14.onnx)
+[dinov2-s14-dyn](https://github.com/jamjamjon/assets/releases/download/v0.0.1/dinov2-s14-dyn.onnx)
+[dinov2-s14-dyn-f16](https://github.com/jamjamjon/assets/releases/download/v0.0.1/dinov2-s14-dyn-f16.onnx)
+
+[dinov2-b14](https://github.com/jamjamjon/assets/releases/download/v0.0.1/dinov2-b14.onnx)
+[dinov2-b14-dyn](https://github.com/jamjamjon/assets/releases/download/v0.0.1/dinov2-b14-dyn.onnx)
+[dinov2-b14-dyn-f16](https://github.com/jamjamjon/assets/releases/download/v0.0.1/dinov2-b14-dyn-f16.onnx)
+
+### 2. Specify the ONNX model path in `main.rs`
+
+```Rust
+let options = Options::default()
+    .with_model("ONNX_PATH")    // <= modify this
+    .with_profile(false);
+
+// build index
+let options = IndexOptions {
+    dimensions: 384, // 768 for vitb; 384 for vits
+    metric: MetricKind::L2sq,
+    quantization: ScalarKind::F16,
+    ..Default::default()
+};
+```
+
+### 3. Then, run
+
+```bash
+cargo run -r --example dinov2
+```
+
+## Results
+
+```shell
+Top-1 distance: 0.0 => "./examples/dinov2/images/bus.jpg"
+Top-2 distance: 1.8332717 => "./examples/dinov2/images/dog.png"
+Top-3 distance: 1.9672602 => "./examples/dinov2/images/cat.png"
+Top-4 distance: 1.978817 => "./examples/dinov2/images/carrot.jpg"
+```
diff --git a/examples/dinov2/images/GlqO.jpg b/examples/dinov2/images/GlqO.jpg
new file mode 100644
index 0000000..394f48f
Binary files /dev/null and b/examples/dinov2/images/GlqO.jpg differ
diff --git a/examples/dinov2/images/JasD.jpg b/examples/dinov2/images/JasD.jpg
new file mode 100644
index 0000000..4e6f31a
Binary files /dev/null and b/examples/dinov2/images/JasD.jpg differ
diff --git a/examples/dinov2/images/bus.jpg b/examples/dinov2/images/bus.jpg
new file mode 100644
index 0000000..40eaaf5
Binary files /dev/null and b/examples/dinov2/images/bus.jpg differ
diff --git a/examples/dinov2/main.rs b/examples/dinov2/main.rs
new file mode 100644
index 0000000..a969bc9
--- /dev/null
+++ b/examples/dinov2/main.rs
@@ -0,0 +1,57 @@
+use usearch::ffi::{IndexOptions, MetricKind, ScalarKind};
+use usls::{models::Dinov2, DataLoader, Options};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // build model
+    let options = Options::default()
+        .with_model("../models/dinov2-s14-dyn-f16.onnx")
+        .with_i00((1, 1, 1).into())
+        .with_i02((224, 224, 224).into())
+        .with_i03((224, 224, 224).into());
+    let mut model = Dinov2::new(&options)?;
+
+    // build dataloader
+    let dl = DataLoader::default()
+        .with_batch(model.batch.opt as usize)
+        .load("./examples/dinov2/images")?;
+
+    // load query
+    let query = image::io::Reader::open("./assets/bus.jpg")?.decode()?;
+    let query = model.run(&[query])?;
+
+    // build index
+    let options = IndexOptions {
+        dimensions: 384, // 768 for vitb; 384 for vits
+        metric: MetricKind::L2sq,
+        quantization: ScalarKind::F16,
+        ..Default::default()
+    };
+    let index = usearch::new_index(&options)?;
+    index.reserve(dl.clone().count())?;
+
+    // load feats
+    for (idx, (image, _path)) in dl.clone().enumerate() {
+        let y = model.run(&image)?;
+        index.add(idx as u64, &y.into_raw_vec())?;
+    }
+
+    // output
+    let topk = 10;
+    let matches = index.search(&query.into_raw_vec(), topk)?;
+    let paths = dl.paths;
+    for (idx, (k, score)) in matches
+        .keys
+        .into_iter()
+        .zip(matches.distances.into_iter())
+        .enumerate()
+    {
+        println!(
+            "Top-{} distance: {:?} => {:?}",
+            idx + 1,
+            score,
+            paths[k as usize]
+        );
+    }
+
+    Ok(())
+}
diff --git a/examples/fastsam/README.md b/examples/fastsam/README.md
new file mode 100644
index 0000000..d2ecc03
--- /dev/null
+++ b/examples/fastsam/README.md
@@ -0,0 +1,41 @@
+## Quick Start
+
+```shell
+cargo run -r --example fastsam
+```
+
+## Or you can manully
+
+
+### 1.Donwload or export ONNX Model
+
+- **Export**  
+
+    ```bash
+    pip install -U ultralytics
+    yolo export model=FastSAM-s.pt format=onnx simplify dynamic
+    ```
+
+- **Download**  
+
+    [FastSAM-s-dyn-f16](https://github.com/jamjamjon/assets/releases/download/v0.0.1/FastSAM-s-dyn-f16.onnx)
+
+### 2. Specify the ONNX model path in `main.rs`
+
+```Rust
+let options = Options::default()
+    .with_model("../models/FastSAM-s-dyn-f16.onnx")    // <= modify this
+    .with_saveout("FastSAM")
+    .with_profile(false);
+let mut model = YOLO::new(&options)?;
+```
+
+### 3. Then, run
+
+```bash
+cargo run -r --example fastsam
+```
+
+## Results
+
+![](./demo.jpg)
diff --git a/examples/fastsam/demo.jpg b/examples/fastsam/demo.jpg
new file mode 100644
index 0000000..d60a5d6
Binary files /dev/null and b/examples/fastsam/demo.jpg differ
diff --git a/examples/fastsam/main.rs b/examples/fastsam/main.rs
new file mode 100644
index 0000000..50a2f33
--- /dev/null
+++ b/examples/fastsam/main.rs
@@ -0,0 +1,22 @@
+use usls::{models::YOLO, DataLoader, Options};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // build model
+    let options = Options::default()
+        .with_model("../models/FastSAM-s-dyn-f16.onnx")
+        .with_i00((1, 1, 4).into())
+        .with_i02((416, 640, 800).into())
+        .with_i03((416, 640, 800).into())
+        .with_confs(&[0.4, 0.15]) // person: 0.4, others: 0.15
+        .with_saveout("FastSAM")
+        .with_profile(false);
+    let mut model = YOLO::new(&options)?;
+
+    // build dataloader
+    let mut dl = DataLoader::default().load("./assets/bus.jpg")?;
+
+    // run
+    model.run(&dl.next().unwrap().0)?;
+
+    Ok(())
+}
diff --git a/examples/rtdetr/README.md b/examples/rtdetr/README.md
new file mode 100644
index 0000000..9d5921a
--- /dev/null
+++ b/examples/rtdetr/README.md
@@ -0,0 +1,37 @@
+## Quick Start
+
+```shell
+cargo run -r --example rtdetr
+```
+
+## Or you can manully
+
+### 1. Donwload or export ONNX Model
+
+- Export
+
+  ```bash
+  pip install -U ultralytics
+  yolo export model=rtdetr-l.pt format=onnx simplify dynamic opset=16
+  ```
+- Download
+
+  [rtdetr-l-f16 model](https://github.com/jamjamjon/assets/releases/download/v0.0.1/rtdetr-l-f16.onnx)
+
+### 2. Specify the ONNX model path in `main.rs`
+
+```Rust
+let options = Options::default()
+    .with_model("ONNX_MODEL")    // <= modify this
+    .with_saveout("RT-DETR");
+```
+
+### 3. Then, run
+
+```bash
+cargo run -r --example rtdetr
+```
+
+## Results
+
+![](./demo.jpg)
diff --git a/examples/rtdetr/demo.jpg b/examples/rtdetr/demo.jpg
new file mode 100644
index 0000000..e0df576
Binary files /dev/null and b/examples/rtdetr/demo.jpg differ
diff --git a/examples/rtdetr/main.rs b/examples/rtdetr/main.rs
new file mode 100644
index 0000000..6f50ced
--- /dev/null
+++ b/examples/rtdetr/main.rs
@@ -0,0 +1,19 @@
+use usls::{models::RTDETR, DataLoader, Options, COCO_NAMES_80};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // build model
+    let options = Options::default()
+        .with_model("../models/rtdetr-l-f16.onnx")
+        .with_confs(&[0.4, 0.15]) // person: 0.4, others: 0.15
+        .with_names(&COCO_NAMES_80)
+        .with_saveout("RT-DETR");
+    let mut model = RTDETR::new(&options)?;
+
+    // build dataloader
+    let mut dl = DataLoader::default().load("./assets/bus.jpg")?;
+
+    // run
+    model.run(&dl.next().unwrap().0)?;
+
+    Ok(())
+}
diff --git a/examples/yolo-world/README.md b/examples/yolo-world/README.md
new file mode 100644
index 0000000..f3081a2
--- /dev/null
+++ b/examples/yolo-world/README.md
@@ -0,0 +1,58 @@
+## Quick Start
+
+```shell
+cargo run -r --example yolo-world
+```
+
+## Or you can manully
+
+### 1. Donwload or Export ONNX Model
+
+- Download
+
+  [yolov8s-world-v2-shoes](https://github.com/jamjamjon/assets/releases/download/v0.0.1/yolov8s-world-v2-shoes.onnx)
+- Or generate your own `yolo-world` model and then Export
+
+  - Installation
+
+  ```shell
+  pip install -U ultralytics
+  ```
+
+  - Generate
+
+  ```python
+  from ultralytics import YOLO
+
+  # Initialize a YOLO-World model
+  model = YOLO('yolov8m-worldv2.pt')
+
+  # Define custom classes
+  model.set_classes(["shoes"])
+
+  # Save the model with the defined offline vocabulary
+  model.save("custom_yolov8m-world-v2.pt")
+  ```
+
+  - Export
+
+  ```shell
+  yolo export model=custom_yolov8m-world-v2.pt format=onnx simplify dynamic
+  ```
+
+### 2. Specify the ONNX model path in `main.rs`
+
+```Rust
+let options = Options::default()
+    .with_model("ONNX_PATH");   // <= modify this
+```
+
+### 3. Then, run
+
+```
+cargo run -r --example yolo-world
+```
+
+## Results
+
+![](./demo.jpg)
diff --git a/examples/yolo-world/demo.jpg b/examples/yolo-world/demo.jpg
new file mode 100644
index 0000000..5242d7d
Binary files /dev/null and b/examples/yolo-world/demo.jpg differ
diff --git a/examples/yolo-world/main.rs b/examples/yolo-world/main.rs
new file mode 100644
index 0000000..ed1bfb5
--- /dev/null
+++ b/examples/yolo-world/main.rs
@@ -0,0 +1,22 @@
+use usls::{models::YOLO, DataLoader, Options};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // build model
+    let options = Options::default()
+        .with_model("../models/yolov8s-world-v2-shoes.onnx")
+        .with_i00((1, 1, 4).into())
+        .with_i02((416, 640, 800).into())
+        .with_i03((416, 640, 800).into())
+        .with_confs(&[0.3]) // shoes: 0.2
+        .with_saveout("YOLO-World")
+        .with_profile(false);
+    let mut model = YOLO::new(&options)?;
+
+    // build dataloader
+    let mut dl = DataLoader::default().load("./assets/bus.jpg")?;
+
+    // run
+    model.run(&dl.next().unwrap().0)?;
+
+    Ok(())
+}
diff --git a/examples/yolov8-face/README.md b/examples/yolov8-face/README.md
new file mode 100644
index 0000000..8b741df
--- /dev/null
+++ b/examples/yolov8-face/README.md
@@ -0,0 +1,30 @@
+## Quick Start
+
+```shell
+cargo run -r --example yolov8-face
+```
+
+## Or you can manully
+
+### 1. Donwload ONNX Model
+
+[yolov8-face-dyn-f16](https://github.com/jamjamjon/assets/releases/download/v0.0.1/yolov8-face-dyn-f16.onnx)
+
+### 2. Specify the ONNX model path in `main.rs`
+
+```Rust
+let options = Options::default()
+    .with_model("ONNX_PATH")    // <= modify this
+    .with_profile(false);
+let mut model = YOLO::new(&options)?;
+```
+
+### 3. Then, run
+
+```bash
+cargo run -r --example yolov8-face
+```
+
+## Results
+
+![](./demo.jpg)
diff --git a/examples/yolov8-face/demo.jpg b/examples/yolov8-face/demo.jpg
new file mode 100644
index 0000000..6180994
Binary files /dev/null and b/examples/yolov8-face/demo.jpg differ
diff --git a/examples/yolov8-face/main.rs b/examples/yolov8-face/main.rs
new file mode 100644
index 0000000..e7469c1
--- /dev/null
+++ b/examples/yolov8-face/main.rs
@@ -0,0 +1,22 @@
+use usls::{models::YOLO, DataLoader, Options};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // build model
+    let options = Options::default()
+        .with_model("../models/yolov8n-face-dyn-f16.onnx")
+        .with_i00((1, 1, 4).into())
+        .with_i02((416, 640, 800).into())
+        .with_i03((416, 640, 800).into())
+        .with_confs(&[0.15])
+        .with_saveout("YOLOv8-Face")
+        .with_profile(false);
+    let mut model = YOLO::new(&options)?;
+
+    // build dataloader
+    let mut dl = DataLoader::default().load("./assets/kids.jpg")?;
+
+    // run
+    model.run(&dl.next().unwrap().0)?;
+
+    Ok(())
+}
diff --git a/examples/yolov8-falldown/README.md b/examples/yolov8-falldown/README.md
new file mode 100644
index 0000000..1cc6699
--- /dev/null
+++ b/examples/yolov8-falldown/README.md
@@ -0,0 +1,30 @@
+## Quick Start
+
+```shell
+cargo run -r --example yolov8-falldown
+```
+
+## Or you can manully
+
+### 1.Donwload ONNX Model
+
+[yolov8-falldown-f16](https://github.com/jamjamjon/assets/releases/download/v0.0.1/yolov8-falldown-f16.onnx)
+
+### 2. Specify the ONNX model path in `main.rs`
+
+```Rust
+let options = Options::default()
+    .with_model("ONNX_PATH")    // <= modify this
+    .with_profile(false);
+let mut model = YOLO::new(&options)?
+```
+
+### 3. Then, run
+
+```bash
+cargo run -r --example yolov8-falldown
+```
+
+## Results
+
+![](./demo.jpg)
diff --git a/examples/yolov8-falldown/demo.jpg b/examples/yolov8-falldown/demo.jpg
new file mode 100644
index 0000000..2b0a9f5
Binary files /dev/null and b/examples/yolov8-falldown/demo.jpg differ
diff --git a/examples/yolov8-falldown/main.rs b/examples/yolov8-falldown/main.rs
new file mode 100644
index 0000000..3e8e84f
--- /dev/null
+++ b/examples/yolov8-falldown/main.rs
@@ -0,0 +1,19 @@
+use usls::{models::YOLO, DataLoader, Options};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // build model
+    let options = Options::default()
+        .with_model("../models/yolov8-falldown-f16.onnx")
+        .with_confs(&[0.3])
+        .with_saveout("YOLOv8-Falldown")
+        .with_profile(false);
+    let mut model = YOLO::new(&options)?;
+
+    // build dataloader
+    let mut dl = DataLoader::default().load("./assets/falldown.jpg")?;
+
+    // run
+    model.run(&dl.next().unwrap().0)?;
+
+    Ok(())
+}
diff --git a/examples/yolov8-head/README.md b/examples/yolov8-head/README.md
new file mode 100644
index 0000000..2ef3bd7
--- /dev/null
+++ b/examples/yolov8-head/README.md
@@ -0,0 +1,30 @@
+## Quick Start
+
+```shell
+cargo run -r --example yolov8-head
+```
+
+## Or you can manully
+
+### 1. Donwload ONNX Model
+
+[yolov8-head-f16](https://github.com/jamjamjon/assets/releases/download/v0.0.1/yolov8-head-f16.onnx)
+
+### 2. Specify the ONNX model path in `main.rs`
+
+```Rust
+let options = Options::default()
+    .with_model("ONNX_PATH")    // <= modify this
+    .with_profile(false);
+let mut model = YOLO::new(&options)?;
+```
+
+### 3. Then, run
+
+```bash
+cargo run -r --example yolov8-head
+```
+
+## Results
+
+![](./demo.jpg)
diff --git a/examples/yolov8-head/demo.jpg b/examples/yolov8-head/demo.jpg
new file mode 100644
index 0000000..ca5ac88
Binary files /dev/null and b/examples/yolov8-head/demo.jpg differ
diff --git a/examples/yolov8-head/main.rs b/examples/yolov8-head/main.rs
new file mode 100644
index 0000000..d5a750d
--- /dev/null
+++ b/examples/yolov8-head/main.rs
@@ -0,0 +1,19 @@
+use usls::{models::YOLO, DataLoader, Options};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // build model
+    let options = Options::default()
+        .with_model("../models/yolov8-head-f16.onnx")
+        .with_confs(&[0.3])
+        .with_saveout("YOLOv8-Head")
+        .with_profile(false);
+    let mut model = YOLO::new(&options)?;
+
+    // build dataloader
+    let mut dl = DataLoader::default().load("./assets/kids.jpg")?;
+
+    // run
+    model.run(&dl.next().unwrap().0)?;
+
+    Ok(())
+}
diff --git a/examples/yolov8-trash/README.md b/examples/yolov8-trash/README.md
new file mode 100644
index 0000000..27c8c1c
--- /dev/null
+++ b/examples/yolov8-trash/README.md
@@ -0,0 +1,32 @@
+Model for detecting plastic bag.
+
+## Quick Start
+
+```shell
+cargo run -r --example yolov8-trash
+```
+
+## Or you can manully
+
+### 1. Donwload ONNX Model
+
+[yolov8-plastic-bag-f16](https://github.com/jamjamjon/assets/releases/download/v0.0.1/yolov8-plastic-bag-f16.onnx)
+
+### 2. Specify the ONNX model path in `main.rs`
+
+```Rust
+let options = Options::default()
+    .with_model("ONNX_PATH")    // <= modify this
+    .with_profile(false);
+let mut model = YOLO::new(&options)?;
+```
+
+### 3. Then, run
+
+```bash
+cargo run -r --example yolov8-trash
+```
+
+## Results
+
+![](./demo.jpg)
diff --git a/examples/yolov8-trash/demo.jpg b/examples/yolov8-trash/demo.jpg
new file mode 100644
index 0000000..747b5f2
Binary files /dev/null and b/examples/yolov8-trash/demo.jpg differ
diff --git a/examples/yolov8-trash/main.rs b/examples/yolov8-trash/main.rs
new file mode 100644
index 0000000..a172778
--- /dev/null
+++ b/examples/yolov8-trash/main.rs
@@ -0,0 +1,20 @@
+use usls::{models::YOLO, DataLoader, Options};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // 1.build model
+    let options = Options::default()
+        .with_model("../models/yolov8-plastic-bag-f16.onnx")
+        .with_confs(&[0.3])
+        .with_saveout("YOLOv8-Trash")
+        .with_names(&["trash"])
+        .with_profile(false);
+    let mut model = YOLO::new(&options)?;
+
+    // build dataloader
+    let mut dl = DataLoader::default().load("./assets/trash.jpg")?;
+
+    // run
+    model.run(&dl.next().unwrap().0)?;
+
+    Ok(())
+}
diff --git a/examples/yolov8/README.md b/examples/yolov8/README.md
new file mode 100644
index 0000000..8b65881
--- /dev/null
+++ b/examples/yolov8/README.md
@@ -0,0 +1,58 @@
+## Features
+
+- Support `Classification`, `Segmentation`, `Detection`, `Pose(Keypoints)-Detection` tasks.
+- Support `FP16` & `FP32` ONNX models.
+- Support `CoreML`, `CUDA` and `TensorRT` execution provider to accelerate computation.
+- Support dynamic input shapes(`batch`, `width`, `height`).
+- Support dynamic confidence(`DynConf`) for each class in Detection task.
+
+## Quick Start
+
+```shell
+cargo run -r --example yolov8
+```
+
+## Or you can manully
+
+### 1. Export `YOLOv8` ONNX Models
+
+```bash
+pip install -U ultralytics
+
+# export onnx model with dynamic shapes
+yolo export model=yolov8m.pt format=onnx simplify dynamic
+yolo export model=yolov8m-cls.pt format=onnx simplify dynamic
+yolo export model=yolov8m-pose.pt format=onnx simplify dynamic
+yolo export model=yolov8m-seg.pt format=onnx simplify dynamic
+
+# export onnx model with fixed shapes
+yolo export model=yolov8m.pt format=onnx simplify
+yolo export model=yolov8m-cls.pt format=onnx simplify
+yolo export model=yolov8m-pose.pt format=onnx simplify
+yolo export model=yolov8m-seg.pt format=onnx simplify
+```
+
+### 2. Specify the ONNX model path in `main.rs`
+
+```Rust
+let options = Options::default()
+    .with_model("ONNX_PATH")   // <= modify this
+    .with_confs(&[0.4, 0.15]) // person: 0.4, others: 0.15
+    .with_saveout("YOLOv8");
+let mut model = YOLO::new(&options)?;
+```
+
+### 3. Then, run
+
+```
+cargo run -r --example yolov8
+```
+
+## Result
+
+|         Task         | Annotated image       |
+| :-------------------: | --------------------- |
+| Instance Segmentation | ![img](./demo-seg.jpg)  |
+|    Classification    | ![img](./demo-cls.jpg)  |
+|       Detection       | ![img](./demo-det.jpg)  |
+|         Pose         | ![img](./demo-pose.jpg) |
diff --git a/examples/yolov8/demo-cls.jpg b/examples/yolov8/demo-cls.jpg
new file mode 100644
index 0000000..55df7eb
Binary files /dev/null and b/examples/yolov8/demo-cls.jpg differ
diff --git a/examples/yolov8/demo-det.jpg b/examples/yolov8/demo-det.jpg
new file mode 100644
index 0000000..35af574
Binary files /dev/null and b/examples/yolov8/demo-det.jpg differ
diff --git a/examples/yolov8/demo-pose.jpg b/examples/yolov8/demo-pose.jpg
new file mode 100644
index 0000000..efdc1fb
Binary files /dev/null and b/examples/yolov8/demo-pose.jpg differ
diff --git a/examples/yolov8/demo-seg.jpg b/examples/yolov8/demo-seg.jpg
new file mode 100644
index 0000000..cd71b0f
Binary files /dev/null and b/examples/yolov8/demo-seg.jpg differ
diff --git a/examples/yolov8/main.rs b/examples/yolov8/main.rs
new file mode 100644
index 0000000..d5df089
--- /dev/null
+++ b/examples/yolov8/main.rs
@@ -0,0 +1,29 @@
+use usls::{models::YOLO, DataLoader, Options, COCO_SKELETON_17};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // 1.build model
+    let options = Options::default()
+        .with_model("../models/yolov8m-dyn-f16.onnx")
+        .with_trt(0) // cuda by default
+        .with_fp16(true)
+        .with_i00((1, 1, 4).into())
+        .with_i02((416, 640, 800).into())
+        .with_i03((416, 640, 800).into())
+        .with_confs(&[0.4, 0.15]) // person: 0.4, others: 0.15
+        .with_profile(true)
+        .with_dry_run(5)
+        .with_skeletons(&COCO_SKELETON_17)
+        .with_saveout("YOLOv8");
+    let mut model = YOLO::new(&options)?;
+
+    // 2.build dataloader
+    let dl = DataLoader::default()
+        .with_batch(1)
+        .load("./assets/bus.jpg")?;
+
+    // 3.run
+    for (xs, _paths) in dl {
+        let _y = model.run(&xs)?;
+    }
+    Ok(())
+}
diff --git a/examples/yolov9/README.md b/examples/yolov9/README.md
new file mode 100644
index 0000000..5ce2bfb
--- /dev/null
+++ b/examples/yolov9/README.md
@@ -0,0 +1,45 @@
+## Quick Start
+
+```shell
+cargo run -r --example yolov9
+```
+
+## Or you can manully
+
+### 1. Donwload or Export ONNX Model
+
+- **Download**
+
+  [yolov9-c-dyn-fp16](https://github.com/jamjamjon/assets/releases/download/v0.0.1/yolov9-c-dyn-f16.onnx)
+- **Export**
+
+  ```shell
+  # clone repo and install dependencies
+  git clone https://github.com/WongKinYiu/yolov9.git
+  cd yolov9
+  pip install -r requirements.txt
+
+  # donwload `pt` weights
+  wget https://github.com/WongKinYiu/yolov9/releases/download/v0.1/yolov9-c.pt
+
+  # export ONNX model
+  python export.py --weights yolov9-c.pt --include onnx --simplify --dynamic
+  ```
+
+### 2. Specify the ONNX model path in `main.rs`
+
+```Rust
+let options = Options::default()
+    .with_model("ONNX_PATH")   // <= modify this
+    .with_saveout("YOLOv9");
+```
+
+### 3. Run
+
+```
+cargo run -r --example yolov9
+```
+
+## Results
+
+![](./demo.jpg)
diff --git a/examples/yolov9/demo.jpg b/examples/yolov9/demo.jpg
new file mode 100644
index 0000000..6c9f6b2
Binary files /dev/null and b/examples/yolov9/demo.jpg differ
diff --git a/examples/yolov9/main.rs b/examples/yolov9/main.rs
new file mode 100644
index 0000000..0c6ed85
--- /dev/null
+++ b/examples/yolov9/main.rs
@@ -0,0 +1,22 @@
+use usls::{models::YOLO, DataLoader, Options};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // build model
+    let options = Options::default()
+        .with_model("../models/yolov9-c-dyn-f16.onnx")
+        .with_i00((1, 1, 4).into())
+        .with_i02((416, 640, 800).into())
+        .with_i03((416, 640, 800).into())
+        .with_confs(&[0.4, 0.15]) // person: 0.4, others: 0.15
+        .with_saveout("YOLOv9")
+        .with_profile(false);
+    let mut model = YOLO::new(&options)?;
+
+    // build dataloader
+    let mut dl = DataLoader::default().load("./assets/bus.jpg")?;
+
+    // run
+    model.run(&dl.next().unwrap().0)?;
+
+    Ok(())
+}
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
new file mode 100644
index 0000000..6d833ff
--- /dev/null
+++ b/rust-toolchain.toml
@@ -0,0 +1,2 @@
+[toolchain]
+channel = "1.75"
diff --git a/src/annotator.rs b/src/annotator.rs
new file mode 100644
index 0000000..4d3337c
--- /dev/null
+++ b/src/annotator.rs
@@ -0,0 +1,228 @@
+use anyhow::Result;
+use image::{ImageBuffer, RgbImage};
+
+use crate::{auto_load, string_now, Results, CHECK_MARK, CROSS_MARK};
+
+#[derive(Debug)]
+pub struct Annotator {
+    font: rusttype::Font<'static>,
+    skeletons: Option<Vec<(usize, usize)>>,
+    hide_conf: bool,
+}
+
+impl Default for Annotator {
+    fn default() -> Self {
+        Self {
+            font: Self::load_font(None).unwrap(),
+            skeletons: None,
+            hide_conf: false,
+        }
+    }
+}
+
+impl Annotator {
+    pub fn with_skeletons(mut self, skeletons: &[(usize, usize)]) -> Self {
+        self.skeletons = Some(skeletons.to_vec());
+        self
+    }
+
+    pub fn with_font(mut self, path: &str) -> Self {
+        self.font = Self::load_font(Some(path)).unwrap();
+        self
+    }
+
+    pub fn save(&self, image: &RgbImage, saveout: &str) {
+        let mut saveout = std::path::PathBuf::from("runs").join(saveout);
+        if !saveout.exists() {
+            std::fs::create_dir_all(&saveout).unwrap();
+        }
+        saveout.push(string_now("-"));
+        let saveout = format!("{}.jpg", saveout.to_str().unwrap());
+        match image.save(&saveout) {
+            Err(err) => println!("{} Saving failed: {:?}", CROSS_MARK, err),
+            Ok(_) => println!("{} Annotated image saved at: {}", CHECK_MARK, saveout),
+        }
+    }
+
+    fn load_font(path: Option<&str>) -> Result<rusttype::Font<'static>> {
+        let path_font = match path {
+            None => auto_load("Arial.ttf")?,
+            Some(p) => p.into(),
+        };
+        let buffer = std::fs::read(path_font)?;
+        Ok(rusttype::Font::try_from_vec(buffer).unwrap())
+    }
+
+    pub fn get_color(&self, n: usize) -> (u8, u8, u8) {
+        Self::color_palette()[n % Self::color_palette().len()]
+    }
+
+    pub fn plot(&self, img: &mut RgbImage, y: &Results) {
+        // masks and polygons
+        if let Some(masks) = y.masks() {
+            for mask in masks.iter() {
+                let mask_nd: ImageBuffer<image::Luma<_>, Vec<u8>> =
+                    ImageBuffer::from_vec(img.width(), img.height(), mask.to_vec())
+                        .expect("can not crate image from ndarray");
+                // masks
+                for _x in 0..img.width() {
+                    for _y in 0..img.height() {
+                        let mask_p = imageproc::drawing::Canvas::get_pixel(&mask_nd, _x, _y);
+                        if mask_p.0[0] > 0 {
+                            let mut img_p = imageproc::drawing::Canvas::get_pixel(img, _x, _y);
+                            img_p.0[0] /= 2;
+                            img_p.0[1] = 255 - (255 - img_p.0[1]) / 3;
+                            img_p.0[2] /= 2;
+                            imageproc::drawing::Canvas::draw_pixel(img, _x, _y, img_p)
+                        }
+                    }
+                }
+                // contours
+                let contours: Vec<imageproc::contours::Contour<i32>> =
+                    imageproc::contours::find_contours(&mask_nd);
+                for contour in contours.iter() {
+                    for point in contour.points.iter() {
+                        imageproc::drawing::draw_filled_circle_mut(
+                            img,
+                            (point.x, point.y),
+                            1,
+                            image::Rgb([255, 255, 255]),
+                        );
+                    }
+                }
+            }
+        }
+
+        // probs
+        if let Some(probs) = y.probs() {
+            let topk = 5usize;
+            let (x, mut y) = (img.width() as i32 / 20, img.height() as i32 / 20);
+            for k in probs.topk(topk).iter() {
+                let legend = format!("{}: {:.2}", k.2.as_ref().unwrap_or(&k.0.to_string()), k.1);
+                let scale = img.width().max(img.height()) as f32 / 30.0;
+                let scale = rusttype::Scale::uniform(scale);
+                let (text_w, text_h) = imageproc::drawing::text_size(scale, &self.font, &legend);
+                y += text_h;
+                imageproc::drawing::draw_filled_rect_mut(
+                    img,
+                    imageproc::rect::Rect::at(x, y).of_size(text_w as u32, text_h as u32),
+                    image::Rgb(self.get_color(k.0).into()),
+                );
+                imageproc::drawing::draw_text_mut(
+                    img,
+                    image::Rgb((0, 0, 0).into()),
+                    x,
+                    y,
+                    scale,
+                    &self.font,
+                    &legend,
+                );
+            }
+        }
+
+        // bboxes
+        if let Some(bboxes) = y.bboxes() {
+            for bbox in bboxes.iter() {
+                imageproc::drawing::draw_hollow_rect_mut(
+                    img,
+                    imageproc::rect::Rect::at(bbox.xmin() as i32, bbox.ymin() as i32)
+                        .of_size(bbox.width() as u32, bbox.height() as u32),
+                    image::Rgb(self.get_color(bbox.id()).into()),
+                );
+                let legend = if self.hide_conf {
+                    bbox.name().unwrap_or(&bbox.id().to_string()).to_string()
+                } else {
+                    format!(
+                        "{}: {:.4}",
+                        bbox.name().unwrap_or(&bbox.id().to_string()),
+                        bbox.confidence()
+                    )
+                };
+                let scale = img.width().max(img.height()) as f32 / 45.0;
+                let scale = rusttype::Scale::uniform(scale);
+                let (text_w, text_h) = imageproc::drawing::text_size(scale, &self.font, &legend);
+                let text_y = if bbox.ymin() as i32 > text_h {
+                    bbox.ymin() as i32 - text_h
+                } else {
+                    text_h - bbox.ymin() as i32
+                };
+                imageproc::drawing::draw_filled_rect_mut(
+                    img,
+                    imageproc::rect::Rect::at(bbox.xmin() as i32, text_y)
+                        .of_size(text_w as u32, text_h as u32),
+                    image::Rgb(self.get_color(bbox.id()).into()),
+                );
+                imageproc::drawing::draw_text_mut(
+                    img,
+                    image::Rgb((0, 0, 0).into()),
+                    bbox.xmin() as i32,
+                    text_y,
+                    scale,
+                    &self.font,
+                    &legend,
+                );
+            }
+        }
+
+        // keypoints
+        if let Some(keypoints) = y.keypoints() {
+            let radius = 3;
+            for kpts in keypoints.iter() {
+                for (i, kpt) in kpts.iter().enumerate() {
+                    if kpt.confidence() == 0.0 {
+                        continue;
+                    }
+                    // draw point
+                    imageproc::drawing::draw_filled_circle_mut(
+                        img,
+                        (kpt.x() as i32, kpt.y() as i32),
+                        radius,
+                        image::Rgb(self.get_color(i + 10).into()),
+                    );
+                }
+
+                // draw skeleton
+                if let Some(skeletons) = &self.skeletons {
+                    for &(i, ii) in skeletons.iter() {
+                        let kpt1 = &kpts[i];
+                        let kpt2 = &kpts[ii];
+                        if kpt1.confidence() == 0.0 || kpt2.confidence() == 0.0 {
+                            continue;
+                        }
+                        imageproc::drawing::draw_line_segment_mut(
+                            img,
+                            (kpt1.x(), kpt1.y()),
+                            (kpt2.x(), kpt2.y()),
+                            image::Rgb([255, 51, 255]),
+                        );
+                    }
+                }
+            }
+        }
+    }
+
+    fn color_palette() -> Vec<(u8, u8, u8)> {
+        vec![
+            (0, 255, 0),
+            (255, 128, 0),
+            (0, 0, 255),
+            (255, 153, 51),
+            (255, 0, 0),
+            (255, 51, 255),
+            (102, 178, 255),
+            (51, 153, 255),
+            (255, 51, 51),
+            (153, 255, 153),
+            (102, 255, 102),
+            (153, 204, 255),
+            (255, 153, 153),
+            (255, 178, 102),
+            (230, 230, 0),
+            (255, 153, 255),
+            (255, 102, 255),
+            (255, 102, 102),
+            (51, 255, 51),
+            (255, 255, 255),
+        ]
+    }
+}
diff --git a/src/bbox.rs b/src/bbox.rs
new file mode 100644
index 0000000..98db2b6
--- /dev/null
+++ b/src/bbox.rs
@@ -0,0 +1,77 @@
+use crate::Rect;
+
+#[derive(Clone, PartialEq, Default)]
+pub struct Bbox {
+    rect: Rect,
+    id: usize,
+    confidence: f32,
+    name: Option<String>,
+}
+
+impl std::fmt::Debug for Bbox {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Bbox")
+            .field("xmin", &self.rect.xmin())
+            .field("ymin", &self.rect.ymin())
+            .field("xmax", &self.rect.xmax())
+            .field("ymax", &self.rect.ymax())
+            .field("id", &self.id)
+            .field("name", &self.name)
+            .field("confidence", &self.confidence)
+            .finish()
+    }
+}
+
+impl Bbox {
+    pub fn new(rect: Rect, id: usize, confidence: f32, name: Option<String>) -> Self {
+        Self {
+            rect,
+            id,
+            confidence,
+            name,
+        }
+    }
+
+    pub fn width(&self) -> f32 {
+        self.rect.width()
+    }
+
+    pub fn height(&self) -> f32 {
+        self.rect.height()
+    }
+
+    pub fn xmin(&self) -> f32 {
+        self.rect.xmin()
+    }
+
+    pub fn ymin(&self) -> f32 {
+        self.rect.ymin()
+    }
+
+    pub fn xmax(&self) -> f32 {
+        self.rect.xmax()
+    }
+
+    pub fn ymax(&self) -> f32 {
+        self.rect.ymax()
+    }
+
+    pub fn id(&self) -> usize {
+        self.id
+    }
+    pub fn name(&self) -> Option<&String> {
+        self.name.as_ref()
+    }
+
+    pub fn confidence(&self) -> f32 {
+        self.confidence
+    }
+
+    pub fn area(&self) -> f32 {
+        self.rect.area()
+    }
+
+    pub fn iou(&self, other: &Bbox) -> f32 {
+        self.rect.intersect(&other.rect) / self.rect.union(&other.rect)
+    }
+}
diff --git a/src/dataloader.rs b/src/dataloader.rs
new file mode 100644
index 0000000..e0a769a
--- /dev/null
+++ b/src/dataloader.rs
@@ -0,0 +1,120 @@
+use crate::{CHECK_MARK, CROSS_MARK, SAFE_CROSS_MARK};
+use anyhow::Result;
+use image::DynamicImage;
+use std::collections::VecDeque;
+use std::path::{Path, PathBuf};
+use walkdir::{DirEntry, WalkDir};
+
+#[derive(Debug, Clone)]
+pub struct DataLoader {
+    // source could be single image, folder with images (TODO: video, stream)
+    pub source: PathBuf,
+    pub batch: usize,
+    pub recursive: bool,
+    pub paths: VecDeque<PathBuf>,
+}
+
+impl Iterator for DataLoader {
+    type Item = (Vec<DynamicImage>, Vec<PathBuf>);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.paths.is_empty() {
+            None
+        } else {
+            let mut yis: Vec<DynamicImage> = Vec::new();
+            let mut yps: Vec<PathBuf> = Vec::new();
+            loop {
+                let path = self.paths.pop_front().unwrap();
+                match image::io::Reader::open(&path) {
+                    Err(err) => {
+                        println!(
+                            "{SAFE_CROSS_MARK} Faild to load image: {:?} -> {:?}",
+                            self.paths[0], err
+                        );
+                    }
+                    Ok(p) => match p.decode() {
+                        Err(err) => {
+                            println!(
+                                "{SAFE_CROSS_MARK} Fail to load image: {:?} -> {:?}",
+                                self.paths[0], err
+                            );
+                        }
+                        Ok(x) => {
+                            yis.push(x);
+                            yps.push(path);
+                        }
+                    },
+                }
+                if self.paths.is_empty() || yis.len() == self.batch {
+                    break;
+                }
+            }
+            Some((yis, yps))
+        }
+    }
+}
+
+impl Default for DataLoader {
+    fn default() -> Self {
+        Self {
+            batch: 1,
+            recursive: false,
+            source: Default::default(),
+            paths: Default::default(),
+        }
+    }
+}
+
+impl DataLoader {
+    pub fn load<P: AsRef<Path>>(&self, source: P) -> Result<Self> {
+        let source = source.as_ref();
+        let mut paths = VecDeque::new();
+
+        match source {
+            s if s.is_file() => paths.push_back(s.to_path_buf()),
+            s if s.is_dir() => {
+                for entry in WalkDir::new(s)
+                    .into_iter()
+                    .filter_entry(|e| !Self::_is_hidden(e))
+                {
+                    let entry = entry.unwrap();
+                    if entry.file_type().is_dir() {
+                        continue;
+                    }
+                    if !self.recursive && entry.depth() > 1 {
+                        continue;
+                    }
+                    paths.push_back(entry.path().to_path_buf());
+                }
+            }
+            // s if s.starts_with("rtsp://") || s.starts_with("rtmp://") || s.starts_with("http://")|| s.starts_with("https://") => todo!(),
+            s if !s.exists() => panic!("{CROSS_MARK} File not found: {s:?}"),
+            _ => todo!(),
+        }
+        println!("{CHECK_MARK} {} files found\n", &paths.len());
+        Ok(Self {
+            paths,
+            source: source.into(),
+            batch: self.batch,
+            recursive: self.recursive,
+        })
+    }
+
+    pub fn with_batch(mut self, x: usize) -> Self {
+        self.batch = x;
+        self
+    }
+
+    pub fn with_recursive(mut self, x: bool) -> Self {
+        self.recursive = x;
+        self
+    }
+
+    fn _is_hidden(entry: &DirEntry) -> bool {
+        entry
+            .file_name()
+            .to_str()
+            .map(|s| s.starts_with('.'))
+            .unwrap_or(false)
+    }
+}
diff --git a/src/device.rs b/src/device.rs
new file mode 100644
index 0000000..3181bd4
--- /dev/null
+++ b/src/device.rs
@@ -0,0 +1,13 @@
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
+pub enum Device {
+    Cpu(usize),
+    Cuda(usize),
+    Trt(usize),
+    CoreML(usize),
+    Cann(usize),
+    // Acl(usize),
+    // Rocm(usize),
+    // Rknpu(usize),
+    // Openvino(usize),
+    // Onednn(usize),
+}
diff --git a/src/dynconf.rs b/src/dynconf.rs
new file mode 100644
index 0000000..8dccdf0
--- /dev/null
+++ b/src/dynconf.rs
@@ -0,0 +1,54 @@
+use std::ops::Index;
+
+#[derive(Clone, PartialEq, PartialOrd)]
+pub struct DynConf {
+    confs: Vec<f32>,
+}
+
+impl Default for DynConf {
+    fn default() -> Self {
+        Self {
+            confs: vec![0.4f32],
+        }
+    }
+}
+
+impl std::fmt::Debug for DynConf {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("").field("DynConf", &self.confs).finish()
+    }
+}
+
+impl std::fmt::Display for DynConf {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_list().entries(self.confs.iter()).finish()
+    }
+}
+
+impl Index<usize> for DynConf {
+    type Output = f32;
+
+    fn index(&self, i: usize) -> &Self::Output {
+        &self.confs[i]
+    }
+}
+
+impl DynConf {
+    pub fn new(confs: &[f32], n: usize) -> Self {
+        if confs.is_empty() && n != 0 {
+            panic!("Error: No value found in confs")
+        }
+        let confs = if confs.len() >= n {
+            confs[..n].to_vec()
+        } else {
+            let val = confs.last().unwrap();
+            let mut confs = confs.to_vec();
+            for _ in 0..(n - confs.len()) {
+                confs.push(*val);
+            }
+            confs
+        };
+
+        Self { confs }
+    }
+}
diff --git a/src/embedding.rs b/src/embedding.rs
new file mode 100644
index 0000000..714272f
--- /dev/null
+++ b/src/embedding.rs
@@ -0,0 +1,51 @@
+use ndarray::{Array, Axis, IxDyn};
+
+#[derive(Clone, PartialEq, Default)]
+pub struct Embedding {
+    data: Array<f32, IxDyn>,
+    names: Option<Vec<String>>,
+}
+
+impl std::fmt::Debug for Embedding {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("").field("Top5", &self.topk(5)).finish()
+    }
+}
+
+impl Embedding {
+    pub fn new(data: Array<f32, IxDyn>, names: Option<Vec<String>>) -> Self {
+        Self { data, names }
+    }
+
+    pub fn data(&self) -> &Array<f32, IxDyn> {
+        &self.data
+    }
+
+    pub fn topk(&self, k: usize) -> Vec<(usize, f32, Option<String>)> {
+        let mut probs = self
+            .data
+            .iter()
+            .enumerate()
+            .map(|(a, b)| (a, *b))
+            .collect::<Vec<_>>();
+        probs.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+        let mut topk = Vec::new();
+        for &(id, confidence) in probs.iter().take(k) {
+            topk.push((
+                id,
+                confidence,
+                self.names.as_ref().map(|names| names[id].to_owned()),
+            ));
+        }
+        topk
+    }
+
+    pub fn norm(&self) -> Array<f32, IxDyn> {
+        let std_ = self.data.mapv(|x| x * x).sum_axis(Axis(0)).mapv(f32::sqrt);
+        self.data.clone() / std_
+    }
+
+    pub fn top1(&self) -> (usize, f32, Option<String>) {
+        self.topk(1)[0].to_owned()
+    }
+}
diff --git a/src/engine.rs b/src/engine.rs
new file mode 100644
index 0000000..59f5d6f
--- /dev/null
+++ b/src/engine.rs
@@ -0,0 +1,370 @@
+use anyhow::Result;
+use half::f16;
+use ndarray::{Array, IxDyn};
+use ort::{
+    ExecutionProvider, ExecutionProviderDispatch, Session, SessionBuilder, TensorElementType,
+    TensorRTExecutionProvider, ValueType,
+};
+
+use crate::{config_dir, Device, MinOptMax, Options, CHECK_MARK, CROSS_MARK, SAFE_CROSS_MARK};
+
+#[derive(Debug)]
+pub struct OrtEngine {
+    session: Session,
+    device: Device,
+    inputs_minoptmax: Vec<Vec<MinOptMax>>,
+    inames: Vec<String>,
+    ishapes: Vec<Vec<isize>>,
+    idtypes: Vec<TensorElementType>,
+    onames: Vec<String>,
+    oshapes: Vec<Vec<isize>>,
+    odtypes: Vec<TensorElementType>,
+    profile: bool,
+    num_dry_run: usize,
+}
+
+impl OrtEngine {
+    pub fn dry_run(&self) -> Result<()> {
+        if self.num_dry_run == 0 {
+            println!("{SAFE_CROSS_MARK} No dry run count specified, skipping the dry run.");
+            return Ok(());
+        }
+        let mut xs: Vec<Array<f32, IxDyn>> = Vec::new();
+        for i in self.inputs_minoptmax.iter() {
+            let mut x: Vec<usize> = Vec::new();
+            for i_ in i.iter() {
+                x.push(i_.opt as usize);
+            }
+            let x: Array<f32, IxDyn> = Array::ones(x).into_dyn();
+            xs.push(x);
+        }
+        for _ in 0..self.num_dry_run {
+            self.run(xs.as_ref())?;
+        }
+        println!("{CHECK_MARK} Dry run x{}", self.num_dry_run);
+        Ok(())
+    }
+
+    pub fn new(config: &Options) -> Result<Self> {
+        ort::init().commit()?;
+        let session = Session::builder()?.with_model_from_file(&config.onnx_path)?;
+
+        // inputs
+        let mut ishapes = Vec::new();
+        let mut idtypes = Vec::new();
+        let mut inames = Vec::new();
+        for x in session.inputs.iter() {
+            inames.push(x.name.to_owned());
+            if let ValueType::Tensor { ty, dimensions } = &x.input_type {
+                ishapes.push(dimensions.iter().map(|x| *x as isize).collect::<Vec<_>>());
+                idtypes.push(*ty);
+            } else {
+                ishapes.push(vec![-1_isize]);
+                idtypes.push(ort::TensorElementType::Float32);
+            }
+        }
+        // outputs
+        let mut oshapes = Vec::new();
+        let mut odtypes = Vec::new();
+        let mut onames = Vec::new();
+        for x in session.outputs.iter() {
+            onames.push(x.name.to_owned());
+            if let ValueType::Tensor { ty, dimensions } = &x.output_type {
+                oshapes.push(dimensions.iter().map(|x| *x as isize).collect::<Vec<_>>());
+                odtypes.push(*ty);
+            } else {
+                oshapes.push(vec![-1_isize]);
+                odtypes.push(ort::TensorElementType::Float32);
+            }
+        }
+        let mut inputs_minoptmax: Vec<Vec<MinOptMax>> = Vec::new();
+        for (i, dims) in ishapes.iter().enumerate() {
+            let mut v_: Vec<MinOptMax> = Vec::new();
+            for (ii, &x) in dims.iter().enumerate() {
+                let x_default: MinOptMax = (ishapes[i][ii], ishapes[i][ii], ishapes[i][ii]).into();
+                let x: MinOptMax = match (i, ii) {
+                    (0, 0) => Self::_set_ixx(x, &config.i00, i, ii).unwrap_or(x_default),
+                    (0, 1) => Self::_set_ixx(x, &config.i01, i, ii).unwrap_or(x_default),
+                    (0, 2) => Self::_set_ixx(x, &config.i02, i, ii).unwrap_or(x_default),
+                    (0, 3) => Self::_set_ixx(x, &config.i03, i, ii).unwrap_or(x_default),
+                    (0, 4) => Self::_set_ixx(x, &config.i04, i, ii).unwrap_or(x_default),
+                    (0, 5) => Self::_set_ixx(x, &config.i05, i, ii).unwrap_or(x_default),
+                    (1, 0) => Self::_set_ixx(x, &config.i10, i, ii).unwrap_or(x_default),
+                    (1, 1) => Self::_set_ixx(x, &config.i11, i, ii).unwrap_or(x_default),
+                    (1, 2) => Self::_set_ixx(x, &config.i12, i, ii).unwrap_or(x_default),
+                    (1, 3) => Self::_set_ixx(x, &config.i13, i, ii).unwrap_or(x_default),
+                    (1, 4) => Self::_set_ixx(x, &config.i14, i, ii).unwrap_or(x_default),
+                    (1, 5) => Self::_set_ixx(x, &config.i15, i, ii).unwrap_or(x_default),
+                    (2, 0) => Self::_set_ixx(x, &config.i20, i, ii).unwrap_or(x_default),
+                    (2, 1) => Self::_set_ixx(x, &config.i21, i, ii).unwrap_or(x_default),
+                    (2, 2) => Self::_set_ixx(x, &config.i22, i, ii).unwrap_or(x_default),
+                    (2, 3) => Self::_set_ixx(x, &config.i23, i, ii).unwrap_or(x_default),
+                    (2, 4) => Self::_set_ixx(x, &config.i24, i, ii).unwrap_or(x_default),
+                    (2, 5) => Self::_set_ixx(x, &config.i25, i, ii).unwrap_or(x_default),
+                    (3, 0) => Self::_set_ixx(x, &config.i30, i, ii).unwrap_or(x_default),
+                    (3, 1) => Self::_set_ixx(x, &config.i31, i, ii).unwrap_or(x_default),
+                    (3, 2) => Self::_set_ixx(x, &config.i32_, i, ii).unwrap_or(x_default),
+                    (3, 3) => Self::_set_ixx(x, &config.i33, i, ii).unwrap_or(x_default),
+                    (3, 4) => Self::_set_ixx(x, &config.i34, i, ii).unwrap_or(x_default),
+                    (3, 5) => Self::_set_ixx(x, &config.i35, i, ii).unwrap_or(x_default),
+                    _ => todo!(),
+                };
+                v_.push(x);
+            }
+            inputs_minoptmax.push(v_);
+        }
+
+        // build again
+        let builder = Session::builder()?;
+        let device = config.device.to_owned();
+        let _ep = match device {
+            Device::Trt(device_id) => Self::build_trt(
+                &inames,
+                &inputs_minoptmax,
+                &builder,
+                device_id,
+                config.trt_int8_enable,
+                config.trt_fp16_enable,
+                config.trt_engine_cache_enable,
+            )?,
+            Device::Cuda(device_id) => Self::build_cuda(&builder, device_id)?,
+            Device::CoreML(_) => {
+                let coreml = ort::CoreMLExecutionProvider::default()
+                    .with_subgraphs()
+                    // .with_ane_only()
+                    .build();
+                if coreml.is_available()? && coreml.register(&builder).is_ok() {
+                    println!("{CHECK_MARK} Using CoreML");
+                    coreml
+                } else {
+                    println!("{CROSS_MARK} CoreML initialization failed");
+                    println!("{CHECK_MARK} Using CPU");
+                    ort::CPUExecutionProvider::default().build()
+                }
+            }
+            Device::Cpu(_) => {
+                println!("{CHECK_MARK} Using CPU");
+                ort::CPUExecutionProvider::default().build()
+            }
+            _ => todo!(),
+        };
+        let session = builder
+            .with_optimization_level(ort::GraphOptimizationLevel::Level3)?
+            .with_model_from_file(&config.onnx_path)?;
+
+        Ok(Self {
+            session,
+            device,
+            inputs_minoptmax,
+            inames,
+            ishapes,
+            idtypes,
+            onames,
+            oshapes,
+            odtypes,
+            profile: config.profile,
+            num_dry_run: config.num_dry_run,
+        })
+    }
+
+    fn build_trt(
+        inames: &[String],
+        inputs_minoptmax: &[Vec<MinOptMax>],
+        builder: &SessionBuilder,
+        device_id: usize,
+        int8_enable: bool,
+        fp16_enable: bool,
+        engine_cache_enable: bool,
+    ) -> Result<ExecutionProviderDispatch> {
+        // auto generate shapes
+        let mut spec_min = String::new();
+        let mut spec_opt = String::new();
+        let mut spec_max = String::new();
+        for (i, name) in inames.iter().enumerate() {
+            if i != 0 {
+                spec_min.push(',');
+                spec_opt.push(',');
+                spec_max.push(',');
+            }
+            let mut s_min = format!("{}:", name);
+            let mut s_opt = format!("{}:", name);
+            let mut s_max = format!("{}:", name);
+            for d in inputs_minoptmax[i].iter() {
+                let min_ = &format!("{}x", d.min);
+                let opt_ = &format!("{}x", d.opt);
+                let max_ = &format!("{}x", d.max);
+                s_min += min_;
+                s_opt += opt_;
+                s_max += max_;
+            }
+            s_min.pop();
+            s_opt.pop();
+            s_max.pop();
+            spec_min += &s_min;
+            spec_opt += &s_opt;
+            spec_max += &s_max;
+        }
+        let trt = TensorRTExecutionProvider::default()
+            .with_device_id(device_id as i32)
+            .with_int8(int8_enable)
+            .with_fp16(fp16_enable)
+            .with_engine_cache(engine_cache_enable)
+            // .with_engine_cache_path(config_dir().to_str().unwrap())
+            .with_engine_cache_path(format!(
+                "{}/{}",
+                config_dir().to_str().unwrap(),
+                "trt-cache"
+            ))
+            .with_timing_cache(false)
+            .with_profile_min_shapes(spec_min)
+            .with_profile_opt_shapes(spec_opt)
+            .with_profile_max_shapes(spec_max)
+            .build();
+        if trt.is_available()? && trt.register(builder).is_ok() {
+            println!("{CHECK_MARK} Using TensorRT");
+            Ok(trt)
+        } else {
+            println!("{CROSS_MARK} TensorRT initialization failed. Try CUDA...");
+            Self::build_cuda(builder, device_id)
+        }
+    }
+
+    fn build_cuda(builder: &SessionBuilder, device_id: usize) -> Result<ExecutionProviderDispatch> {
+        let cuda = ort::CUDAExecutionProvider::default()
+            .with_device_id(device_id as i32)
+            .build();
+        if cuda.is_available()? && cuda.register(builder).is_ok() {
+            println!("{CHECK_MARK} Using CUDA");
+            Ok(cuda)
+        } else {
+            println!("{CROSS_MARK} CUDA initialization failed");
+            println!("{CHECK_MARK} Using CPU");
+            Ok(ort::CPUExecutionProvider::default().build())
+        }
+    }
+
+    pub fn run(&self, xs: &[Array<f32, IxDyn>]) -> Result<Vec<Array<f32, IxDyn>>> {
+        // input
+        let mut xs_ = Vec::new();
+        let t_pre = std::time::Instant::now();
+        for (idtype, x) in self.idtypes.iter().zip(xs.iter()) {
+            let x_ = match idtype {
+                TensorElementType::Float32 => ort::Value::from_array(x.view())?,
+                TensorElementType::Float16 => ort::Value::from_array(x.mapv(f16::from_f32).view())?,
+                TensorElementType::Int32 => ort::Value::from_array(x.mapv(|x_| x_ as i32).view())?,
+                TensorElementType::Int64 => ort::Value::from_array(x.mapv(|x_| x_ as i64).view())?,
+                _ => todo!(),
+            };
+            xs_.push(x_);
+        }
+        let t_pre = t_pre.elapsed();
+
+        // inference
+        let t_run = std::time::Instant::now();
+        let ys = self.session.run(xs_.as_ref())?;
+        let t_run = t_run.elapsed();
+
+        // oputput
+        let mut ys_ = Vec::new();
+        let t_post = std::time::Instant::now();
+        for ((_, y), dtype) in ys.iter().zip(self.odtypes.iter()) {
+            let y_ = match &dtype {
+                TensorElementType::Float32 => y.extract_tensor::<f32>()?.view().to_owned(),
+                TensorElementType::Float16 => y.extract_tensor::<f16>()?.view().mapv(f16::to_f32),
+                _ => todo!(),
+            };
+            ys_.push(y_);
+        }
+        let t_post = t_post.elapsed();
+        if self.profile {
+            println!(
+                "[Profile] batch: {:?} => {:.4?} (i: {t_pre:.4?}, run: {t_run:.4?}, o: {t_post:.4?})", 
+                self.batch().opt,
+                t_pre + t_run + t_post
+            );
+        }
+        Ok(ys_)
+    }
+
+    pub fn _set_ixx(x: isize, ixx: &Option<MinOptMax>, i: usize, ii: usize) -> Option<MinOptMax> {
+        match x {
+            -1 => {
+                match ixx {
+                    None => panic!(
+                        "{CROSS_MARK} Using dynamic shapes in inputs without specifying it: the {}-th input, the {}-th dimension.",
+                        i + 1,
+                        ii + 1
+                    ),
+                    Some(ixx) => Some(ixx.to_owned()), // customized
+                }
+            }
+            _ => Some((x, x, x).into()), // customized, but not dynamic
+        }
+    }
+
+    pub fn oshapes(&self) -> &Vec<Vec<isize>> {
+        &self.oshapes
+    }
+
+    pub fn onames(&self) -> &Vec<String> {
+        &self.onames
+    }
+
+    pub fn odtypes(&self) -> &Vec<ort::TensorElementType> {
+        &self.odtypes
+    }
+
+    pub fn ishapes(&self) -> &Vec<Vec<isize>> {
+        &self.ishapes
+    }
+
+    pub fn inames(&self) -> &Vec<String> {
+        &self.inames
+    }
+
+    pub fn idtypes(&self) -> &Vec<ort::TensorElementType> {
+        &self.idtypes
+    }
+
+    pub fn device(&self) -> &Device {
+        &self.device
+    }
+
+    pub fn inputs_minoptmax(&self) -> &Vec<Vec<MinOptMax>> {
+        &self.inputs_minoptmax
+    }
+
+    pub fn batch(&self) -> &MinOptMax {
+        &self.inputs_minoptmax[0][0]
+    }
+
+    pub fn height(&self) -> &MinOptMax {
+        &self.inputs_minoptmax[0][2]
+    }
+
+    pub fn width(&self) -> &MinOptMax {
+        &self.inputs_minoptmax[0][2]
+    }
+
+    pub fn is_batch_dyn(&self) -> bool {
+        self.ishapes[0][0] == -1
+    }
+
+    pub fn try_fetch(&self, key: &str) -> Option<String> {
+        match self.session.metadata() {
+            Err(_) => None,
+            Ok(metadata) => match metadata.custom(key) {
+                Err(_) => None,
+                Ok(value) => value,
+            },
+        }
+    }
+
+    pub fn session(&self) -> &Session {
+        &self.session
+    }
+
+    pub fn version(&self) -> Option<String> {
+        self.try_fetch("version")
+    }
+}
diff --git a/src/keypoint.rs b/src/keypoint.rs
new file mode 100644
index 0000000..06e386d
--- /dev/null
+++ b/src/keypoint.rs
@@ -0,0 +1,35 @@
+use crate::Point;
+
+#[derive(PartialEq, Clone, Default)]
+pub struct Keypoint {
+    pub point: Point,
+    confidence: f32,
+}
+
+impl std::fmt::Debug for Keypoint {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Keypoint")
+            .field("x", &self.point.x)
+            .field("y", &self.point.y)
+            .field("confidence", &self.confidence)
+            .finish()
+    }
+}
+
+impl Keypoint {
+    pub fn new(point: Point, confidence: f32) -> Self {
+        Self { point, confidence }
+    }
+
+    pub fn x(&self) -> f32 {
+        self.point.x
+    }
+
+    pub fn y(&self) -> f32 {
+        self.point.y
+    }
+
+    pub fn confidence(&self) -> f32 {
+        self.confidence
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..741f29b
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,43 @@
+mod annotator;
+mod bbox;
+mod dataloader;
+mod device;
+mod dynconf;
+mod embedding;
+mod engine;
+mod keypoint;
+mod min_opt_max;
+pub mod models;
+pub mod ops;
+mod options;
+mod point;
+mod rect;
+mod results;
+mod rotated_rect;
+mod tokenizer_stream;
+mod utils;
+
+pub use annotator::Annotator;
+pub use bbox::Bbox;
+pub use dataloader::DataLoader;
+pub use device::Device;
+pub use dynconf::DynConf;
+pub use embedding::Embedding;
+pub use engine::OrtEngine;
+pub use keypoint::Keypoint;
+pub use min_opt_max::MinOptMax;
+pub use options::Options;
+pub use point::Point;
+pub use rect::Rect;
+pub use results::Results;
+pub use rotated_rect::RotatedRect;
+pub use tokenizer_stream::TokenizerStream;
+pub use utils::{
+    auto_load, config_dir, download, non_max_suppression, string_now, COCO_NAMES_80,
+    COCO_SKELETON_17,
+};
+
+const GITHUB_ASSETS: &str = "https://github.com/jamjamjon/assets/releases/download/v0.0.1";
+const CHECK_MARK: &str = "✅";
+const CROSS_MARK: &str = "❌";
+const SAFE_CROSS_MARK: &str = "❎";
diff --git a/src/min_opt_max.rs b/src/min_opt_max.rs
new file mode 100644
index 0000000..b79d0c3
--- /dev/null
+++ b/src/min_opt_max.rs
@@ -0,0 +1,42 @@
+#[derive(Debug, Clone)]
+pub struct MinOptMax {
+    pub min: isize,
+    pub opt: isize,
+    pub max: isize,
+}
+
+impl Default for MinOptMax {
+    fn default() -> Self {
+        Self {
+            min: -1,
+            opt: -1,
+            max: -1,
+        }
+    }
+}
+
+impl From<(isize, isize, isize)> for MinOptMax {
+    fn from((min, opt, max): (isize, isize, isize)) -> Self {
+        let min = min.min(opt);
+        let max = max.max(opt);
+        Self { min, opt, max }
+    }
+}
+
+impl From<[isize; 3]> for MinOptMax {
+    fn from([min, opt, max]: [isize; 3]) -> Self {
+        let min = min.min(opt);
+        let max = max.max(opt);
+        Self { min, opt, max }
+    }
+}
+
+impl MinOptMax {
+    pub fn new(opt: isize) -> Self {
+        Self {
+            min: opt,
+            opt,
+            max: opt,
+        }
+    }
+}
diff --git a/src/models/blip.rs b/src/models/blip.rs
new file mode 100644
index 0000000..f814d47
--- /dev/null
+++ b/src/models/blip.rs
@@ -0,0 +1,135 @@
+use anyhow::Result;
+use image::DynamicImage;
+use ndarray::{s, Array, Axis, IxDyn};
+use std::io::Write;
+use tokenizers::Tokenizer;
+
+use crate::{auto_load, ops, MinOptMax, Options, OrtEngine, TokenizerStream};
+
+#[derive(Debug)]
+pub struct Blip {
+    pub textual: OrtEngine,
+    pub visual: OrtEngine,
+    pub height: MinOptMax,
+    pub width: MinOptMax,
+    pub batch_visual: MinOptMax,
+    pub batch_textual: MinOptMax,
+    tokenizer: TokenizerStream,
+}
+
+impl Blip {
+    pub fn new(options_visual: Options, options_textual: Options) -> Result<Self> {
+        let visual = OrtEngine::new(&options_visual)?;
+        let textual = OrtEngine::new(&options_textual)?;
+        let (batch_visual, batch_textual, height, width) = (
+            visual.batch().to_owned(),
+            textual.batch().to_owned(),
+            visual.height().to_owned(),
+            visual.width().to_owned(),
+        );
+        let tokenizer = match &options_textual.tokenizer {
+            None => auto_load("tokenizer-blip.json")?,
+            Some(tokenizer) => tokenizer.into(),
+        };
+        let tokenizer = Tokenizer::from_file(tokenizer).unwrap();
+        let tokenizer = TokenizerStream::new(tokenizer);
+        visual.dry_run()?;
+        textual.dry_run()?;
+        Ok(Self {
+            textual,
+            visual,
+            batch_visual,
+            batch_textual,
+            height,
+            width,
+            tokenizer,
+        })
+    }
+
+    pub fn encode_images(&self, xs: &[DynamicImage]) -> Result<Array<f32, IxDyn>> {
+        let xs_ = ops::resize(xs, self.height.opt as u32, self.width.opt as u32, true)?;
+        let ys: Vec<Array<f32, IxDyn>> = self.visual.run(&[xs_])?;
+        let ys = ys[0].to_owned();
+        Ok(ys)
+    }
+
+    pub fn caption(&mut self, path: &str, prompt: Option<&str>) -> Result<()> {
+        // this demo use batch_size=1
+        let x = image::io::Reader::open(path)?.decode()?;
+        let image_embeds = self.encode_images(&[x])?;
+        let image_embeds_attn_mask: Array<f32, IxDyn> =
+            Array::ones((1, image_embeds.shape()[1])).into_dyn();
+
+        // conditional
+        let mut input_ids = match prompt {
+            None => {
+                print!("[Unconditional image captioning]: ");
+                vec![0.0f32]
+            }
+
+            Some(prompt) => {
+                let encodings = self.tokenizer.tokenizer().encode(prompt, false);
+                let ids: Vec<f32> = encodings
+                    .unwrap()
+                    .get_ids()
+                    .iter()
+                    .map(|x| *x as f32)
+                    .collect();
+                print!("[Conditional image captioning]: {} ", prompt);
+                ids
+            }
+        };
+        loop {
+            let input_ids_nd: Array<f32, IxDyn> = Array::from_vec(input_ids.to_owned()).into_dyn();
+            let input_ids_nd = input_ids_nd.insert_axis(Axis(0));
+            let input_ids_attn_mask: Array<f32, IxDyn> =
+                Array::ones(input_ids_nd.shape()).into_dyn();
+            let y = self.textual.run(&[
+                input_ids_nd,
+                input_ids_attn_mask,
+                image_embeds.to_owned(),
+                image_embeds_attn_mask.to_owned(),
+            ])?; // N, length, vocab_size
+            let y = y[0].to_owned();
+            let y = y.slice(s!(0, -1.., ..));
+
+            // softmax
+            let exps = y.mapv(|c| c.exp());
+            let stds = exps.sum_axis(Axis(1));
+            let probs = exps / stds.insert_axis(Axis(1));
+            let probs = probs.slice(s!(0, ..));
+
+            // argmax
+            let (token_id, _) = probs
+                .into_iter()
+                .enumerate()
+                .reduce(|max, x| if x.1 > max.1 { x } else { max })
+                .unwrap();
+            input_ids.push(token_id as f32);
+
+            // SEP
+            if token_id == 102 {
+                break;
+            }
+
+            // streaming generation
+            if let Some(t) = self.tokenizer.next_token(token_id as u32)? {
+                print!("{t}");
+                std::io::stdout().flush()?;
+            }
+            // sleep for test
+            std::thread::sleep(std::time::Duration::from_millis(10));
+        }
+        println!();
+        self.tokenizer.clear();
+        Ok(())
+    }
+
+    pub fn batch_visual(&self) -> usize {
+        self.batch_visual.opt as usize
+    }
+
+    pub fn batch_textual(&self) -> usize {
+        self.batch_textual.opt as usize
+    }
+}
diff --git a/src/models/clip.rs b/src/models/clip.rs
new file mode 100644
index 0000000..8972b30
--- /dev/null
+++ b/src/models/clip.rs
@@ -0,0 +1,105 @@
+use crate::{auto_load, ops, MinOptMax, Options, OrtEngine};
+use anyhow::Result;
+use image::DynamicImage;
+use itertools::Itertools;
+use ndarray::{Array, Array2, Axis, IxDyn};
+use tokenizers::{PaddingDirection, PaddingParams, PaddingStrategy, Tokenizer};
+
+#[derive(Debug)]
+pub struct Clip {
+    pub textual: OrtEngine,
+    pub visual: OrtEngine,
+    pub height: MinOptMax,
+    pub width: MinOptMax,
+    pub batch_visual: MinOptMax,
+    pub batch_textual: MinOptMax,
+    tokenizer: Tokenizer,
+    context_length: usize,
+}
+
+impl Clip {
+    pub fn new(options_visual: Options, options_textual: Options) -> Result<Self> {
+        let context_length = 77;
+        let visual = OrtEngine::new(&options_visual)?;
+        let textual = OrtEngine::new(&options_textual)?;
+        let (batch_visual, batch_textual, height, width) = (
+            visual.inputs_minoptmax()[0][0].to_owned(),
+            textual.inputs_minoptmax()[0][0].to_owned(),
+            visual.inputs_minoptmax()[0][2].to_owned(),
+            visual.inputs_minoptmax()[0][3].to_owned(),
+        );
+        let tokenizer = match &options_textual.tokenizer {
+            None => auto_load("tokenizer-clip.json").unwrap(),
+            Some(tokenizer) => tokenizer.into(),
+        };
+        let mut tokenizer = Tokenizer::from_file(tokenizer).unwrap();
+        tokenizer.with_padding(Some(PaddingParams {
+            strategy: PaddingStrategy::Fixed(context_length),
+            direction: PaddingDirection::Right,
+            pad_to_multiple_of: None,
+            pad_id: 0,
+            pad_type_id: 0,
+            pad_token: "[PAD]".to_string(),
+        }));
+
+        visual.dry_run()?;
+        textual.dry_run()?;
+
+        Ok(Self {
+            textual,
+            visual,
+            batch_visual,
+            batch_textual,
+            height,
+            width,
+            tokenizer,
+            context_length,
+        })
+    }
+
+    pub fn encode_images(&self, xs: &[DynamicImage]) -> Result<Array<f32, IxDyn>> {
+        let xs_ = ops::resize(xs, self.height.opt as u32, self.width.opt as u32, true)?;
+        let ys: Vec<Array<f32, IxDyn>> = self.visual.run(&[xs_])?;
+        let ys = ys[0].to_owned();
+        Ok(ys)
+    }
+
+    pub fn encode_texts(&self, texts: &[String]) -> Result<Array<f32, IxDyn>> {
+        let encodings = self
+            .tokenizer
+            .encode_batch(texts.to_owned(), false)
+            .unwrap();
+        let xs: Vec<f32> = encodings
+            .iter()
+            .map(|i| i.get_ids().iter().map(|b| *b as f32).collect())
+            .concat();
+        let xs = Array2::from_shape_vec((texts.len(), self.context_length), xs)?.into_dyn();
+        let ys = self.textual.run(&[xs])?;
+        let ys = ys[0].to_owned();
+        // let ys = ops::norm(&ys);
+        Ok(ys)
+    }
+
+    pub fn get_similarity(
+        &self,
+        images_feats: &Array<f32, IxDyn>,
+        texts_feats: &Array<f32, IxDyn>,
+    ) -> Result<Vec<Vec<f32>>> {
+        let images_feats = images_feats.clone().into_dimensionality::<ndarray::Ix2>()?;
+        let texts_feats = texts_feats.clone().into_dimensionality::<ndarray::Ix2>()?;
+        let matrix = images_feats.dot(&texts_feats.t()); // [M, N]
+        let exps = matrix.mapv(|x| x.exp()); //[M, N]
+        let stds = exps.sum_axis(Axis(1)); //[M, 1]
+        let matrix = exps / stds.insert_axis(Axis(1)); // [M, N]
+        let similarity: Vec<Vec<f32>> = matrix.axis_iter(Axis(0)).map(|row| row.to_vec()).collect();
+        Ok(similarity)
+    }
+
+    pub fn batch_visual(&self) -> usize {
+        self.batch_visual.opt as usize
+    }
+
+    pub fn batch_textual(&self) -> usize {
+        self.batch_textual.opt as usize
+    }
+}
diff --git a/src/models/dinov2.rs b/src/models/dinov2.rs
new file mode 100644
index 0000000..ae8721d
--- /dev/null
+++ b/src/models/dinov2.rs
@@ -0,0 +1,39 @@
+use crate::{ops, MinOptMax, Options, OrtEngine};
+use anyhow::Result;
+use image::DynamicImage;
+use ndarray::{Array, IxDyn};
+
+#[derive(Debug)]
+pub struct Dinov2 {
+    engine: OrtEngine,
+    pub height: MinOptMax,
+    pub width: MinOptMax,
+    pub batch: MinOptMax,
+}
+
+impl Dinov2 {
+    pub fn new(options: &Options) -> Result<Self> {
+        let engine = OrtEngine::new(options)?;
+        let (batch, height, width) = (
+            engine.inputs_minoptmax()[0][0].to_owned(),
+            engine.inputs_minoptmax()[0][2].to_owned(),
+            engine.inputs_minoptmax()[0][3].to_owned(),
+        );
+        engine.dry_run()?;
+
+        Ok(Self {
+            engine,
+            height,
+            width,
+            batch,
+        })
+    }
+
+    pub fn run(&mut self, xs: &[DynamicImage]) -> Result<Array<f32, IxDyn>> {
+        let xs_ = ops::resize(xs, self.height.opt as u32, self.width.opt as u32, true)?;
+        let ys: Vec<Array<f32, IxDyn>> = self.engine.run(&[xs_])?;
+        let ys = ys[0].to_owned();
+        let ys = ops::norm(&ys);
+        Ok(ys)
+    }
+}
diff --git a/src/models/mod.rs b/src/models/mod.rs
new file mode 100644
index 0000000..9dc0d3f
--- /dev/null
+++ b/src/models/mod.rs
@@ -0,0 +1,11 @@
+mod blip;
+mod clip;
+mod dinov2;
+mod rtdetr;
+mod yolo;
+
+pub use blip::Blip;
+pub use clip::Clip;
+pub use dinov2::Dinov2;
+pub use rtdetr::RTDETR;
+pub use yolo::YOLO;
diff --git a/src/models/rtdetr.rs b/src/models/rtdetr.rs
new file mode 100644
index 0000000..a908299
--- /dev/null
+++ b/src/models/rtdetr.rs
@@ -0,0 +1,154 @@
+use anyhow::Result;
+use image::DynamicImage;
+use ndarray::{s, Array, Axis, IxDyn};
+use regex::Regex;
+
+use crate::{ops, Annotator, Bbox, DynConf, MinOptMax, Options, OrtEngine, Rect, Results};
+
+#[derive(Debug)]
+pub struct RTDETR {
+    engine: OrtEngine,
+    height: MinOptMax,
+    width: MinOptMax,
+    batch: MinOptMax,
+    annotator: Annotator,
+    confs: DynConf,
+    saveout: Option<String>,
+    nc: usize,
+    names: Option<Vec<String>>,
+}
+
+impl RTDETR {
+    pub fn new(options: &Options) -> Result<Self> {
+        let engine = OrtEngine::new(options)?;
+        let (batch, height, width) = (
+            engine.inputs_minoptmax()[0][0].to_owned(),
+            engine.inputs_minoptmax()[0][2].to_owned(),
+            engine.inputs_minoptmax()[0][3].to_owned(),
+        );
+        let names: Option<_> = match &options.names {
+            None => engine.try_fetch("names").map(|names| {
+                let re = Regex::new(r#"(['"])([-()\w '"]+)(['"])"#).unwrap();
+                let mut names_ = vec![];
+                for (_, [_, name, _]) in re.captures_iter(&names).map(|x| x.extract()) {
+                    names_.push(name.to_string());
+                }
+                names_
+            }),
+            Some(names) => Some(names.to_owned()),
+        };
+        let nc = options.nc.unwrap_or(
+            names
+                .as_ref()
+                .expect("Failed to get num_classes, make it explicit with `--nc`")
+                .len(),
+        );
+        let annotator = Annotator::default();
+        let confs = DynConf::new(&options.confs, nc);
+        engine.dry_run()?;
+
+        Ok(Self {
+            engine,
+            confs,
+            nc,
+            height,
+            width,
+            batch,
+            saveout: options.saveout.to_owned(),
+            annotator,
+            names,
+        })
+    }
+
+    pub fn run(&mut self, xs: &[DynamicImage]) -> Result<Vec<Results>> {
+        let xs_ = ops::letterbox(xs, self.height() as u32, self.width() as u32)?;
+        let ys = self.engine.run(&[xs_])?;
+        let ys = self.postprocess(ys, xs)?;
+        match &self.saveout {
+            None => {}
+            Some(saveout) => {
+                for (img0, y) in xs.iter().zip(ys.iter()) {
+                    let mut img = img0.to_rgb8();
+                    self.annotator.plot(&mut img, y);
+                    self.annotator.save(&img, saveout);
+                }
+            }
+        }
+        Ok(ys)
+    }
+
+    pub fn postprocess(
+        &self,
+        xs: Vec<Array<f32, IxDyn>>,
+        xs0: &[DynamicImage],
+    ) -> Result<Vec<Results>> {
+        const CXYWH_OFFSET: usize = 4; // cxcywh
+        let preds = &xs[0];
+
+        let mut ys = Vec::new();
+        for (idx, anchor) in preds.axis_iter(Axis(0)).enumerate() {
+            // [bs, num_query, 4 + nc]
+            let width_original = xs0[idx].width() as f32;
+            let height_original = xs0[idx].height() as f32;
+            let ratio =
+                (self.width() as f32 / width_original).min(self.height() as f32 / height_original);
+
+            // save each result
+            let mut y_bboxes = Vec::new();
+            for pred in anchor.axis_iter(Axis(0)) {
+                let bbox = pred.slice(s![0..CXYWH_OFFSET]);
+                let clss = pred.slice(s![CXYWH_OFFSET..CXYWH_OFFSET + self.nc]);
+
+                // confidence & id
+                let (id, &confidence) = clss
+                    .into_iter()
+                    .enumerate()
+                    .reduce(|max, x| if x.1 > max.1 { x } else { max })
+                    .unwrap();
+
+                // confs filter
+                if confidence < self.confs[id] {
+                    continue;
+                }
+
+                // bbox -> input size scale -> rescale
+                let x = (bbox[0] - bbox[2] / 2.) * self.width() as f32 / ratio;
+                let y = (bbox[1] - bbox[3] / 2.) * self.height() as f32 / ratio;
+                let w = bbox[2] * self.width() as f32 / ratio;
+                let h = bbox[3] * self.height() as f32 / ratio;
+                let y_bbox = Bbox::new(
+                    Rect::from_xywh(
+                        x.max(0.0f32).min(width_original),
+                        y.max(0.0f32).min(height_original),
+                        w,
+                        h,
+                    ),
+                    id,
+                    confidence,
+                    self.names.as_ref().map(|names| names[id].clone()),
+                );
+                y_bboxes.push(y_bbox)
+            }
+            let y = Results {
+                probs: None,
+                bboxes: Some(y_bboxes),
+                keypoints: None,
+                masks: None,
+            };
+            ys.push(y);
+        }
+        Ok(ys)
+    }
+
+    pub fn batch(&self) -> isize {
+        self.batch.opt
+    }
+
+    pub fn width(&self) -> isize {
+        self.width.opt
+    }
+
+    pub fn height(&self) -> isize {
+        self.height.opt
+    }
+}
diff --git a/src/models/yolo.rs b/src/models/yolo.rs
new file mode 100644
index 0000000..e783bda
--- /dev/null
+++ b/src/models/yolo.rs
@@ -0,0 +1,387 @@
+use anyhow::Result;
+use clap::ValueEnum;
+use image::{DynamicImage, ImageBuffer};
+use ndarray::{s, Array, Axis, IxDyn};
+use regex::Regex;
+
+use crate::{
+    non_max_suppression, ops, Annotator, Bbox, DynConf, Embedding, Keypoint, MinOptMax, Options,
+    OrtEngine, Point, Rect, Results,
+};
+
+const CXYWH_OFFSET: usize = 4;
+const KPT_STEP: usize = 3;
+
+#[derive(Debug, Clone, ValueEnum)]
+enum YOLOTask {
+    Classify,
+    Detect,
+    Pose,
+    Segment,
+    Obb, // TODO
+}
+
+#[derive(Debug)]
+pub struct YOLO {
+    engine: OrtEngine,
+    nc: usize,
+    nk: usize,
+    nm: usize,
+    height: MinOptMax,
+    width: MinOptMax,
+    batch: MinOptMax,
+    task: YOLOTask,
+    confs: DynConf,
+    kconfs: DynConf,
+    iou: f32,
+    saveout: Option<String>,
+    annotator: Annotator,
+    names: Option<Vec<String>>,
+    apply_nms: bool,
+    anchors_first: bool,
+}
+
+impl YOLO {
+    pub fn new(options: &Options) -> Result<Self> {
+        let engine = OrtEngine::new(options)?;
+        let (batch, height, width) = (
+            engine.batch().to_owned(),
+            engine.height().to_owned(),
+            engine.width().to_owned(),
+        );
+        let task = match engine
+            .try_fetch("task")
+            .unwrap_or("detect".to_string())
+            .as_str()
+        {
+            "classify" => YOLOTask::Classify,
+            "detect" => YOLOTask::Detect,
+            "pose" => YOLOTask::Pose,
+            "segment" => YOLOTask::Segment,
+            x => todo!("{:?} is not supported for now!", x),
+        };
+
+        // try from custom class names, and then model metadata
+        let mut names = options.names.to_owned().or(Self::fetch_names(&engine));
+        let nc = match options.nc {
+            Some(nc) => {
+                match &names {
+                    None => names = Some((0..nc).map(|x| x.to_string()).collect::<Vec<String>>()),
+                    Some(names) => {
+                        assert_eq!(
+                            nc,
+                            names.len(),
+                            "the length of `nc` and `class names` is not equal."
+                        );
+                    }
+                }
+                nc
+            }
+            None => match &names {
+                Some(names) => names.len(),
+                None => panic!(
+                    "Can not parse model without `nc` and `class names`. Try to make it explicit."
+                ),
+            },
+        };
+
+        // try from model metadata
+        let nk = engine
+            .try_fetch("kpt_shape")
+            .map(|kpt_string| {
+                let re = Regex::new(r"([0-9]+), ([0-9]+)").unwrap();
+                let caps = re.captures(&kpt_string).unwrap();
+                caps.get(1).unwrap().as_str().parse::<usize>().unwrap()
+            })
+            .unwrap_or(0_usize);
+        let nm = if let YOLOTask::Segment = task {
+            engine.oshapes()[1][1] as usize
+        } else {
+            0_usize
+        };
+        let confs = DynConf::new(&options.confs, nc);
+        let kconfs = DynConf::new(&options.kconfs, nk);
+        let mut annotator = Annotator::default();
+        if let Some(skeletons) = &options.skeletons {
+            annotator = annotator.with_skeletons(skeletons);
+        }
+        let saveout = options.saveout.to_owned();
+        engine.dry_run()?;
+
+        Ok(Self {
+            engine,
+            confs,
+            kconfs,
+            iou: options.iou,
+            apply_nms: options.apply_nms,
+            nc,
+            nk,
+            nm,
+            height,
+            width,
+            batch,
+            task,
+            saveout,
+            annotator,
+            names,
+            anchors_first: options.anchors_first,
+        })
+    }
+
+    // pub fn run_with_dl(&mut self, dl: &Dataloader) -> Result<Vec<Results>> {
+    //     for (images, paths) in dataloader {
+    //         self.run(&images)
+    //     }
+    //     Ok(())
+    // }
+
+    pub fn run(&mut self, xs: &[DynamicImage]) -> Result<Vec<Results>> {
+        let xs_ = ops::letterbox(xs, self.height() as u32, self.width() as u32)?;
+        let ys = self.engine.run(&[xs_])?;
+        let ys = self.postprocess(ys, xs)?;
+        match &self.saveout {
+            None => println!("{ys:?}"),
+            Some(saveout) => {
+                for (img0, y) in xs.iter().zip(ys.iter()) {
+                    let mut img = img0.to_rgb8();
+                    self.annotator.plot(&mut img, y);
+                    self.annotator.save(&img, saveout);
+                }
+            }
+        }
+        Ok(ys)
+    }
+
+    pub fn postprocess(
+        &self,
+        xs: Vec<Array<f32, IxDyn>>,
+        xs0: &[DynamicImage],
+    ) -> Result<Vec<Results>> {
+        if let YOLOTask::Classify = self.task {
+            let mut ys = Vec::new();
+            for batch in xs[0].axis_iter(Axis(0)) {
+                ys.push(Results::new(
+                    Some(Embedding::new(batch.into_owned(), self.names.to_owned())),
+                    None,
+                    None,
+                    None,
+                ));
+            }
+            Ok(ys)
+        } else {
+            let (preds, protos) = if xs.len() == 2 {
+                if xs[0].ndim() == 3 {
+                    (&xs[0], Some(&xs[1]))
+                } else {
+                    (&xs[1], Some(&xs[0]))
+                }
+            } else {
+                (&xs[0], None)
+            };
+
+            let mut ys = Vec::new();
+            for (idx, anchor) in preds.axis_iter(Axis(0)).enumerate() {
+                // [b, 4 + nc + nm, na]
+                // input image
+                let width_original = xs0[idx].width() as f32;
+                let height_original = xs0[idx].height() as f32;
+                let ratio = (self.width() as f32 / width_original)
+                    .min(self.height() as f32 / height_original);
+
+                #[allow(clippy::type_complexity)]
+                let mut data: Vec<(Bbox, Option<Vec<Keypoint>>, Option<Vec<f32>>)> = Vec::new();
+                for pred in anchor.axis_iter(if self.anchors_first { Axis(0) } else { Axis(1) }) {
+                    // split preds for different tasks
+                    let bbox = pred.slice(s![0..CXYWH_OFFSET]);
+                    let clss = pred.slice(s![CXYWH_OFFSET..CXYWH_OFFSET + self.nc]);
+                    let kpts = {
+                        if let YOLOTask::Pose = self.task {
+                            Some(pred.slice(s![pred.len() - KPT_STEP * self.nk..]))
+                        } else {
+                            None
+                        }
+                    };
+                    let coefs = {
+                        if let YOLOTask::Segment = self.task {
+                            Some(pred.slice(s![pred.len() - self.nm..]).to_vec())
+                        } else {
+                            None
+                        }
+                    };
+
+                    // confidence and index
+                    let (id, &confidence) = clss
+                        .into_iter()
+                        .enumerate()
+                        .reduce(|max, x| if x.1 > max.1 { x } else { max })
+                        .unwrap();
+
+                    // confidence filter
+                    if confidence < self.confs[id] {
+                        continue;
+                    }
+
+                    // bbox re-scale
+                    let cx = bbox[0] / ratio;
+                    let cy = bbox[1] / ratio;
+                    let w = bbox[2] / ratio;
+                    let h = bbox[3] / ratio;
+                    let x = cx - w / 2.;
+                    let y = cy - h / 2.;
+                    let y_bbox = Bbox::new(
+                        Rect::from_xywh(
+                            x.max(0.0f32).min(width_original),
+                            y.max(0.0f32).min(height_original),
+                            w,
+                            h,
+                        ),
+                        id,
+                        confidence,
+                        self.names.as_ref().map(|names| names[id].to_owned()),
+                    );
+
+                    // kpts
+                    let y_kpts = {
+                        if let Some(kpts) = kpts {
+                            let mut kpts_ = Vec::new();
+                            for i in 0..self.nk {
+                                let kx = kpts[KPT_STEP * i] / ratio;
+                                let ky = kpts[KPT_STEP * i + 1] / ratio;
+                                let kconf = kpts[KPT_STEP * i + 2];
+                                if kconf < self.kconfs[i] {
+                                    kpts_.push(Keypoint::default());
+                                } else {
+                                    kpts_.push(Keypoint::new(
+                                        Point::new(
+                                            kx.max(0.0f32).min(width_original),
+                                            ky.max(0.0f32).min(height_original),
+                                        ),
+                                        kconf,
+                                    ));
+                                }
+                            }
+                            Some(kpts_)
+                        } else {
+                            None
+                        }
+                    };
+
+                    // merged
+                    data.push((y_bbox, y_kpts, coefs));
+                }
+
+                // nms
+                if self.apply_nms {
+                    non_max_suppression(&mut data, self.iou);
+                }
+
+                // decode
+                let mut y_bboxes: Vec<Bbox> = Vec::new();
+                let mut y_kpts: Vec<Vec<Keypoint>> = Vec::new();
+                let mut y_masks: Vec<Vec<u8>> = Vec::new();
+                for elem in data.into_iter() {
+                    if let Some(kpts) = elem.1 {
+                        y_kpts.push(kpts)
+                    }
+
+                    // decode masks
+                    if let Some(coefs) = elem.2 {
+                        let proto = protos.unwrap().slice(s![idx, .., .., ..]);
+                        let (nm, nh, nw) = proto.dim();
+
+                        // coefs * proto -> mask
+                        let coefs = Array::from_shape_vec((1, nm), coefs)?; // (n, nm)
+                        let proto = proto.to_owned().into_shape((nm, nh * nw))?; // (nm, nh*nw)
+                        let mask = coefs.dot(&proto).into_shape((nh, nw, 1))?; // (nh, nw, n)
+
+                        // build image from ndarray
+                        let mask_im: ImageBuffer<image::Luma<_>, Vec<f32>> =
+                            match ImageBuffer::from_raw(nw as u32, nh as u32, mask.into_raw_vec()) {
+                                Some(image) => image,
+                                None => panic!("can not create image from ndarray"),
+                            };
+                        let mut mask_im = image::DynamicImage::from(mask_im); // -> dyn
+
+                        // rescale masks
+                        let (_, w_mask, h_mask) =
+                            ops::scale_wh(width_original, height_original, nw as f32, nh as f32);
+                        let mask_cropped = mask_im.crop(0, 0, w_mask as u32, h_mask as u32);
+                        let mask_original = mask_cropped.resize_exact(
+                            width_original as u32,
+                            height_original as u32,
+                            image::imageops::FilterType::Triangle,
+                        );
+
+                        // crop-mask with bbox
+                        let mut mask_original_cropped = mask_original.into_luma8();
+                        for y in 0..height_original as usize {
+                            for x in 0..width_original as usize {
+                                if x < elem.0.xmin() as usize
+                                    || x > elem.0.xmax() as usize
+                                    || y < elem.0.ymin() as usize
+                                    || y > elem.0.ymax() as usize
+                                {
+                                    mask_original_cropped.put_pixel(
+                                        x as u32,
+                                        y as u32,
+                                        image::Luma([0u8]),
+                                    );
+                                }
+                            }
+                        }
+                        y_masks.push(mask_original_cropped.into_raw());
+                    }
+                    y_bboxes.push(elem.0);
+                }
+
+                // save each result
+                let y = Results {
+                    probs: None,
+                    bboxes: if !y_bboxes.is_empty() {
+                        Some(y_bboxes)
+                    } else {
+                        None
+                    },
+                    keypoints: if !y_kpts.is_empty() {
+                        Some(y_kpts)
+                    } else {
+                        None
+                    },
+                    masks: if !y_masks.is_empty() {
+                        Some(y_masks)
+                    } else {
+                        None
+                    },
+                };
+                ys.push(y);
+            }
+
+            Ok(ys)
+        }
+    }
+
+    fn fetch_names(engine: &OrtEngine) -> Option<Vec<String>> {
+        // fetch class names from onnx metadata
+        // String format: `{0: 'person', 1: 'bicycle', 2: 'sports ball', ..., 27: "yellow_lady's_slipper"}`
+        engine.try_fetch("names").map(|names| {
+            let re = Regex::new(r#"(['"])([-()\w '"]+)(['"])"#).unwrap();
+            let mut names_ = vec![];
+            for (_, [_, name, _]) in re.captures_iter(&names).map(|x| x.extract()) {
+                names_.push(name.to_string());
+            }
+            names_
+        })
+    }
+
+    pub fn batch(&self) -> isize {
+        self.batch.opt
+    }
+
+    pub fn width(&self) -> isize {
+        self.width.opt
+    }
+
+    pub fn height(&self) -> isize {
+        self.height.opt
+    }
+}
diff --git a/src/ops.rs b/src/ops.rs
new file mode 100644
index 0000000..945e7f0
--- /dev/null
+++ b/src/ops.rs
@@ -0,0 +1,95 @@
+use anyhow::Result;
+use image::{DynamicImage, GenericImageView};
+use ndarray::{Array, Axis, Ix2, IxDyn};
+
+pub fn scale_wh(w0: f32, h0: f32, w1: f32, h1: f32) -> (f32, f32, f32) {
+    let r = (w1 / w0).min(h1 / h0);
+    (r, (w0 * r).round(), (h0 * r).round())
+}
+
+pub fn resize(
+    xs: &[DynamicImage],
+    height: u32,
+    width: u32,
+    norm_imagenet: bool,
+) -> Result<Array<f32, IxDyn>> {
+    let norm = 255.0;
+    let mut ys = Array::ones(vec![xs.len(), 3, height as usize, width as usize]).into_dyn();
+    // let mut ys = Array::ones((xs.len(), 3, height as usize, width as usize)).into_dyn();
+    for (idx, x) in xs.iter().enumerate() {
+        let (w0, h0) = x.dimensions();
+        let w0 = w0 as f32;
+        let h0 = h0 as f32;
+        let (_, w_new, h_new) = scale_wh(w0, h0, width as f32, height as f32); // f32 round
+        let img = x.resize_exact(
+            w_new as u32,
+            h_new as u32,
+            image::imageops::FilterType::Triangle,
+        );
+        for (x, y, rgb) in img.pixels() {
+            let x = x as usize;
+            let y = y as usize;
+            let [r, g, b, _] = rgb.0;
+            ys[[idx, 0, y, x]] = (r as f32) / norm;
+            ys[[idx, 1, y, x]] = (g as f32) / norm;
+            ys[[idx, 2, y, x]] = (b as f32) / norm;
+        }
+    }
+
+    if norm_imagenet {
+        let mean =
+            Array::from_shape_vec((1, 3, 1, 1), vec![0.48145466, 0.4578275, 0.40821073]).unwrap();
+        let std = Array::from_shape_vec((1, 3, 1, 1), vec![0.26862954, 0.261_302_6, 0.275_777_1])
+            .unwrap();
+        ys = (ys - mean) / std;
+    }
+    Ok(ys)
+}
+
+pub fn letterbox(xs: &[DynamicImage], height: u32, width: u32) -> Result<Array<f32, IxDyn>> {
+    let norm = 255.0;
+    let bg = 144.0;
+    let mut ys = Array::ones((xs.len(), 3, height as usize, width as usize)).into_dyn();
+    ys.fill(bg / norm);
+    for (idx, x) in xs.iter().enumerate() {
+        let (w0, h0) = x.dimensions();
+        let w0 = w0 as f32;
+        let h0 = h0 as f32;
+        let (_, w_new, h_new) = scale_wh(w0, h0, width as f32, height as f32); // f32 round
+        let img = x.resize_exact(
+            w_new as u32,
+            h_new as u32,
+            image::imageops::FilterType::Triangle,
+        );
+        for (x, y, rgb) in img.pixels() {
+            let x = x as usize;
+            let y = y as usize;
+            let [r, g, b, _] = rgb.0;
+            ys[[idx, 0, y, x]] = (r as f32) / norm;
+            ys[[idx, 1, y, x]] = (g as f32) / norm;
+            ys[[idx, 2, y, x]] = (b as f32) / norm;
+        }
+    }
+    Ok(ys)
+}
+
+pub fn norm(xs: &Array<f32, IxDyn>) -> Array<f32, IxDyn> {
+    let std_ = xs
+        .mapv(|x| x * x)
+        .sum_axis(Axis(1))
+        .mapv(f32::sqrt)
+        .insert_axis(Axis(1));
+    xs / std_
+}
+
+pub fn dot2(query: &Array<f32, IxDyn>, gallery: &Array<f32, IxDyn>) -> Result<Vec<Vec<f32>>> {
+    // (m, ndim) * (n, ndim).t => (m, n)
+    let query = query.to_owned().into_dimensionality::<Ix2>()?;
+    let gallery = gallery.to_owned().into_dimensionality::<Ix2>()?;
+    let matrix = query.dot(&gallery.t());
+    let exps = matrix.mapv(|x| x.exp());
+    let stds = exps.sum_axis(Axis(1));
+    let matrix = exps / stds.insert_axis(Axis(1));
+    let matrix: Vec<Vec<f32>> = matrix.axis_iter(Axis(0)).map(|row| row.to_vec()).collect();
+    Ok(matrix)
+}
diff --git a/src/options.rs b/src/options.rs
new file mode 100644
index 0000000..f2ff5dc
--- /dev/null
+++ b/src/options.rs
@@ -0,0 +1,321 @@
+use crate::{auto_load, Device, MinOptMax};
+
+#[derive(Debug, Clone)]
+pub struct Options {
+    pub onnx_path: String,
+    pub device: Device,
+    pub profile: bool,
+    pub num_dry_run: usize,
+    pub i00: Option<MinOptMax>, // 1st input, axis 0, batch usually
+    pub i01: Option<MinOptMax>, // 1st input, axis 1
+    pub i02: Option<MinOptMax>,
+    pub i03: Option<MinOptMax>,
+    pub i04: Option<MinOptMax>,
+    pub i05: Option<MinOptMax>,
+    pub i10: Option<MinOptMax>, // 2nd input, axis 0
+    pub i11: Option<MinOptMax>,
+    pub i12: Option<MinOptMax>,
+    pub i13: Option<MinOptMax>,
+    pub i14: Option<MinOptMax>,
+    pub i15: Option<MinOptMax>,
+    pub i20: Option<MinOptMax>, // 2nd input, axis 0
+    pub i21: Option<MinOptMax>,
+    pub i22: Option<MinOptMax>,
+    pub i23: Option<MinOptMax>,
+    pub i24: Option<MinOptMax>,
+    pub i25: Option<MinOptMax>,
+    pub i30: Option<MinOptMax>, // 2nd input, axis 0
+    pub i31: Option<MinOptMax>,
+    pub i32_: Option<MinOptMax>,
+    pub i33: Option<MinOptMax>,
+    pub i34: Option<MinOptMax>,
+    pub i35: Option<MinOptMax>,
+
+    // trt ep
+    pub trt_engine_cache_enable: bool,
+    pub trt_int8_enable: bool,
+    pub trt_fp16_enable: bool,
+
+    // options for Vision and Language models
+    pub nc: Option<usize>,
+    pub nk: Option<usize>,
+    pub nm: Option<usize>,
+    pub confs: Vec<f32>,
+    pub kconfs: Vec<f32>,
+    pub iou: f32,
+    pub apply_nms: bool,
+    pub saveout: Option<String>,
+    pub tokenizer: Option<String>,
+    pub vocab: Option<String>,
+    pub names: Option<Vec<String>>, // class names
+    pub anchors_first: bool,        // otuput format: [bs, anchors/na, pos+nc+nm]
+    pub skeletons: Option<Vec<(usize, usize)>>,
+}
+
+impl Default for Options {
+    fn default() -> Self {
+        Self {
+            onnx_path: String::new(),
+            device: Device::Cuda(0),
+            profile: false,
+            num_dry_run: 3,
+            i00: None,
+            i01: None,
+            i02: None,
+            i03: None,
+            i04: None,
+            i05: None,
+            i10: None,
+            i11: None,
+            i12: None,
+            i13: None,
+            i14: None,
+            i15: None,
+            i20: None,
+            i21: None,
+            i22: None,
+            i23: None,
+            i24: None,
+            i25: None,
+            i30: None,
+            i31: None,
+            i32_: None,
+            i33: None,
+            i34: None,
+            i35: None,
+            trt_engine_cache_enable: true,
+            trt_int8_enable: false,
+            trt_fp16_enable: false,
+            nc: None,
+            nk: None,
+            nm: None,
+            confs: vec![0.4f32],
+            kconfs: vec![0.5f32],
+            iou: 0.45f32,
+            apply_nms: true,
+            saveout: None,
+            tokenizer: None,
+            vocab: None,
+            names: None,
+            anchors_first: false,
+            skeletons: None,
+        }
+    }
+}
+
+impl Options {
+    pub fn with_model(mut self, onnx_path: &str) -> Self {
+        self.onnx_path = auto_load(onnx_path).unwrap();
+        self
+    }
+
+    pub fn with_dry_run(mut self, n: usize) -> Self {
+        self.num_dry_run = n;
+        self
+    }
+
+    pub fn with_cuda(mut self, id: usize) -> Self {
+        self.device = Device::Cuda(id);
+        self
+    }
+
+    pub fn with_trt(mut self, id: usize) -> Self {
+        self.device = Device::Trt(id);
+        self
+    }
+
+    pub fn with_cpu(mut self) -> Self {
+        self.device = Device::Cpu(0);
+        self
+    }
+
+    pub fn with_coreml(mut self, id: usize) -> Self {
+        self.device = Device::CoreML(id);
+        self
+    }
+
+    pub fn with_fp16(mut self, x: bool) -> Self {
+        self.trt_fp16_enable = x;
+        self
+    }
+
+    pub fn with_profile(mut self, profile: bool) -> Self {
+        self.profile = profile;
+        self
+    }
+
+    pub fn with_saveout(mut self, saveout: &str) -> Self {
+        self.saveout = Some(saveout.to_string());
+        self
+    }
+
+    pub fn with_names(mut self, names: &[&str]) -> Self {
+        self.names = Some(names.iter().map(|x| x.to_string()).collect::<Vec<String>>());
+        self
+    }
+
+    pub fn with_skeletons(mut self, skeletons: &[(usize, usize)]) -> Self {
+        self.skeletons = Some(skeletons.to_vec());
+        self
+    }
+
+    pub fn with_anchors_first(mut self) -> Self {
+        self.anchors_first = true;
+        self
+    }
+
+    pub fn with_nms(mut self, apply_nms: bool) -> Self {
+        self.apply_nms = apply_nms;
+        self
+    }
+
+    pub fn with_nc(mut self, nc: usize) -> Self {
+        self.nc = Some(nc);
+        self
+    }
+
+    pub fn with_nk(mut self, nk: usize) -> Self {
+        self.nk = Some(nk);
+        self
+    }
+
+    pub fn with_iou(mut self, x: f32) -> Self {
+        self.iou = x;
+        self
+    }
+
+    pub fn with_confs(mut self, confs: &[f32]) -> Self {
+        self.confs = confs.to_vec();
+        self
+    }
+
+    pub fn with_kconfs(mut self, kconfs: &[f32]) -> Self {
+        self.kconfs = kconfs.to_vec();
+        self
+    }
+
+    pub fn with_tokenizer(mut self, tokenizer: String) -> Self {
+        self.tokenizer = Some(tokenizer);
+        self
+    }
+
+    pub fn with_i00(mut self, x: MinOptMax) -> Self {
+        self.i00 = Some(x);
+        self
+    }
+
+    pub fn with_i01(mut self, x: MinOptMax) -> Self {
+        self.i01 = Some(x);
+        self
+    }
+
+    pub fn with_i02(mut self, x: MinOptMax) -> Self {
+        self.i02 = Some(x);
+        self
+    }
+
+    pub fn with_i03(mut self, x: MinOptMax) -> Self {
+        self.i03 = Some(x);
+        self
+    }
+
+    pub fn with_i04(mut self, x: MinOptMax) -> Self {
+        self.i04 = Some(x);
+        self
+    }
+
+    pub fn with_i05(mut self, x: MinOptMax) -> Self {
+        self.i05 = Some(x);
+        self
+    }
+
+    pub fn with_i10(mut self, x: MinOptMax) -> Self {
+        self.i10 = Some(x);
+        self
+    }
+
+    pub fn with_i11(mut self, x: MinOptMax) -> Self {
+        self.i11 = Some(x);
+        self
+    }
+
+    pub fn with_i12(mut self, x: MinOptMax) -> Self {
+        self.i12 = Some(x);
+        self
+    }
+
+    pub fn with_i13(mut self, x: MinOptMax) -> Self {
+        self.i13 = Some(x);
+        self
+    }
+
+    pub fn with_i14(mut self, x: MinOptMax) -> Self {
+        self.i14 = Some(x);
+        self
+    }
+
+    pub fn with_i15(mut self, x: MinOptMax) -> Self {
+        self.i15 = Some(x);
+        self
+    }
+
+    pub fn with_i20(mut self, x: MinOptMax) -> Self {
+        self.i20 = Some(x);
+        self
+    }
+
+    pub fn with_i21(mut self, x: MinOptMax) -> Self {
+        self.i21 = Some(x);
+        self
+    }
+
+    pub fn with_i22(mut self, x: MinOptMax) -> Self {
+        self.i22 = Some(x);
+        self
+    }
+
+    pub fn with_i23(mut self, x: MinOptMax) -> Self {
+        self.i23 = Some(x);
+        self
+    }
+
+    pub fn with_i24(mut self, x: MinOptMax) -> Self {
+        self.i24 = Some(x);
+        self
+    }
+
+    pub fn with_i25(mut self, x: MinOptMax) -> Self {
+        self.i25 = Some(x);
+        self
+    }
+
+    pub fn with_i30(mut self, x: MinOptMax) -> Self {
+        self.i30 = Some(x);
+        self
+    }
+
+    pub fn with_i31(mut self, x: MinOptMax) -> Self {
+        self.i31 = Some(x);
+        self
+    }
+
+    pub fn with_i32_(mut self, x: MinOptMax) -> Self {
+        self.i32_ = Some(x);
+        self
+    }
+
+    pub fn with_i33(mut self, x: MinOptMax) -> Self {
+        self.i33 = Some(x);
+        self
+    }
+
+    pub fn with_i34(mut self, x: MinOptMax) -> Self {
+        self.i34 = Some(x);
+        self
+    }
+
+    pub fn with_i35(mut self, x: MinOptMax) -> Self {
+        self.i35 = Some(x);
+        self
+    }
+}
diff --git a/src/point.rs b/src/point.rs
new file mode 100644
index 0000000..d53aae0
--- /dev/null
+++ b/src/point.rs
@@ -0,0 +1,182 @@
+use std::ops::{Add, Div, Mul, Sub};
+
+#[derive(Default, Debug, PartialOrd, PartialEq, Clone, Copy)]
+pub struct Point {
+    pub x: f32,
+    pub y: f32,
+}
+
+impl Add for Point {
+    type Output = Self;
+
+    fn add(self, other: Self) -> Self::Output {
+        Self {
+            x: self.x + other.x,
+            y: self.y + other.y,
+        }
+    }
+}
+
+impl Add<f32> for Point {
+    type Output = Self;
+
+    fn add(self, other: f32) -> Self::Output {
+        Self {
+            x: self.x + other,
+            y: self.y + other,
+        }
+    }
+}
+
+impl Sub for Point {
+    type Output = Self;
+
+    fn sub(self, other: Self) -> Self::Output {
+        Self {
+            x: self.x - other.x,
+            y: self.y - other.y,
+        }
+    }
+}
+
+impl Sub<f32> for Point {
+    type Output = Self;
+
+    fn sub(self, other: f32) -> Self::Output {
+        Self {
+            x: self.x * other,
+            y: self.y * other,
+        }
+    }
+}
+
+impl Mul<f32> for Point {
+    type Output = Self;
+
+    fn mul(self, other: f32) -> Self::Output {
+        Self {
+            x: self.x * other,
+            y: self.y * other,
+        }
+    }
+}
+
+impl Mul for Point {
+    type Output = Self;
+
+    fn mul(self, other: Self) -> Self::Output {
+        Self {
+            x: self.x * other.x,
+            y: self.y * other.y,
+        }
+    }
+}
+
+impl Div for Point {
+    type Output = Self;
+
+    fn div(self, other: Self) -> Self::Output {
+        Self {
+            x: self.x / other.x,
+            y: self.y / other.y,
+        }
+    }
+}
+
+impl Div<f32> for Point {
+    type Output = Self;
+
+    fn div(self, other: f32) -> Self::Output {
+        Self {
+            x: self.x / other,
+            y: self.y / other,
+        }
+    }
+}
+
+impl From<(f32, f32)> for Point {
+    fn from((x, y): (f32, f32)) -> Self {
+        Self { x, y }
+    }
+}
+
+impl From<Point> for (f32, f32) {
+    fn from(Point { x, y }: Point) -> Self {
+        (x, y)
+    }
+}
+
+impl From<[f32; 2]> for Point {
+    fn from([x, y]: [f32; 2]) -> Self {
+        Self { x, y }
+    }
+}
+
+impl From<Point> for [f32; 2] {
+    fn from(Point { x, y }: Point) -> Self {
+        [x, y]
+    }
+}
+
+impl Point {
+    pub fn new(x: f32, y: f32) -> Self {
+        Self { x, y }
+    }
+
+    pub fn coord(&self) -> [f32; 2] {
+        [self.x, self.y]
+    }
+
+    pub fn is_origin(&self) -> bool {
+        self.x == 0.0_f32 && self.y == 0.0_f32
+    }
+
+    pub fn distance_from(&self, other: &Point) -> f32 {
+        ((self.x - other.x).powf(2.0) + (self.y - other.y).powf(2.0)).sqrt()
+    }
+
+    pub fn distance_from_origin(&self) -> f32 {
+        (self.x.powf(2.0) + self.y.powf(2.0)).sqrt()
+    }
+
+    pub fn sum(&self) -> f32 {
+        self.x + self.y
+    }
+}
+
+#[cfg(test)]
+mod tests_points {
+    use super::Point;
+
+    #[test]
+    fn new() {
+        let origin1 = Point::from((0.0f32, 0.0f32));
+        let origin2 = Point::from([0.0f32, 0.0f32]);
+        let origin3 = (0.0f32, 0.0f32).into();
+        let origin4 = [0.0f32, 0.0f32].into();
+        let origin5 = Point::new(1.0f32, 2.0f32);
+        let origin6 = Point {
+            x: 1.0f32,
+            y: 2.0f32,
+        };
+        assert_eq!(origin1, origin2);
+        assert_eq!(origin2, origin3);
+        assert_eq!(origin3, origin4);
+        assert_eq!(origin5, origin6);
+        assert!(origin1.is_origin());
+        assert!(origin2.is_origin());
+        assert!(origin3.is_origin());
+        assert!(origin4.is_origin());
+        assert!(!origin5.is_origin());
+        assert!(!origin6.is_origin());
+    }
+
+    #[test]
+    fn into_tuple_array() {
+        let point = Point::from((1.0, 2.0));
+        let tuple: (f32, f32) = point.into();
+        let array: [f32; 2] = point.into();
+        assert_eq!(tuple, (1.0, 2.0));
+        assert_eq!(array, [1.0, 2.0]);
+    }
+}
diff --git a/src/rect.rs b/src/rect.rs
new file mode 100644
index 0000000..8ce25f7
--- /dev/null
+++ b/src/rect.rs
@@ -0,0 +1,193 @@
+use crate::Point;
+
+#[derive(Default, PartialOrd, PartialEq, Clone, Copy)]
+pub struct Rect {
+    top_left: Point,
+    bottom_right: Point,
+}
+
+impl std::fmt::Debug for Rect {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Rectangle")
+            .field("xmin", &self.xmin())
+            .field("ymin", &self.ymin())
+            .field("xmax", &self.xmax())
+            .field("ymax", &self.ymax())
+            .finish()
+    }
+}
+
+impl<P: Into<Point>> From<(P, P)> for Rect {
+    fn from((top_left, bottom_right): (P, P)) -> Self {
+        Self {
+            top_left: top_left.into(),
+            bottom_right: bottom_right.into(),
+        }
+    }
+}
+
+impl<P: Into<Point>> From<[P; 2]> for Rect {
+    fn from([top_left, bottom_right]: [P; 2]) -> Self {
+        Self {
+            top_left: top_left.into(),
+            bottom_right: bottom_right.into(),
+        }
+    }
+}
+
+impl Rect {
+    pub fn new(top_left: Point, bottom_right: Point) -> Self {
+        Self {
+            top_left,
+            bottom_right,
+        }
+    }
+
+    pub fn from_xywh(x: f32, y: f32, w: f32, h: f32) -> Self {
+        Self {
+            top_left: Point::new(x, y),
+            bottom_right: Point::new(x + w, y + h),
+        }
+    }
+
+    pub fn from_xyxy(x1: f32, y1: f32, x2: f32, y2: f32) -> Self {
+        Self {
+            top_left: Point::new(x1, y1),
+            bottom_right: Point::new(x2, y2),
+        }
+    }
+
+    pub fn from_cxywh(cx: f32, cy: f32, w: f32, h: f32) -> Self {
+        Self {
+            top_left: Point::new(cx - w / 2.0, cy - h / 2.0),
+            bottom_right: Point::new(cx + w / 2.0, cy + h / 2.0),
+        }
+    }
+
+    pub fn width(&self) -> f32 {
+        (self.bottom_right - self.top_left).x
+    }
+
+    pub fn height(&self) -> f32 {
+        (self.bottom_right - self.top_left).y
+    }
+
+    pub fn xmin(&self) -> f32 {
+        self.top_left.x
+    }
+
+    pub fn ymin(&self) -> f32 {
+        self.top_left.y
+    }
+
+    pub fn xmax(&self) -> f32 {
+        self.bottom_right.x
+    }
+
+    pub fn ymax(&self) -> f32 {
+        self.bottom_right.y
+    }
+
+    pub fn cx(&self) -> f32 {
+        self.bottom_right.x - self.top_left.x
+    }
+
+    pub fn cy(&self) -> f32 {
+        self.bottom_right.y - self.top_left.y
+    }
+
+    pub fn tl(&self) -> Point {
+        self.top_left
+    }
+
+    pub fn br(&self) -> Point {
+        self.bottom_right
+    }
+
+    pub fn tr(&self) -> Point {
+        Point::new(self.bottom_right.x, self.top_left.y)
+    }
+
+    pub fn bl(&self) -> Point {
+        Point::new(self.top_left.x, self.bottom_right.y)
+    }
+
+    pub fn center(&self) -> Point {
+        (self.bottom_right + self.top_left) / 2.0
+    }
+
+    pub fn area(&self) -> f32 {
+        self.height() * self.width()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.area() == 0.0
+    }
+
+    pub fn is_squre(&self) -> bool {
+        self.width() == self.height()
+    }
+
+    pub fn intersect(&self, other: &Rect) -> f32 {
+        let l = self.xmin().max(other.xmin());
+        let r = (self.xmin() + self.width()).min(other.xmin() + other.width());
+        let t = self.ymin().max(other.ymin());
+        let b = (self.ymin() + self.height()).min(other.ymin() + other.height());
+        (r - l).max(0.) * (b - t).max(0.)
+    }
+
+    pub fn union(&self, other: &Rect) -> f32 {
+        self.area() + other.area() - self.intersect(other)
+    }
+
+    pub fn iou(&self, other: &Rect) -> f32 {
+        self.intersect(other) / self.union(other)
+    }
+
+    pub fn contains(&self, other: &Rect) -> bool {
+        self.xmin() <= other.xmin()
+            && self.xmax() >= other.xmax()
+            && self.ymin() <= other.ymin()
+            && self.ymax() >= other.ymax()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::Rect;
+    use crate::Point;
+
+    #[test]
+    fn new() {
+        let rect1 = Rect {
+            top_left: Point {
+                x: 0.0f32,
+                y: 0.0f32,
+            },
+            bottom_right: Point {
+                x: 5.0f32,
+                y: 5.0f32,
+            },
+        };
+        let rect2 = Rect {
+            top_left: (0.0f32, 0.0f32).into(),
+            bottom_right: [5.0f32, 5.0f32].into(),
+        };
+        let rect3 = Rect::new([0.0, 0.0].into(), [5.0, 5.0].into());
+        let rect4: Rect = ((0.0, 0.0), (5.0, 5.0)).into();
+        let rect5: Rect = [(0.0, 0.0), (5.0, 5.0)].into();
+        let rect6: Rect = ([0.0, 0.0], [5.0, 5.0]).into();
+        let rect7: Rect = Rect::from(([0.0, 0.0], [5.0, 5.0]));
+        let rect8: Rect = Rect::from([[0.0, 0.0], [5.0, 5.0]]);
+        let rect9: Rect = Rect::from([(0.0, 0.0), (5.0, 5.0)]);
+        let rect10: Rect = Rect::from_xyxy(0.0, 0.0, 5.0, 5.0);
+        let rect11: Rect = Rect::from_xywh(0.0, 0.0, 5.0, 5.0);
+
+        assert_eq!(rect1, rect2);
+        assert_eq!(rect3, rect4);
+        assert_eq!(rect5, rect6);
+        assert_eq!(rect7, rect8);
+        assert_eq!(rect9, rect8);
+        assert_eq!(rect10, rect11);
+    }
+}
diff --git a/src/results.rs b/src/results.rs
new file mode 100644
index 0000000..9e5c9f0
--- /dev/null
+++ b/src/results.rs
@@ -0,0 +1,59 @@
+use crate::{Bbox, Embedding, Keypoint};
+
+#[derive(Clone, PartialEq, Default)]
+pub struct Results {
+    pub probs: Option<Embedding>,
+    pub bboxes: Option<Vec<Bbox>>,
+    pub keypoints: Option<Vec<Vec<Keypoint>>>,
+    pub masks: Option<Vec<Vec<u8>>>,
+}
+
+impl std::fmt::Debug for Results {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Results")
+            .field("Probabilities", &self.probs)
+            .field("BoundingBoxes", &self.bboxes)
+            .field("Keypoints", &self.keypoints)
+            .field(
+                "Masks",
+                &format_args!("{:?}", self.masks().map(|masks| masks.len())),
+            )
+            .finish()
+    }
+}
+
+impl Results {
+    pub fn new(
+        probs: Option<Embedding>,
+        bboxes: Option<Vec<Bbox>>,
+        keypoints: Option<Vec<Vec<Keypoint>>>,
+        masks: Option<Vec<Vec<u8>>>,
+    ) -> Self {
+        Self {
+            probs,
+            bboxes,
+            keypoints,
+            masks,
+        }
+    }
+
+    pub fn probs(&self) -> Option<&Embedding> {
+        self.probs.as_ref()
+    }
+
+    pub fn keypoints(&self) -> Option<&Vec<Vec<Keypoint>>> {
+        self.keypoints.as_ref()
+    }
+
+    pub fn masks(&self) -> Option<&Vec<Vec<u8>>> {
+        self.masks.as_ref()
+    }
+
+    pub fn bboxes(&self) -> Option<&Vec<Bbox>> {
+        self.bboxes.as_ref()
+    }
+
+    pub fn bboxes_mut(&mut self) -> Option<&mut Vec<Bbox>> {
+        self.bboxes.as_mut()
+    }
+}
diff --git a/src/rotated_rect.rs b/src/rotated_rect.rs
new file mode 100644
index 0000000..ab878d5
--- /dev/null
+++ b/src/rotated_rect.rs
@@ -0,0 +1,155 @@
+use crate::Point;
+
+#[derive(Default, PartialOrd, PartialEq, Clone, Copy)]
+pub struct RotatedRect {
+    center: Point,
+    width: f32,
+    height: f32,
+    rotation: f32, // (0, 90) radians
+}
+
+impl std::fmt::Debug for RotatedRect {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("RotatedRectangle")
+            .field("height", &self.height)
+            .field("width", &self.width)
+            .field("center", &self.center)
+            .field("rotation", &self.rotation)
+            .field("vertices", &self.vertices())
+            .finish()
+    }
+}
+
+impl RotatedRect {
+    pub fn new(center: Point, width: f32, height: f32, rotation: f32) -> Self {
+        Self {
+            center,
+            width,
+            height,
+            rotation,
+        }
+    }
+
+    pub fn vertices(&self) -> [Point; 4] {
+        // [cos -sin]
+        // [sin cos]
+        let m = [
+            [
+                self.rotation.cos() * 0.5 * self.width,
+                -self.rotation.sin() * 0.5 * self.height,
+            ],
+            [
+                self.rotation.sin() * 0.5 * self.width,
+                self.rotation.cos() * 0.5 * self.height,
+            ],
+        ];
+        let v1 = self.center + Point::new(m[0][0] + m[0][1], m[1][0] + m[1][1]);
+        let v2 = self.center + Point::new(m[0][0] - m[0][1], m[1][0] - m[1][1]);
+        let v3 = self.center * 2.0 - v1;
+        let v4 = self.center * 2.0 - v2;
+        [v1, v2, v3, v4]
+    }
+
+    pub fn height(&self) -> f32 {
+        self.height
+    }
+
+    pub fn width(&self) -> f32 {
+        self.width
+    }
+
+    pub fn center(&self) -> Point {
+        self.center
+    }
+
+    pub fn area(&self) -> f32 {
+        self.height * self.width
+    }
+
+    // pub fn contain_point(&self, point: Point) -> bool {
+    //     // ray casting
+    //     todo!()
+    // }
+}
+
+#[test]
+fn test1() {
+    let pi = std::f32::consts::PI;
+    let rt = RotatedRect::new(
+        Point::new(0.0f32, 0.0f32),
+        2.0f32,
+        4.0f32,
+        pi / 180.0 * 90.0,
+    );
+
+    assert_eq!(
+        rt.vertices(),
+        [
+            Point {
+                x: -2.0,
+                y: 0.99999994,
+            },
+            Point {
+                x: 2.0,
+                y: 1.0000001,
+            },
+            Point {
+                x: 2.0,
+                y: -0.99999994,
+            },
+            Point {
+                x: -2.0,
+                y: -1.0000001,
+            },
+        ]
+    );
+}
+
+#[test]
+fn test2() {
+    let pi = std::f32::consts::PI;
+    let rt = RotatedRect::new(
+        Point::new(0.0f32, 0.0f32),
+        2.0f32.sqrt(),
+        2.0f32.sqrt(),
+        pi / 180.0 * 45.0,
+    );
+
+    assert_eq!(
+        rt.vertices(),
+        [
+            Point {
+                x: 0.0,
+                y: 0.99999994
+            },
+            Point {
+                x: 0.99999994,
+                y: 0.0
+            },
+            Point {
+                x: 0.0,
+                y: -0.99999994
+            },
+            Point {
+                x: -0.99999994,
+                y: 0.0
+            },
+        ]
+    );
+}
+
+// #[test]
+// fn contain_point() {
+//     let pi = std::f32::consts::PI;
+//     let rt = RotatedRect::new(
+//         Point::new(0.0f32, 0.0f32),
+//         1.0f32.sqrt(),
+//         1.0f32.sqrt(),
+//         pi / 180.0 * 45.0,
+//     );
+
+//     assert!(rt.contain_point(Point::new(0.0, 0.0)));
+//     assert!(rt.contain_point(Point::new(0.5, 0.0)));
+//     assert!(rt.contain_point(Point::new(0.0, 0.5)));
+
+// }
diff --git a/src/tokenizer_stream.rs b/src/tokenizer_stream.rs
new file mode 100644
index 0000000..5fb8025
--- /dev/null
+++ b/src/tokenizer_stream.rs
@@ -0,0 +1,88 @@
+// https://github.com/huggingface/candle/blob/2a8679509eb55232b37378442c4366343f6dcb11/candle-examples/src/token_output_stream.rs#L5
+use anyhow::Result;
+
+/// This is a wrapper around a tokenizer to ensure that tokens can be returned to the user in a
+/// streaming way rather than having to wait for the full decoding.
+#[derive(Debug)]
+pub struct TokenizerStream {
+    tokenizer: tokenizers::Tokenizer,
+    tokens: Vec<u32>,
+    prev_index: usize,
+    current_index: usize,
+}
+
+impl TokenizerStream {
+    pub fn new(tokenizer: tokenizers::Tokenizer) -> Self {
+        Self {
+            tokenizer,
+            tokens: Vec::new(),
+            prev_index: 0,
+            current_index: 0,
+        }
+    }
+
+    pub fn into_inner(self) -> tokenizers::Tokenizer {
+        self.tokenizer
+    }
+
+    fn decode(&self, tokens: &[u32]) -> Result<String> {
+        match self.tokenizer.decode(tokens, true) {
+            Ok(str) => Ok(str),
+            Err(err) => anyhow::bail!("cannot decode: {err}"),
+        }
+    }
+
+    // https://github.com/huggingface/text-generation-inference/blob/5ba53d44a18983a4de32d122f4cb46f4a17d9ef6/server/text_generation_server/models/model.py#L68
+    pub fn next_token(&mut self, token: u32) -> Result<Option<String>> {
+        let prev_text = if self.tokens.is_empty() {
+            String::new()
+        } else {
+            let tokens = &self.tokens[self.prev_index..self.current_index];
+            self.decode(tokens)?
+        };
+        self.tokens.push(token);
+        let text = self.decode(&self.tokens[self.prev_index..])?;
+        if text.len() > prev_text.len() && text.chars().last().unwrap().is_alphanumeric() {
+            let text = text.split_at(prev_text.len());
+            self.prev_index = self.current_index;
+            self.current_index = self.tokens.len();
+            Ok(Some(text.1.to_string()))
+        } else {
+            Ok(None)
+        }
+    }
+
+    pub fn decode_rest(&self) -> Result<Option<String>> {
+        let prev_text = if self.tokens.is_empty() {
+            String::new()
+        } else {
+            let tokens = &self.tokens[self.prev_index..self.current_index];
+            self.decode(tokens)?
+        };
+        let text = self.decode(&self.tokens[self.prev_index..])?;
+        if text.len() > prev_text.len() {
+            let text = text.split_at(prev_text.len());
+            Ok(Some(text.1.to_string()))
+        } else {
+            Ok(None)
+        }
+    }
+
+    pub fn decode_all(&self) -> Result<String> {
+        self.decode(&self.tokens)
+    }
+
+    pub fn get_token(&self, token_s: &str) -> Option<u32> {
+        self.tokenizer.get_vocab(true).get(token_s).copied()
+    }
+
+    pub fn tokenizer(&self) -> &tokenizers::Tokenizer {
+        &self.tokenizer
+    }
+
+    pub fn clear(&mut self) {
+        self.tokens.clear();
+        self.prev_index = 0;
+        self.current_index = 0;
+    }
+}
diff --git a/src/utils.rs b/src/utils.rs
new file mode 100644
index 0000000..aba3625
--- /dev/null
+++ b/src/utils.rs
@@ -0,0 +1,220 @@
+use crate::{Bbox, Keypoint, GITHUB_ASSETS};
+use anyhow::Result;
+use indicatif::{ProgressBar, ProgressStyle};
+use std::io::{Read, Write};
+use std::path::{Path, PathBuf};
+
+pub fn auto_load<P: AsRef<Path>>(src: P) -> Result<String> {
+    // check if input file exists
+    let src = src.as_ref();
+    let p = if src.is_file() {
+        src.into()
+    } else {
+        let sth = src.file_name().unwrap().to_str().unwrap();
+        let mut p = config_dir();
+        p.push(sth);
+        // download from github assets if not exists in config directory
+        if !p.is_file() {
+            download(
+                &format!("{}/{}", GITHUB_ASSETS, sth),
+                &p,
+                Some(sth.to_string().as_str()),
+            )
+            .unwrap_or_else(|err| panic!("Fail to load {:?}: {err}", src));
+        }
+        p
+    };
+    Ok(p.to_str().unwrap().to_string())
+}
+
+pub fn download<P: AsRef<Path> + std::fmt::Debug>(
+    src: &str,
+    dst: P,
+    prompt: Option<&str>,
+) -> Result<()> {
+    let resp = ureq::AgentBuilder::new()
+        .try_proxy_from_env(true)
+        .build()
+        .get(src)
+        .timeout(std::time::Duration::from_secs(2000))
+        .call()
+        .unwrap_or_else(|err| panic!("Failed to GET: {}", err));
+    let ntotal = resp
+        .header("Content-Length")
+        .and_then(|s| s.parse::<u64>().ok())
+        .expect("Content-Length header should be present on archive response");
+    let pb = ProgressBar::new(ntotal);
+    pb.set_style(
+            ProgressStyle::with_template(
+                "{prefix:.bold} {msg:.dim} [{bar:.blue.bright/white.dim}] {binary_bytes}/{binary_total_bytes} ({binary_bytes_per_sec}, {percent_precise}%, {elapsed})"
+            )
+            .unwrap()
+            .progress_chars("#>-"));
+    pb.set_prefix(String::from("\n🐢 Downloading"));
+    pb.set_message(prompt.unwrap_or_default().to_string());
+    let mut reader = resp.into_reader();
+    let mut buffer = [0; 256];
+    let mut downloaded_bytes = 0usize;
+    let mut f = std::fs::File::create(&dst).expect("Failed to create file");
+    loop {
+        let bytes_read = reader.read(&mut buffer)?;
+        if bytes_read == 0 {
+            break;
+        }
+        pb.inc(bytes_read as u64);
+        f.write_all(&buffer[..bytes_read])?;
+        downloaded_bytes += bytes_read;
+    }
+    assert_eq!(downloaded_bytes as u64, ntotal);
+    pb.finish();
+    Ok(())
+}
+
+pub fn string_now(delimiter: &str) -> String {
+    let t_now = chrono::Local::now();
+    let fmt = format!(
+        "%Y{}%m{}%d{}%H{}%M{}%S{}%f",
+        delimiter, delimiter, delimiter, delimiter, delimiter, delimiter
+    );
+    t_now.format(&fmt).to_string()
+}
+
+pub fn config_dir() -> PathBuf {
+    match dirs::config_dir() {
+        Some(mut d) => {
+            d.push("usls");
+            if !d.exists() {
+                std::fs::create_dir_all(&d).expect("Failed to create config directory.");
+            }
+            d
+        }
+        None => panic!("Unsupported operating system. Now support Linux, MacOS, Windows."),
+    }
+}
+
+#[allow(clippy::type_complexity)]
+pub fn non_max_suppression(
+    xs: &mut Vec<(Bbox, Option<Vec<Keypoint>>, Option<Vec<f32>>)>,
+    iou_threshold: f32,
+) {
+    xs.sort_by(|b1, b2| b2.0.confidence().partial_cmp(&b1.0.confidence()).unwrap());
+
+    let mut current_index = 0;
+    for index in 0..xs.len() {
+        let mut drop = false;
+        for prev_index in 0..current_index {
+            let iou = xs[prev_index].0.iou(&xs[index].0);
+            if iou > iou_threshold {
+                drop = true;
+                break;
+            }
+        }
+        if !drop {
+            xs.swap(current_index, index);
+            current_index += 1;
+        }
+    }
+    xs.truncate(current_index);
+}
+
+pub const COCO_SKELETON_17: [(usize, usize); 16] = [
+    (0, 1),
+    (0, 2),
+    (1, 3),
+    (2, 4),
+    (5, 6),
+    (5, 11),
+    (6, 12),
+    (11, 12),
+    (5, 7),
+    (6, 8),
+    (7, 9),
+    (8, 10),
+    (11, 13),
+    (12, 14),
+    (13, 15),
+    (14, 16),
+];
+
+pub const COCO_NAMES_80: [&str; 80] = [
+    "person",
+    "bicycle",
+    "car",
+    "motorcycle",
+    "airplane",
+    "bus",
+    "train",
+    "truck",
+    "boat",
+    "traffic light",
+    "fire hydrant",
+    "stop sign",
+    "parking meter",
+    "bench",
+    "bird",
+    "cat",
+    "dog",
+    "horse",
+    "sheep",
+    "cow",
+    "elephant",
+    "bear",
+    "zebra",
+    "giraffe",
+    "backpack",
+    "umbrella",
+    "handbag",
+    "tie",
+    "suitcase",
+    "frisbee",
+    "skis",
+    "snowboard",
+    "sports ball",
+    "kite",
+    "baseball bat",
+    "baseball glove",
+    "skateboard",
+    "surfboard",
+    "tennis racket",
+    "bottle",
+    "wine glass",
+    "cup",
+    "fork",
+    "knife",
+    "spoon",
+    "bowl",
+    "banana",
+    "apple",
+    "sandwich",
+    "orange",
+    "broccoli",
+    "carrot",
+    "hot dog",
+    "pizza",
+    "donut",
+    "cake",
+    "chair",
+    "couch",
+    "potted plant",
+    "bed",
+    "dining table",
+    "toilet",
+    "tv",
+    "laptop",
+    "mouse",
+    "remote",
+    "keyboard",
+    "cell phone",
+    "microwave",
+    "oven",
+    "toaster",
+    "sink",
+    "refrigerator",
+    "book",
+    "clock",
+    "vase",
+    "scissors",
+    "teddy bear",
+    "hair drier",
+    "toothbrush",
+];