Add PPOCRv5 DET and REC models (#98)

* Add PPOCRv5 DET and REC models

* Add Text Struct
This commit is contained in:
Jamjamjon
2025-05-21 13:29:17 +08:00
committed by GitHub
parent a2dde94a3a
commit cf21eabcaa
16 changed files with 137 additions and 38 deletions

View File

@ -28,9 +28,13 @@ struct Args {
#[argh(option, default = "false")] #[argh(option, default = "false")]
show_hbbs_conf: bool, show_hbbs_conf: bool,
/// show mbrs confidence /// show obbs confidence
#[argh(option, default = "false")] #[argh(option, default = "false")]
show_obbs_conf: bool, show_obbs_conf: bool,
/// show polygons confidence
#[argh(option, default = "false")]
show_polygons_conf: bool,
} }
fn main() -> Result<()> { fn main() -> Result<()> {
@ -43,7 +47,7 @@ fn main() -> Result<()> {
// build model // build model
let config = match &args.model { let config = match &args.model {
Some(m) => Config::db().with_model_file(m), Some(m) => Config::db().with_model_file(m),
None => Config::ppocr_det_v4_ch().with_model_dtype(args.dtype.as_str().try_into()?), None => Config::ppocr_det_v5_mobile().with_model_dtype(args.dtype.as_str().try_into()?),
} }
.with_device_all(args.device.as_str().try_into()?) .with_device_all(args.device.as_str().try_into()?)
.commit()?; .commit()?;
@ -66,16 +70,16 @@ fn main() -> Result<()> {
.with_polygon_style( .with_polygon_style(
Style::polygon() Style::polygon()
.with_visible(true) .with_visible(true)
.with_text_visible(false) .with_text_visible(true)
.show_confidence(true) .show_confidence(args.show_polygons_conf)
.show_id(true) .show_id(false)
.show_name(true) .show_name(false)
.with_color(usls::StyleColors::default().with_outline([255, 105, 180, 255].into())), .with_color(usls::StyleColors::default().with_outline([255, 105, 180, 255].into())),
) )
.with_hbb_style( .with_hbb_style(
Style::hbb() Style::hbb()
.with_visible(args.show_hbbs) .with_visible(args.show_hbbs)
.with_text_visible(false) .with_text_visible(true)
.with_thickness(1) .with_thickness(1)
.show_confidence(args.show_hbbs_conf) .show_confidence(args.show_hbbs_conf)
.show_id(false) .show_id(false)
@ -84,7 +88,7 @@ fn main() -> Result<()> {
.with_obb_style( .with_obb_style(
Style::obb() Style::obb()
.with_visible(args.show_obbs) .with_visible(args.show_obbs)
.with_text_visible(false) .with_text_visible(true)
.show_confidence(args.show_obbs_conf) .show_confidence(args.show_obbs_conf)
.show_id(false) .show_id(false)
.show_name(false), .show_name(false),

View File

@ -47,7 +47,7 @@ fn main() -> Result<()> {
for y in ys.iter() { for y in ys.iter() {
if let Some(texts) = y.texts() { if let Some(texts) = y.texts() {
for text in texts { for text in texts {
println!("[User]: {}\n\n[Assistant]:{}", args.prompt, text); println!("[User]: {}\n\n[Assistant]:{:?}", args.prompt, text);
} }
} }
} }

View File

@ -7,15 +7,14 @@ cargo run -r -F cuda --example svtr -- --device cuda
## Results ## Results
```shell ```shell
["./examples/svtr/images/license-ch-2.png"]: Ys([Y { Texts: [Text("粤A·68688")] }]) ys: [Y { Texts: [Text { text: "粤A68688", confidence: 0.9940011 }] }]
["./examples/svtr/images/license-ch.png"]: Ys([Y { Texts: [Text("冀B6G000")] }]) ys: [Y { Texts: [Text { text: "冀B6G000", confidence: 0.86073524 }] }]
["./examples/svtr/images/sign-ch-2.png"]: Ys([Y { Texts: [Text("我在南锣鼓捣猫呢")] }]) ys: [Y { Texts: [Text { text: "我在南锣鼓捣猫呢", confidence: 0.99346924 }] }]
["./examples/svtr/images/sign-ch.png"]: Ys([Y { Texts: [Text("小菊儿胡同71号")] }]) ys: [Y { Texts: [Text { text: "小菊儿胡同71号", confidence: 0.99450684 }] }]
["./examples/svtr/images/text-110022345.png"]: Ys([Y { Texts: [Text("110022345")] }]) ys: [Y { Texts: [Text { text: "110022345", confidence: 0.99994576 }] }]
["./examples/svtr/images/text-ch.png"]: Ys([Y { Texts: [Text("你有这么高速运转的机械进入中国,记住我给出的原理")] }]) ys: [Y { Texts: [Text { text: "你有这么高速运转的机械进入中国,记住我给出的原理", confidence: 0.9996338 }] }]
["./examples/svtr/images/text-en-2.png"]: Ys([Y { Texts: [Text("from the background, but also separate text instances which")] }]) ys: [Y { Texts: [Text { text: "from the background, but also separate text instances which", confidence: 0.9954648 }] }]
["./examples/svtr/images/text-en-dark.png"]: Ys([Y { Texts: [Text("Please lower your volume")] }]) ys: [Y { Texts: [Text { text: "Please lower your yolume", confidence: 0.93910724 }] }]
["./examples/svtr/images/text-en.png"]: Ys([Y { Texts: [Text("are closely jointed. Some examples are illustrated in Fig.7.")] }]) ys: [Y { Texts: [Text { text: "are closely jointed. Some examples are illustrated in Fig.7.", confidence: 0.9959717 }] }]
["./examples/svtr/images/text-hello-rust-handwritten.png"]: Ys([Y { Texts: [Text("HeloRuSt")] }]) ys: [Y { Texts: [Text { text: "HelloRust", confidence: 0.97661674 }] }]
```
```

View File

@ -11,6 +11,10 @@ struct Args {
/// dtype /// dtype
#[argh(option, default = "String::from(\"auto\")")] #[argh(option, default = "String::from(\"auto\")")]
dtype: String, dtype: String,
/// max text length
#[argh(option, default = "960")]
max_text_length: usize,
} }
fn main() -> Result<()> { fn main() -> Result<()> {
@ -22,9 +26,12 @@ fn main() -> Result<()> {
let args: Args = argh::from_env(); let args: Args = argh::from_env();
// build model // build model
let config = Config::ppocr_rec_v4_ch() let config = Config::ppocr_rec_v5_mobile()
// ppocr_rec_v5_server()
// ppocr_rec_v4_ch()
// ppocr_rec_v4_en() // ppocr_rec_v4_en()
// repsvtr_ch() // repsvtr_ch()
.with_model_ixx(0, 3, args.max_text_length.into())
.with_model_device(args.device.as_str().try_into()?) .with_model_device(args.device.as_str().try_into()?)
.with_model_dtype(args.dtype.as_str().try_into()?) .with_model_dtype(args.dtype.as_str().try_into()?)
.commit()?; .commit()?;

View File

@ -9,6 +9,7 @@ mod obb;
mod polygon; mod polygon;
mod prob; mod prob;
mod skeleton; mod skeleton;
mod text;
mod x; mod x;
mod xs; mod xs;
mod y; mod y;
@ -29,6 +30,7 @@ pub use obb::*;
pub use polygon::*; pub use polygon::*;
pub use prob::*; pub use prob::*;
pub use skeleton::*; pub use skeleton::*;
pub use text::*;
pub use x::X; pub use x::X;
pub use xs::Xs; pub use xs::Xs;
pub use y::*; pub use y::*;

56
src/inference/text.rs Normal file
View File

@ -0,0 +1,56 @@
use aksr::Builder;
use crate::{impl_meta_methods, InstanceMeta, Style};
#[derive(Builder, Clone, Default)]
pub struct Text {
text: String,
meta: InstanceMeta,
style: Option<Style>,
}
impl std::fmt::Debug for Text {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// f.debug_struct("Text")
// .field("text", &self.text)
// .field("id", &self.meta.id())
// .field("name", &self.meta.name())
// .field("confidence", &self.meta.confidence())
// .finish()
let mut f = f.debug_struct("Text");
f.field("text", &self.text);
if let Some(id) = &self.meta.id() {
f.field("id", id);
}
if let Some(name) = &self.meta.name() {
f.field("name", name);
}
if let Some(confidence) = &self.meta.confidence() {
f.field("confidence", confidence);
}
f.finish()
}
}
impl From<String> for Text {
fn from(text: String) -> Self {
Self {
text,
..Default::default()
}
}
}
impl From<&str> for Text {
fn from(text: &str) -> Self {
Self {
text: text.to_string(),
..Default::default()
}
}
}
impl Text {
impl_meta_methods!();
}

View File

@ -1,6 +1,6 @@
use aksr::Builder; use aksr::Builder;
use crate::{Hbb, Keypoint, Mask, Obb, Polygon, Prob}; use crate::{Hbb, Keypoint, Mask, Obb, Polygon, Prob, Text};
/// Container for inference results for each image. /// Container for inference results for each image.
/// ///
@ -10,7 +10,7 @@ use crate::{Hbb, Keypoint, Mask, Obb, Polygon, Prob};
/// ///
#[derive(Builder, Clone, Default)] #[derive(Builder, Clone, Default)]
pub struct Y { pub struct Y {
texts: Option<Vec<String>>, texts: Option<Vec<Text>>,
probs: Option<Vec<Prob>>, probs: Option<Vec<Prob>>,
keypoints: Option<Vec<Keypoint>>, keypoints: Option<Vec<Keypoint>>,
keypointss: Option<Vec<Vec<Keypoint>>>, keypointss: Option<Vec<Vec<Keypoint>>>,

View File

@ -127,7 +127,7 @@ impl Blip {
let ys = texts let ys = texts
.into_iter() .into_iter()
.map(|x| Y::default().with_texts(&[&x])) .map(|x| Y::default().with_texts(&[x.into()]))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
Ok(ys) Ok(ys)

View File

@ -29,6 +29,20 @@ impl crate::Config {
Self::db().with_model_file("ppocr-v4-server-ch.onnx") Self::db().with_model_file("ppocr-v4-server-ch.onnx")
} }
fn ppocr_det_v5() -> Self {
Self::db()
.with_model_ixx(0, 2, (608, 640, 1600).into())
.with_model_ixx(0, 3, (608, 640, 1600).into())
}
pub fn ppocr_det_v5_mobile() -> Self {
Self::ppocr_det_v5().with_model_file("ppocr-v5-mobile.onnx")
}
pub fn ppocr_det_v5_server() -> Self {
Self::ppocr_det_v5().with_model_file("ppocr-v5-server.onnx")
}
pub fn db2() -> Self { pub fn db2() -> Self {
Self::db() Self::db()
.with_image_mean(&[0.798, 0.785, 0.772]) .with_image_mean(&[0.798, 0.785, 0.772])

View File

@ -277,13 +277,13 @@ impl Florence2 {
// postprocess // postprocess
let mut y = Y::default(); let mut y = Y::default();
if let Task::Caption(_) | Task::Ocr = x_textual { if let Task::Caption(_) | Task::Ocr = x_textual {
y = y.with_texts(&[&text]); y = y.with_texts(&[text.into()]);
} else { } else {
let elems = Self::loc_parse(&text)?; let elems = Self::loc_parse(&text)?;
match x_textual { match x_textual {
Task::RegionToCategory(..) | Task::RegionToDescription(..) => { Task::RegionToCategory(..) | Task::RegionToDescription(..) => {
let text = elems[0][0].clone(); let text = elems[0][0].clone();
y = y.with_texts(&[&text]); y = y.with_texts(&[text.into()]);
} }
Task::ObjectDetection Task::ObjectDetection
| Task::OpenSetDetection(_) | Task::OpenSetDetection(_)

View File

@ -107,7 +107,7 @@ impl Moondream2 {
_ => vec![198., 198., 24334., 1159., 25.], _ => vec![198., 198., 24334., 1159., 25.],
}; };
let text = self.generate_text(&input_ids, kv_cache)?; let text = self.generate_text(&input_ids, kv_cache)?;
let y = Y::default().with_texts(&[&text]); let y = Y::default().with_texts(&[text.into()]);
Ok(y) Ok(y)
} }
@ -120,7 +120,7 @@ impl Moondream2 {
.collect(); .collect();
let text = self.generate_text(&input_ids, kv_cache)?; let text = self.generate_text(&input_ids, kv_cache)?;
let y = Y::default().with_texts(&[&text]); let y = Y::default().with_texts(&[text.into()]);
Ok(y) Ok(y)
} }

View File

@ -2,7 +2,7 @@ use aksr::Builder;
use anyhow::Result; use anyhow::Result;
use ndarray::{s, Axis}; use ndarray::{s, Axis};
use crate::{elapsed, models::BaseModelVisual, Config, Image, Keypoint, Ts, Xs, Y}; use crate::{elapsed, models::BaseModelVisual, Config, Image, Keypoint, Text, Ts, Xs, Y};
#[derive(Builder, Debug)] #[derive(Builder, Debug)]
pub struct SLANet { pub struct SLANet {
@ -107,7 +107,11 @@ impl SLANet {
y_texts.extend_from_slice(&["</table>", "</body>", "</html>"]); y_texts.extend_from_slice(&["</table>", "</body>", "</html>"]);
} }
ys.push(Y::default().with_keypointss(&y_kpts).with_texts(&y_texts)); ys.push(
Y::default()
.with_keypointss(&y_kpts)
.with_texts(&y_texts.into_iter().map(Text::from).collect::<Vec<_>>()),
);
} }
Ok(ys) Ok(ys)

View File

@ -92,7 +92,7 @@ impl SmolVLM {
let mut ys: Vec<Y> = Vec::new(); let mut ys: Vec<Y> = Vec::new();
for image in images.iter() { for image in images.iter() {
let y = self.generate_one(image, text)?; let y = self.generate_one(image, text)?;
ys.push(Y::default().with_texts(&[&y])); ys.push(Y::default().with_texts(&[y.into()]));
} }
Ok(ys) Ok(ys)

View File

@ -6,7 +6,7 @@ impl crate::Config {
.with_model_ixx(0, 0, (1, 1, 8).into()) .with_model_ixx(0, 0, (1, 1, 8).into())
.with_model_ixx(0, 1, 3.into()) .with_model_ixx(0, 1, 3.into())
.with_model_ixx(0, 2, 48.into()) .with_model_ixx(0, 2, 48.into())
.with_model_ixx(0, 3, (320, 960, 1600).into()) .with_model_ixx(0, 3, (320, 960, 3200).into())
.with_resize_mode(crate::ResizeMode::FitHeight) .with_resize_mode(crate::ResizeMode::FitHeight)
.with_padding_value(0) .with_padding_value(0)
.with_normalize(true) .with_normalize(true)
@ -56,4 +56,16 @@ impl crate::Config {
pub fn svtr_v2_student_ch() -> Self { pub fn svtr_v2_student_ch() -> Self {
Self::svtr_ch().with_model_file("v2-distill-student-ch.onnx") Self::svtr_ch().with_model_file("v2-distill-student-ch.onnx")
} }
fn ppocr_rec_v5() -> Self {
Self::svtr().with_vocab_txt("svtr/vocab_v5_ppocr_rec.txt")
}
pub fn ppocr_rec_v5_mobile() -> Self {
Self::ppocr_rec_v5().with_model_file("ppocr-v5-mobile.onnx")
}
pub fn ppocr_rec_v5_server() -> Self {
Self::ppocr_rec_v5().with_model_file("ppocr-v5-server.onnx")
}
} }

View File

@ -3,7 +3,7 @@ use anyhow::Result;
use ndarray::Axis; use ndarray::Axis;
use rayon::prelude::*; use rayon::prelude::*;
use crate::{elapsed, Config, DynConf, Engine, Image, Processor, Ts, Xs, Y}; use crate::{elapsed, Config, DynConf, Engine, Image, Processor, Text, Ts, Xs, Y};
#[derive(Builder, Debug)] #[derive(Builder, Debug)]
pub struct SVTR { pub struct SVTR {
@ -80,13 +80,14 @@ impl SVTR {
preds.dedup_by(|a, b| a.0 == b.0); preds.dedup_by(|a, b| a.0 == b.0);
let text: String = preds let (text, confs): (String, Vec<f32>) = preds
.into_iter() .into_iter()
.filter(|(id, &conf)| *id != 0 && conf >= self.confs[0]) .filter(|(id, &conf)| *id != 0 && conf >= self.confs[0])
.map(|(id, _)| self.processor.vocab()[id].clone()) .map(|(id, &conf)| (self.processor.vocab()[id].clone(), conf))
.collect(); .collect();
Y::default().with_texts(&[&text]) Y::default().with_texts(&[Text::from(text)
.with_confidence(confs.iter().sum::<f32>() / confs.len() as f32)])
}) })
.collect(); .collect();

View File

@ -188,7 +188,7 @@ impl TrOCR {
// to texts // to texts
let texts = texts let texts = texts
.into_par_iter() .into_par_iter()
.map(|x| Y::default().with_texts(&[&x])) .map(|x| Y::default().with_texts(&[x.into()]))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
Ok(texts) Ok(texts)