Add PPOCRv5 DET and REC models (#98)

* Add PPOCRv5 DET and REC models

* Add Text Struct
This commit is contained in:
Jamjamjon
2025-05-21 13:29:17 +08:00
committed by GitHub
parent a2dde94a3a
commit cf21eabcaa
16 changed files with 137 additions and 38 deletions

View File

@ -28,9 +28,13 @@ struct Args {
#[argh(option, default = "false")]
show_hbbs_conf: bool,
/// show mbrs confidence
/// show obbs confidence
#[argh(option, default = "false")]
show_obbs_conf: bool,
/// show polygons confidence
#[argh(option, default = "false")]
show_polygons_conf: bool,
}
fn main() -> Result<()> {
@ -43,7 +47,7 @@ fn main() -> Result<()> {
// build model
let config = match &args.model {
Some(m) => Config::db().with_model_file(m),
None => Config::ppocr_det_v4_ch().with_model_dtype(args.dtype.as_str().try_into()?),
None => Config::ppocr_det_v5_mobile().with_model_dtype(args.dtype.as_str().try_into()?),
}
.with_device_all(args.device.as_str().try_into()?)
.commit()?;
@ -66,16 +70,16 @@ fn main() -> Result<()> {
.with_polygon_style(
Style::polygon()
.with_visible(true)
.with_text_visible(false)
.show_confidence(true)
.show_id(true)
.show_name(true)
.with_text_visible(true)
.show_confidence(args.show_polygons_conf)
.show_id(false)
.show_name(false)
.with_color(usls::StyleColors::default().with_outline([255, 105, 180, 255].into())),
)
.with_hbb_style(
Style::hbb()
.with_visible(args.show_hbbs)
.with_text_visible(false)
.with_text_visible(true)
.with_thickness(1)
.show_confidence(args.show_hbbs_conf)
.show_id(false)
@ -84,7 +88,7 @@ fn main() -> Result<()> {
.with_obb_style(
Style::obb()
.with_visible(args.show_obbs)
.with_text_visible(false)
.with_text_visible(true)
.show_confidence(args.show_obbs_conf)
.show_id(false)
.show_name(false),

View File

@ -47,7 +47,7 @@ fn main() -> Result<()> {
for y in ys.iter() {
if let Some(texts) = y.texts() {
for text in texts {
println!("[User]: {}\n\n[Assistant]:{}", args.prompt, text);
println!("[User]: {}\n\n[Assistant]:{:?}", args.prompt, text);
}
}
}

View File

@ -7,15 +7,14 @@ cargo run -r -F cuda --example svtr -- --device cuda
## Results
```shell
["./examples/svtr/images/license-ch-2.png"]: Ys([Y { Texts: [Text("粤A·68688")] }])
["./examples/svtr/images/license-ch.png"]: Ys([Y { Texts: [Text("冀B6G000")] }])
["./examples/svtr/images/sign-ch-2.png"]: Ys([Y { Texts: [Text("我在南锣鼓捣猫呢")] }])
["./examples/svtr/images/sign-ch.png"]: Ys([Y { Texts: [Text("小菊儿胡同71号")] }])
["./examples/svtr/images/text-110022345.png"]: Ys([Y { Texts: [Text("110022345")] }])
["./examples/svtr/images/text-ch.png"]: Ys([Y { Texts: [Text("你有这么高速运转的机械进入中国,记住我给出的原理")] }])
["./examples/svtr/images/text-en-2.png"]: Ys([Y { Texts: [Text("from the background, but also separate text instances which")] }])
["./examples/svtr/images/text-en-dark.png"]: Ys([Y { Texts: [Text("Please lower your volume")] }])
["./examples/svtr/images/text-en.png"]: Ys([Y { Texts: [Text("are closely jointed. Some examples are illustrated in Fig.7.")] }])
["./examples/svtr/images/text-hello-rust-handwritten.png"]: Ys([Y { Texts: [Text("HeloRuSt")] }])
```
ys: [Y { Texts: [Text { text: "粤A68688", confidence: 0.9940011 }] }]
ys: [Y { Texts: [Text { text: "冀B6G000", confidence: 0.86073524 }] }]
ys: [Y { Texts: [Text { text: "我在南锣鼓捣猫呢", confidence: 0.99346924 }] }]
ys: [Y { Texts: [Text { text: "小菊儿胡同71号", confidence: 0.99450684 }] }]
ys: [Y { Texts: [Text { text: "110022345", confidence: 0.99994576 }] }]
ys: [Y { Texts: [Text { text: "你有这么高速运转的机械进入中国,记住我给出的原理", confidence: 0.9996338 }] }]
ys: [Y { Texts: [Text { text: "from the background, but also separate text instances which", confidence: 0.9954648 }] }]
ys: [Y { Texts: [Text { text: "Please lower your yolume", confidence: 0.93910724 }] }]
ys: [Y { Texts: [Text { text: "are closely jointed. Some examples are illustrated in Fig.7.", confidence: 0.9959717 }] }]
ys: [Y { Texts: [Text { text: "HelloRust", confidence: 0.97661674 }] }]
```

View File

@ -11,6 +11,10 @@ struct Args {
/// dtype
#[argh(option, default = "String::from(\"auto\")")]
dtype: String,
/// max text length
#[argh(option, default = "960")]
max_text_length: usize,
}
fn main() -> Result<()> {
@ -22,9 +26,12 @@ fn main() -> Result<()> {
let args: Args = argh::from_env();
// build model
let config = Config::ppocr_rec_v4_ch()
let config = Config::ppocr_rec_v5_mobile()
// ppocr_rec_v5_server()
// ppocr_rec_v4_ch()
// ppocr_rec_v4_en()
// repsvtr_ch()
.with_model_ixx(0, 3, args.max_text_length.into())
.with_model_device(args.device.as_str().try_into()?)
.with_model_dtype(args.dtype.as_str().try_into()?)
.commit()?;

View File

@ -9,6 +9,7 @@ mod obb;
mod polygon;
mod prob;
mod skeleton;
mod text;
mod x;
mod xs;
mod y;
@ -29,6 +30,7 @@ pub use obb::*;
pub use polygon::*;
pub use prob::*;
pub use skeleton::*;
pub use text::*;
pub use x::X;
pub use xs::Xs;
pub use y::*;

56
src/inference/text.rs Normal file
View File

@ -0,0 +1,56 @@
use aksr::Builder;
use crate::{impl_meta_methods, InstanceMeta, Style};
#[derive(Builder, Clone, Default)]
pub struct Text {
text: String,
meta: InstanceMeta,
style: Option<Style>,
}
impl std::fmt::Debug for Text {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// f.debug_struct("Text")
// .field("text", &self.text)
// .field("id", &self.meta.id())
// .field("name", &self.meta.name())
// .field("confidence", &self.meta.confidence())
// .finish()
let mut f = f.debug_struct("Text");
f.field("text", &self.text);
if let Some(id) = &self.meta.id() {
f.field("id", id);
}
if let Some(name) = &self.meta.name() {
f.field("name", name);
}
if let Some(confidence) = &self.meta.confidence() {
f.field("confidence", confidence);
}
f.finish()
}
}
impl From<String> for Text {
fn from(text: String) -> Self {
Self {
text,
..Default::default()
}
}
}
impl From<&str> for Text {
fn from(text: &str) -> Self {
Self {
text: text.to_string(),
..Default::default()
}
}
}
impl Text {
impl_meta_methods!();
}

View File

@ -1,6 +1,6 @@
use aksr::Builder;
use crate::{Hbb, Keypoint, Mask, Obb, Polygon, Prob};
use crate::{Hbb, Keypoint, Mask, Obb, Polygon, Prob, Text};
/// Container for inference results for each image.
///
@ -10,7 +10,7 @@ use crate::{Hbb, Keypoint, Mask, Obb, Polygon, Prob};
///
#[derive(Builder, Clone, Default)]
pub struct Y {
texts: Option<Vec<String>>,
texts: Option<Vec<Text>>,
probs: Option<Vec<Prob>>,
keypoints: Option<Vec<Keypoint>>,
keypointss: Option<Vec<Vec<Keypoint>>>,

View File

@ -127,7 +127,7 @@ impl Blip {
let ys = texts
.into_iter()
.map(|x| Y::default().with_texts(&[&x]))
.map(|x| Y::default().with_texts(&[x.into()]))
.collect::<Vec<_>>();
Ok(ys)

View File

@ -29,6 +29,20 @@ impl crate::Config {
Self::db().with_model_file("ppocr-v4-server-ch.onnx")
}
fn ppocr_det_v5() -> Self {
Self::db()
.with_model_ixx(0, 2, (608, 640, 1600).into())
.with_model_ixx(0, 3, (608, 640, 1600).into())
}
pub fn ppocr_det_v5_mobile() -> Self {
Self::ppocr_det_v5().with_model_file("ppocr-v5-mobile.onnx")
}
pub fn ppocr_det_v5_server() -> Self {
Self::ppocr_det_v5().with_model_file("ppocr-v5-server.onnx")
}
pub fn db2() -> Self {
Self::db()
.with_image_mean(&[0.798, 0.785, 0.772])

View File

@ -277,13 +277,13 @@ impl Florence2 {
// postprocess
let mut y = Y::default();
if let Task::Caption(_) | Task::Ocr = x_textual {
y = y.with_texts(&[&text]);
y = y.with_texts(&[text.into()]);
} else {
let elems = Self::loc_parse(&text)?;
match x_textual {
Task::RegionToCategory(..) | Task::RegionToDescription(..) => {
let text = elems[0][0].clone();
y = y.with_texts(&[&text]);
y = y.with_texts(&[text.into()]);
}
Task::ObjectDetection
| Task::OpenSetDetection(_)

View File

@ -107,7 +107,7 @@ impl Moondream2 {
_ => vec![198., 198., 24334., 1159., 25.],
};
let text = self.generate_text(&input_ids, kv_cache)?;
let y = Y::default().with_texts(&[&text]);
let y = Y::default().with_texts(&[text.into()]);
Ok(y)
}
@ -120,7 +120,7 @@ impl Moondream2 {
.collect();
let text = self.generate_text(&input_ids, kv_cache)?;
let y = Y::default().with_texts(&[&text]);
let y = Y::default().with_texts(&[text.into()]);
Ok(y)
}

View File

@ -2,7 +2,7 @@ use aksr::Builder;
use anyhow::Result;
use ndarray::{s, Axis};
use crate::{elapsed, models::BaseModelVisual, Config, Image, Keypoint, Ts, Xs, Y};
use crate::{elapsed, models::BaseModelVisual, Config, Image, Keypoint, Text, Ts, Xs, Y};
#[derive(Builder, Debug)]
pub struct SLANet {
@ -107,7 +107,11 @@ impl SLANet {
y_texts.extend_from_slice(&["</table>", "</body>", "</html>"]);
}
ys.push(Y::default().with_keypointss(&y_kpts).with_texts(&y_texts));
ys.push(
Y::default()
.with_keypointss(&y_kpts)
.with_texts(&y_texts.into_iter().map(Text::from).collect::<Vec<_>>()),
);
}
Ok(ys)

View File

@ -92,7 +92,7 @@ impl SmolVLM {
let mut ys: Vec<Y> = Vec::new();
for image in images.iter() {
let y = self.generate_one(image, text)?;
ys.push(Y::default().with_texts(&[&y]));
ys.push(Y::default().with_texts(&[y.into()]));
}
Ok(ys)

View File

@ -6,7 +6,7 @@ impl crate::Config {
.with_model_ixx(0, 0, (1, 1, 8).into())
.with_model_ixx(0, 1, 3.into())
.with_model_ixx(0, 2, 48.into())
.with_model_ixx(0, 3, (320, 960, 1600).into())
.with_model_ixx(0, 3, (320, 960, 3200).into())
.with_resize_mode(crate::ResizeMode::FitHeight)
.with_padding_value(0)
.with_normalize(true)
@ -56,4 +56,16 @@ impl crate::Config {
pub fn svtr_v2_student_ch() -> Self {
Self::svtr_ch().with_model_file("v2-distill-student-ch.onnx")
}
fn ppocr_rec_v5() -> Self {
Self::svtr().with_vocab_txt("svtr/vocab_v5_ppocr_rec.txt")
}
pub fn ppocr_rec_v5_mobile() -> Self {
Self::ppocr_rec_v5().with_model_file("ppocr-v5-mobile.onnx")
}
pub fn ppocr_rec_v5_server() -> Self {
Self::ppocr_rec_v5().with_model_file("ppocr-v5-server.onnx")
}
}

View File

@ -3,7 +3,7 @@ use anyhow::Result;
use ndarray::Axis;
use rayon::prelude::*;
use crate::{elapsed, Config, DynConf, Engine, Image, Processor, Ts, Xs, Y};
use crate::{elapsed, Config, DynConf, Engine, Image, Processor, Text, Ts, Xs, Y};
#[derive(Builder, Debug)]
pub struct SVTR {
@ -80,13 +80,14 @@ impl SVTR {
preds.dedup_by(|a, b| a.0 == b.0);
let text: String = preds
let (text, confs): (String, Vec<f32>) = preds
.into_iter()
.filter(|(id, &conf)| *id != 0 && conf >= self.confs[0])
.map(|(id, _)| self.processor.vocab()[id].clone())
.map(|(id, &conf)| (self.processor.vocab()[id].clone(), conf))
.collect();
Y::default().with_texts(&[&text])
Y::default().with_texts(&[Text::from(text)
.with_confidence(confs.iter().sum::<f32>() / confs.len() as f32)])
})
.collect();

View File

@ -188,7 +188,7 @@ impl TrOCR {
// to texts
let texts = texts
.into_par_iter()
.map(|x| Y::default().with_texts(&[&x]))
.map(|x| Y::default().with_texts(&[x.into()]))
.collect::<Vec<_>>();
Ok(texts)