mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Fixing the progressbar. (#1353)
* Fixing the progressbar. * Upgrade deps. * Update cargo audit * Ssh this action. * Fixing esaxx by using slower rust version. * Trying the new esaxx version. * Publish. * Get cache again.
This commit is contained in:
6
.github/workflows/python.yml
vendored
6
.github/workflows/python.yml
vendored
@ -54,7 +54,7 @@ jobs:
|
||||
os: [ubuntu-latest, macos-latest]
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v2
|
||||
|
||||
|
||||
- name: Install Rust
|
||||
@ -99,9 +99,7 @@ jobs:
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
command: audit
|
||||
# ignoring specific CVE which probably isn't affecting this crate
|
||||
# https://github.com/chronotope/chrono/issues/602
|
||||
args: -D warnings -f ./bindings/python/Cargo.lock --ignore RUSTSEC-2020-0071 --ignore RUSTSEC-2021-0145
|
||||
args: -D warnings -f ./bindings/python/Cargo.lock
|
||||
|
||||
- name: Install
|
||||
working-directory: ./bindings/python
|
||||
|
4
.github/workflows/rust.yml
vendored
4
.github/workflows/rust.yml
vendored
@ -85,9 +85,7 @@ jobs:
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
command: audit
|
||||
# ignoring specific CVE which probably isn't affecting this crate
|
||||
# https://github.com/chronotope/chrono/issues/602
|
||||
args: -D warnings -f ./tokenizers/Cargo.lock --ignore RUSTSEC-2020-0071 --ignore RUSTSEC-2021-0145
|
||||
args: -D warnings -f ./tokenizers/Cargo.lock
|
||||
|
||||
# Verify that Readme.md is up to date.
|
||||
- name: Make sure, Readme generated from lib.rs matches actual Readme
|
||||
|
@ -9,25 +9,23 @@ name = "tokenizers"
|
||||
crate-type = ["cdylib"]
|
||||
|
||||
[dependencies]
|
||||
rayon = "1.3"
|
||||
rayon = "1.8"
|
||||
serde = { version = "1.0", features = [ "rc", "derive" ]}
|
||||
serde_json = "1.0"
|
||||
libc = "0.2"
|
||||
env_logger = "0.7.1"
|
||||
env_logger = "0.10.0"
|
||||
pyo3 = { version = "0.19" }
|
||||
numpy = "0.19.0"
|
||||
ndarray = "0.13"
|
||||
onig = { version = "6.0", default-features = false }
|
||||
itertools = "0.9"
|
||||
ndarray = "0.15"
|
||||
onig = { version = "6.4", default-features = false }
|
||||
itertools = "0.11"
|
||||
|
||||
[dependencies.tokenizers]
|
||||
version = "0.14.1-dev.0"
|
||||
path = "../../tokenizers"
|
||||
default-features = false
|
||||
features = ["onig"]
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.1"
|
||||
tempfile = "3.8"
|
||||
pyo3 = { version = "0.19", features = ["auto-initialize"] }
|
||||
|
||||
[features]
|
||||
|
@ -45,31 +45,31 @@ harness = false
|
||||
[dependencies]
|
||||
lazy_static = "1.4"
|
||||
rand = "0.8"
|
||||
onig = { version = "6.0", default-features = false, optional = true }
|
||||
regex = "1.8"
|
||||
onig = { version = "6.4", default-features = false, optional = true }
|
||||
regex = "1.9"
|
||||
regex-syntax = "0.7"
|
||||
rayon = "1.7"
|
||||
rayon-cond = "0.1"
|
||||
rayon = "1.8"
|
||||
rayon-cond = "0.3"
|
||||
serde = { version = "1.0", features = [ "derive" ] }
|
||||
serde_json = "1.0"
|
||||
clap = { version = "4.0", features=["derive"], optional = true }
|
||||
clap = { version = "4.4", features=["derive"], optional = true }
|
||||
unicode-normalization-alignments = "0.1"
|
||||
unicode_categories = "0.1"
|
||||
unicode-segmentation = "1.10"
|
||||
indicatif = {version = "0.15", optional = true}
|
||||
itertools = "0.9"
|
||||
indicatif = {version = "0.17", optional = true}
|
||||
itertools = "0.11"
|
||||
log = "0.4"
|
||||
derive_builder = "0.12"
|
||||
spm_precompiled = "0.1"
|
||||
hf-hub = { version = "0.2.0", optional = true }
|
||||
aho-corasick = "0.7"
|
||||
paste = "1.0.6"
|
||||
macro_rules_attribute = "0.1.2"
|
||||
thiserror = "1.0.30"
|
||||
fancy-regex = { version = "0.10", optional = true}
|
||||
getrandom = { version = "0.2.6" }
|
||||
esaxx-rs = { version = "0.1", default-features = false, features=[]}
|
||||
monostate = "0.1.5"
|
||||
hf-hub = { version = "0.3.2", optional = true }
|
||||
aho-corasick = "1.1"
|
||||
paste = "1.0.14"
|
||||
macro_rules_attribute = "0.2.0"
|
||||
thiserror = "1.0.49"
|
||||
fancy-regex = { version = "0.11", optional = true}
|
||||
getrandom = { version = "0.2.10" }
|
||||
esaxx-rs = { version = "0.1.10", default-features = false, features=[]}
|
||||
monostate = "0.1.9"
|
||||
|
||||
[features]
|
||||
default = ["progressbar", "cli", "onig", "esaxx_fast"]
|
||||
@ -80,8 +80,8 @@ cli = ["clap"]
|
||||
unstable_wasm = ["fancy-regex", "getrandom/js"]
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = "0.4"
|
||||
tempfile = "3.1"
|
||||
criterion = "0.5"
|
||||
tempfile = "3.8"
|
||||
assert_approx_eq = "1.1"
|
||||
|
||||
[profile.release]
|
||||
|
@ -223,7 +223,8 @@ impl BpeTrainer {
|
||||
let p = ProgressBar::new(0);
|
||||
p.set_style(
|
||||
ProgressStyle::default_bar()
|
||||
.template("[{elapsed_precise}] {msg:<40!} {wide_bar} {pos:<9!}/{len:>9!}"),
|
||||
.template("[{elapsed_precise}] {msg:<30!} {wide_bar} {pos:<9!}/{len:>9!}")
|
||||
.expect("Invalid progress template"),
|
||||
);
|
||||
Some(p)
|
||||
} else {
|
||||
@ -241,11 +242,10 @@ impl BpeTrainer {
|
||||
}
|
||||
|
||||
/// Update the progress bar with the new provided length and message
|
||||
fn update_progress(&self, p: &Option<ProgressBar>, len: usize, message: &str) {
|
||||
fn update_progress(&self, p: &Option<ProgressBar>, len: usize, message: &'static str) {
|
||||
if let Some(p) = p {
|
||||
p.set_message(message);
|
||||
p.set_length(len as u64);
|
||||
p.set_draw_delta(len as u64 / 100);
|
||||
p.reset();
|
||||
}
|
||||
}
|
||||
|
@ -88,7 +88,8 @@ impl UnigramTrainer {
|
||||
let p = ProgressBar::new(0);
|
||||
p.set_style(
|
||||
ProgressStyle::default_bar()
|
||||
.template("[{elapsed_precise}] {msg:<40!} {wide_bar} {pos:<9!}/{len:>9!}"),
|
||||
.template("[{elapsed_precise}] {msg:<30!} {wide_bar} {pos:<9!}/{len:>9!}")
|
||||
.expect("Invalid progress template"),
|
||||
);
|
||||
Some(p)
|
||||
} else {
|
||||
@ -431,11 +432,10 @@ impl UnigramTrainer {
|
||||
}
|
||||
|
||||
/// Update the progress bar with the new provided length and message
|
||||
fn update_progress(&self, p: &Option<ProgressBar>, len: usize, message: &str) {
|
||||
fn update_progress(&self, p: &Option<ProgressBar>, len: usize, message: &'static str) {
|
||||
if let Some(p) = p {
|
||||
p.set_message(message);
|
||||
p.set_length(len as u64);
|
||||
p.set_draw_delta(len as u64 / 100);
|
||||
p.reset();
|
||||
}
|
||||
}
|
||||
|
@ -166,10 +166,12 @@ impl AddedVocabulary {
|
||||
pub fn new() -> Self {
|
||||
let trie = AhoCorasickBuilder::new()
|
||||
.match_kind(MatchKind::LeftmostLongest)
|
||||
.build::<_, &&[u8]>([]);
|
||||
.build::<_, &&[u8]>([])
|
||||
.expect("The trie should build correctly");
|
||||
let normalized_trie = AhoCorasickBuilder::new()
|
||||
.match_kind(MatchKind::LeftmostLongest)
|
||||
.build::<_, &&[u8]>([]);
|
||||
.build::<_, &&[u8]>([])
|
||||
.expect("The normalized trie should build correctly");
|
||||
Self {
|
||||
added_tokens_map: HashMap::new(),
|
||||
added_tokens_map_r: HashMap::new(),
|
||||
@ -314,7 +316,8 @@ impl AddedVocabulary {
|
||||
let (tokens, ids): (Vec<&AddedToken>, Vec<u32>) = non_normalized.into_iter().unzip();
|
||||
let trie = AhoCorasickBuilder::new()
|
||||
.match_kind(MatchKind::LeftmostLongest)
|
||||
.build(tokens.iter().map(|token| &token.content));
|
||||
.build(tokens.iter().map(|token| &token.content))
|
||||
.expect("Failed to build tried when refreshing tokens");
|
||||
self.split_trie = (trie, ids);
|
||||
|
||||
let (ntokens, nids): (Vec<&AddedToken>, Vec<u32>) = normalized.into_iter().unzip();
|
||||
@ -330,7 +333,8 @@ impl AddedVocabulary {
|
||||
.collect();
|
||||
let normalized_trie = AhoCorasickBuilder::new()
|
||||
.match_kind(MatchKind::LeftmostLongest)
|
||||
.build(patterns.iter().map(|content| content.get()));
|
||||
.build(patterns.iter().map(|content| content.get()))
|
||||
.expect("Failed to build tried when refreshing tokens (normalized)");
|
||||
self.split_normalized_trie = (normalized_trie, nids);
|
||||
}
|
||||
|
||||
|
@ -1078,11 +1078,11 @@ where
|
||||
let progress = ProgressBar::new(len);
|
||||
progress.set_style(
|
||||
ProgressStyle::default_bar()
|
||||
.template("[{elapsed_precise}] {msg:<40!} {wide_bar} {percent:>18!}%"),
|
||||
.template("[{elapsed_precise}] {msg:<30!} {wide_bar} {percent:>18!}%")
|
||||
.expect("Invalid progress template"),
|
||||
);
|
||||
progress
|
||||
.set_message(&format!("Pre-processing files ({:.2} Mo)", len / 1_000_000));
|
||||
progress.set_draw_delta(len / 100); // Redraw only every 2%
|
||||
.set_message(format!("Pre-processing files ({:.2} Mo)", len / 1_000_000));
|
||||
Some(progress)
|
||||
} else {
|
||||
None
|
||||
@ -1131,15 +1131,10 @@ where
|
||||
let progress = ProgressBar::new(len);
|
||||
progress.set_style(
|
||||
ProgressStyle::default_bar()
|
||||
.template("[{elapsed_precise}] {msg:<40!} {wide_bar} {pos:<9!}/{len:>9!}"),
|
||||
.template("[{elapsed_precise}] {msg:<30!} {wide_bar} {pos:<9!}/{len:>9!}")
|
||||
.expect("Invalid progress template"),
|
||||
);
|
||||
progress.set_message("Pre-processing sequences");
|
||||
if len > 0 {
|
||||
progress.set_draw_delta(len / 100); // Redraw only every 2%
|
||||
} else {
|
||||
// Trying to have a good default to avoid progress tracking being the bottleneck
|
||||
progress.set_draw_delta(1000);
|
||||
}
|
||||
Some(progress)
|
||||
} else {
|
||||
None
|
||||
|
@ -3,6 +3,7 @@ pub(crate) use indicatif::{ProgressBar, ProgressStyle};
|
||||
|
||||
#[cfg(not(feature = "progressbar"))]
|
||||
mod progressbar {
|
||||
use std::borrow::Cow;
|
||||
pub struct ProgressBar;
|
||||
impl ProgressBar {
|
||||
pub fn new(_length: u64) -> Self {
|
||||
@ -10,8 +11,7 @@ mod progressbar {
|
||||
}
|
||||
|
||||
pub fn set_length(&self, _length: u64) {}
|
||||
pub fn set_draw_delta(&self, _draw_delta: u64) {}
|
||||
pub fn set_message(&self, _message: &str) {}
|
||||
pub fn set_message(&self, _message: impl Into<Cow<'static, str>>) {}
|
||||
pub fn finish(&self) {}
|
||||
pub fn reset(&self) {}
|
||||
pub fn inc(&self, _inc: u64) {}
|
||||
@ -23,8 +23,8 @@ mod progressbar {
|
||||
pub fn default_bar() -> Self {
|
||||
Self {}
|
||||
}
|
||||
pub fn template(self, _template: &str) -> Self {
|
||||
self
|
||||
pub fn template(self, _template: &str) -> Result<Self, String> {
|
||||
Ok(self)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user