mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Attempting to add UnigramTrainer to python bindings.
This commit is contained in:
@ -172,3 +172,75 @@ impl PyWordPieceTrainer {
|
|||||||
))
|
))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[pyclass(extends=PyTrainer, name=UnigramTrainer)]
|
||||||
|
pub struct PyUnigramTrainer {}
|
||||||
|
#[pymethods]
|
||||||
|
impl PyUnigramTrainer {
|
||||||
|
/// Create a new UnigramTrainer with the given configuration
|
||||||
|
#[new]
|
||||||
|
#[args(kwargs = "**")]
|
||||||
|
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
||||||
|
let mut builder = tk::models::unigram::UnigramTrainer::builder();
|
||||||
|
if let Some(kwargs) = kwargs {
|
||||||
|
for (key, val) in kwargs {
|
||||||
|
let key: &str = key.extract()?;
|
||||||
|
match key {
|
||||||
|
"vocab_size" => builder.vocab_size(val.extract()?),
|
||||||
|
"show_progress" => builder.show_progress(val.extract()?),
|
||||||
|
"n_sub_iterations" => builder.n_sub_iterations(val.extract()?),
|
||||||
|
"shrinking_factor" => builder.shrinking_factor(val.extract()?),
|
||||||
|
"space_char" => {
|
||||||
|
let string: String = val.extract()?;
|
||||||
|
if string.chars().collect::<Vec<_>>().len() != 1 {
|
||||||
|
return Err(exceptions::Exception::py_err(
|
||||||
|
"space_char must be 1 unicode char long",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
builder.space_char(string.chars().next().ok_or_else(|| {
|
||||||
|
exceptions::Exception::py_err("space_char must not be 0 width")
|
||||||
|
})?)
|
||||||
|
}
|
||||||
|
"unk_token" => builder.unk_token(val.extract()?),
|
||||||
|
"split_by_number" => builder.split_by_number(val.extract()?),
|
||||||
|
"treat_whitespace_as_suffix" => {
|
||||||
|
builder.treat_whitespace_as_suffix(val.extract()?)
|
||||||
|
}
|
||||||
|
"split_by_unicode_script" => builder.split_by_unicode_script(val.extract()?),
|
||||||
|
"split_by_digits" => builder.split_by_digits(val.extract()?),
|
||||||
|
"split_by_whitespace" => builder.split_by_whitespace(val.extract()?),
|
||||||
|
"max_piece_length" => builder.max_piece_length(val.extract()?),
|
||||||
|
"seed_size" => builder.seed_size(val.extract()?),
|
||||||
|
"special_tokens" => builder.special_tokens(
|
||||||
|
val.cast_as::<PyList>()?
|
||||||
|
.into_iter()
|
||||||
|
.map(|token| {
|
||||||
|
if let Ok(content) = token.extract::<String>() {
|
||||||
|
Ok(PyAddedToken::from(content, Some(true)).get_token())
|
||||||
|
} else if let Ok(mut token) =
|
||||||
|
token.extract::<PyRefMut<PyAddedToken>>()
|
||||||
|
{
|
||||||
|
token.is_special_token = true;
|
||||||
|
Ok(token.get_token())
|
||||||
|
} else {
|
||||||
|
Err(exceptions::Exception::py_err(
|
||||||
|
"special_tokens must be a List[Union[str, AddedToken]]",
|
||||||
|
))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect::<PyResult<Vec<_>>>()?,
|
||||||
|
),
|
||||||
|
_ => {
|
||||||
|
println!("Ignored unknown kwargs option {}", key);
|
||||||
|
&mut builder
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let trainer: tokenizers::models::unigram::UnigramTrainer = builder
|
||||||
|
.build()
|
||||||
|
.map_err(|_| exceptions::Exception::py_err("Cannot build UnigramTrainer"))?;
|
||||||
|
Ok((PyUnigramTrainer {}, PyTrainer::new(trainer.into())))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -11,7 +11,7 @@ use std::path::{Path, PathBuf};
|
|||||||
use serde::{Deserialize, Serialize, Serializer};
|
use serde::{Deserialize, Serialize, Serializer};
|
||||||
|
|
||||||
use crate::models::bpe::{BpeTrainer, BPE};
|
use crate::models::bpe::{BpeTrainer, BPE};
|
||||||
use crate::models::unigram::Unigram;
|
use crate::models::unigram::{Unigram, UnigramTrainer};
|
||||||
use crate::models::wordlevel::WordLevel;
|
use crate::models::wordlevel::WordLevel;
|
||||||
use crate::models::wordpiece::{WordPiece, WordPieceTrainer};
|
use crate::models::wordpiece::{WordPiece, WordPieceTrainer};
|
||||||
use crate::{AddedToken, Model, Result, Token, Trainer};
|
use crate::{AddedToken, Model, Result, Token, Trainer};
|
||||||
@ -117,6 +117,7 @@ impl Model for ModelWrapper {
|
|||||||
pub enum TrainerWrapper {
|
pub enum TrainerWrapper {
|
||||||
BpeTrainer(BpeTrainer),
|
BpeTrainer(BpeTrainer),
|
||||||
WordPieceTrainer(WordPieceTrainer),
|
WordPieceTrainer(WordPieceTrainer),
|
||||||
|
UnigramTrainer(UnigramTrainer),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Trainer for TrainerWrapper {
|
impl Trainer for TrainerWrapper {
|
||||||
@ -126,6 +127,7 @@ impl Trainer for TrainerWrapper {
|
|||||||
match self {
|
match self {
|
||||||
TrainerWrapper::BpeTrainer(bpe) => bpe.should_show_progress(),
|
TrainerWrapper::BpeTrainer(bpe) => bpe.should_show_progress(),
|
||||||
TrainerWrapper::WordPieceTrainer(wpt) => wpt.should_show_progress(),
|
TrainerWrapper::WordPieceTrainer(wpt) => wpt.should_show_progress(),
|
||||||
|
TrainerWrapper::UnigramTrainer(wpt) => wpt.should_show_progress(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -133,6 +135,7 @@ impl Trainer for TrainerWrapper {
|
|||||||
match self {
|
match self {
|
||||||
TrainerWrapper::BpeTrainer(bpe) => bpe.train(words).map(|(m, t)| (m.into(), t)),
|
TrainerWrapper::BpeTrainer(bpe) => bpe.train(words).map(|(m, t)| (m.into(), t)),
|
||||||
TrainerWrapper::WordPieceTrainer(wpt) => wpt.train(words).map(|(m, t)| (m.into(), t)),
|
TrainerWrapper::WordPieceTrainer(wpt) => wpt.train(words).map(|(m, t)| (m.into(), t)),
|
||||||
|
TrainerWrapper::UnigramTrainer(wpt) => wpt.train(words).map(|(m, t)| (m.into(), t)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -140,9 +143,11 @@ impl Trainer for TrainerWrapper {
|
|||||||
match self {
|
match self {
|
||||||
TrainerWrapper::BpeTrainer(bpe) => bpe.process_tokens(words, tokens),
|
TrainerWrapper::BpeTrainer(bpe) => bpe.process_tokens(words, tokens),
|
||||||
TrainerWrapper::WordPieceTrainer(wpt) => wpt.process_tokens(words, tokens),
|
TrainerWrapper::WordPieceTrainer(wpt) => wpt.process_tokens(words, tokens),
|
||||||
|
TrainerWrapper::UnigramTrainer(wpt) => wpt.process_tokens(words, tokens),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl_enum_from!(BpeTrainer, TrainerWrapper, BpeTrainer);
|
impl_enum_from!(BpeTrainer, TrainerWrapper, BpeTrainer);
|
||||||
impl_enum_from!(WordPieceTrainer, TrainerWrapper, WordPieceTrainer);
|
impl_enum_from!(WordPieceTrainer, TrainerWrapper, WordPieceTrainer);
|
||||||
|
impl_enum_from!(UnigramTrainer, TrainerWrapper, UnigramTrainer);
|
||||||
|
@ -86,6 +86,10 @@ pub struct UnigramTrainer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl UnigramTrainer {
|
impl UnigramTrainer {
|
||||||
|
pub fn builder() -> UnigramTrainerBuilder {
|
||||||
|
UnigramTrainerBuilder::default()
|
||||||
|
}
|
||||||
|
|
||||||
/// Setup a progress bar if asked to show progress
|
/// Setup a progress bar if asked to show progress
|
||||||
fn setup_progress(&self) -> Option<ProgressBar> {
|
fn setup_progress(&self) -> Option<ProgressBar> {
|
||||||
if self.show_progress {
|
if self.show_progress {
|
||||||
@ -132,9 +136,10 @@ impl UnigramTrainer {
|
|||||||
// This function checks that unicode "scripts" are consistent, so we cannot have romaji and
|
// This function checks that unicode "scripts" are consistent, so we cannot have romaji and
|
||||||
// hiragana for instance. Seems pretty specific. Also Hiragana and katakana are mixed
|
// hiragana for instance. Seems pretty specific. Also Hiragana and katakana are mixed
|
||||||
let raw_script = get_script(c);
|
let raw_script = get_script(c);
|
||||||
|
|
||||||
let script = if *c as u32 == 0x30FC {
|
let script = if *c as u32 == 0x30FC {
|
||||||
Script::Han
|
Script::Han
|
||||||
} else if *c as u32 == 32 || !self.split_by_number && c.is_numeric() {
|
} else if *c == self.space_char || !self.split_by_number && c.is_numeric() {
|
||||||
Script::Any
|
Script::Any
|
||||||
} else {
|
} else {
|
||||||
match raw_script {
|
match raw_script {
|
||||||
|
@ -99,7 +99,7 @@ fn test_sample() {
|
|||||||
*p /= z;
|
*p /= z;
|
||||||
}
|
}
|
||||||
|
|
||||||
let n_trials = 100_000;
|
let n_trials = 1_000;
|
||||||
let mut freq: HashMap<String, u32> = HashMap::new();
|
let mut freq: HashMap<String, u32> = HashMap::new();
|
||||||
for _ in 0..n_trials {
|
for _ in 0..n_trials {
|
||||||
let string = lattice.sample_token(theta).join(" ");
|
let string = lattice.sample_token(theta).join(" ");
|
||||||
@ -245,6 +245,8 @@ fn test_spm_compat_train() {
|
|||||||
// println!("Stop train {:?}", model.get_vocab());
|
// println!("Stop train {:?}", model.get_vocab());
|
||||||
// println!("Vocab {}", model.get_vocab().len());
|
// println!("Vocab {}", model.get_vocab().len());
|
||||||
|
|
||||||
|
model.save(Path::new("data"), Some("trained.json")).unwrap();
|
||||||
|
|
||||||
let file = read_to_string(test_file).unwrap();
|
let file = read_to_string(test_file).unwrap();
|
||||||
let encoded = std::str::from_utf8(&output.stdout).unwrap();
|
let encoded = std::str::from_utf8(&output.stdout).unwrap();
|
||||||
|
|
||||||
@ -253,9 +255,7 @@ fn test_spm_compat_train() {
|
|||||||
let mut n_tokenizer_tokens = 0;
|
let mut n_tokenizer_tokens = 0;
|
||||||
let mut n_spm_tokens = 0;
|
let mut n_spm_tokens = 0;
|
||||||
for (tokenizer_line, spm_line) in file.lines().zip(encoded.lines()) {
|
for (tokenizer_line, spm_line) in file.lines().zip(encoded.lines()) {
|
||||||
println!("Tokenizer line {:?}", tokenizer_line);
|
let tokenizer_tokens = model.encode(&tokenizer_line.replace(" ", "▁"));
|
||||||
println!("Spm line {:?}", spm_line);
|
|
||||||
let tokenizer_tokens = model.encode(tokenizer_line);
|
|
||||||
let mut spm_tokens: Vec<String> = spm_line
|
let mut spm_tokens: Vec<String> = spm_line
|
||||||
.split(' ')
|
.split(' ')
|
||||||
.map(|s| s.to_string().replace('▁', " "))
|
.map(|s| s.to_string().replace('▁', " "))
|
||||||
@ -274,6 +274,12 @@ fn test_spm_compat_train() {
|
|||||||
total += 1;
|
total += 1;
|
||||||
|
|
||||||
// assert_eq!(tokenizer_tokens, spm_tokens, "Failed on line {}", i + 1,);
|
// assert_eq!(tokenizer_tokens, spm_tokens, "Failed on line {}", i + 1,);
|
||||||
|
// println!("{} vs {}", tokenizer_tokens.len(), spm_tokens.len());
|
||||||
|
// assert!(tokenizer_tokens.len() <= spm_tokens.len());
|
||||||
|
// if spm_tokens.len() < tokenizer_tokens.len() {
|
||||||
|
// println!("Tokenizer line {:?}", tokenizer_tokens.join(" "));
|
||||||
|
// println!("Spm line {:?}", spm_line);
|
||||||
|
// }
|
||||||
}
|
}
|
||||||
let acc = (correct as f64) / (total as f64) * 100.0;
|
let acc = (correct as f64) / (total as f64) * 100.0;
|
||||||
println!("Total tokenizer tokens {}", n_tokenizer_tokens);
|
println!("Total tokenizer tokens {}", n_tokenizer_tokens);
|
||||||
|
Reference in New Issue
Block a user