Attempting to add UnigramTrainer to python bindings.

2025-09-03 15:59:25 +00:00 · 2020-09-01 14:14:50 +02:00
parent 481f3bd976
commit d624645cf3
4 changed files with 94 additions and 6 deletions
--- a/bindings/python/src/trainers.rs
+++ b/bindings/python/src/trainers.rs
@ -172,3 +172,75 @@ impl PyWordPieceTrainer {
        ))
    }
 }
+
+#[pyclass(extends=PyTrainer, name=UnigramTrainer)]
+pub struct PyUnigramTrainer {}
+#[pymethods]
+impl PyUnigramTrainer {
+    /// Create a new UnigramTrainer with the given configuration
+    #[new]
+    #[args(kwargs = "**")]
+    pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
+        let mut builder = tk::models::unigram::UnigramTrainer::builder();
+        if let Some(kwargs) = kwargs {
+            for (key, val) in kwargs {
+                let key: &str = key.extract()?;
+                match key {
+                    "vocab_size" => builder.vocab_size(val.extract()?),
+                    "show_progress" => builder.show_progress(val.extract()?),
+                    "n_sub_iterations" => builder.n_sub_iterations(val.extract()?),
+                    "shrinking_factor" => builder.shrinking_factor(val.extract()?),
+                    "space_char" => {
+                        let string: String = val.extract()?;
+                        if string.chars().collect::<Vec<_>>().len() != 1 {
+                            return Err(exceptions::Exception::py_err(
+                                "space_char must be 1 unicode char long",
+                            ));
+                        }
+                        builder.space_char(string.chars().next().ok_or_else(|| {
+                            exceptions::Exception::py_err("space_char must not be 0 width")
+                        })?)
+                    }
+                    "unk_token" => builder.unk_token(val.extract()?),
+                    "split_by_number" => builder.split_by_number(val.extract()?),
+                    "treat_whitespace_as_suffix" => {
+                        builder.treat_whitespace_as_suffix(val.extract()?)
+                    }
+                    "split_by_unicode_script" => builder.split_by_unicode_script(val.extract()?),
+                    "split_by_digits" => builder.split_by_digits(val.extract()?),
+                    "split_by_whitespace" => builder.split_by_whitespace(val.extract()?),
+                    "max_piece_length" => builder.max_piece_length(val.extract()?),
+                    "seed_size" => builder.seed_size(val.extract()?),
+                    "special_tokens" => builder.special_tokens(
+                        val.cast_as::<PyList>()?
+                            .into_iter()
+                            .map(|token| {
+                                if let Ok(content) = token.extract::<String>() {
+                                    Ok(PyAddedToken::from(content, Some(true)).get_token())
+                                } else if let Ok(mut token) =
+                                    token.extract::<PyRefMut<PyAddedToken>>()
+                                {
+                                    token.is_special_token = true;
+                                    Ok(token.get_token())
+                                } else {
+                                    Err(exceptions::Exception::py_err(
+                                        "special_tokens must be a List[Union[str, AddedToken]]",
+                                    ))
+                                }
+                            })
+                            .collect::<PyResult<Vec<_>>>()?,
+                    ),
+                    _ => {
+                        println!("Ignored unknown kwargs option {}", key);
+                        &mut builder
+                    }
+                };
+            }
+        }
+
+        let trainer: tokenizers::models::unigram::UnigramTrainer = builder
+            .build()
+            .map_err(|_| exceptions::Exception::py_err("Cannot build UnigramTrainer"))?;
+        Ok((PyUnigramTrainer {}, PyTrainer::new(trainer.into())))
+    }
+}
--- a/tokenizers/src/models/mod.rs
+++ b/tokenizers/src/models/mod.rs
@ -11,7 +11,7 @@ use std::path::{Path, PathBuf};
 use serde::{Deserialize, Serialize, Serializer};

 use crate::models::bpe::{BpeTrainer, BPE};
-use crate::models::unigram::Unigram;
+use crate::models::unigram::{Unigram, UnigramTrainer};
 use crate::models::wordlevel::WordLevel;
 use crate::models::wordpiece::{WordPiece, WordPieceTrainer};
 use crate::{AddedToken, Model, Result, Token, Trainer};
@ -117,6 +117,7 @@ impl Model for ModelWrapper {
 pub enum TrainerWrapper {
    BpeTrainer(BpeTrainer),
    WordPieceTrainer(WordPieceTrainer),
+    UnigramTrainer(UnigramTrainer),
 }

 impl Trainer for TrainerWrapper {
@ -126,6 +127,7 @@ impl Trainer for TrainerWrapper {
        match self {
            TrainerWrapper::BpeTrainer(bpe) => bpe.should_show_progress(),
            TrainerWrapper::WordPieceTrainer(wpt) => wpt.should_show_progress(),
+            TrainerWrapper::UnigramTrainer(wpt) => wpt.should_show_progress(),
        }
    }

@ -133,6 +135,7 @@ impl Trainer for TrainerWrapper {
        match self {
            TrainerWrapper::BpeTrainer(bpe) => bpe.train(words).map(|(m, t)| (m.into(), t)),
            TrainerWrapper::WordPieceTrainer(wpt) => wpt.train(words).map(|(m, t)| (m.into(), t)),
+            TrainerWrapper::UnigramTrainer(wpt) => wpt.train(words).map(|(m, t)| (m.into(), t)),
        }
    }

@ -140,9 +143,11 @@ impl Trainer for TrainerWrapper {
        match self {
            TrainerWrapper::BpeTrainer(bpe) => bpe.process_tokens(words, tokens),
            TrainerWrapper::WordPieceTrainer(wpt) => wpt.process_tokens(words, tokens),
+            TrainerWrapper::UnigramTrainer(wpt) => wpt.process_tokens(words, tokens),
        }
    }
 }

 impl_enum_from!(BpeTrainer, TrainerWrapper, BpeTrainer);
 impl_enum_from!(WordPieceTrainer, TrainerWrapper, WordPieceTrainer);
+impl_enum_from!(UnigramTrainer, TrainerWrapper, UnigramTrainer);
--- a/tokenizers/src/models/unigram/trainer.rs
+++ b/tokenizers/src/models/unigram/trainer.rs
@ -86,6 +86,10 @@ pub struct UnigramTrainer {
 }

 impl UnigramTrainer {
+    pub fn builder() -> UnigramTrainerBuilder {
+        UnigramTrainerBuilder::default()
+    }
+
    /// Setup a progress bar if asked to show progress
    fn setup_progress(&self) -> Option<ProgressBar> {
        if self.show_progress {
@ -132,9 +136,10 @@ impl UnigramTrainer {
            // This function checks that unicode "scripts" are consistent, so we cannot have romaji and
            // hiragana for instance. Seems pretty specific. Also Hiragana and katakana are mixed
            let raw_script = get_script(c);
+
            let script = if *c as u32 == 0x30FC {
                Script::Han
-            } else if *c as u32 == 32 || !self.split_by_number && c.is_numeric() {
+            } else if *c == self.space_char || !self.split_by_number && c.is_numeric() {
                Script::Any
            } else {
                match raw_script {
--- a/tokenizers/tests/unigram.rs
+++ b/tokenizers/tests/unigram.rs
@ -99,7 +99,7 @@ fn test_sample() {
            *p /= z;
        }

-        let n_trials = 100_000;
+        let n_trials = 1_000;
        let mut freq: HashMap<String, u32> = HashMap::new();
        for _ in 0..n_trials {
            let string = lattice.sample_token(theta).join(" ");
@ -245,6 +245,8 @@ fn test_spm_compat_train() {
    // println!("Stop train {:?}", model.get_vocab());
    // println!("Vocab {}", model.get_vocab().len());

+    model.save(Path::new("data"), Some("trained.json")).unwrap();
+
    let file = read_to_string(test_file).unwrap();
    let encoded = std::str::from_utf8(&output.stdout).unwrap();

@ -253,9 +255,7 @@ fn test_spm_compat_train() {
    let mut n_tokenizer_tokens = 0;
    let mut n_spm_tokens = 0;
    for (tokenizer_line, spm_line) in file.lines().zip(encoded.lines()) {
-        println!("Tokenizer line {:?}", tokenizer_line);
-        println!("Spm line {:?}", spm_line);
-        let tokenizer_tokens = model.encode(tokenizer_line);
+        let tokenizer_tokens = model.encode(&tokenizer_line.replace(" ", "▁"));
        let mut spm_tokens: Vec<String> = spm_line
            .split(' ')
            .map(|s| s.to_string().replace('▁', " "))
@ -274,6 +274,12 @@ fn test_spm_compat_train() {
        total += 1;

        // assert_eq!(tokenizer_tokens, spm_tokens, "Failed on line {}", i + 1,);
+        // println!("{} vs {}", tokenizer_tokens.len(), spm_tokens.len());
+        // assert!(tokenizer_tokens.len() <= spm_tokens.len());
+        // if spm_tokens.len() < tokenizer_tokens.len() {
+        //     println!("Tokenizer line {:?}", tokenizer_tokens.join(" "));
+        //     println!("Spm line       {:?}", spm_line);
+        // }
    }
    let acc = (correct as f64) / (total as f64) * 100.0;
    println!("Total tokenizer tokens {}", n_tokenizer_tokens);