implement a simple max_sentencepiece_length into BPE (#1228)

* implement a simple max_sentencepiece_length into BPE Add a way for the BPE trainer to behave like the unigram trainer where tokens longer than a certain lenght(default 16 in SPM) to be skipped. this is implemented in unigram trainer but in a different way. If this code were to be actually integrated some works to be done Documentation describing the behavior and how it should be set. Set default==0 so it doesnt act unless set provide ways in the python binding for the user to set max token length I was trying to find a way to implement max_sentencepiece_length through pretokenizer split rules and to be honest, its very difficult and regexes can be real slow when operating on the whole training corpus. * implement a simple max_sentencepiece_length into BPE Add a way for the BPE trainer to behave like the unigram trainer where tokens longer than a certain lenght(default 16 in SPM) to be skipped. this is implemented in unigram trainer but in a different way. If this code were to be actually integrated some works to be done Documentation describing the behavior and how it should be set. Set default==0 so it doesnt act unless set provide ways in the python binding for the user to set max token length I was trying to find a way to implement max_sentencepiece_length through pretokenizer split rules and to be honest, its very difficult and regexes can be real slow when operating on the whole training corpus. * utilize Option<u16> for safer code. * Other version. * Update trainer.rs clarify with type usize propagate max_length option * change max_length into more descriptive name in the documentation https://huggingface.co/docs/tokenizers/api/trainers unigramtrainer uses max_piece_length for similar function. since BPE the underlying concept is merges, using max_merge_length as the variable name could prove more descriptive. * change variable name in trainer.rs change max_merge_length into max_token_length * Update trainer.rs add several max_token_length declaration that were missing. impl BpeTrainerBuilder struct BpeTrainer Add explanation for variable shadowing. * Update trainer.rs Move default definition of max_token_length to proper location. adjust downstream variable initializations accordingly. * add max_token_length test * Add bpe direct assert test * Update trainer.rs clarified test documentation * Creating the bindings. * Fix the default. * Re-adding missing package-lock which I accidentally removed. * .. * Fixing trainer test. * Fix. --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
2025-08-22 16:25:30 +00:00 · 2023-05-16 17:08:19 +09:00
parent daf3fcc976
commit cefc41e8ec
6 changed files with 6799 additions and 11 deletions
--- a/bindings/node/package-lock.json
+++ b/bindings/node/package-lock.json
--- a/bindings/python/py_src/tokenizers/trainers/init.pyi
+++ b/bindings/python/py_src/tokenizers/trainers/init.pyi
@ -38,6 +38,12 @@ class BpeTrainer(Trainer):
        end_of_word_suffix (:obj:`str`, `optional`):
            A suffix to be used for every subword that is a end-of-word.
        max_token_length (:obj:`int`, `optional`):
            Prevents creating tokens longer than the specified size.
            This can help with reducing polluting your vocabulary with
            highly repetitive tokens like `======` for wikipedia
    """
 class UnigramTrainer(Trainer):
--- a/bindings/python/src/trainers.rs
+++ b/bindings/python/src/trainers.rs
@ -162,6 +162,12 @@ macro_rules! setter {
 ///
 ///     end_of_word_suffix (:obj:`str`, `optional`):
 ///         A suffix to be used for every subword that is a end-of-word.
 ///
 ///     max_token_length (:obj:`int`, `optional`):
 ///         Prevents creating tokens longer than the specified size.
 ///         This can help with reducing polluting your vocabulary with
 ///         highly repetitive tokens like `======` for wikipedia
 ///
 #[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "BpeTrainer")]
 pub struct PyBpeTrainer {}
 #[pymethods]
@ -243,6 +249,16 @@ impl PyBpeTrainer {
        setter!(self_, BpeTrainer, limit_alphabet, limit);
    }
    #[getter]
    fn get_max_token_length(self_: PyRef<Self>) -> Option<usize> {
        getter!(self_, BpeTrainer, max_token_length)
    }
    #[setter]
    fn set_max_token_length(self_: PyRef<Self>, limit: Option<usize>) {
        setter!(self_, BpeTrainer, max_token_length, limit);
    }
    #[getter]
    fn get_initial_alphabet(self_: PyRef<Self>) -> Vec<String> {
        getter!(
@ -315,6 +331,7 @@ impl PyBpeTrainer {
                        );
                    }
                    "limit_alphabet" => builder = builder.limit_alphabet(val.extract()?),
                    "max_token_length" => builder = builder.max_token_length(val.extract()?),
                    "initial_alphabet" => {
                        let alphabet: Vec<String> = val.extract()?;
                        builder = builder.initial_alphabet(
--- a/bindings/python/tests/bindings/test_trainers.py
+++ b/bindings/python/tests/bindings/test_trainers.py
@ -63,7 +63,7 @@ class TestBpeTrainer:
    def test_can_pickle(self):
        assert (
            trainers.BpeTrainer(min_frequency=12).__getstate__()
-            == b"""{"BpeTrainer":{"min_frequency":12,"vocab_size":30000,"show_progress":true,"special_tokens":[],"limit_alphabet":null,"initial_alphabet":[],"continuing_subword_prefix":null,"end_of_word_suffix":null,"words":{}}}"""
+            == b"""{"BpeTrainer":{"min_frequency":12,"vocab_size":30000,"show_progress":true,"special_tokens":[],"limit_alphabet":null,"initial_alphabet":[],"continuing_subword_prefix":null,"end_of_word_suffix":null,"max_token_length":null,"words":{}}}"""
        )
        assert isinstance(pickle.loads(pickle.dumps(trainers.BpeTrainer(min_frequency=12))), trainers.BpeTrainer)
--- a/tokenizers/src/models/bpe/trainer.rs
+++ b/tokenizers/src/models/bpe/trainer.rs
@ -44,6 +44,7 @@ struct Config {
    initial_alphabet: HashSet<char>,
    continuing_subword_prefix: Option<String>,
    end_of_word_suffix: Option<String>,
    max_token_length: Option<usize>,
 }
 /// A `BpeTrainerBuilder` can be used to create a `BpeTrainer` with a custom
@ -64,6 +65,7 @@ impl Default for BpeTrainerBuilder {
                initial_alphabet: HashSet::new(),
                continuing_subword_prefix: None,
                end_of_word_suffix: None,
                max_token_length: None,
            },
        }
    }
@ -130,6 +132,12 @@ impl BpeTrainerBuilder {
        self.config.end_of_word_suffix = Some(suffix);
        self
    }
    /// Set max_token_length
    #[must_use]
    pub fn max_token_length(mut self, max_token_length: Option<usize>) -> Self {
        self.config.max_token_length = max_token_length;
        self
    }
    /// Constructs the final BpeTrainer
    pub fn build(self) -> BpeTrainer {
@ -142,6 +150,7 @@ impl BpeTrainerBuilder {
            initial_alphabet: self.config.initial_alphabet,
            continuing_subword_prefix: self.config.continuing_subword_prefix,
            end_of_word_suffix: self.config.end_of_word_suffix,
            max_token_length: self.config.max_token_length,
            words: HashMap::new(),
        }
    }
@ -183,6 +192,8 @@ pub struct BpeTrainer {
    pub continuing_subword_prefix: Option<String>,
    /// An optional suffix to caracterize and end-of-word subword
    pub end_of_word_suffix: Option<String>,
    /// An optional parameter to limit the max length of any single token
    pub max_token_length: Option<usize>,
    words: HashMap<String, u32>,
 }
@ -425,6 +436,7 @@ impl BpeTrainer {
    ) -> Result<Vec<AddedToken>> {
        let mut word_to_id: HashMap<String, u32> = HashMap::with_capacity(self.vocab_size);
        let mut id_to_word: Vec<String> = Vec::with_capacity(self.vocab_size);
        let max_token_length: usize = self.max_token_length.unwrap_or(usize::MAX);
        let progress = self.setup_progress();
@ -502,6 +514,9 @@ impl BpeTrainer {
                }
            }
            let new_token = format!("{}{}", part_a, part_b);
            // implement sentencepiece-like merge.
            // if this code were to be merged, integrate a way in the python bindings to communicate this variable
            // default should be 0/None to maintain previous behavior. 16 is the spm default.
            // Insert new token if it does not already exist
            let new_token_id = word_to_id
@ -524,7 +539,7 @@ impl BpeTrainer {
                    // can be there only once (HashSet). So this is safe.
                    unsafe {
                        let word: &mut Word = &mut (*w);
-                        word.merge(top.pair.0, top.pair.1, new_token_id)
+                        word.merge(top.pair.0, top.pair.1, new_token_id, max_token_length)
                            .into_iter()
                            .map(|c| (c, *i))
                            .collect::<Vec<_>>()
@ -720,4 +735,115 @@ mod tests {
        .collect();
        assert_eq!(model.merges, expected_merges);
    }
    #[test]
    fn bpe_test_max_token_length_16() {
        /* bpe_test_max_token_length series of tests test the max_token_length flag of bpetrainer
        // this is the more robust version that only tests max length of learned tokens
        // (pre) tokenizer settings or vocab can be easily modified when necessary
         */
        let max_token_length = 16;
        let long_word_counts: HashMap<String, u32> = [
            ("singlelongtokenwithoutcasechange", 2),
            ("singleLongTokenWithCamelCaseChange", 2),
            ("Longsingletokenwithpunctu@t!onwithin", 2),
            ("Anotherlongsingletokenwithnumberw1th1n", 2),
            ("짧은한글문자열짧은한", 2),             // korean 10 char
            ("긴한글문자열긴한글문자열긴한글문", 2), // korean 16 char
            ("短字符串短字符串短字", 2),             //simplified chinese 10 char
            ("长字符串长字符串长字符串长字符串", 2), // simp. chinese 16 char
            ("短い文字列短い文字列", 2),             // japanese 10 char
            ("長い文字列長い文字列長い文字列長", 2), // japanese 16 char
            ("so", 2),
            ("GPT-2", 2),
        ]
        .iter()
        .map(|(key, value)| (key.to_string(), *value))
        .collect();
        let trainer = BpeTrainer::builder()
            .max_token_length(Some(max_token_length))
            .show_progress(false)
            .min_frequency(0)
            .build();
        let mut model = BPE::default();
        trainer.do_train(&long_word_counts, &mut model).unwrap();
        let vocab = model.get_vocab();
        for token in vocab.keys() {
            assert!(
                token.chars().count() <= max_token_length,
                "token too long : {} , chars().count() = {}",
                token,
                token.chars().count()
            )
        }
    }
    #[test]
    fn bpe_test_max_token_length_direct_assert() {
        /* more direct version of bpe_test_max_token_length test
        // directly compares tokens with known expected values.
        // maybe unstable depending on specific settings or changes.
         */
        let long_word_counts: HashMap<String, u32> = [
            ("sin", 2),
            ("Sin", 2),
            ("Lon", 2),
            ("Ano", 2),
            ("짧은한", 2),
            ("긴한글", 2),
            ("短字符", 2),
            ("长字符", 2),
            ("短い文", 2),
            ("長い文", 2),
            ("so", 2),
            ("GP", 2),
        ]
        .iter()
        .map(|(key, value)| (key.to_string(), *value))
        .collect();
        let trainer = BpeTrainer::builder()
            .max_token_length(Some(2))
            .show_progress(false)
            .min_frequency(0)
            .build();
        let mut model = BPE::default();
        trainer.do_train(&long_word_counts, &mut model).unwrap();
        let trained_vocab: HashMap<String, u32> = model.get_vocab();
        let expected_vocab: HashMap<String, u32> = [
            ("短", 12),
            ("n", 6),
            ("i", 5),
            ("s", 8),
            ("字符", 23),
            ("長", 14),
            ("긴", 17),
            ("い文", 22),
            ("L", 2),
            ("in", 21),
            ("o", 7),
            ("은한", 29),
            ("S", 4),
            ("P", 3),
            ("so", 27),
            ("符", 13),
            ("文", 11),
            ("字", 10),
            ("짧", 19),
            ("GP", 25),
            ("글", 16),
            ("G", 1),
            ("An", 24),
            ("长", 15),
            ("A", 0),
            ("Lo", 26),
            ("긴한", 28),
            ("い", 9),
            ("한", 20),
            ("은", 18),
        ]
        .iter()
        .cloned()
        .map(|(k, v)| (k.to_string(), v))
        .collect();
        assert_eq!(trained_vocab, expected_vocab)
    }
 }
--- a/tokenizers/src/models/bpe/word.rs
+++ b/tokenizers/src/models/bpe/word.rs
@ -103,7 +103,13 @@ impl Word {
        });
    }
-    pub(super) fn merge(&mut self, c1: u32, c2: u32, replacement: u32) -> Vec<(Pair, i32)> {
+    pub(super) fn merge(
        &mut self,
        c1: u32,
        c2: u32,
        replacement: u32,
        max_length: usize,
    ) -> Vec<(Pair, i32)> {
        let mut changes: Vec<(Pair, i32)> = vec![];
        let mut i = 0;
        loop {
@ -117,12 +123,6 @@ impl Word {
                let first = self.symbols[i];
                let second = self.symbols[i + 1];
                // If there are other characters before the pair
                if i > 0 {
                    changes.push(((self.symbols[i - 1].c, first.c), -1));
                    changes.push(((self.symbols[i - 1].c, replacement), 1));
                }
                // Remove in place
                let new_s = Symbol {
                    c: replacement,
@ -130,6 +130,15 @@ impl Word {
                    next: second.next,
                    len: first.len + second.len,
                };
                // If there are other characters before the pair
                if i > 0 {
                    changes.push(((self.symbols[i - 1].c, first.c), -1));
                    if self.symbols[i - 1].len + new_s.len < max_length {
                        changes.push(((self.symbols[i - 1].c, replacement), 1));
                    }
                }
                self.symbols.insert(i, new_s); // Insert replacement before first char of pair
                self.symbols.remove(i + 1); // Remove first char of pair
                self.symbols.remove(i + 1); // And then the second
@ -137,7 +146,9 @@ impl Word {
                // If there are other characters after the pair
                if i < self.symbols.len() - 1 {
                    changes.push(((second.c, self.symbols[i + 1].c), -1));
-                    changes.push(((replacement, self.symbols[i + 1].c), 1));
+                    if self.symbols[i + 1].len + new_s.len < max_length {
                        changes.push(((replacement, self.symbols[i + 1].c), 1));
                    }
                }
            }
@ -276,7 +287,7 @@ mod tests {
        // We're going to perform a merge on the pair ('l', 'l') ~= (2, 2). Let's
        // say that 'll' has the ID of 4 in the updated word-to-id vocab.
-        let changes = word.merge(2, 2, 4);
+        let changes = word.merge(2, 2, 4, usize::MAX);
        // So the word should now look like this:
        assert_eq!(
@ -306,4 +317,39 @@ mod tests {
            ]
        );
    }
    #[test]
    fn test_merge_max_length() {
        // Let's say we have the word 'hello' and a word-to-id vocab that looks
        // like this: {'h': 0, 'e': 1, 'l': 2, 'o': 3}.
        let mut word = Word::new();
        word.add(0, 1); // 'h'
        word.add(1, 1); // 'e'
        word.add(2, 1); // 'l'
        word.add(2, 1); // 'l'
        word.add(3, 1); // 'o'
        // We're going to perform a merge on the pair ('l', 'l') ~= (2, 2). Let's
        // say that 'll' has the ID of 4 in the updated word-to-id vocab.
        let changes = word.merge(2, 2, 4, 2);
        assert_eq!(
            word.get_chars(),
            &[
                0u32, // 'h'
                1u32, // 'e'
                4u32, // 'll'
                3u32, // 'o'
            ]
        );
        assert_eq!(
            changes,
            &[
                ((1u32, 2u32), -1i32), // count for ('e', 'l') should be decreased by 1.
                // ((1u32, 4u32), 1i32),  Missing since this would be larger than 2
                ((2u32, 3u32), -1i32), // count for ('l', 'o') should be decreased by 1.
                                       // ((4u32, 3u32), 1i32), Missing since this would be larger than 2
            ]
        );
    }
 }