Wordpiece handles prefix customization

2025-08-22 16:25:30 +00:00 · 2020-01-03 15:30:55 -05:00
parent dc8266236d
commit 02a89bb07f
2 changed files with 11 additions and 1 deletions
--- a/tokenizers/src/models/bpe/model.rs
+++ b/tokenizers/src/models/bpe/model.rs
@ -245,6 +245,10 @@ impl BPE {
        &self.unk_token
    }

+    pub fn get_continuing_subword_prefix(&self) -> &Option<String> {
+        &self.continuing_subword_prefix
+    }
+
    fn merge_word(&self, w: &str) -> Word {
        let mut word = Word::new();
        for (is_first, is_last, c) in w.chars().with_first_and_last() {
--- a/tokenizers/src/models/wordpiece/mod.rs
+++ b/tokenizers/src/models/wordpiece/mod.rs
@ -28,6 +28,7 @@ impl fmt::Display for Error {

 pub struct WordPiece {
    unk_token: String,
+    continuing_subword_prefix: String,
    max_input_chars_per_word: usize,
    vocab: HashMap<String, u32>,
    vocab_r: HashMap<u32, String>,
@ -38,6 +39,7 @@ impl Default for WordPiece {
        WordPiece {
            vocab: HashMap::new(),
            vocab_r: HashMap::new(),
+            continuing_subword_prefix: String::from("##"),
            unk_token: String::from("[UNK]"),
            max_input_chars_per_word: 100,
        }
@ -64,6 +66,7 @@ impl WordPiece {
            vocab_r: vocab.into_iter().map(|(token, id)| (id, token)).collect(),
            unk_token,
            max_input_chars_per_word: max_input_chars_per_word.unwrap_or(100),
+            ..Default::default()
        })
    }

@ -86,6 +89,9 @@ impl WordPiece {
                wp.unk_token = unk_token.to_owned();
            }
        }
+        if let Some(prefix) = bpe.get_continuing_subword_prefix() {
+            wp.continuing_subword_prefix = prefix.to_owned();
+        }

        wp
    }
@ -125,7 +131,7 @@ impl Model for WordPiece {
                while start < end {
                    let mut substr = chars[start..end].iter().collect::<String>();
                    if start > 0 {
-                        substr = format!("##{}", substr);
+                        substr = format!("{}{}", self.continuing_subword_prefix, substr);
                    }
                    if self.vocab.contains_key(&substr) {
                        cur_str = Some(Token {