Add BasicPreTokenizer for bert

2025-08-22 16:25:30 +00:00 · 2019-12-06 19:28:30 -05:00
parent c4bda752bd
commit 030698530c
2 changed files with 172 additions and 0 deletions
--- a/tokenizers/src/pre_tokenizers/basic.rs
+++ b/tokenizers/src/pre_tokenizers/basic.rs
@ -0,0 +1,171 @@
+use crate::tokenizer::PreTokenizer;
+use std::collections::HashSet;
+use unicode_categories::UnicodeCategories;
+use unicode_normalization::UnicodeNormalization;
+
+/// Extremely simple tokenization on whitespaces
+fn whitespace_tokenize(s: &str) -> Vec<&str> {
+    s.trim()
+        .split(char::is_whitespace)
+        .filter(|s| *s != " ")
+        .collect()
+}
+
+/// Checks whether a character is whitespace
+fn is_whitespace(c: char) -> bool {
+    // These are technically control characters but we count them as whitespace
+    if c == '\t' || c == '\n' || c == '\r' {
+        true
+    } else {
+        c.is_whitespace()
+    }
+}
+
+/// Checks whether a character is a control character
+fn is_control(c: char) -> bool {
+    // These are technically control characters but we count them as whitespace
+    if c == '\t' || c == '\n' || c == '\r' {
+        false
+    } else {
+        // The definition of `is_control` here is quite large and contains also
+        // Cc, Cf, Cn or Co
+        // cf. https://unicode.org/reports/tr44/ (Table 12)
+        c.is_other()
+    }
+}
+
+/// Checks whether a character is chinese
+/// This defines a "chinese character" as anything in the CJK Unicode block:
+///   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+///
+/// Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+/// despite its name. The modern Korean Hangul alphabet is a different block,
+/// as is Japanese Hiragana and Katakana. Those alphabets are used to write
+/// space-separated words, so they are not treated specially and handled
+/// like for all of the other languages.
+fn is_chinese_char(c: char) -> bool {
+    match c as usize {
+        0x4E00..=0x9FFF => true,
+        0x3400..=0x4DBF => true,
+        0x20000..=0x2A6DF => true,
+        0x2A700..=0x2B73F => true,
+        0x2B740..=0x2B81F => true,
+        0x2B920..=0x2CEAF => true,
+        0xF900..=0xFAFF => true,
+        0x2F800..=0x2FA1F => true,
+        _ => false,
+    }
+}
+
+pub struct BasicPreTokenizer {
+    /// Whether to lower case the input.
+    do_lower_case: bool,
+    /// A list of token not to split.
+    never_split: HashSet<String>,
+    /// Whether to tokenize Chinese characters
+    tokenize_chinese_chars: bool,
+}
+
+impl BasicPreTokenizer {
+    pub fn new(
+        do_lower_case: bool,
+        never_split: HashSet<String>,
+        tokenize_chinese_chars: bool,
+    ) -> Self {
+        BasicPreTokenizer {
+            do_lower_case,
+            never_split,
+            tokenize_chinese_chars,
+        }
+    }
+
+    /// Strips accents from a piece of text
+    fn run_strip_accents(&self, text: &str) -> String {
+        text.nfd()
+            .filter(|c| !c.is_mark_nonspacing())
+            .collect::<String>()
+    }
+
+    /// Splits punctuation on a piece of text.
+    fn run_split_on_punc(&self, text: &str) -> Vec<String> {
+        if self.never_split.contains(text) {
+            return vec![text.to_owned()];
+        }
+
+        let mut output: Vec<Vec<char>> = vec![];
+        let mut start_new_word = true;
+        text.chars().for_each(|c| {
+            if c.is_ascii_punctuation() {
+                output.push(vec![c]);
+                start_new_word = true;
+            } else {
+                if start_new_word {
+                    output.push(vec![]);
+                }
+                start_new_word = false;
+                output.last_mut().unwrap().push(c);
+            }
+        });
+
+        output
+            .into_iter()
+            .map(|cs| cs.into_iter().collect::<String>())
+            .collect()
+    }
+
+    fn tokenize_chinese_chars(&self, text: &str) -> String {
+        text.chars()
+            .map(|c| {
+                if is_chinese_char(c) {
+                    vec![' ', c, ' ']
+                } else {
+                    vec![c]
+                }
+            })
+            .flatten()
+            .collect::<String>()
+    }
+
+    fn clean_text(&self, text: &str) -> String {
+        text.chars()
+            .map(|c| {
+                if c as usize == 0 || c as usize == 0xfffd || is_control(c) {
+                    None
+                } else if is_whitespace(c) {
+                    Some(' ')
+                } else {
+                    Some(c)
+                }
+            })
+            .filter(|c| c.is_some())
+            .map(|c| c.unwrap())
+            .collect::<String>()
+    }
+}
+
+impl PreTokenizer for BasicPreTokenizer {
+    fn pre_tokenize(&self, s: &str) -> Vec<String> {
+        let mut text = self.clean_text(s);
+
+        // This was added on November 1st, 2018 for the multilingual and Chinese
+        // models. This is also applied to the English models now, but it doesn't
+        // matter since the English models were not trained on any Chinese data
+        // and generally don't have any Chinese data in them (there are Chinese
+        // characters in the vocabulary because Wikipedia does have some Chinese
+        // words in the English Wikipedia.).
+        if self.tokenize_chinese_chars {
+            text = self.tokenize_chinese_chars(&text);
+        }
+        let orig_tokens = whitespace_tokenize(&text);
+        let mut split_tokens = vec![];
+        for token in orig_tokens {
+            let mut tk = token.to_owned();
+            if self.do_lower_case && !self.never_split.contains(token) {
+                tk = self.run_strip_accents(&token.to_lowercase())
+            }
+            split_tokens.extend(self.run_split_on_punc(&tk));
+        }
+
+        split_tokens
+    }
+}
--- a/tokenizers/src/pre_tokenizers/mod.rs
+++ b/tokenizers/src/pre_tokenizers/mod.rs
@ -1,2 +1,3 @@
+pub mod basic;
 pub mod byte_level;
 pub mod whitespace;