mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
BertPreTokenizer pre tokenize only (with offsets)
This commit is contained in:
@ -1,182 +1,76 @@
|
|||||||
use crate::tokenizer::{PreTokenizer, Result};
|
use crate::tokenizer::{Offsets, PreTokenizer, Result};
|
||||||
use std::collections::HashSet;
|
|
||||||
use unicode_categories::UnicodeCategories;
|
|
||||||
use unicode_normalization::UnicodeNormalization;
|
|
||||||
|
|
||||||
/// Extremely simple tokenization on whitespaces
|
/// Split the given string as the `should_split` predicate dictates. Keep track of the offsets
|
||||||
fn whitespace_tokenize(s: &str) -> Vec<&str> {
|
fn split_on<F: Fn(&char) -> bool>(
|
||||||
s.trim()
|
s: &str,
|
||||||
.split(char::is_whitespace)
|
should_split: F,
|
||||||
.filter(|s| *s != " ")
|
include_split_token: bool,
|
||||||
.collect()
|
) -> Vec<(String, Offsets)> {
|
||||||
}
|
let mut words: Vec<(String, Offsets)> = vec![];
|
||||||
|
let mut offset = 0;
|
||||||
/// Checks whether a character is whitespace
|
let mut word = Vec::with_capacity(50);
|
||||||
fn is_whitespace(c: char) -> bool {
|
s.chars().for_each(|c| {
|
||||||
// These are technically control characters but we count them as whitespace
|
if should_split(&c) {
|
||||||
if c == '\t' || c == '\n' || c == '\r' {
|
if !word.is_empty() {
|
||||||
true
|
let offsets = (offset - word.len(), offset);
|
||||||
} else {
|
words.push((word.drain(0..).collect::<String>(), offsets));
|
||||||
c.is_whitespace()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Checks whether a character is a control character
|
|
||||||
fn is_control(c: char) -> bool {
|
|
||||||
// These are technically control characters but we count them as whitespace
|
|
||||||
if c == '\t' || c == '\n' || c == '\r' {
|
|
||||||
false
|
|
||||||
} else {
|
|
||||||
// The definition of `is_control` here is quite large and contains also
|
|
||||||
// Cc, Cf, Cn or Co
|
|
||||||
// cf. https://unicode.org/reports/tr44/ (Table 12)
|
|
||||||
c.is_other()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Checks whether a character is chinese
|
|
||||||
/// This defines a "chinese character" as anything in the CJK Unicode block:
|
|
||||||
/// https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
|
|
||||||
///
|
|
||||||
/// Note that the CJK Unicode block is NOT all Japanese and Korean characters,
|
|
||||||
/// despite its name. The modern Korean Hangul alphabet is a different block,
|
|
||||||
/// as is Japanese Hiragana and Katakana. Those alphabets are used to write
|
|
||||||
/// space-separated words, so they are not treated specially and handled
|
|
||||||
/// like for all of the other languages.
|
|
||||||
fn is_chinese_char(c: char) -> bool {
|
|
||||||
match c as usize {
|
|
||||||
0x4E00..=0x9FFF => true,
|
|
||||||
0x3400..=0x4DBF => true,
|
|
||||||
0x20000..=0x2A6DF => true,
|
|
||||||
0x2A700..=0x2B73F => true,
|
|
||||||
0x2B740..=0x2B81F => true,
|
|
||||||
0x2B920..=0x2CEAF => true,
|
|
||||||
0xF900..=0xFAFF => true,
|
|
||||||
0x2F800..=0x2FA1F => true,
|
|
||||||
_ => false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct BertPreTokenizer {
|
|
||||||
/// Whether to do the basic tokenization
|
|
||||||
do_basic_tokenize: bool,
|
|
||||||
/// Whether to lower case the input.
|
|
||||||
do_lower_case: bool,
|
|
||||||
/// A list of token not to split.
|
|
||||||
never_split: HashSet<String>,
|
|
||||||
/// Whether to tokenize Chinese characters
|
|
||||||
tokenize_chinese_chars: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BertPreTokenizer {
|
|
||||||
pub fn new(
|
|
||||||
do_basic_tokenize: bool,
|
|
||||||
do_lower_case: bool,
|
|
||||||
never_split: HashSet<String>,
|
|
||||||
tokenize_chinese_chars: bool,
|
|
||||||
) -> Self {
|
|
||||||
BertPreTokenizer {
|
|
||||||
do_basic_tokenize,
|
|
||||||
do_lower_case,
|
|
||||||
never_split,
|
|
||||||
tokenize_chinese_chars,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Strips accents from a piece of text
|
|
||||||
fn run_strip_accents(&self, text: &str) -> String {
|
|
||||||
text.nfd()
|
|
||||||
.filter(|c| !c.is_mark_nonspacing())
|
|
||||||
.collect::<String>()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Splits punctuation on a piece of text.
|
|
||||||
fn run_split_on_punc(&self, text: &str) -> Vec<String> {
|
|
||||||
if self.never_split.contains(text) {
|
|
||||||
return vec![text.to_owned()];
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut output: Vec<Vec<char>> = vec![];
|
|
||||||
let mut start_new_word = true;
|
|
||||||
text.chars().for_each(|c| {
|
|
||||||
if c.is_ascii_punctuation() {
|
|
||||||
output.push(vec![c]);
|
|
||||||
start_new_word = true;
|
|
||||||
} else {
|
|
||||||
if start_new_word {
|
|
||||||
output.push(vec![]);
|
|
||||||
}
|
|
||||||
start_new_word = false;
|
|
||||||
output.last_mut().unwrap().push(c);
|
|
||||||
}
|
}
|
||||||
});
|
if include_split_token {
|
||||||
|
words.push((c.to_string(), (offset, offset + 1)));
|
||||||
output
|
}
|
||||||
.into_iter()
|
} else if !should_split(&c) {
|
||||||
.map(|cs| cs.into_iter().collect::<String>())
|
word.push(c);
|
||||||
.collect()
|
}
|
||||||
|
offset += 1;
|
||||||
|
});
|
||||||
|
// Don't forget the potential last word
|
||||||
|
if !word.is_empty() {
|
||||||
|
let offsets = (offset - word.len(), offset);
|
||||||
|
words.push((word.drain(0..).collect::<String>(), offsets));
|
||||||
}
|
}
|
||||||
|
|
||||||
fn tokenize_chinese_chars(&self, text: &str) -> String {
|
words
|
||||||
text.chars()
|
|
||||||
.map(|c| {
|
|
||||||
if is_chinese_char(c) {
|
|
||||||
vec![' ', c, ' ']
|
|
||||||
} else {
|
|
||||||
vec![c]
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.flatten()
|
|
||||||
.collect::<String>()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn clean_text(&self, text: &str) -> String {
|
|
||||||
text.chars()
|
|
||||||
.map(|c| {
|
|
||||||
if c as usize == 0 || c as usize == 0xfffd || is_control(c) {
|
|
||||||
None
|
|
||||||
} else if is_whitespace(c) {
|
|
||||||
Some(' ')
|
|
||||||
} else {
|
|
||||||
Some(c)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.filter(|c| c.is_some())
|
|
||||||
.map(|c| c.unwrap())
|
|
||||||
.collect::<String>()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct BertPreTokenizer;
|
||||||
|
|
||||||
impl PreTokenizer for BertPreTokenizer {
|
impl PreTokenizer for BertPreTokenizer {
|
||||||
fn pre_tokenize(&self, s: &str) -> Result<Vec<String>> {
|
fn pre_tokenize(&self, s: &str) -> Result<Vec<(String, Offsets)>> {
|
||||||
if !self.do_basic_tokenize {
|
let mut split_tokens = vec![];
|
||||||
Ok(whitespace_tokenize(&s)
|
for (token, offsets) in split_on(&s, |c| char::is_whitespace(*c), false) {
|
||||||
.into_iter()
|
split_tokens.extend(
|
||||||
.map(|s| s.to_owned())
|
split_on(&token, char::is_ascii_punctuation, true)
|
||||||
.collect())
|
.into_iter()
|
||||||
} else {
|
.map(|(tok, off)| (tok, (off.0 + offsets.0, off.1 + offsets.0))),
|
||||||
let mut text = self.clean_text(s);
|
);
|
||||||
|
|
||||||
// This was added on November 1st, 2018 for the multilingual and Chinese
|
|
||||||
// models. This is also applied to the English models now, but it doesn't
|
|
||||||
// matter since the English models were not trained on any Chinese data
|
|
||||||
// and generally don't have any Chinese data in them (there are Chinese
|
|
||||||
// characters in the vocabulary because Wikipedia does have some Chinese
|
|
||||||
// words in the English Wikipedia.).
|
|
||||||
if self.tokenize_chinese_chars {
|
|
||||||
text = self.tokenize_chinese_chars(&text);
|
|
||||||
}
|
|
||||||
let orig_tokens = whitespace_tokenize(&text);
|
|
||||||
let mut split_tokens = vec![];
|
|
||||||
for token in orig_tokens {
|
|
||||||
let mut tk = token.to_owned();
|
|
||||||
if self.do_lower_case && !self.never_split.contains(token) {
|
|
||||||
tk = self.run_strip_accents(&token.to_lowercase())
|
|
||||||
}
|
|
||||||
split_tokens.extend(self.run_split_on_punc(&tk));
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(split_tokens)
|
|
||||||
}
|
}
|
||||||
|
Ok(split_tokens)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn basic() {
|
||||||
|
let pretok = BertPreTokenizer;
|
||||||
|
let res = pretok
|
||||||
|
.pre_tokenize("Hey friend! How are you?!?")
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
&res,
|
||||||
|
&[
|
||||||
|
("Hey".into(), (0, 3)),
|
||||||
|
("friend".into(), (4, 10)),
|
||||||
|
("!".into(), (10, 11)),
|
||||||
|
("How".into(), (16, 19)),
|
||||||
|
("are".into(), (20, 23)),
|
||||||
|
("you".into(), (24, 27)),
|
||||||
|
("?".into(), (27, 28)),
|
||||||
|
("!".into(), (28, 29)),
|
||||||
|
("?".into(), (29, 30)),
|
||||||
|
]
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user