From 0b450d62ff7f8e4f266107e23d2fe5e005dfa2cc Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Sun, 17 Nov 2019 00:40:22 -0500 Subject: [PATCH] Add ByteLevel pre tokenizer --- tokenizers/src/pre_tokenizers/byte_level.rs | 63 +++++++++++++++++++++ tokenizers/src/pre_tokenizers/mod.rs | 1 + 2 files changed, 64 insertions(+) create mode 100644 tokenizers/src/pre_tokenizers/byte_level.rs diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs new file mode 100644 index 00000000..577beaea --- /dev/null +++ b/tokenizers/src/pre_tokenizers/byte_level.rs @@ -0,0 +1,63 @@ +use crate::tokenizer::PreTokenizer; +use onig::Regex; +use std::collections::HashMap; + +fn bytes_char() -> HashMap { + let mut bs: Vec = vec![]; + bs.extend(b'!'..=b'~'); + bs.extend(b'\xA1'..=b'\xAC'); + bs.extend(b'\xAE'..=b'\xFF'); + + let mut cs: Vec = bs.iter().map(|i| *i as u32).collect(); + let mut n = 0; + + for b in 0..=255u8 { + if !bs.contains(&b) { + bs.push(b); + cs.push(u32::pow(2, 8) + n); + n += 1; + } + } + + bs.into_iter().zip(cs).collect() +} + +pub struct ByteLevel; +impl PreTokenizer for ByteLevel { + fn pre_tokenize(&self, s: &str) -> Vec { + lazy_static! { + static ref RE: Regex = Regex::new( + r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" + ) + .unwrap(); + static ref BYTES_CHAR: HashMap = bytes_char(); + } + + RE.find_iter(s) + .map(|(start, end)| s[start..end].to_owned()) + .map(|s| { + s.into_bytes() + .iter() + .map(|b| std::char::from_u32(BYTES_CHAR[b]).unwrap()) + .collect() + }) + .collect() + } +} + +#[cfg(test)] +mod tests { + use super::ByteLevel; + use crate::tokenizer::PreTokenizer; + + #[test] + fn basic() { + let pre_tok = ByteLevel; + assert_eq!( + pre_tok.pre_tokenize("Hello my friend, how is your day going?"), + vec![ + "Hello", "Ġmy", "Ġfriend", ",", "Ġhow", "Ġis", "Ġyour", "Ġday", "Ġgoing", "?" + ] + ); + } +} diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index fd371e5b..614485a0 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -1 +1,2 @@ +pub mod byte_level; pub mod whitespace;