diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs index 03fb4b40..6343bbd0 100644 --- a/tokenizers/src/pre_tokenizers/byte_level.rs +++ b/tokenizers/src/pre_tokenizers/byte_level.rs @@ -9,6 +9,8 @@ use crate::tokenizer::{ }; use crate::utils::macro_rules_attribute; +/// Converts bytes to unicode characters. +/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L9 fn bytes_char() -> HashMap { let mut bs: Vec = vec![]; bs.extend(b'!'..=b'~'); @@ -33,6 +35,8 @@ fn bytes_char() -> HashMap { } lazy_static! { + /// Regex that matches exactly one token. + /// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98 static ref RE: SysRegex = SysRegex::new( r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" )