Add quick doc to byte_level.rs (#1420)

* Add quick doc to byte_level.rs

* Address PR comments
This commit is contained in:
Steven Weiss
2024-01-03 01:25:07 -08:00
committed by GitHub
parent 11462596d1
commit f1c23b8680

View File

@ -9,6 +9,8 @@ use crate::tokenizer::{
}; };
use crate::utils::macro_rules_attribute; use crate::utils::macro_rules_attribute;
/// Converts bytes to unicode characters.
/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L9
fn bytes_char() -> HashMap<u8, char> { fn bytes_char() -> HashMap<u8, char> {
let mut bs: Vec<u8> = vec![]; let mut bs: Vec<u8> = vec![];
bs.extend(b'!'..=b'~'); bs.extend(b'!'..=b'~');
@ -33,6 +35,8 @@ fn bytes_char() -> HashMap<u8, char> {
} }
lazy_static! { lazy_static! {
/// Regex that matches exactly one token.
/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98
static ref RE: SysRegex = SysRegex::new( static ref RE: SysRegex = SysRegex::new(
r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
) )