mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Add quick doc to byte_level.rs (#1420)
* Add quick doc to byte_level.rs * Address PR comments
This commit is contained in:
@ -9,6 +9,8 @@ use crate::tokenizer::{
|
||||
};
|
||||
use crate::utils::macro_rules_attribute;
|
||||
|
||||
/// Converts bytes to unicode characters.
|
||||
/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L9
|
||||
fn bytes_char() -> HashMap<u8, char> {
|
||||
let mut bs: Vec<u8> = vec![];
|
||||
bs.extend(b'!'..=b'~');
|
||||
@ -33,6 +35,8 @@ fn bytes_char() -> HashMap<u8, char> {
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
/// Regex that matches exactly one token.
|
||||
/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98
|
||||
static ref RE: SysRegex = SysRegex::new(
|
||||
r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
|
||||
)
|
||||
|
Reference in New Issue
Block a user