mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Add quick doc to byte_level.rs (#1420)
* Add quick doc to byte_level.rs * Address PR comments
This commit is contained in:
@ -9,6 +9,8 @@ use crate::tokenizer::{
|
|||||||
};
|
};
|
||||||
use crate::utils::macro_rules_attribute;
|
use crate::utils::macro_rules_attribute;
|
||||||
|
|
||||||
|
/// Converts bytes to unicode characters.
|
||||||
|
/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L9
|
||||||
fn bytes_char() -> HashMap<u8, char> {
|
fn bytes_char() -> HashMap<u8, char> {
|
||||||
let mut bs: Vec<u8> = vec![];
|
let mut bs: Vec<u8> = vec![];
|
||||||
bs.extend(b'!'..=b'~');
|
bs.extend(b'!'..=b'~');
|
||||||
@ -33,6 +35,8 @@ fn bytes_char() -> HashMap<u8, char> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
|
/// Regex that matches exactly one token.
|
||||||
|
/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98
|
||||||
static ref RE: SysRegex = SysRegex::new(
|
static ref RE: SysRegex = SysRegex::new(
|
||||||
r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
|
r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
|
||||||
)
|
)
|
||||||
|
Reference in New Issue
Block a user