mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
replace lazy_static with stabilized std::sync::LazyLock in 1.80 (#1739)
This commit is contained in:
@ -42,7 +42,6 @@ required-features = ["http"]
|
||||
harness = false
|
||||
|
||||
[dependencies]
|
||||
lazy_static = "1.4"
|
||||
rand = "0.8"
|
||||
onig = { version = "6.4", default-features = false, optional = true }
|
||||
regex = "1.10"
|
||||
|
@ -130,8 +130,6 @@
|
||||
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
#[macro_use]
|
||||
extern crate lazy_static;
|
||||
|
||||
#[macro_use]
|
||||
extern crate derive_builder;
|
||||
|
@ -2,16 +2,13 @@ use crate::processors::byte_level::bytes_char;
|
||||
use crate::tokenizer::{NormalizedString, Normalizer, Result};
|
||||
use crate::utils::macro_rules_attribute;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::LazyLock;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
#[macro_rules_attribute(impl_serde_type!)]
|
||||
pub struct ByteLevel;
|
||||
|
||||
lazy_static! {
|
||||
static ref BYTES_CHAR: HashMap<u8, char> = bytes_char();
|
||||
static ref CHAR_BYTES: HashMap<char, u8> =
|
||||
bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
|
||||
}
|
||||
static BYTES_CHAR: LazyLock<HashMap<u8, char>> = LazyLock::new(bytes_char);
|
||||
|
||||
impl Default for ByteLevel {
|
||||
fn default() -> Self {
|
||||
|
@ -1,4 +1,5 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::LazyLock;
|
||||
|
||||
use crate::utils::SysRegex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@ -37,17 +38,15 @@ pub(crate) fn bytes_char() -> HashMap<u8, char> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
/// Regex that matches exactly one token.
|
||||
/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98
|
||||
static ref RE: SysRegex = SysRegex::new(
|
||||
r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
|
||||
)
|
||||
.unwrap();
|
||||
static ref BYTES_CHAR: HashMap<u8, char> = bytes_char();
|
||||
static ref CHAR_BYTES: HashMap<char, u8> =
|
||||
bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
|
||||
}
|
||||
static RE: LazyLock<SysRegex> = LazyLock::new(|| {
|
||||
SysRegex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+")
|
||||
.unwrap()
|
||||
});
|
||||
static BYTES_CHAR: LazyLock<HashMap<u8, char>> = LazyLock::new(bytes_char);
|
||||
static CHAR_BYTES: LazyLock<HashMap<char, u8>> =
|
||||
LazyLock::new(|| bytes_char().into_iter().map(|(c, b)| (b, c)).collect());
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
/// Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care
|
||||
|
@ -1,3 +1,5 @@
|
||||
use std::sync::LazyLock;
|
||||
|
||||
use regex::Regex;
|
||||
|
||||
use crate::tokenizer::{
|
||||
@ -17,9 +19,7 @@ impl Default for Whitespace {
|
||||
|
||||
impl PreTokenizer for Whitespace {
|
||||
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
|
||||
lazy_static! {
|
||||
static ref RE: Regex = Regex::new(r"\w+|[^\w\s]+").unwrap();
|
||||
}
|
||||
static RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\w+|[^\w\s]+").unwrap());
|
||||
let re_ref: &Regex = &RE;
|
||||
|
||||
pretokenized.split(|_, normalized| {
|
||||
|
@ -5,6 +5,7 @@ use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
|
||||
use regex::Regex;
|
||||
use serde::{ser::SerializeSeq, Deserialize, Serialize, Serializer};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::LazyLock;
|
||||
|
||||
/// Represent a token added by the user on top of the existing Model vocabulary.
|
||||
/// AddedToken can be configured to specify the behavior they should have in various situations
|
||||
@ -94,12 +95,10 @@ impl std::hash::Hash for AddedToken {
|
||||
|
||||
type MatchingSet = (AhoCorasick, Vec<u32>);
|
||||
|
||||
lazy_static! {
|
||||
static ref STARTS_WITH_WORD: Regex = Regex::new(r"^\w").unwrap();
|
||||
static ref ENDS_WITH_WORD: Regex = Regex::new(r"\w$").unwrap();
|
||||
static ref RIGHTMOST_SPACE_AT_START: Regex = Regex::new(r"^\s*").unwrap();
|
||||
static ref LEFTMOST_SPACE_AT_END: Regex = Regex::new(r"\s*$").unwrap();
|
||||
}
|
||||
static STARTS_WITH_WORD: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\w").unwrap());
|
||||
static ENDS_WITH_WORD: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\w$").unwrap());
|
||||
static RIGHTMOST_SPACE_AT_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*").unwrap());
|
||||
static LEFTMOST_SPACE_AT_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s*$").unwrap());
|
||||
|
||||
fn ends_with_word(sentence: &str) -> bool {
|
||||
ENDS_WITH_WORD.is_match(sentence)
|
||||
|
Reference in New Issue
Block a user