replace lazy_static with stabilized std::sync::LazyLock in 1.80 (#1739)

This commit is contained in:
sftse
2025-03-18 17:33:44 +01:00
committed by GitHub
parent 4383a25787
commit 759d7aa77a
6 changed files with 20 additions and 28 deletions

View File

@ -42,7 +42,6 @@ required-features = ["http"]
harness = false harness = false
[dependencies] [dependencies]
lazy_static = "1.4"
rand = "0.8" rand = "0.8"
onig = { version = "6.4", default-features = false, optional = true } onig = { version = "6.4", default-features = false, optional = true }
regex = "1.10" regex = "1.10"

View File

@ -130,8 +130,6 @@
#[macro_use] #[macro_use]
extern crate log; extern crate log;
#[macro_use]
extern crate lazy_static;
#[macro_use] #[macro_use]
extern crate derive_builder; extern crate derive_builder;

View File

@ -2,16 +2,13 @@ use crate::processors::byte_level::bytes_char;
use crate::tokenizer::{NormalizedString, Normalizer, Result}; use crate::tokenizer::{NormalizedString, Normalizer, Result};
use crate::utils::macro_rules_attribute; use crate::utils::macro_rules_attribute;
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::sync::LazyLock;
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
#[macro_rules_attribute(impl_serde_type!)] #[macro_rules_attribute(impl_serde_type!)]
pub struct ByteLevel; pub struct ByteLevel;
lazy_static! { static BYTES_CHAR: LazyLock<HashMap<u8, char>> = LazyLock::new(bytes_char);
static ref BYTES_CHAR: HashMap<u8, char> = bytes_char();
static ref CHAR_BYTES: HashMap<char, u8> =
bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
}
impl Default for ByteLevel { impl Default for ByteLevel {
fn default() -> Self { fn default() -> Self {

View File

@ -1,4 +1,5 @@
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::sync::LazyLock;
use crate::utils::SysRegex; use crate::utils::SysRegex;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@ -37,17 +38,15 @@ pub(crate) fn bytes_char() -> HashMap<u8, char> {
.collect() .collect()
} }
lazy_static! { /// Regex that matches exactly one token.
/// Regex that matches exactly one token. /// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98
/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98 static RE: LazyLock<SysRegex> = LazyLock::new(|| {
static ref RE: SysRegex = SysRegex::new( SysRegex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+")
r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" .unwrap()
) });
.unwrap(); static BYTES_CHAR: LazyLock<HashMap<u8, char>> = LazyLock::new(bytes_char);
static ref BYTES_CHAR: HashMap<u8, char> = bytes_char(); static CHAR_BYTES: LazyLock<HashMap<char, u8>> =
static ref CHAR_BYTES: HashMap<char, u8> = LazyLock::new(|| bytes_char().into_iter().map(|(c, b)| (b, c)).collect());
bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
}
#[derive(Copy, Clone, Debug, PartialEq, Eq)] #[derive(Copy, Clone, Debug, PartialEq, Eq)]
/// Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care /// Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care

View File

@ -1,3 +1,5 @@
use std::sync::LazyLock;
use regex::Regex; use regex::Regex;
use crate::tokenizer::{ use crate::tokenizer::{
@ -17,9 +19,7 @@ impl Default for Whitespace {
impl PreTokenizer for Whitespace { impl PreTokenizer for Whitespace {
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> { fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
lazy_static! { static RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\w+|[^\w\s]+").unwrap());
static ref RE: Regex = Regex::new(r"\w+|[^\w\s]+").unwrap();
}
let re_ref: &Regex = &RE; let re_ref: &Regex = &RE;
pretokenized.split(|_, normalized| { pretokenized.split(|_, normalized| {

View File

@ -5,6 +5,7 @@ use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
use regex::Regex; use regex::Regex;
use serde::{ser::SerializeSeq, Deserialize, Serialize, Serializer}; use serde::{ser::SerializeSeq, Deserialize, Serialize, Serializer};
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::sync::LazyLock;
/// Represent a token added by the user on top of the existing Model vocabulary. /// Represent a token added by the user on top of the existing Model vocabulary.
/// AddedToken can be configured to specify the behavior they should have in various situations /// AddedToken can be configured to specify the behavior they should have in various situations
@ -94,12 +95,10 @@ impl std::hash::Hash for AddedToken {
type MatchingSet = (AhoCorasick, Vec<u32>); type MatchingSet = (AhoCorasick, Vec<u32>);
lazy_static! { static STARTS_WITH_WORD: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\w").unwrap());
static ref STARTS_WITH_WORD: Regex = Regex::new(r"^\w").unwrap(); static ENDS_WITH_WORD: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\w$").unwrap());
static ref ENDS_WITH_WORD: Regex = Regex::new(r"\w$").unwrap(); static RIGHTMOST_SPACE_AT_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*").unwrap());
static ref RIGHTMOST_SPACE_AT_START: Regex = Regex::new(r"^\s*").unwrap(); static LEFTMOST_SPACE_AT_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s*$").unwrap());
static ref LEFTMOST_SPACE_AT_END: Regex = Regex::new(r"\s*$").unwrap();
}
fn ends_with_word(sentence: &str) -> bool { fn ends_with_word(sentence: &str) -> bool {
ENDS_WITH_WORD.is_match(sentence) ENDS_WITH_WORD.is_match(sentence)