replace lazy_static with stabilized std::sync::LazyLock in 1.80 (#1739)

This commit is contained in:
sftse
2025-03-18 17:33:44 +01:00
committed by GitHub
parent 4383a25787
commit 759d7aa77a
6 changed files with 20 additions and 28 deletions

View File

@ -42,7 +42,6 @@ required-features = ["http"]
harness = false
[dependencies]
lazy_static = "1.4"
rand = "0.8"
onig = { version = "6.4", default-features = false, optional = true }
regex = "1.10"

View File

@ -130,8 +130,6 @@
#[macro_use]
extern crate log;
#[macro_use]
extern crate lazy_static;
#[macro_use]
extern crate derive_builder;

View File

@ -2,16 +2,13 @@ use crate::processors::byte_level::bytes_char;
use crate::tokenizer::{NormalizedString, Normalizer, Result};
use crate::utils::macro_rules_attribute;
use std::collections::{HashMap, HashSet};
use std::sync::LazyLock;
#[derive(Clone, Debug)]
#[macro_rules_attribute(impl_serde_type!)]
pub struct ByteLevel;
lazy_static! {
static ref BYTES_CHAR: HashMap<u8, char> = bytes_char();
static ref CHAR_BYTES: HashMap<char, u8> =
bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
}
static BYTES_CHAR: LazyLock<HashMap<u8, char>> = LazyLock::new(bytes_char);
impl Default for ByteLevel {
fn default() -> Self {

View File

@ -1,4 +1,5 @@
use std::collections::{HashMap, HashSet};
use std::sync::LazyLock;
use crate::utils::SysRegex;
use serde::{Deserialize, Serialize};
@ -37,17 +38,15 @@ pub(crate) fn bytes_char() -> HashMap<u8, char> {
.collect()
}
lazy_static! {
/// Regex that matches exactly one token.
/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98
static ref RE: SysRegex = SysRegex::new(
r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
)
.unwrap();
static ref BYTES_CHAR: HashMap<u8, char> = bytes_char();
static ref CHAR_BYTES: HashMap<char, u8> =
bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
}
/// Regex that matches exactly one token.
/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98
static RE: LazyLock<SysRegex> = LazyLock::new(|| {
SysRegex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+")
.unwrap()
});
static BYTES_CHAR: LazyLock<HashMap<u8, char>> = LazyLock::new(bytes_char);
static CHAR_BYTES: LazyLock<HashMap<char, u8>> =
LazyLock::new(|| bytes_char().into_iter().map(|(c, b)| (b, c)).collect());
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
/// Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care

View File

@ -1,3 +1,5 @@
use std::sync::LazyLock;
use regex::Regex;
use crate::tokenizer::{
@ -17,9 +19,7 @@ impl Default for Whitespace {
impl PreTokenizer for Whitespace {
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
lazy_static! {
static ref RE: Regex = Regex::new(r"\w+|[^\w\s]+").unwrap();
}
static RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\w+|[^\w\s]+").unwrap());
let re_ref: &Regex = &RE;
pretokenized.split(|_, normalized| {

View File

@ -5,6 +5,7 @@ use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
use regex::Regex;
use serde::{ser::SerializeSeq, Deserialize, Serialize, Serializer};
use std::collections::{HashMap, HashSet};
use std::sync::LazyLock;
/// Represent a token added by the user on top of the existing Model vocabulary.
/// AddedToken can be configured to specify the behavior they should have in various situations
@ -94,12 +95,10 @@ impl std::hash::Hash for AddedToken {
type MatchingSet = (AhoCorasick, Vec<u32>);
lazy_static! {
static ref STARTS_WITH_WORD: Regex = Regex::new(r"^\w").unwrap();
static ref ENDS_WITH_WORD: Regex = Regex::new(r"\w$").unwrap();
static ref RIGHTMOST_SPACE_AT_START: Regex = Regex::new(r"^\s*").unwrap();
static ref LEFTMOST_SPACE_AT_END: Regex = Regex::new(r"\s*$").unwrap();
}
static STARTS_WITH_WORD: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\w").unwrap());
static ENDS_WITH_WORD: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\w$").unwrap());
static RIGHTMOST_SPACE_AT_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*").unwrap());
static LEFTMOST_SPACE_AT_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s*$").unwrap());
fn ends_with_word(sentence: &str) -> bool {
ENDS_WITH_WORD.is_match(sentence)