replace lazy_static with stabilized std::sync::LazyLock in 1.80 (#1739)

2025-12-05 04:08:22 +00:00 · 2025-03-18 17:33:44 +01:00
parent 4383a25787
commit 759d7aa77a
6 changed files with 20 additions and 28 deletions
--- a/tokenizers/Cargo.toml
+++ b/tokenizers/Cargo.toml
@@ -42,7 +42,6 @@ required-features = ["http"]
 harness = false
 [dependencies]
 lazy_static = "1.4"
 rand = "0.8"
 onig = { version = "6.4", default-features = false, optional = true }
 regex = "1.10"
--- a/tokenizers/src/lib.rs
+++ b/tokenizers/src/lib.rs
@@ -130,8 +130,6 @@
 #[macro_use]
 extern crate log;
 #[macro_use]
 extern crate lazy_static;
 #[macro_use]
 extern crate derive_builder;
--- a/tokenizers/src/normalizers/byte_level.rs
+++ b/tokenizers/src/normalizers/byte_level.rs
@@ -2,16 +2,13 @@ use crate::processors::byte_level::bytes_char;
 use crate::tokenizer::{NormalizedString, Normalizer, Result};
 use crate::utils::macro_rules_attribute;
 use std::collections::{HashMap, HashSet};
 use std::sync::LazyLock;
 #[derive(Clone, Debug)]
 #[macro_rules_attribute(impl_serde_type!)]
 pub struct ByteLevel;
-lazy_static! {
+static BYTES_CHAR: LazyLock<HashMap<u8, char>> = LazyLock::new(bytes_char);
    static ref BYTES_CHAR: HashMap<u8, char> = bytes_char();
    static ref CHAR_BYTES: HashMap<char, u8> =
        bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
 }
 impl Default for ByteLevel {
    fn default() -> Self {
--- a/tokenizers/src/pre_tokenizers/byte_level.rs
+++ b/tokenizers/src/pre_tokenizers/byte_level.rs
@@ -1,4 +1,5 @@
 use std::collections::{HashMap, HashSet};
 use std::sync::LazyLock;
 use crate::utils::SysRegex;
 use serde::{Deserialize, Serialize};
@@ -37,17 +38,15 @@ pub(crate) fn bytes_char() -> HashMap<u8, char> {
        .collect()
 }
-lazy_static! {
+/// Regex that matches exactly one token.
-    /// Regex that matches exactly one token.
+/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98
-    /// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98
+static RE: LazyLock<SysRegex> = LazyLock::new(|| {
-    static ref RE: SysRegex = SysRegex::new(
+    SysRegex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+")
-        r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
+        .unwrap()
-    )
+});
-    .unwrap();
+static BYTES_CHAR: LazyLock<HashMap<u8, char>> = LazyLock::new(bytes_char);
-    static ref BYTES_CHAR: HashMap<u8, char> = bytes_char();
+static CHAR_BYTES: LazyLock<HashMap<char, u8>> =
-    static ref CHAR_BYTES: HashMap<char, u8> =
+    LazyLock::new(|| bytes_char().into_iter().map(|(c, b)| (b, c)).collect());
        bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
 }
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 /// Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care
--- a/tokenizers/src/pre_tokenizers/whitespace.rs
+++ b/tokenizers/src/pre_tokenizers/whitespace.rs
@@ -1,3 +1,5 @@
 use std::sync::LazyLock;
 use regex::Regex;
 use crate::tokenizer::{
@@ -17,9 +19,7 @@ impl Default for Whitespace {
 impl PreTokenizer for Whitespace {
    fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
-        lazy_static! {
+        static RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\w+|[^\w\s]+").unwrap());
            static ref RE: Regex = Regex::new(r"\w+|[^\w\s]+").unwrap();
        }
        let re_ref: &Regex = &RE;
        pretokenized.split(|_, normalized| {
--- a/tokenizers/src/tokenizer/added_vocabulary.rs
+++ b/tokenizers/src/tokenizer/added_vocabulary.rs
@@ -5,6 +5,7 @@ use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
 use regex::Regex;
 use serde::{ser::SerializeSeq, Deserialize, Serialize, Serializer};
 use std::collections::{HashMap, HashSet};
 use std::sync::LazyLock;
 /// Represent a token added by the user on top of the existing Model vocabulary.
 /// AddedToken can be configured to specify the behavior they should have in various situations
@@ -94,12 +95,10 @@ impl std::hash::Hash for AddedToken {
 type MatchingSet = (AhoCorasick, Vec<u32>);
-lazy_static! {
+static STARTS_WITH_WORD: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\w").unwrap());
-    static ref STARTS_WITH_WORD: Regex = Regex::new(r"^\w").unwrap();
+static ENDS_WITH_WORD: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\w$").unwrap());
-    static ref ENDS_WITH_WORD: Regex = Regex::new(r"\w$").unwrap();
+static RIGHTMOST_SPACE_AT_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*").unwrap());
-    static ref RIGHTMOST_SPACE_AT_START: Regex = Regex::new(r"^\s*").unwrap();
+static LEFTMOST_SPACE_AT_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s*$").unwrap());
    static ref LEFTMOST_SPACE_AT_END: Regex = Regex::new(r"\s*$").unwrap();
 }
 fn ends_with_word(sentence: &str) -> bool {
    ENDS_WITH_WORD.is_match(sentence)