replace lazy_static with stabilized std::sync::LazyLock in 1.80 (#1739)

2025-08-22 16:25:30 +00:00 · 2025-03-18 17:33:44 +01:00
parent 4383a25787
commit 759d7aa77a
6 changed files with 20 additions and 28 deletions
--- a/tokenizers/Cargo.toml
+++ b/tokenizers/Cargo.toml
@ -42,7 +42,6 @@ required-features = ["http"]
 harness = false

 [dependencies]
-lazy_static = "1.4"
 rand = "0.8"
 onig = { version = "6.4", default-features = false, optional = true }
 regex = "1.10"
--- a/tokenizers/src/lib.rs
+++ b/tokenizers/src/lib.rs
@ -130,8 +130,6 @@

 #[macro_use]
 extern crate log;
-#[macro_use]
-extern crate lazy_static;

 #[macro_use]
 extern crate derive_builder;
--- a/tokenizers/src/normalizers/byte_level.rs
+++ b/tokenizers/src/normalizers/byte_level.rs
@ -2,16 +2,13 @@ use crate::processors::byte_level::bytes_char;
 use crate::tokenizer::{NormalizedString, Normalizer, Result};
 use crate::utils::macro_rules_attribute;
 use std::collections::{HashMap, HashSet};
+use std::sync::LazyLock;

 #[derive(Clone, Debug)]
 #[macro_rules_attribute(impl_serde_type!)]
 pub struct ByteLevel;

-lazy_static! {
-    static ref BYTES_CHAR: HashMap<u8, char> = bytes_char();
-    static ref CHAR_BYTES: HashMap<char, u8> =
-        bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
-}
+static BYTES_CHAR: LazyLock<HashMap<u8, char>> = LazyLock::new(bytes_char);

 impl Default for ByteLevel {
    fn default() -> Self {
--- a/tokenizers/src/pre_tokenizers/byte_level.rs
+++ b/tokenizers/src/pre_tokenizers/byte_level.rs
@ -1,4 +1,5 @@
 use std::collections::{HashMap, HashSet};
+use std::sync::LazyLock;

 use crate::utils::SysRegex;
 use serde::{Deserialize, Serialize};
@ -37,17 +38,15 @@ pub(crate) fn bytes_char() -> HashMap<u8, char> {
        .collect()
 }

-lazy_static! {
 /// Regex that matches exactly one token.
 /// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98
-    static ref RE: SysRegex = SysRegex::new(
-        r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
-    )
-    .unwrap();
-    static ref BYTES_CHAR: HashMap<u8, char> = bytes_char();
-    static ref CHAR_BYTES: HashMap<char, u8> =
-        bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
-}
+static RE: LazyLock<SysRegex> = LazyLock::new(|| {
+    SysRegex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+")
+        .unwrap()
+});
+static BYTES_CHAR: LazyLock<HashMap<u8, char>> = LazyLock::new(bytes_char);
+static CHAR_BYTES: LazyLock<HashMap<char, u8>> =
+    LazyLock::new(|| bytes_char().into_iter().map(|(c, b)| (b, c)).collect());

 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 /// Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care
--- a/tokenizers/src/pre_tokenizers/whitespace.rs
+++ b/tokenizers/src/pre_tokenizers/whitespace.rs
@ -1,3 +1,5 @@
+use std::sync::LazyLock;
+
 use regex::Regex;

 use crate::tokenizer::{
@ -17,9 +19,7 @@ impl Default for Whitespace {

 impl PreTokenizer for Whitespace {
    fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
-        lazy_static! {
-            static ref RE: Regex = Regex::new(r"\w+|[^\w\s]+").unwrap();
-        }
+        static RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\w+|[^\w\s]+").unwrap());
        let re_ref: &Regex = &RE;

        pretokenized.split(|_, normalized| {
--- a/tokenizers/src/tokenizer/added_vocabulary.rs
+++ b/tokenizers/src/tokenizer/added_vocabulary.rs
@ -5,6 +5,7 @@ use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
 use regex::Regex;
 use serde::{ser::SerializeSeq, Deserialize, Serialize, Serializer};
 use std::collections::{HashMap, HashSet};
+use std::sync::LazyLock;

 /// Represent a token added by the user on top of the existing Model vocabulary.
 /// AddedToken can be configured to specify the behavior they should have in various situations
@ -94,12 +95,10 @@ impl std::hash::Hash for AddedToken {

 type MatchingSet = (AhoCorasick, Vec<u32>);

-lazy_static! {
-    static ref STARTS_WITH_WORD: Regex = Regex::new(r"^\w").unwrap();
-    static ref ENDS_WITH_WORD: Regex = Regex::new(r"\w$").unwrap();
-    static ref RIGHTMOST_SPACE_AT_START: Regex = Regex::new(r"^\s*").unwrap();
-    static ref LEFTMOST_SPACE_AT_END: Regex = Regex::new(r"\s*$").unwrap();
-}
+static STARTS_WITH_WORD: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\w").unwrap());
+static ENDS_WITH_WORD: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\w$").unwrap());
+static RIGHTMOST_SPACE_AT_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*").unwrap());
+static LEFTMOST_SPACE_AT_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s*$").unwrap());

 fn ends_with_word(sentence: &str) -> bool {
    ENDS_WITH_WORD.is_match(sentence)