diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index d35a688e..80b662ff 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -577,7 +577,12 @@ impl Tokenizer { }) .collect::>(); - self.split_re = Some(regex::Regex::new(&format!(r"({})", added_tokens.join("|"))).unwrap()); + if added_tokens.is_empty() { + self.split_re = None; + } else { + self.split_re = + Some(regex::Regex::new(&format!(r"({})", added_tokens.join("|"))).unwrap()); + } // Return the number of added tokens tokens.len() - ignored