mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Fix a bug when adding special tokens
If we add special tokens that are part of the vocabulary of the model, the tokens aren't added to the tokenizer, which then built an empty regex. This completely break the tokenization
This commit is contained in:
@ -577,7 +577,12 @@ impl Tokenizer {
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
self.split_re = Some(regex::Regex::new(&format!(r"({})", added_tokens.join("|"))).unwrap());
|
||||
if added_tokens.is_empty() {
|
||||
self.split_re = None;
|
||||
} else {
|
||||
self.split_re =
|
||||
Some(regex::Regex::new(&format!(r"({})", added_tokens.join("|"))).unwrap());
|
||||
}
|
||||
|
||||
// Return the number of added tokens
|
||||
tokens.len() - ignored
|
||||
|
Reference in New Issue
Block a user