From e5fc051ad2776199896e17ca9eaf7bb885f7e182 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 5 Sep 2023 13:34:43 +0000 Subject: [PATCH] update --- tokenizers/src/tokenizer/added_vocabulary.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 58b21f5e..1d7d2961 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -257,11 +257,16 @@ impl AddedVocabulary { let new_id = if let Some(new_id) = self.token_to_id(&token.content, model) { new_id } else { - self.added_tokens_map - .values() - .cloned() - .max() - .map_or(model.get_vocab_size() as u32, |max| max + 1) + self.added_tokens_map.values().cloned().max().map_or( + model.get_vocab_size() as u32, + |max| { + if max >= (model.get_vocab_size() as u32) || model.get_vocab_size() == 0 { + max + 1 + } else { + model.get_vocab_size() as u32 + } + }, + ) }; // Make sure we modify the previous entry self.added_tokens_map @@ -681,7 +686,6 @@ mod tests { token.special = true; assert_eq!(token.special, true); // Token was already there - } #[test]