From 2b72017e17c36a7f90a3301997614f2e2818a511 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Fri, 1 Sep 2023 19:03:33 +0000 Subject: [PATCH] correctly compute the new id: we take the max of the AddedToken + get_vocab_size --- tokenizers/src/tokenizer/added_vocabulary.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index a02527f3..1b7dd314 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -264,7 +264,7 @@ impl AddedVocabulary { let id = if let Some(id) = self.token_to_id(&token.content, model) { id } else { - let new_id = (model.get_vocab_size() + self.added_tokens_map.len()) as u32; + let new_id = (model.get_vocab_size() + cmp::max(self.added_tokens_map_r.keys(),0)) as u32; new_id };