From 531b06f6dbf387f2ff8b7380093f160c32c451a8 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 5 Sep 2023 15:19:50 +0000 Subject: [PATCH] update the `get_vocab_size` to compute actual length of the `get_vocab` function --- tokenizers/src/tokenizer/mod.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index c76addf9..97390d91 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -668,12 +668,11 @@ where pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize { // TODO ArthurZ THIS IS WRONG! We need to measure the length of the `set` because // now some tokens can be both in the added_tokens_encoder and in the vocab - self.model.get_vocab_size() - + if with_added_tokens { - self.added_vocabulary.len() - } else { - 0 - } + if with_added_tokens { + self.get_vocab(with_added_tokens).len() + } else { + self.model.get_vocab_size() + } } /// Converts a token in the corresponding id.