From 531b06f6dbf387f2ff8b7380093f160c32c451a8 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 5 Sep 2023 15:19:50 +0000
Subject: [PATCH] update the `get_vocab_size` to compute actual length of the
 `get_vocab` function

---
 tokenizers/src/tokenizer/mod.rs | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
index c76addf9..97390d91 100644
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@@ -668,12 +668,11 @@ where
     pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
         // TODO ArthurZ THIS IS WRONG! We need to measure the length of the `set` because
         // now some tokens can be both in the added_tokens_encoder and in the vocab
-        self.model.get_vocab_size()
-            + if with_added_tokens {
-                self.added_vocabulary.len()
-            } else {
-                0
-            }
+        if with_added_tokens {
+            self.get_vocab(with_added_tokens).len()
+        } else {
+            self.model.get_vocab_size()
+        }
     }
 
     /// Converts a token in the corresponding id.