update the get_vocab_size to compute actual length of the get_vocab function

2025-08-22 16:25:30 +00:00 · 2023-09-05 15:19:50 +00:00
parent f1da83f358
commit 531b06f6db
1 changed files with 5 additions and 6 deletions
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@ -668,11 +668,10 @@ where
    pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
        // TODO ArthurZ THIS IS WRONG! We need to measure the length of the `set` because
        // now some tokens can be both in the added_tokens_encoder and in the vocab
-        self.model.get_vocab_size()
-            + if with_added_tokens {
-                self.added_vocabulary.len()
+        if with_added_tokens {
+            self.get_vocab(with_added_tokens).len()
        } else {
-                0
+            self.model.get_vocab_size()
        }
    }