update the get_vocab_size to compute actual length of the get_vocab function

2025-08-23 00:35:35 +00:00 · 2023-09-05 15:19:50 +00:00
parent f1da83f358
commit 531b06f6db
1 changed files with 5 additions and 6 deletions
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@ -668,12 +668,11 @@ where
    pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
        // TODO ArthurZ THIS IS WRONG! We need to measure the length of the `set` because
        // now some tokens can be both in the added_tokens_encoder and in the vocab
-        self.model.get_vocab_size()
+        if with_added_tokens {
-            + if with_added_tokens {
+            self.get_vocab(with_added_tokens).len()
-                self.added_vocabulary.len()
+        } else {
-            } else {
+            self.model.get_vocab_size()
-                0
+        }
            }
    }
    /// Converts a token in the corresponding id.