Handle vocab size with added tokens

2025-08-22 16:25:30 +00:00 · 2019-12-19 20:19:56 -05:00
parent b7040e0412
commit f2b9c30ad9
2 changed files with 8 additions and 4 deletions
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@ -36,9 +36,8 @@ impl Tokenizer {
        }
    }

-    #[getter]
-    fn get_vocab_size(&self) -> usize {
-        self.tokenizer.get_vocab_size()
+    fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
+        self.tokenizer.get_vocab_size(with_added_tokens)
    }

    fn with_model(&mut self, model: &mut Model) -> PyResult<()> {
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@ -210,8 +210,13 @@ impl Tokenizer {
    }

    /// Get the size of the vocabulary
-    pub fn get_vocab_size(&self) -> usize {
+    pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
        self.model.get_vocab_size()
+            + if with_added_tokens {
+                self.added_tokens.len()
+            } else {
+                0
+            }
    }

    /// Converts a token in the corresponding id.