Handle vocab size with added tokens

2025-08-23 00:35:35 +00:00 · 2019-12-19 20:19:56 -05:00
parent b7040e0412
commit f2b9c30ad9
2 changed files with 8 additions and 4 deletions
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@ -36,9 +36,8 @@ impl Tokenizer {
        }
    }
-    #[getter]
+    fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
-    fn get_vocab_size(&self) -> usize {
+        self.tokenizer.get_vocab_size(with_added_tokens)
        self.tokenizer.get_vocab_size()
    }
    fn with_model(&mut self, model: &mut Model) -> PyResult<()> {
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@ -210,8 +210,13 @@ impl Tokenizer {
    }
    /// Get the size of the vocabulary
-    pub fn get_vocab_size(&self) -> usize {
+    pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
        self.model.get_vocab_size()
            + if with_added_tokens {
                self.added_tokens.len()
            } else {
                0
            }
    }
    /// Converts a token in the corresponding id.