diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index 8a25d64c..7c3f00c9 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -36,9 +36,8 @@ impl Tokenizer { } } - #[getter] - fn get_vocab_size(&self) -> usize { - self.tokenizer.get_vocab_size() + fn get_vocab_size(&self, with_added_tokens: bool) -> usize { + self.tokenizer.get_vocab_size(with_added_tokens) } fn with_model(&mut self, model: &mut Model) -> PyResult<()> { diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index ee8db5b2..d35a688e 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -210,8 +210,13 @@ impl Tokenizer { } /// Get the size of the vocabulary - pub fn get_vocab_size(&self) -> usize { + pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize { self.model.get_vocab_size() + + if with_added_tokens { + self.added_tokens.len() + } else { + 0 + } } /// Converts a token in the corresponding id.