Expose vocabulary size

2025-08-22 16:25:30 +00:00 · 2019-12-10 16:20:31 -05:00
parent 6c294c60b0
commit b4b31d73cd
4 changed files with 19 additions and 0 deletions
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@ -30,6 +30,11 @@ impl Tokenizer {
        }
    }

+    #[getter]
+    fn get_vocab_size(&self) -> usize {
+        self.tokenizer.get_vocab_size()
+    }
+
    fn with_model(&mut self, model: &mut Model) -> PyResult<()> {
        if let Some(model) = model.model.to_pointer() {
            self.tokenizer.with_model(model);
--- a/tokenizers/src/models/bpe/model.rs
+++ b/tokenizers/src/models/bpe/model.rs
@ -96,6 +96,10 @@ impl BPE {
 }

 impl Model for BPE {
+    fn get_vocab_size(&self) -> usize {
+        self.vocab.len()
+    }
+
    fn tokenize(&self, sentence: Vec<String>) -> Vec<Token> {
        if sentence.len() == 0 {
            return vec![];
--- a/tokenizers/src/models/wordpiece/mod.rs
+++ b/tokenizers/src/models/wordpiece/mod.rs
@ -48,6 +48,10 @@ impl WordPiece {
 }

 impl Model for WordPiece {
+    fn get_vocab_size(&self) -> usize {
+        self.vocab.len()
+    }
+
    fn tokenize(&self, sentence: Vec<String>) -> Vec<Token> {
        let mut output_tokens = vec![];

--- a/tokenizers/src/tokenizer.rs
+++ b/tokenizers/src/tokenizer.rs
@ -37,6 +37,7 @@ pub trait Model {
    fn decode(&self, ids: Vec<u32>) -> Vec<String>;
    fn token_to_id(&self, token: &str) -> Option<u32>;
    fn id_to_token(&self, id: u32) -> Option<String>;
+    fn get_vocab_size(&self) -> usize;
 }

 /// A PostProcessor has the responsibility to post process an encoded output of the Tokenizer.
@ -166,6 +167,11 @@ impl Tokenizer {
        self
    }

+    /// Get the size of the vocabulary
+    pub fn get_vocab_size(&self) -> usize {
+        self.model.get_vocab_size()
+    }
+
    /// Converts a token in the corresponding id.
    pub fn token_to_id(&self, token: &str) -> Option<u32> {
        self.model.token_to_id(token)