Python - expost get_vocab on Tokenizer

2025-09-02 07:19:24 +00:00 · 2020-03-27 11:53:18 -04:00
parent e191008751
commit a2a6d80017
2 changed files with 23 additions and 15 deletions
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@ -4,6 +4,7 @@ use pyo3::exceptions;
 use pyo3::prelude::*;
 use pyo3::types::*;
 use pyo3::PyObjectProtocol;
+use std::collections::HashMap;

 use super::decoders::Decoder;
 use super::encoding::Encoding;
@ -87,20 +88,13 @@ impl Tokenizer {
            .map_or(0, |p| p.as_ref().added_tokens(is_pair)))
    }

-    #[args(kwargs = "**")]
-    fn get_vocab_size(&self, kwargs: Option<&PyDict>) -> PyResult<usize> {
-        let mut with_added_tokens = true;
-
-        if let Some(kwargs) = kwargs {
-            for (key, value) in kwargs {
-                let key: &str = key.extract()?;
-                match key {
-                    "with_added_tokens" => with_added_tokens = value.extract()?,
-                    _ => println!("Ignored unknown kwarg option {}", key),
-                }
-            }
+    #[args(with_added_tokens = true)]
+    fn get_vocab(&self, with_added_tokens: bool) -> PyResult<HashMap<String, u32>> {
+        Ok(self.tokenizer.get_vocab(with_added_tokens))
    }

+    #[args(with_added_tokens = true)]
+    fn get_vocab_size(&self, with_added_tokens: bool) -> PyResult<usize> {
        Ok(self.tokenizer.get_vocab_size(with_added_tokens))
    }

--- a/bindings/python/tokenizers/init.pyi
+++ b/bindings/python/tokenizers/init.pyi
@ -258,12 +258,26 @@ class Tokenizer:
        :return:
        """
        pass
-    def get_vocab_size(self, with_added_tokens: Optional[bool]) -> int:
+    def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
+        """ Returns the vocabulary
+
+        Args:
+            with_added_tokens: boolean:
+                Whether to include the added tokens in the vocabulary
+
+        Returns:
+            The vocabulary
+        """
+        pass
+    def get_vocab_size(self, with_added_tokens: bool = True) -> int:
        """ Returns the size of the vocabulary

        Args:
-            with_added_tokens: (`optional`) boolean:
+            with_added_tokens: boolean:
                Whether to include the added tokens in the vocabulary's size
+
+        Returns:
+            The size of the vocabulary
        """
        pass
    def enable_truncation(self, max_length: int, stride: Optional[int], strategy: Optional[str]):