Python - Add missing get_vocab from BaseTokenizer

2025-12-06 20:58:22 +00:00 · 2020-04-01 11:32:54 -04:00
parent b770f36428
commit 2a84ef12cf
1 changed files with 13 additions and 1 deletions
--- a/bindings/python/tokenizers/implementations/base_tokenizer.py
+++ b/bindings/python/tokenizers/implementations/base_tokenizer.py
@@ -25,7 +25,19 @@ class BaseTokenizer:
        """
        return self._tokenizer.num_special_tokens_to_add(is_pair)

-    def get_vocab_size(self, with_added_tokens: bool = True):
+    def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
+        """ Returns the vocabulary
+
+        Args:
+            with_added_tokens: boolean:
+                Whether to include the added tokens in the vocabulary
+
+        Returns:
+            The vocabulary
+        """
+        return self._tokenizer.get_vocab(with_added_tokens=with_added_tokens)
+
+    def get_vocab_size(self, with_added_tokens: bool = True) -> int:
        """ Return the size of vocabulary, with or without added tokens.

        Args: