Merge pull request #99 from kdexd/get-vocab-size

Expose get_vocab_size in tokenizer python API.
2025-12-09 14:18:30 +00:00 · 2020-02-03 11:52:29 -05:00
parent 0094393610 b027c63c37
commit a48b337d7b
1 changed files with 12 additions and 0 deletions
--- a/bindings/python/tokenizers/implementations/base_tokenizer.py
+++ b/bindings/python/tokenizers/implementations/base_tokenizer.py
@@ -13,6 +13,18 @@ class BaseTokenizer:
            self._tokenizer.get_vocab_size(),
            ', '.join(k + '=' + str(v) for k, v in self._parameters.items()))

+    def get_vocab_size(self, with_added_tokens: bool = True):
+        """ Return the size of vocabulary, with or without added tokens.
+
+        Args:
+            with_added_tokens: (`optional`) bool:
+                Whether to count in added special tokens or not
+
+        Returns:
+            Size of vocabulary
+        """
+        return self._tokenizer.get_vocab_size(with_added_tokens=with_added_tokens)
+
    def enable_padding(self,
                       direction: Optional[str] = "right",
                       pad_id: Optional[int] = 0,