mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-12 05:18:39 +00:00
Merge pull request #99 from kdexd/get-vocab-size
Expose get_vocab_size in tokenizer python API.
This commit is contained in:
@@ -13,6 +13,18 @@ class BaseTokenizer:
|
|||||||
self._tokenizer.get_vocab_size(),
|
self._tokenizer.get_vocab_size(),
|
||||||
', '.join(k + '=' + str(v) for k, v in self._parameters.items()))
|
', '.join(k + '=' + str(v) for k, v in self._parameters.items()))
|
||||||
|
|
||||||
|
def get_vocab_size(self, with_added_tokens: bool = True):
|
||||||
|
""" Return the size of vocabulary, with or without added tokens.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
with_added_tokens: (`optional`) bool:
|
||||||
|
Whether to count in added special tokens or not
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Size of vocabulary
|
||||||
|
"""
|
||||||
|
return self._tokenizer.get_vocab_size(with_added_tokens=with_added_tokens)
|
||||||
|
|
||||||
def enable_padding(self,
|
def enable_padding(self,
|
||||||
direction: Optional[str] = "right",
|
direction: Optional[str] = "right",
|
||||||
pad_id: Optional[int] = 0,
|
pad_id: Optional[int] = 0,
|
||||||
|
|||||||
Reference in New Issue
Block a user