mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-08 05:38:23 +00:00
Add a way to specify the unknown token in SentencePieceUnigramTokenizer python implem (#762)
* add a way to specify the unknown token in `SentencePieceUnigramTokenizer` * add test that verify that an exception is raised for the missing unknown token * style * add test tokens
This commit is contained in:
@@ -49,13 +49,29 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
vocab_size: int = 8000,
|
||||
show_progress: bool = True,
|
||||
special_tokens: List[Union[str, AddedToken]] = [],
|
||||
unk_token: Optional[str] = None,
|
||||
):
|
||||
""" Train the model using the given files """
|
||||
"""
|
||||
Train the model using the given files
|
||||
|
||||
Args:
|
||||
files (:obj:`List[str]`):
|
||||
A list of path to the files that we should use for training
|
||||
vocab_size (:obj:`int`):
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
show_progress (:obj:`bool`):
|
||||
Whether to show progress bars while training.
|
||||
special_tokens (:obj:`List[Union[str, AddedToken]]`):
|
||||
A list of special tokens the model should know of.
|
||||
unk_token (:obj:`str`, `optional`):
|
||||
The unknown token to be used by the model.
|
||||
"""
|
||||
|
||||
trainer = trainers.UnigramTrainer(
|
||||
vocab_size=vocab_size,
|
||||
special_tokens=special_tokens,
|
||||
show_progress=show_progress,
|
||||
unk_token=unk_token,
|
||||
)
|
||||
|
||||
if isinstance(files, str):
|
||||
@@ -68,13 +84,29 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
vocab_size: int = 8000,
|
||||
show_progress: bool = True,
|
||||
special_tokens: List[Union[str, AddedToken]] = [],
|
||||
unk_token: Optional[str] = None,
|
||||
):
|
||||
""" Train the model using the given iterator """
|
||||
"""
|
||||
Train the model using the given iterator
|
||||
|
||||
Args:
|
||||
iterator (:obj:`Union[Iterator[str], Iterator[Iterator[str]]]`):
|
||||
Any iterator over strings or list of strings
|
||||
vocab_size (:obj:`int`):
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
show_progress (:obj:`bool`):
|
||||
Whether to show progress bars while training.
|
||||
special_tokens (:obj:`List[Union[str, AddedToken]]`):
|
||||
A list of special tokens the model should know of.
|
||||
unk_token (:obj:`str`, `optional`):
|
||||
The unknown token to be used by the model.
|
||||
"""
|
||||
|
||||
trainer = trainers.UnigramTrainer(
|
||||
vocab_size=vocab_size,
|
||||
special_tokens=special_tokens,
|
||||
show_progress=show_progress,
|
||||
unk_token=unk_token,
|
||||
)
|
||||
|
||||
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
|
||||
|
||||
Reference in New Issue
Block a user