Python - Add length to train_from_iterator in implementations (#937)

This commit is contained in:
dctelus
2022-03-04 08:11:58 -05:00
committed by GitHub
parent 845da6d8e8
commit 4a8f5db067
5 changed files with 30 additions and 5 deletions

View File

@ -133,6 +133,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
],
show_progress: bool = True,
wordpieces_prefix: str = "##",
length: Optional[int] = None,
):
""" Train the model using the given iterator """
@ -145,4 +146,8 @@ class BertWordPieceTokenizer(BaseTokenizer):
show_progress=show_progress,
continuing_subword_prefix=wordpieces_prefix,
)
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
self._tokenizer.train_from_iterator(
iterator,
trainer=trainer,
length=length,
)

View File

@ -110,6 +110,7 @@ class ByteLevelBPETokenizer(BaseTokenizer):
min_frequency: int = 2,
show_progress: bool = True,
special_tokens: List[Union[str, AddedToken]] = [],
length: Optional[int] = None,
):
""" Train the model using the given iterator """
@ -120,4 +121,8 @@ class ByteLevelBPETokenizer(BaseTokenizer):
special_tokens=special_tokens,
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
)
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
self._tokenizer.train_from_iterator(
iterator,
trainer=trainer,
length=length,
)

View File

@ -135,6 +135,7 @@ class CharBPETokenizer(BaseTokenizer):
initial_alphabet: List[str] = [],
suffix: Optional[str] = "</w>",
show_progress: bool = True,
length: Optional[int] = None,
):
""" Train the model using the given iterator """
@ -147,4 +148,8 @@ class CharBPETokenizer(BaseTokenizer):
end_of_word_suffix=suffix,
show_progress=show_progress,
)
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
self._tokenizer.train_from_iterator(
iterator,
trainer=trainer,
length=length,
)

View File

@ -88,6 +88,7 @@ class SentencePieceBPETokenizer(BaseTokenizer):
limit_alphabet: int = 1000,
initial_alphabet: List[str] = [],
show_progress: bool = True,
length: Optional[int] = None,
):
""" Train the model using the given iterator """
@ -99,4 +100,8 @@ class SentencePieceBPETokenizer(BaseTokenizer):
initial_alphabet=initial_alphabet,
show_progress=show_progress,
)
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
self._tokenizer.train_from_iterator(
iterator,
trainer=trainer,
length=length,
)

View File

@ -85,6 +85,7 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
show_progress: bool = True,
special_tokens: List[Union[str, AddedToken]] = [],
unk_token: Optional[str] = None,
length: Optional[int] = None,
):
"""
Train the model using the given iterator
@ -109,7 +110,11 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
unk_token=unk_token,
)
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
self._tokenizer.train_from_iterator(
iterator,
trainer=trainer,
length=length,
)
@staticmethod
def from_spm(filename: str):