Python - Add length to train_from_iterator in implementations (#937)

This commit is contained in:
dctelus
2022-03-04 08:11:58 -05:00
committed by GitHub
parent 845da6d8e8
commit 4a8f5db067
5 changed files with 30 additions and 5 deletions

View File

@ -133,6 +133,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
], ],
show_progress: bool = True, show_progress: bool = True,
wordpieces_prefix: str = "##", wordpieces_prefix: str = "##",
length: Optional[int] = None,
): ):
""" Train the model using the given iterator """ """ Train the model using the given iterator """
@ -145,4 +146,8 @@ class BertWordPieceTokenizer(BaseTokenizer):
show_progress=show_progress, show_progress=show_progress,
continuing_subword_prefix=wordpieces_prefix, continuing_subword_prefix=wordpieces_prefix,
) )
self._tokenizer.train_from_iterator(iterator, trainer=trainer) self._tokenizer.train_from_iterator(
iterator,
trainer=trainer,
length=length,
)

View File

@ -110,6 +110,7 @@ class ByteLevelBPETokenizer(BaseTokenizer):
min_frequency: int = 2, min_frequency: int = 2,
show_progress: bool = True, show_progress: bool = True,
special_tokens: List[Union[str, AddedToken]] = [], special_tokens: List[Union[str, AddedToken]] = [],
length: Optional[int] = None,
): ):
""" Train the model using the given iterator """ """ Train the model using the given iterator """
@ -120,4 +121,8 @@ class ByteLevelBPETokenizer(BaseTokenizer):
special_tokens=special_tokens, special_tokens=special_tokens,
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
) )
self._tokenizer.train_from_iterator(iterator, trainer=trainer) self._tokenizer.train_from_iterator(
iterator,
trainer=trainer,
length=length,
)

View File

@ -135,6 +135,7 @@ class CharBPETokenizer(BaseTokenizer):
initial_alphabet: List[str] = [], initial_alphabet: List[str] = [],
suffix: Optional[str] = "</w>", suffix: Optional[str] = "</w>",
show_progress: bool = True, show_progress: bool = True,
length: Optional[int] = None,
): ):
""" Train the model using the given iterator """ """ Train the model using the given iterator """
@ -147,4 +148,8 @@ class CharBPETokenizer(BaseTokenizer):
end_of_word_suffix=suffix, end_of_word_suffix=suffix,
show_progress=show_progress, show_progress=show_progress,
) )
self._tokenizer.train_from_iterator(iterator, trainer=trainer) self._tokenizer.train_from_iterator(
iterator,
trainer=trainer,
length=length,
)

View File

@ -88,6 +88,7 @@ class SentencePieceBPETokenizer(BaseTokenizer):
limit_alphabet: int = 1000, limit_alphabet: int = 1000,
initial_alphabet: List[str] = [], initial_alphabet: List[str] = [],
show_progress: bool = True, show_progress: bool = True,
length: Optional[int] = None,
): ):
""" Train the model using the given iterator """ """ Train the model using the given iterator """
@ -99,4 +100,8 @@ class SentencePieceBPETokenizer(BaseTokenizer):
initial_alphabet=initial_alphabet, initial_alphabet=initial_alphabet,
show_progress=show_progress, show_progress=show_progress,
) )
self._tokenizer.train_from_iterator(iterator, trainer=trainer) self._tokenizer.train_from_iterator(
iterator,
trainer=trainer,
length=length,
)

View File

@ -85,6 +85,7 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
show_progress: bool = True, show_progress: bool = True,
special_tokens: List[Union[str, AddedToken]] = [], special_tokens: List[Union[str, AddedToken]] = [],
unk_token: Optional[str] = None, unk_token: Optional[str] = None,
length: Optional[int] = None,
): ):
""" """
Train the model using the given iterator Train the model using the given iterator
@ -109,7 +110,11 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
unk_token=unk_token, unk_token=unk_token,
) )
self._tokenizer.train_from_iterator(iterator, trainer=trainer) self._tokenizer.train_from_iterator(
iterator,
trainer=trainer,
length=length,
)
@staticmethod @staticmethod
def from_spm(filename: str): def from_spm(filename: str):