mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Python - Add length to train_from_iterator in implementations (#937)
This commit is contained in:
@ -133,6 +133,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
],
|
||||
show_progress: bool = True,
|
||||
wordpieces_prefix: str = "##",
|
||||
length: Optional[int] = None,
|
||||
):
|
||||
""" Train the model using the given iterator """
|
||||
|
||||
@ -145,4 +146,8 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
show_progress=show_progress,
|
||||
continuing_subword_prefix=wordpieces_prefix,
|
||||
)
|
||||
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
|
||||
self._tokenizer.train_from_iterator(
|
||||
iterator,
|
||||
trainer=trainer,
|
||||
length=length,
|
||||
)
|
||||
|
@ -110,6 +110,7 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
min_frequency: int = 2,
|
||||
show_progress: bool = True,
|
||||
special_tokens: List[Union[str, AddedToken]] = [],
|
||||
length: Optional[int] = None,
|
||||
):
|
||||
""" Train the model using the given iterator """
|
||||
|
||||
@ -120,4 +121,8 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
special_tokens=special_tokens,
|
||||
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
|
||||
)
|
||||
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
|
||||
self._tokenizer.train_from_iterator(
|
||||
iterator,
|
||||
trainer=trainer,
|
||||
length=length,
|
||||
)
|
||||
|
@ -135,6 +135,7 @@ class CharBPETokenizer(BaseTokenizer):
|
||||
initial_alphabet: List[str] = [],
|
||||
suffix: Optional[str] = "</w>",
|
||||
show_progress: bool = True,
|
||||
length: Optional[int] = None,
|
||||
):
|
||||
""" Train the model using the given iterator """
|
||||
|
||||
@ -147,4 +148,8 @@ class CharBPETokenizer(BaseTokenizer):
|
||||
end_of_word_suffix=suffix,
|
||||
show_progress=show_progress,
|
||||
)
|
||||
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
|
||||
self._tokenizer.train_from_iterator(
|
||||
iterator,
|
||||
trainer=trainer,
|
||||
length=length,
|
||||
)
|
||||
|
@ -88,6 +88,7 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
limit_alphabet: int = 1000,
|
||||
initial_alphabet: List[str] = [],
|
||||
show_progress: bool = True,
|
||||
length: Optional[int] = None,
|
||||
):
|
||||
""" Train the model using the given iterator """
|
||||
|
||||
@ -99,4 +100,8 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
initial_alphabet=initial_alphabet,
|
||||
show_progress=show_progress,
|
||||
)
|
||||
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
|
||||
self._tokenizer.train_from_iterator(
|
||||
iterator,
|
||||
trainer=trainer,
|
||||
length=length,
|
||||
)
|
||||
|
@ -85,6 +85,7 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
show_progress: bool = True,
|
||||
special_tokens: List[Union[str, AddedToken]] = [],
|
||||
unk_token: Optional[str] = None,
|
||||
length: Optional[int] = None,
|
||||
):
|
||||
"""
|
||||
Train the model using the given iterator
|
||||
@ -109,7 +110,11 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
unk_token=unk_token,
|
||||
)
|
||||
|
||||
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
|
||||
self._tokenizer.train_from_iterator(
|
||||
iterator,
|
||||
trainer=trainer,
|
||||
length=length,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def from_spm(filename: str):
|
||||
|
Reference in New Issue
Block a user