mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Python - Add length to train_from_iterator in implementations (#937)
This commit is contained in:
@ -133,6 +133,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
|||||||
],
|
],
|
||||||
show_progress: bool = True,
|
show_progress: bool = True,
|
||||||
wordpieces_prefix: str = "##",
|
wordpieces_prefix: str = "##",
|
||||||
|
length: Optional[int] = None,
|
||||||
):
|
):
|
||||||
""" Train the model using the given iterator """
|
""" Train the model using the given iterator """
|
||||||
|
|
||||||
@ -145,4 +146,8 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
|||||||
show_progress=show_progress,
|
show_progress=show_progress,
|
||||||
continuing_subword_prefix=wordpieces_prefix,
|
continuing_subword_prefix=wordpieces_prefix,
|
||||||
)
|
)
|
||||||
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
|
self._tokenizer.train_from_iterator(
|
||||||
|
iterator,
|
||||||
|
trainer=trainer,
|
||||||
|
length=length,
|
||||||
|
)
|
||||||
|
@ -110,6 +110,7 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
|||||||
min_frequency: int = 2,
|
min_frequency: int = 2,
|
||||||
show_progress: bool = True,
|
show_progress: bool = True,
|
||||||
special_tokens: List[Union[str, AddedToken]] = [],
|
special_tokens: List[Union[str, AddedToken]] = [],
|
||||||
|
length: Optional[int] = None,
|
||||||
):
|
):
|
||||||
""" Train the model using the given iterator """
|
""" Train the model using the given iterator """
|
||||||
|
|
||||||
@ -120,4 +121,8 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
|||||||
special_tokens=special_tokens,
|
special_tokens=special_tokens,
|
||||||
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
|
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
|
||||||
)
|
)
|
||||||
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
|
self._tokenizer.train_from_iterator(
|
||||||
|
iterator,
|
||||||
|
trainer=trainer,
|
||||||
|
length=length,
|
||||||
|
)
|
||||||
|
@ -135,6 +135,7 @@ class CharBPETokenizer(BaseTokenizer):
|
|||||||
initial_alphabet: List[str] = [],
|
initial_alphabet: List[str] = [],
|
||||||
suffix: Optional[str] = "</w>",
|
suffix: Optional[str] = "</w>",
|
||||||
show_progress: bool = True,
|
show_progress: bool = True,
|
||||||
|
length: Optional[int] = None,
|
||||||
):
|
):
|
||||||
""" Train the model using the given iterator """
|
""" Train the model using the given iterator """
|
||||||
|
|
||||||
@ -147,4 +148,8 @@ class CharBPETokenizer(BaseTokenizer):
|
|||||||
end_of_word_suffix=suffix,
|
end_of_word_suffix=suffix,
|
||||||
show_progress=show_progress,
|
show_progress=show_progress,
|
||||||
)
|
)
|
||||||
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
|
self._tokenizer.train_from_iterator(
|
||||||
|
iterator,
|
||||||
|
trainer=trainer,
|
||||||
|
length=length,
|
||||||
|
)
|
||||||
|
@ -88,6 +88,7 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
|||||||
limit_alphabet: int = 1000,
|
limit_alphabet: int = 1000,
|
||||||
initial_alphabet: List[str] = [],
|
initial_alphabet: List[str] = [],
|
||||||
show_progress: bool = True,
|
show_progress: bool = True,
|
||||||
|
length: Optional[int] = None,
|
||||||
):
|
):
|
||||||
""" Train the model using the given iterator """
|
""" Train the model using the given iterator """
|
||||||
|
|
||||||
@ -99,4 +100,8 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
|||||||
initial_alphabet=initial_alphabet,
|
initial_alphabet=initial_alphabet,
|
||||||
show_progress=show_progress,
|
show_progress=show_progress,
|
||||||
)
|
)
|
||||||
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
|
self._tokenizer.train_from_iterator(
|
||||||
|
iterator,
|
||||||
|
trainer=trainer,
|
||||||
|
length=length,
|
||||||
|
)
|
||||||
|
@ -85,6 +85,7 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
|||||||
show_progress: bool = True,
|
show_progress: bool = True,
|
||||||
special_tokens: List[Union[str, AddedToken]] = [],
|
special_tokens: List[Union[str, AddedToken]] = [],
|
||||||
unk_token: Optional[str] = None,
|
unk_token: Optional[str] = None,
|
||||||
|
length: Optional[int] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Train the model using the given iterator
|
Train the model using the given iterator
|
||||||
@ -109,7 +110,11 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
|||||||
unk_token=unk_token,
|
unk_token=unk_token,
|
||||||
)
|
)
|
||||||
|
|
||||||
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
|
self._tokenizer.train_from_iterator(
|
||||||
|
iterator,
|
||||||
|
trainer=trainer,
|
||||||
|
length=length,
|
||||||
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_spm(filename: str):
|
def from_spm(filename: str):
|
||||||
|
Reference in New Issue
Block a user