mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-08 05:38:23 +00:00
Python - Add train_from_iterator to implementations
This commit is contained in:
@@ -5,7 +5,7 @@ from tokenizers.pre_tokenizers import BertPreTokenizer
|
||||
from tokenizers.processors import BertProcessing
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union, Dict
|
||||
from typing import Optional, List, Union, Dict, Iterator
|
||||
|
||||
|
||||
class BertWordPieceTokenizer(BaseTokenizer):
|
||||
@@ -116,3 +116,33 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
if isinstance(files, str):
|
||||
files = [files]
|
||||
self._tokenizer.train(files, trainer=trainer)
|
||||
|
||||
def train_from_iterator(
|
||||
self,
|
||||
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 2,
|
||||
limit_alphabet: int = 1000,
|
||||
initial_alphabet: List[str] = [],
|
||||
special_tokens: List[Union[str, AddedToken]] = [
|
||||
"[PAD]",
|
||||
"[UNK]",
|
||||
"[CLS]",
|
||||
"[SEP]",
|
||||
"[MASK]",
|
||||
],
|
||||
show_progress: bool = True,
|
||||
wordpieces_prefix: str = "##",
|
||||
):
|
||||
""" Train the model using the given iterator """
|
||||
|
||||
trainer = trainers.WordPieceTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
limit_alphabet=limit_alphabet,
|
||||
initial_alphabet=initial_alphabet,
|
||||
special_tokens=special_tokens,
|
||||
show_progress=show_progress,
|
||||
continuing_subword_prefix=wordpieces_prefix,
|
||||
)
|
||||
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
|
||||
|
||||
@@ -10,7 +10,7 @@ from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union, Dict, Tuple
|
||||
from typing import Optional, List, Union, Dict, Tuple, Iterator
|
||||
|
||||
|
||||
class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
@@ -102,3 +102,22 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
if isinstance(files, str):
|
||||
files = [files]
|
||||
self._tokenizer.train(files, trainer=trainer)
|
||||
|
||||
def train_from_iterator(
|
||||
self,
|
||||
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 2,
|
||||
show_progress: bool = True,
|
||||
special_tokens: List[Union[str, AddedToken]] = [],
|
||||
):
|
||||
""" Train the model using the given iterator """
|
||||
|
||||
trainer = trainers.BpeTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
show_progress=show_progress,
|
||||
special_tokens=special_tokens,
|
||||
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
|
||||
)
|
||||
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
|
||||
|
||||
@@ -8,7 +8,7 @@ from ..normalizers import (
|
||||
)
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union, Dict, Tuple
|
||||
from typing import Optional, List, Union, Dict, Tuple, Iterator
|
||||
|
||||
|
||||
class CharBPETokenizer(BaseTokenizer):
|
||||
@@ -124,3 +124,27 @@ class CharBPETokenizer(BaseTokenizer):
|
||||
if isinstance(files, str):
|
||||
files = [files]
|
||||
self._tokenizer.train(files, trainer=trainer)
|
||||
|
||||
def train_from_iterator(
|
||||
self,
|
||||
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 2,
|
||||
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
|
||||
limit_alphabet: int = 1000,
|
||||
initial_alphabet: List[str] = [],
|
||||
suffix: Optional[str] = "</w>",
|
||||
show_progress: bool = True,
|
||||
):
|
||||
""" Train the model using the given iterator """
|
||||
|
||||
trainer = trainers.BpeTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
special_tokens=special_tokens,
|
||||
limit_alphabet=limit_alphabet,
|
||||
initial_alphabet=initial_alphabet,
|
||||
end_of_word_suffix=suffix,
|
||||
show_progress=show_progress,
|
||||
)
|
||||
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
|
||||
|
||||
@@ -3,7 +3,7 @@ from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import NFKC
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union, Dict, Tuple
|
||||
from typing import Optional, List, Union, Dict, Tuple, Iterator
|
||||
|
||||
|
||||
class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
@@ -75,3 +75,25 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
if isinstance(files, str):
|
||||
files = [files]
|
||||
self._tokenizer.train(files, trainer=trainer)
|
||||
|
||||
def train_from_iterator(
|
||||
self,
|
||||
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 2,
|
||||
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
|
||||
limit_alphabet: int = 1000,
|
||||
initial_alphabet: List[str] = [],
|
||||
show_progress: bool = True,
|
||||
):
|
||||
""" Train the model using the given iterator """
|
||||
|
||||
trainer = trainers.BpeTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
special_tokens=special_tokens,
|
||||
limit_alphabet=limit_alphabet,
|
||||
initial_alphabet=initial_alphabet,
|
||||
show_progress=show_progress,
|
||||
)
|
||||
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
|
||||
|
||||
@@ -11,7 +11,7 @@ from tokenizers.models import Unigram
|
||||
import json
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union
|
||||
from typing import Optional, List, Union, Iterator
|
||||
|
||||
|
||||
class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
@@ -77,6 +77,23 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
files = [files]
|
||||
self._tokenizer.train(files, trainer=trainer)
|
||||
|
||||
def train_from_iterator(
|
||||
self,
|
||||
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
|
||||
vocab_size: int = 8000,
|
||||
show_progress: bool = True,
|
||||
special_tokens: List[Union[str, AddedToken]] = [],
|
||||
):
|
||||
""" Train the model using the given iterator """
|
||||
|
||||
trainer = trainers.UnigramTrainer(
|
||||
vocab_size=vocab_size,
|
||||
special_tokens=special_tokens,
|
||||
show_progress=show_progress,
|
||||
)
|
||||
|
||||
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
|
||||
|
||||
@staticmethod
|
||||
def from_spm(filename: str):
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user