Python - Add train_from_iterator to implementations

This commit is contained in:
Anthony MOI
2021-01-06 17:07:56 -05:00
committed by Anthony MOI
parent 817c5ad317
commit d94fa220b6
9 changed files with 166 additions and 7 deletions

View File

@@ -5,7 +5,7 @@ from tokenizers.pre_tokenizers import BertPreTokenizer
from tokenizers.processors import BertProcessing
from .base_tokenizer import BaseTokenizer
from typing import Optional, List, Union, Dict
from typing import Optional, List, Union, Dict, Iterator
class BertWordPieceTokenizer(BaseTokenizer):
@@ -116,3 +116,33 @@ class BertWordPieceTokenizer(BaseTokenizer):
if isinstance(files, str):
files = [files]
self._tokenizer.train(files, trainer=trainer)
def train_from_iterator(
self,
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
vocab_size: int = 30000,
min_frequency: int = 2,
limit_alphabet: int = 1000,
initial_alphabet: List[str] = [],
special_tokens: List[Union[str, AddedToken]] = [
"[PAD]",
"[UNK]",
"[CLS]",
"[SEP]",
"[MASK]",
],
show_progress: bool = True,
wordpieces_prefix: str = "##",
):
""" Train the model using the given iterator """
trainer = trainers.WordPieceTrainer(
vocab_size=vocab_size,
min_frequency=min_frequency,
limit_alphabet=limit_alphabet,
initial_alphabet=initial_alphabet,
special_tokens=special_tokens,
show_progress=show_progress,
continuing_subword_prefix=wordpieces_prefix,
)
self._tokenizer.train_from_iterator(iterator, trainer=trainer)

View File

@@ -10,7 +10,7 @@ from tokenizers.models import BPE
from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
from .base_tokenizer import BaseTokenizer
from typing import Optional, List, Union, Dict, Tuple
from typing import Optional, List, Union, Dict, Tuple, Iterator
class ByteLevelBPETokenizer(BaseTokenizer):
@@ -102,3 +102,22 @@ class ByteLevelBPETokenizer(BaseTokenizer):
if isinstance(files, str):
files = [files]
self._tokenizer.train(files, trainer=trainer)
def train_from_iterator(
self,
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
vocab_size: int = 30000,
min_frequency: int = 2,
show_progress: bool = True,
special_tokens: List[Union[str, AddedToken]] = [],
):
""" Train the model using the given iterator """
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
min_frequency=min_frequency,
show_progress=show_progress,
special_tokens=special_tokens,
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
)
self._tokenizer.train_from_iterator(iterator, trainer=trainer)

View File

@@ -8,7 +8,7 @@ from ..normalizers import (
)
from .base_tokenizer import BaseTokenizer
from typing import Optional, List, Union, Dict, Tuple
from typing import Optional, List, Union, Dict, Tuple, Iterator
class CharBPETokenizer(BaseTokenizer):
@@ -124,3 +124,27 @@ class CharBPETokenizer(BaseTokenizer):
if isinstance(files, str):
files = [files]
self._tokenizer.train(files, trainer=trainer)
def train_from_iterator(
self,
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
vocab_size: int = 30000,
min_frequency: int = 2,
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
limit_alphabet: int = 1000,
initial_alphabet: List[str] = [],
suffix: Optional[str] = "</w>",
show_progress: bool = True,
):
""" Train the model using the given iterator """
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
min_frequency=min_frequency,
special_tokens=special_tokens,
limit_alphabet=limit_alphabet,
initial_alphabet=initial_alphabet,
end_of_word_suffix=suffix,
show_progress=show_progress,
)
self._tokenizer.train_from_iterator(iterator, trainer=trainer)

View File

@@ -3,7 +3,7 @@ from tokenizers.models import BPE
from tokenizers.normalizers import NFKC
from .base_tokenizer import BaseTokenizer
from typing import Optional, List, Union, Dict, Tuple
from typing import Optional, List, Union, Dict, Tuple, Iterator
class SentencePieceBPETokenizer(BaseTokenizer):
@@ -75,3 +75,25 @@ class SentencePieceBPETokenizer(BaseTokenizer):
if isinstance(files, str):
files = [files]
self._tokenizer.train(files, trainer=trainer)
def train_from_iterator(
self,
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
vocab_size: int = 30000,
min_frequency: int = 2,
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
limit_alphabet: int = 1000,
initial_alphabet: List[str] = [],
show_progress: bool = True,
):
""" Train the model using the given iterator """
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
min_frequency=min_frequency,
special_tokens=special_tokens,
limit_alphabet=limit_alphabet,
initial_alphabet=initial_alphabet,
show_progress=show_progress,
)
self._tokenizer.train_from_iterator(iterator, trainer=trainer)

View File

@@ -11,7 +11,7 @@ from tokenizers.models import Unigram
import json
from .base_tokenizer import BaseTokenizer
from typing import Optional, List, Union
from typing import Optional, List, Union, Iterator
class SentencePieceUnigramTokenizer(BaseTokenizer):
@@ -77,6 +77,23 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
files = [files]
self._tokenizer.train(files, trainer=trainer)
def train_from_iterator(
self,
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
vocab_size: int = 8000,
show_progress: bool = True,
special_tokens: List[Union[str, AddedToken]] = [],
):
""" Train the model using the given iterator """
trainer = trainers.UnigramTrainer(
vocab_size=vocab_size,
special_tokens=special_tokens,
show_progress=show_progress,
)
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
@staticmethod
def from_spm(filename: str):
try: