Python - Handle training on custom classes

This commit is contained in:
Anthony MOI
2020-01-08 10:33:59 -05:00
parent fc56f8d186
commit bc48a89770
4 changed files with 85 additions and 8 deletions

View File

@ -1,11 +1,11 @@
from tokenizers import Tokenizer, decoders
from tokenizers import Tokenizer, decoders, trainers
from tokenizers.models import WordPiece
from tokenizers.normalizers import BertNormalizer
from tokenizers.pre_tokenizers import BertPreTokenizer
from tokenizers.processors import BertProcessing
from .base_tokenizer import BaseTokenizer
from typing import Optional
from typing import Optional, List
class BertWordPieceTokenizer(BaseTokenizer):
""" Bert WordPiece Tokenizer """
@ -61,3 +61,24 @@ class BertWordPieceTokenizer(BaseTokenizer):
}
super().__init__(tokenizer, parameters)
def train(self, files: List[str],
vocab_size: int=30000,
min_frequency: int=2,
limit_alphabet: int=1000,
initial_alphabet: List[str]=[],
special_tokens: List[str]=["[UNK]", "[SEP]", "[CLS]"],
show_progress: bool=True,
wordpieces_prefix: str="##"):
""" Train the model using the given files """
trainer = trainers.WordPieceTrainer.new(
vocab_size=vocab_size,
min_frequency=min_frequency,
limit_alphabet=limit_alphabet,
initial_alphabet=initial_alphabet,
special_tokens=special_tokens,
show_progress=show_progress,
continuing_subword_prefix=wordpieces_prefix
)
self._tokenizer.train(trainer, files)

View File

@ -1,9 +1,9 @@
from tokenizers import Tokenizer, pre_tokenizers, decoders
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers
from tokenizers.models import BPE
from tokenizers.normalizers import NFKC, Sequence, Lowercase
from .base_tokenizer import BaseTokenizer
from typing import Optional
from typing import Optional, List
class BPETokenizer(BaseTokenizer):
""" Original BPE Tokenizer
@ -41,3 +41,24 @@ class BPETokenizer(BaseTokenizer):
}
super().__init__(tokenizer, parameters)
def train(self, files: List[str],
vocab_size: int=30000,
min_frequency: int=2,
special_tokens: List[str]=["<unk>"],
limit_alphabet: int=1000,
initial_alphabet: List[str]=[],
suffix: Optional[str]="</w>",
show_progress: bool=True):
""" Train the model using the given files """
trainer = trainers.BpeTrainer.new(
vocab_size=vocab_size,
min_frequency=min_frequency,
special_tokens=special_tokens,
limit_alphabet=limit_alphabet,
initial_alphabet=initial_alphabet,
end_of_word_suffix=suffix,
show_progress=show_progress
)
self._tokenizer.train(trainer, files)

View File

@ -1,9 +1,9 @@
from tokenizers import Tokenizer, pre_tokenizers, decoders
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers
from tokenizers.models import BPE
from tokenizers.normalizers import NFKC
from .base_tokenizer import BaseTokenizer
from typing import Optional
from typing import Optional, List
class ByteLevelBPETokenizer(BaseTokenizer):
""" ByteLevelBPETokenizer
@ -30,3 +30,19 @@ class ByteLevelBPETokenizer(BaseTokenizer):
}
super().__init__(tokenizer, parameters)
def train(self, files: List[str],
vocab_size: int=30000,
min_frequency: int=2,
show_progress: bool=True,
special_tokens: List[str]=[]):
""" Train the model using the given files """
trainer = trainers.BpeTrainer.new(
vocab_size=vocab_size,
min_frequency=min_frequency,
show_progress=show_progress,
special_tokens=special_tokens,
initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
)
self._tokenizer.train(trainer, files)

View File

@ -1,9 +1,9 @@
from tokenizers import Tokenizer, pre_tokenizers, decoders
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers
from tokenizers.models import BPE
from tokenizers.normalizers import NFKC
from .base_tokenizer import BaseTokenizer
from typing import Optional
from typing import Optional, List
class SentencePieceBPETokenizer(BaseTokenizer):
""" SentencePiece BPE Tokenizer
@ -41,3 +41,22 @@ class SentencePieceBPETokenizer(BaseTokenizer):
}
super().__init__(tokenizer, parameters)
def train(self, files: List[str],
vocab_size: int=30000,
min_frequency: int=2,
special_tokens: List[str]=["<unk>"],
limit_alphabet: int=1000,
initial_alphabet: List[str]=[],
show_progress: bool=True):
""" Train the model using the given files """
trainer = trainers.BpeTrainer.new(
vocab_size=vocab_size,
min_frequency=min_frequency,
special_tokens=special_tokens,
limit_alphabet=limit_alphabet,
initial_alphabet=initial_alphabet,
show_progress=show_progress
)
self._tokenizer.train(trainer, files)