mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Python - Handle training on custom classes
This commit is contained in:
@ -1,11 +1,11 @@
|
||||
from tokenizers import Tokenizer, decoders
|
||||
from tokenizers import Tokenizer, decoders, trainers
|
||||
from tokenizers.models import WordPiece
|
||||
from tokenizers.normalizers import BertNormalizer
|
||||
from tokenizers.pre_tokenizers import BertPreTokenizer
|
||||
from tokenizers.processors import BertProcessing
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional
|
||||
from typing import Optional, List
|
||||
|
||||
class BertWordPieceTokenizer(BaseTokenizer):
|
||||
""" Bert WordPiece Tokenizer """
|
||||
@ -61,3 +61,24 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
def train(self, files: List[str],
|
||||
vocab_size: int=30000,
|
||||
min_frequency: int=2,
|
||||
limit_alphabet: int=1000,
|
||||
initial_alphabet: List[str]=[],
|
||||
special_tokens: List[str]=["[UNK]", "[SEP]", "[CLS]"],
|
||||
show_progress: bool=True,
|
||||
wordpieces_prefix: str="##"):
|
||||
""" Train the model using the given files """
|
||||
|
||||
trainer = trainers.WordPieceTrainer.new(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
limit_alphabet=limit_alphabet,
|
||||
initial_alphabet=initial_alphabet,
|
||||
special_tokens=special_tokens,
|
||||
show_progress=show_progress,
|
||||
continuing_subword_prefix=wordpieces_prefix
|
||||
)
|
||||
self._tokenizer.train(trainer, files)
|
||||
|
@ -1,9 +1,9 @@
|
||||
from tokenizers import Tokenizer, pre_tokenizers, decoders
|
||||
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import NFKC, Sequence, Lowercase
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional
|
||||
from typing import Optional, List
|
||||
|
||||
class BPETokenizer(BaseTokenizer):
|
||||
""" Original BPE Tokenizer
|
||||
@ -41,3 +41,24 @@ class BPETokenizer(BaseTokenizer):
|
||||
}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
def train(self, files: List[str],
|
||||
vocab_size: int=30000,
|
||||
min_frequency: int=2,
|
||||
special_tokens: List[str]=["<unk>"],
|
||||
limit_alphabet: int=1000,
|
||||
initial_alphabet: List[str]=[],
|
||||
suffix: Optional[str]="</w>",
|
||||
show_progress: bool=True):
|
||||
""" Train the model using the given files """
|
||||
|
||||
trainer = trainers.BpeTrainer.new(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
special_tokens=special_tokens,
|
||||
limit_alphabet=limit_alphabet,
|
||||
initial_alphabet=initial_alphabet,
|
||||
end_of_word_suffix=suffix,
|
||||
show_progress=show_progress
|
||||
)
|
||||
self._tokenizer.train(trainer, files)
|
||||
|
@ -1,9 +1,9 @@
|
||||
from tokenizers import Tokenizer, pre_tokenizers, decoders
|
||||
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import NFKC
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional
|
||||
from typing import Optional, List
|
||||
|
||||
class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
""" ByteLevelBPETokenizer
|
||||
@ -30,3 +30,19 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
def train(self, files: List[str],
|
||||
vocab_size: int=30000,
|
||||
min_frequency: int=2,
|
||||
show_progress: bool=True,
|
||||
special_tokens: List[str]=[]):
|
||||
""" Train the model using the given files """
|
||||
|
||||
trainer = trainers.BpeTrainer.new(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
show_progress=show_progress,
|
||||
special_tokens=special_tokens,
|
||||
initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
|
||||
)
|
||||
self._tokenizer.train(trainer, files)
|
||||
|
@ -1,9 +1,9 @@
|
||||
from tokenizers import Tokenizer, pre_tokenizers, decoders
|
||||
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import NFKC
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional
|
||||
from typing import Optional, List
|
||||
|
||||
class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
""" SentencePiece BPE Tokenizer
|
||||
@ -41,3 +41,22 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
def train(self, files: List[str],
|
||||
vocab_size: int=30000,
|
||||
min_frequency: int=2,
|
||||
special_tokens: List[str]=["<unk>"],
|
||||
limit_alphabet: int=1000,
|
||||
initial_alphabet: List[str]=[],
|
||||
show_progress: bool=True):
|
||||
""" Train the model using the given files """
|
||||
|
||||
trainer = trainers.BpeTrainer.new(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
special_tokens=special_tokens,
|
||||
limit_alphabet=limit_alphabet,
|
||||
initial_alphabet=initial_alphabet,
|
||||
show_progress=show_progress
|
||||
)
|
||||
self._tokenizer.train(trainer, files)
|
||||
|
Reference in New Issue
Block a user