mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Python - Handle training on custom classes
This commit is contained in:
@ -1,11 +1,11 @@
|
|||||||
from tokenizers import Tokenizer, decoders
|
from tokenizers import Tokenizer, decoders, trainers
|
||||||
from tokenizers.models import WordPiece
|
from tokenizers.models import WordPiece
|
||||||
from tokenizers.normalizers import BertNormalizer
|
from tokenizers.normalizers import BertNormalizer
|
||||||
from tokenizers.pre_tokenizers import BertPreTokenizer
|
from tokenizers.pre_tokenizers import BertPreTokenizer
|
||||||
from tokenizers.processors import BertProcessing
|
from tokenizers.processors import BertProcessing
|
||||||
from .base_tokenizer import BaseTokenizer
|
from .base_tokenizer import BaseTokenizer
|
||||||
|
|
||||||
from typing import Optional
|
from typing import Optional, List
|
||||||
|
|
||||||
class BertWordPieceTokenizer(BaseTokenizer):
|
class BertWordPieceTokenizer(BaseTokenizer):
|
||||||
""" Bert WordPiece Tokenizer """
|
""" Bert WordPiece Tokenizer """
|
||||||
@ -61,3 +61,24 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
|||||||
}
|
}
|
||||||
|
|
||||||
super().__init__(tokenizer, parameters)
|
super().__init__(tokenizer, parameters)
|
||||||
|
|
||||||
|
def train(self, files: List[str],
|
||||||
|
vocab_size: int=30000,
|
||||||
|
min_frequency: int=2,
|
||||||
|
limit_alphabet: int=1000,
|
||||||
|
initial_alphabet: List[str]=[],
|
||||||
|
special_tokens: List[str]=["[UNK]", "[SEP]", "[CLS]"],
|
||||||
|
show_progress: bool=True,
|
||||||
|
wordpieces_prefix: str="##"):
|
||||||
|
""" Train the model using the given files """
|
||||||
|
|
||||||
|
trainer = trainers.WordPieceTrainer.new(
|
||||||
|
vocab_size=vocab_size,
|
||||||
|
min_frequency=min_frequency,
|
||||||
|
limit_alphabet=limit_alphabet,
|
||||||
|
initial_alphabet=initial_alphabet,
|
||||||
|
special_tokens=special_tokens,
|
||||||
|
show_progress=show_progress,
|
||||||
|
continuing_subword_prefix=wordpieces_prefix
|
||||||
|
)
|
||||||
|
self._tokenizer.train(trainer, files)
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
from tokenizers import Tokenizer, pre_tokenizers, decoders
|
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers
|
||||||
from tokenizers.models import BPE
|
from tokenizers.models import BPE
|
||||||
from tokenizers.normalizers import NFKC, Sequence, Lowercase
|
from tokenizers.normalizers import NFKC, Sequence, Lowercase
|
||||||
from .base_tokenizer import BaseTokenizer
|
from .base_tokenizer import BaseTokenizer
|
||||||
|
|
||||||
from typing import Optional
|
from typing import Optional, List
|
||||||
|
|
||||||
class BPETokenizer(BaseTokenizer):
|
class BPETokenizer(BaseTokenizer):
|
||||||
""" Original BPE Tokenizer
|
""" Original BPE Tokenizer
|
||||||
@ -41,3 +41,24 @@ class BPETokenizer(BaseTokenizer):
|
|||||||
}
|
}
|
||||||
|
|
||||||
super().__init__(tokenizer, parameters)
|
super().__init__(tokenizer, parameters)
|
||||||
|
|
||||||
|
def train(self, files: List[str],
|
||||||
|
vocab_size: int=30000,
|
||||||
|
min_frequency: int=2,
|
||||||
|
special_tokens: List[str]=["<unk>"],
|
||||||
|
limit_alphabet: int=1000,
|
||||||
|
initial_alphabet: List[str]=[],
|
||||||
|
suffix: Optional[str]="</w>",
|
||||||
|
show_progress: bool=True):
|
||||||
|
""" Train the model using the given files """
|
||||||
|
|
||||||
|
trainer = trainers.BpeTrainer.new(
|
||||||
|
vocab_size=vocab_size,
|
||||||
|
min_frequency=min_frequency,
|
||||||
|
special_tokens=special_tokens,
|
||||||
|
limit_alphabet=limit_alphabet,
|
||||||
|
initial_alphabet=initial_alphabet,
|
||||||
|
end_of_word_suffix=suffix,
|
||||||
|
show_progress=show_progress
|
||||||
|
)
|
||||||
|
self._tokenizer.train(trainer, files)
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
from tokenizers import Tokenizer, pre_tokenizers, decoders
|
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers
|
||||||
from tokenizers.models import BPE
|
from tokenizers.models import BPE
|
||||||
from tokenizers.normalizers import NFKC
|
from tokenizers.normalizers import NFKC
|
||||||
from .base_tokenizer import BaseTokenizer
|
from .base_tokenizer import BaseTokenizer
|
||||||
|
|
||||||
from typing import Optional
|
from typing import Optional, List
|
||||||
|
|
||||||
class ByteLevelBPETokenizer(BaseTokenizer):
|
class ByteLevelBPETokenizer(BaseTokenizer):
|
||||||
""" ByteLevelBPETokenizer
|
""" ByteLevelBPETokenizer
|
||||||
@ -30,3 +30,19 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
|||||||
}
|
}
|
||||||
|
|
||||||
super().__init__(tokenizer, parameters)
|
super().__init__(tokenizer, parameters)
|
||||||
|
|
||||||
|
def train(self, files: List[str],
|
||||||
|
vocab_size: int=30000,
|
||||||
|
min_frequency: int=2,
|
||||||
|
show_progress: bool=True,
|
||||||
|
special_tokens: List[str]=[]):
|
||||||
|
""" Train the model using the given files """
|
||||||
|
|
||||||
|
trainer = trainers.BpeTrainer.new(
|
||||||
|
vocab_size=vocab_size,
|
||||||
|
min_frequency=min_frequency,
|
||||||
|
show_progress=show_progress,
|
||||||
|
special_tokens=special_tokens,
|
||||||
|
initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
|
||||||
|
)
|
||||||
|
self._tokenizer.train(trainer, files)
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
from tokenizers import Tokenizer, pre_tokenizers, decoders
|
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers
|
||||||
from tokenizers.models import BPE
|
from tokenizers.models import BPE
|
||||||
from tokenizers.normalizers import NFKC
|
from tokenizers.normalizers import NFKC
|
||||||
from .base_tokenizer import BaseTokenizer
|
from .base_tokenizer import BaseTokenizer
|
||||||
|
|
||||||
from typing import Optional
|
from typing import Optional, List
|
||||||
|
|
||||||
class SentencePieceBPETokenizer(BaseTokenizer):
|
class SentencePieceBPETokenizer(BaseTokenizer):
|
||||||
""" SentencePiece BPE Tokenizer
|
""" SentencePiece BPE Tokenizer
|
||||||
@ -41,3 +41,22 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
|||||||
}
|
}
|
||||||
|
|
||||||
super().__init__(tokenizer, parameters)
|
super().__init__(tokenizer, parameters)
|
||||||
|
|
||||||
|
def train(self, files: List[str],
|
||||||
|
vocab_size: int=30000,
|
||||||
|
min_frequency: int=2,
|
||||||
|
special_tokens: List[str]=["<unk>"],
|
||||||
|
limit_alphabet: int=1000,
|
||||||
|
initial_alphabet: List[str]=[],
|
||||||
|
show_progress: bool=True):
|
||||||
|
""" Train the model using the given files """
|
||||||
|
|
||||||
|
trainer = trainers.BpeTrainer.new(
|
||||||
|
vocab_size=vocab_size,
|
||||||
|
min_frequency=min_frequency,
|
||||||
|
special_tokens=special_tokens,
|
||||||
|
limit_alphabet=limit_alphabet,
|
||||||
|
initial_alphabet=initial_alphabet,
|
||||||
|
show_progress=show_progress
|
||||||
|
)
|
||||||
|
self._tokenizer.train(trainer, files)
|
||||||
|
Reference in New Issue
Block a user