mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 16:49:27 +00:00
44 lines
1010 B
Python
44 lines
1010 B
Python
from tokenizers import (
|
|
Tokenizer,
|
|
normalizers,
|
|
pre_tokenizers,
|
|
models,
|
|
decoders,
|
|
processors,
|
|
trainers,
|
|
AddedToken,
|
|
)
|
|
|
|
|
|
def test_train_tokenizer():
|
|
# START train_tokenizer
|
|
vocab_size = 100
|
|
|
|
tokenizer = Tokenizer(models.BPE())
|
|
tokenizer.normalizer = normalizers.Sequence(
|
|
[
|
|
normalizers.Strip(),
|
|
normalizers.NFC(),
|
|
]
|
|
)
|
|
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
|
tokenizer.post_processor = processors.ByteLevel()
|
|
tokenizer.decoder = decoders.ByteLevel()
|
|
|
|
trainer = trainers.BpeTrainer(
|
|
vocab_size=vocab_size,
|
|
min_frequency=0,
|
|
special_tokens=[
|
|
AddedToken("<s>"),
|
|
AddedToken("<pad>"),
|
|
AddedToken("</s>"),
|
|
AddedToken("<unk>"),
|
|
AddedToken("<mask>"),
|
|
],
|
|
show_progress=False,
|
|
)
|
|
|
|
tokenizer.train(trainer, ["data/small.txt"])
|
|
tokenizer.save("data/tokenizer.json")
|
|
# END train_tokenizer
|