Doc - Improve snippets testing

This commit is contained in:
Anthony MOI
2020-10-02 15:52:33 -04:00
committed by Anthony MOI
parent f4e7754112
commit 000c19a7a5
12 changed files with 84 additions and 68 deletions

View File

@ -0,0 +1,43 @@
from tokenizers import (
Tokenizer,
normalizers,
pre_tokenizers,
models,
decoders,
processors,
trainers,
AddedToken,
)
def test_train_tokenizer():
# START train_tokenizer
vocab_size = 100
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence(
[
normalizers.Strip(),
normalizers.NFC(),
]
)
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.post_processor = processors.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
min_frequency=0,
special_tokens=[
AddedToken("<s>"),
AddedToken("<pad>"),
AddedToken("</s>"),
AddedToken("<unk>"),
AddedToken("<mask>"),
],
show_progress=False,
)
tokenizer.train(trainer, ["data/small.txt"])
tokenizer.save("data/tokenizer.json")
# END train_tokenizer