mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 16:49:27 +00:00
Doc - Improve snippets testing
This commit is contained in:
43
bindings/python/tests/documentation/test_train.py
Normal file
43
bindings/python/tests/documentation/test_train.py
Normal file
@ -0,0 +1,43 @@
|
||||
from tokenizers import (
|
||||
Tokenizer,
|
||||
normalizers,
|
||||
pre_tokenizers,
|
||||
models,
|
||||
decoders,
|
||||
processors,
|
||||
trainers,
|
||||
AddedToken,
|
||||
)
|
||||
|
||||
|
||||
def test_train_tokenizer():
|
||||
# START train_tokenizer
|
||||
vocab_size = 100
|
||||
|
||||
tokenizer = Tokenizer(models.BPE())
|
||||
tokenizer.normalizer = normalizers.Sequence(
|
||||
[
|
||||
normalizers.Strip(),
|
||||
normalizers.NFC(),
|
||||
]
|
||||
)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
||||
tokenizer.post_processor = processors.ByteLevel()
|
||||
tokenizer.decoder = decoders.ByteLevel()
|
||||
|
||||
trainer = trainers.BpeTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=0,
|
||||
special_tokens=[
|
||||
AddedToken("<s>"),
|
||||
AddedToken("<pad>"),
|
||||
AddedToken("</s>"),
|
||||
AddedToken("<unk>"),
|
||||
AddedToken("<mask>"),
|
||||
],
|
||||
show_progress=False,
|
||||
)
|
||||
|
||||
tokenizer.train(trainer, ["data/small.txt"])
|
||||
tokenizer.save("data/tokenizer.json")
|
||||
# END train_tokenizer
|
Reference in New Issue
Block a user