mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-01 14:59:20 +00:00
Doc - Improve snippets testing
This commit is contained in:
0
bindings/python/tests/documentation/__init__.py
Normal file
0
bindings/python/tests/documentation/__init__.py
Normal file
19
bindings/python/tests/documentation/test_load.py
Normal file
19
bindings/python/tests/documentation/test_load.py
Normal file
@ -0,0 +1,19 @@
|
||||
from tokenizers import Tokenizer
|
||||
|
||||
|
||||
def test_load_tokenizer():
|
||||
# START load_tokenizer
|
||||
tokenizer = Tokenizer.from_file("data/roberta.json")
|
||||
# END load_tokenizer
|
||||
|
||||
example = "This is an example"
|
||||
ids = [713, 16, 41, 1246]
|
||||
tokens = ["This", "Ġis", "Ġan", "Ġexample"]
|
||||
|
||||
encodings = tokenizer.encode(example)
|
||||
|
||||
assert encodings.ids == ids
|
||||
assert encodings.tokens == tokens
|
||||
|
||||
decoded = tokenizer.decode(ids)
|
||||
assert decoded == example
|
43
bindings/python/tests/documentation/test_train.py
Normal file
43
bindings/python/tests/documentation/test_train.py
Normal file
@ -0,0 +1,43 @@
|
||||
from tokenizers import (
|
||||
Tokenizer,
|
||||
normalizers,
|
||||
pre_tokenizers,
|
||||
models,
|
||||
decoders,
|
||||
processors,
|
||||
trainers,
|
||||
AddedToken,
|
||||
)
|
||||
|
||||
|
||||
def test_train_tokenizer():
|
||||
# START train_tokenizer
|
||||
vocab_size = 100
|
||||
|
||||
tokenizer = Tokenizer(models.BPE())
|
||||
tokenizer.normalizer = normalizers.Sequence(
|
||||
[
|
||||
normalizers.Strip(),
|
||||
normalizers.NFC(),
|
||||
]
|
||||
)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
||||
tokenizer.post_processor = processors.ByteLevel()
|
||||
tokenizer.decoder = decoders.ByteLevel()
|
||||
|
||||
trainer = trainers.BpeTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=0,
|
||||
special_tokens=[
|
||||
AddedToken("<s>"),
|
||||
AddedToken("<pad>"),
|
||||
AddedToken("</s>"),
|
||||
AddedToken("<unk>"),
|
||||
AddedToken("<mask>"),
|
||||
],
|
||||
show_progress=False,
|
||||
)
|
||||
|
||||
tokenizer.train(trainer, ["data/small.txt"])
|
||||
tokenizer.save("data/tokenizer.json")
|
||||
# END train_tokenizer
|
Reference in New Issue
Block a user