mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 08:45:38 +00:00
20 lines
469 B
Python
20 lines
469 B
Python
from tokenizers import Tokenizer
|
|
|
|
|
|
def test_load_tokenizer():
|
|
# START load_tokenizer
|
|
tokenizer = Tokenizer.from_file("data/roberta.json")
|
|
# END load_tokenizer
|
|
|
|
example = "This is an example"
|
|
ids = [713, 16, 41, 1246]
|
|
tokens = ["This", "Ġis", "Ġan", "Ġexample"]
|
|
|
|
encodings = tokenizer.encode(example)
|
|
|
|
assert encodings.ids == ids
|
|
assert encodings.tokens == tokens
|
|
|
|
decoded = tokenizer.decode(ids)
|
|
assert decoded == example
|