mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
18 lines
377 B
Python
18 lines
377 B
Python
from tokenizers import Tokenizer
|
|
|
|
# START load
|
|
tokenizer = Tokenizer.from_file("data/roberta.json")
|
|
# END load
|
|
|
|
example = "This is an example"
|
|
ids = [713, 16, 41, 1246]
|
|
tokens = ["This", "Ġis", "Ġan", "Ġexample"]
|
|
|
|
encodings = tokenizer.encode(example)
|
|
|
|
assert encodings.ids == ids
|
|
assert encodings.tokens == tokens
|
|
|
|
decoded = tokenizer.decode(ids)
|
|
assert decoded == example
|