mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Encode special tokens (#1437)
* add doc in the code * add option to skip special tokens * nits * add api dummy for now * Fmt. * Fix fmt. * Fix the stub. * add a test * add a test in python * style it * nits * add getter and setters * stub * update python test * fmt * last nit --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
@ -457,3 +457,34 @@ class TestTokenizer:
|
||||
output = tokenizer.encode("A sentence 🤗")
|
||||
assert output.ids == [1, 10, 2, 3, 4, 5, 10, 6, 7, 8, 9]
|
||||
assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>"]
|
||||
|
||||
def test_encode_special_tokens(self):
|
||||
tokenizer = Tokenizer.from_pretrained("t5-base")
|
||||
tokenizer.add_tokens(["<eot>"])
|
||||
tokenizer.add_special_tokens(["<end_of_text>"])
|
||||
output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
|
||||
assert output.tokens == ["▁Hey", "▁there", "<end_of_text>", "▁dear", "<eot>", "▁friend", "!"]
|
||||
|
||||
tokenizer.encode_special_tokens = True
|
||||
assert tokenizer.encode_special_tokens == True
|
||||
|
||||
output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
|
||||
assert output.tokens == [
|
||||
"▁Hey",
|
||||
"▁there",
|
||||
"<",
|
||||
"end",
|
||||
"_",
|
||||
"of",
|
||||
"_",
|
||||
"text",
|
||||
">",
|
||||
"▁dear",
|
||||
"<eot>",
|
||||
"▁friend",
|
||||
"!",
|
||||
]
|
||||
|
||||
tokenizer.add_tokens(["of_text>"])
|
||||
output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
|
||||
assert output.tokens == ["▁Hey", "▁there", "<", "end", "_", "of_text>", "▁dear", "<eot>", "▁friend", "!"]
|
||||
|
Reference in New Issue
Block a user