Encode special tokens (#1437)

* add doc in the code

* add option to skip special tokens

* nits

* add api dummy for now

* Fmt.

* Fix fmt.

* Fix the stub.

* add a test

* add a test in python

* style it

* nits

* add getter and setters

* stub

* update python test

* fmt

* last nit

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
Arthur
2024-01-19 12:43:43 +01:00
committed by GitHub
parent 888dd4bc65
commit 6a77d4859b
5 changed files with 173 additions and 0 deletions

View File

@ -457,3 +457,34 @@ class TestTokenizer:
output = tokenizer.encode("A sentence 🤗")
assert output.ids == [1, 10, 2, 3, 4, 5, 10, 6, 7, 8, 9]
assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>"]
def test_encode_special_tokens(self):
tokenizer = Tokenizer.from_pretrained("t5-base")
tokenizer.add_tokens(["<eot>"])
tokenizer.add_special_tokens(["<end_of_text>"])
output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
assert output.tokens == ["▁Hey", "▁there", "<end_of_text>", "▁dear", "<eot>", "▁friend", "!"]
tokenizer.encode_special_tokens = True
assert tokenizer.encode_special_tokens == True
output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
assert output.tokens == [
"▁Hey",
"▁there",
"<",
"end",
"_",
"of",
"_",
"text",
">",
"▁dear",
"<eot>",
"▁friend",
"!",
]
tokenizer.add_tokens(["of_text>"])
output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
assert output.tokens == ["▁Hey", "▁there", "<", "end", "_", "of_text>", "▁dear", "<eot>", "▁friend", "!"]