Encode special tokens (#1437)

* add doc in the code * add option to skip special tokens * nits * add api dummy for now * Fmt. * Fix fmt. * Fix the stub. * add a test * add a test in python * style it * nits * add getter and setters * stub * update python test * fmt * last nit --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
2025-08-23 00:35:35 +00:00 · 2024-01-19 12:43:43 +01:00
parent 888dd4bc65
commit 6a77d4859b
5 changed files with 173 additions and 0 deletions
--- a/bindings/python/tests/bindings/test_tokenizer.py
+++ b/bindings/python/tests/bindings/test_tokenizer.py
@ -457,3 +457,34 @@ class TestTokenizer:
        output = tokenizer.encode("A sentence 🤗")
        assert output.ids == [1, 10, 2, 3, 4, 5, 10, 6, 7, 8, 9]
        assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>"]
+
+    def test_encode_special_tokens(self):
+        tokenizer = Tokenizer.from_pretrained("t5-base")
+        tokenizer.add_tokens(["<eot>"])
+        tokenizer.add_special_tokens(["<end_of_text>"])
+        output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
+        assert output.tokens == ["▁Hey", "▁there", "<end_of_text>", "▁dear", "<eot>", "▁friend", "!"]
+
+        tokenizer.encode_special_tokens = True
+        assert tokenizer.encode_special_tokens == True
+
+        output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
+        assert output.tokens == [
+            "▁Hey",
+            "▁there",
+            "<",
+            "end",
+            "_",
+            "of",
+            "_",
+            "text",
+            ">",
+            "▁dear",
+            "<eot>",
+            "▁friend",
+            "!",
+        ]
+
+        tokenizer.add_tokens(["of_text>"])
+        output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
+        assert output.tokens == ["▁Hey", "▁there", "<", "end", "_", "of_text>", "▁dear", "<eot>", "▁friend", "!"]