Encode special tokens (#1437)

* add doc in the code * add option to skip special tokens * nits * add api dummy for now * Fmt. * Fix fmt. * Fix the stub. * add a test * add a test in python * style it * nits * add getter and setters * stub * update python test * fmt * last nit --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
2025-08-22 16:25:30 +00:00 · 2024-01-19 12:43:43 +01:00
parent 888dd4bc65
commit 6a77d4859b
5 changed files with 173 additions and 0 deletions
--- a/bindings/python/py_src/tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/init.pyi
@ -836,6 +836,18 @@ class Tokenizer:
        Returns:
            A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch

+        """
+        pass
+    @property
+    def encode_special_tokens(self):
+        """
+        Modifies the tokenizer in order to use or not the special tokens
+        during encoding.
+
+        Args:
+            value (:obj:`bool`):
+                Whether to use the special tokens or not
+
        """
        pass
    @staticmethod
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@ -1109,6 +1109,25 @@ impl PyTokenizer {
        self.tokenizer.id_to_token(id)
    }

+    /// Modifies the tokenizer in order to use or not the special tokens
+    /// during encoding.
+    ///
+    /// Args:
+    ///     value (:obj:`bool`):
+    ///         Whether to use the special tokens or not
+    ///
+    #[setter]
+    fn set_encode_special_tokens(&mut self, value: bool) {
+        self.tokenizer.set_encode_special_tokens(value);
+    }
+    /// Get the value of the `encode_special_tokens` attribute
+    ///
+    /// Returns:
+    ///     :obj:`bool`: the tokenizer's encode_special_tokens attribute
+    #[getter]
+    fn get_encode_special_tokens(&self) -> bool {
+        self.tokenizer.get_encode_special_tokens()
+    }
    /// Add the given tokens to the vocabulary
    ///
    /// The given tokens are added only if they don't already exist in the vocabulary.
--- a/bindings/python/tests/bindings/test_tokenizer.py
+++ b/bindings/python/tests/bindings/test_tokenizer.py
@ -457,3 +457,34 @@ class TestTokenizer:
        output = tokenizer.encode("A sentence 🤗")
        assert output.ids == [1, 10, 2, 3, 4, 5, 10, 6, 7, 8, 9]
        assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>"]
+
+    def test_encode_special_tokens(self):
+        tokenizer = Tokenizer.from_pretrained("t5-base")
+        tokenizer.add_tokens(["<eot>"])
+        tokenizer.add_special_tokens(["<end_of_text>"])
+        output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
+        assert output.tokens == ["▁Hey", "▁there", "<end_of_text>", "▁dear", "<eot>", "▁friend", "!"]
+
+        tokenizer.encode_special_tokens = True
+        assert tokenizer.encode_special_tokens == True
+
+        output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
+        assert output.tokens == [
+            "▁Hey",
+            "▁there",
+            "<",
+            "end",
+            "_",
+            "of",
+            "_",
+            "text",
+            ">",
+            "▁dear",
+            "<eot>",
+            "▁friend",
+            "!",
+        ]
+
+        tokenizer.add_tokens(["of_text>"])
+        output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
+        assert output.tokens == ["▁Hey", "▁there", "<", "end", "_", "of_text>", "▁dear", "<eot>", "▁friend", "!"]