Adding Sequence for PostProcessor. (#1052)

* Adding `Sequence` for `PostProcessor`. * Fixing node? Writing in the dark here, don't have Python2.7 * `undefined` is not accepted. * Other test.
2025-08-23 00:35:35 +00:00 · 2022-08-25 14:50:06 +02:00
parent 37f7bae0f7
commit 06025e4ca1
12 changed files with 344 additions and 7 deletions
--- a/bindings/python/py_src/tokenizers/processors/init.py
+++ b/bindings/python/py_src/tokenizers/processors/init.py
@ -5,4 +5,5 @@ PostProcessor = processors.PostProcessor
 BertProcessing = processors.BertProcessing
 ByteLevel = processors.ByteLevel
 RobertaProcessing = processors.RobertaProcessing
+Sequence = processors.Sequence
 TemplateProcessing = processors.TemplateProcessing
--- a/bindings/python/py_src/tokenizers/processors/init.pyi
+++ b/bindings/python/py_src/tokenizers/processors/init.pyi
@ -193,6 +193,48 @@ class RobertaProcessing(PostProcessor):
        """
        pass

+class Sequence(PostProcessor):
+    """
+    Sequence Processor
+
+    Args:
+        processors (:obj:`List[PostProcessor]`)
+            The processors that need to be chained
+    """
+
+    def __init__(self, processors):
+        pass
+    def num_special_tokens_to_add(self, is_pair):
+        """
+        Return the number of special tokens that would be added for single/pair sentences.
+
+        Args:
+            is_pair (:obj:`bool`):
+                Whether the input would be a pair of sequences
+
+        Returns:
+            :obj:`int`: The number of tokens to add
+        """
+        pass
+    def process(self, encoding, pair=None, add_special_tokens=True):
+        """
+        Post-process the given encodings, generating the final one
+
+        Args:
+            encoding (:class:`~tokenizers.Encoding`):
+                The encoding for the first sequence
+
+            pair (:class:`~tokenizers.Encoding`, `optional`):
+                The encoding for the pair sequence
+
+            add_special_tokens (:obj:`bool`):
+                Whether to add the special tokens
+
+        Return:
+            :class:`~tokenizers.Encoding`: The final encoding
+        """
+        pass
+
 class TemplateProcessing(PostProcessor):
    """
    Provides a way to specify templates in order to add the special tokens to each
--- a/bindings/python/src/lib.rs
+++ b/bindings/python/src/lib.rs
@ -104,6 +104,7 @@ fn processors(_py: Python, m: &PyModule) -> PyResult<()> {
    m.add_class::<processors::PyRobertaProcessing>()?;
    m.add_class::<processors::PyByteLevel>()?;
    m.add_class::<processors::PyTemplateProcessing>()?;
+    m.add_class::<processors::PySequence>()?;
    Ok(())
 }

--- a/bindings/python/src/processors.rs
+++ b/bindings/python/src/processors.rs
@ -11,6 +11,7 @@ use serde::{Deserialize, Serialize};
 use tk::processors::bert::BertProcessing;
 use tk::processors::byte_level::ByteLevel;
 use tk::processors::roberta::RobertaProcessing;
+use tk::processors::sequence::Sequence;
 use tk::processors::template::{SpecialToken, Template};
 use tk::processors::PostProcessorWrapper;
 use tk::{Encoding, PostProcessor};
@ -50,6 +51,7 @@ impl PyPostProcessor {
            PostProcessorWrapper::Template(_) => {
                Py::new(py, (PyTemplateProcessing {}, base))?.into_py(py)
            }
+            PostProcessorWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))?.into_py(py),
        })
    }
 }
@ -414,6 +416,37 @@ impl PyTemplateProcessing {
    }
 }

+/// Sequence Processor
+///
+/// Args:
+///     processors (:obj:`List[PostProcessor]`)
+///         The processors that need to be chained
+#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "Sequence")]
+#[pyo3(text_signature = "(self, processors)")]
+pub struct PySequence {}
+#[pymethods]
+impl PySequence {
+    #[new]
+    #[args(processors)]
+    fn new(processors_py: &PyList) -> (Self, PyPostProcessor) {
+        let mut processors: Vec<PostProcessorWrapper> = Vec::with_capacity(processors_py.len());
+        for n in processors_py.iter() {
+            let processor: PyRef<PyPostProcessor> = n.extract().unwrap();
+            let processor = processor.processor.as_ref();
+            processors.push(processor.clone());
+        }
+        let sequence_processor = Sequence::new(processors);
+        (
+            PySequence {},
+            PyPostProcessor::new(Arc::new(PostProcessorWrapper::Sequence(sequence_processor))),
+        )
+    }
+
+    fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
+        PyTuple::new(py, &[PyList::empty(py)])
+    }
+}
+
 #[cfg(test)]
 mod test {
    use std::sync::Arc;
--- a/bindings/python/tests/bindings/test_processors.py
+++ b/bindings/python/tests/bindings/test_processors.py
@ -13,6 +13,7 @@ from tokenizers.processors import (
    RobertaProcessing,
    ByteLevel,
    TemplateProcessing,
+    Sequence,
 )


@ -179,3 +180,49 @@ class TestTemplateProcessing:
        tokenizer.post_processor = self.get_roberta()
        template = tokenizer.encode("my name is john", "pair")
        assert original.ids == template.ids
+
+
+class TestSequenceProcessing:
+    def test_sequence_processing(self):
+        assert Sequence([]) is not None
+        assert Sequence([ByteLevel()]) is not None
+        assert isinstance(Sequence([]), PostProcessor)
+        assert isinstance(Sequence([]), Sequence)
+        serialized = pickle.dumps(Sequence([]))
+        assert isinstance(pickle.loads(serialized), Sequence)
+
+    def test_post_process(self):
+        byte_level = ByteLevel(trim_offsets=True)
+        template = TemplateProcessing(
+            single=["[CLS]", "$0", "[SEP]"],
+            pair=["[CLS]:0", "$A", "[SEP]:0", "$B:1", "[SEP]:1"],
+            special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
+        )
+
+        tokenizer = Tokenizer(BPE())
+        tokenizer.add_special_tokens(["[SEP]", "[CLS]"])
+        tokenizer.add_tokens(["my", "name", "is", "Ġjohn", "pair"])
+        tokenizer.post_processor = template
+
+        # Before the sequence
+        original = tokenizer.encode("my name is Ġjohn")
+        assert original.ids == [1, 2, 3, 4, 5, 0]
+        assert original.type_ids == [0, 0, 0, 0, 0, 0]
+        assert original.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (11, 16), (0, 0)]
+        pair = tokenizer.encode("my name is Ġjohn", "pair")
+        # assert pair.ids == [1, 2, 3, 4, 5, 0, 6, 0]
+        assert pair.type_ids == [0, 0, 0, 0, 0, 0, 1, 1]
+        assert pair.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (11, 16), (0, 0), (0, 4), (0, 0)]
+
+        processor = Sequence([byte_level, template])
+        tokenizer.post_processor = processor
+
+        original = tokenizer.encode("my name is Ġjohn")
+        assert original.ids == [1, 2, 3, 4, 5, 0]
+        assert original.type_ids == [0, 0, 0, 0, 0, 0]
+        # Offsets ARE trimmed
+        assert original.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (12, 16), (0, 0)]
+        pair = tokenizer.encode("my name is Ġjohn", "pair")
+        # assert pair.ids == [1, 2, 3, 4, 5, 0, 6, 0]
+        assert pair.type_ids == [0, 0, 0, 0, 0, 0, 1, 1]
+        assert pair.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (12, 16), (0, 0), (0, 4), (0, 0)]