Python - Bindings for TemplateProcessing

2025-12-05 12:18:20 +00:00 · 2020-09-09 17:13:08 -04:00
parent c156ae3a83
commit 337fe72b13
5 changed files with 255 additions and 3 deletions
--- a/bindings/python/py_src/tokenizers/processors/init.py
+++ b/bindings/python/py_src/tokenizers/processors/init.py
@@ -4,3 +4,4 @@ PostProcessor = processors.PostProcessor
 BertProcessing = processors.BertProcessing
 RobertaProcessing = processors.RobertaProcessing
 ByteLevel = processors.ByteLevel
 TemplateProcessing = processors.TemplateProcessing
--- a/bindings/python/py_src/tokenizers/processors/init.pyi
+++ b/bindings/python/py_src/tokenizers/processors/init.pyi
@@ -1,4 +1,4 @@
-from typing import Tuple
+from typing import Tuple, Union, List
 class PostProcessor:
    """ Base class for all post-processors
@@ -89,7 +89,7 @@ class ByteLevel(PostProcessor):
    want the offsets to include these whitespaces, then this PostProcessor must be used.
    """
-    def __init(self, trim_offsets: bool = True) -> None:
+    def __init__(self, trim_offsets: bool = True) -> None:
        """ Instantiate a new ByteLevel
        Args:
@@ -97,3 +97,67 @@ class ByteLevel(PostProcessor):
                Whether to trim the whitespaces from the produced offsets.
        """
        pass
 Template = Union[str, List[str]]
 Tokens = List[Union[Tuple[int, str], Tuple[str, int], dict]]
 class TemplateProcessing(PostProcessor):
    """ TemplateProcessing
    Provides a way to specify templates in order to add the special tokens to each
    input sequence as relevant.
    Let's take `BERT` tokenizer as an example. It uses two special tokens, used to
    delimitate each sequence. `[CLS]` is always used at the beginning of the first
    sequence, and `[SEP]` is added at the end of both the first, and the pair
    sequences. The final result looks like this:
        - Single sequence: `[CLS] Hello there [SEP]`
        - Pair sequences: `[CLS] My name is Anthony [SEP] What is my name? [SEP]`
    You can achieve such behavior using a TemplateProcessing:
    ```
    TemplateProcessing(
        seq_a="[CLS] $0 [SEP]",
        seq_b="$1 [SEP]",
        special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
    )
    ```
    In this example, $0 and $1 both represent the input sequences. The number in this
    identifier is actually the default type_id that will be used for each sequence. So,
    in this case, the first sequence will use 0, while the pair sequence will use 1.
    Note that we are saying the "default" type_id because each SpecialToken can define
    its own type_id which would override the provided default.
    """
    def __init__(self, seq_a: Template, seq_b: Template, special_tokens: Tokens) -> None:
        """ Instantiate a new TemplateProcessing
        Args:
            seq_a: Template
                The template for the first sequence.
            seq_b: Template:
                The template for the pair sequence.
            special_tokens: Tokens:
                The list of special tokens used in each sequences
        Template: Union[str, List[str]]:
            - If a `str` is provided, the whitespace is used as delimiter between tokens
            - If a `List[str]` is provided, a list of tokens
        Tokens: List[Union[Tuple[int, str], Tuple[str, int], dict]]:
            - A Tuple with both a token and its associated ID, in any order
            - A dict with the following keys:
                - "id": str => The special token id, as specified in the Template
                - "ids": List[int] => The associated IDs
                - "tokens": List[str] => The associated tokens
                - "type_ids": Optional[List[Optional[int]]] => If specified, a list of optional
                  type_ids. In the `type_id` is not specified, the one from the input sequence
                  will be used.
             The given dict expects the provided `ids`, `tokens` and `type_ids` lists to have
             the same length.
        """
        pass
--- a/bindings/python/src/lib.rs
+++ b/bindings/python/src/lib.rs
@@ -1,3 +1,5 @@
 #![warn(clippy::all)]
 extern crate tokenizers as tk;
 mod decoders;
@@ -90,6 +92,7 @@ fn processors(_py: Python, m: &PyModule) -> PyResult<()> {
    m.add_class::<processors::PyBertProcessing>()?;
    m.add_class::<processors::PyRobertaProcessing>()?;
    m.add_class::<processors::PyByteLevel>()?;
    m.add_class::<processors::PyTemplateProcessing>()?;
    Ok(())
 }
--- a/bindings/python/src/processors.rs
+++ b/bindings/python/src/processors.rs
@@ -4,10 +4,12 @@ use pyo3::exceptions;
 use pyo3::prelude::*;
 use pyo3::types::*;
 use crate::error::ToPyResult;
 use serde::{Deserialize, Serialize};
 use tk::processors::bert::BertProcessing;
 use tk::processors::byte_level::ByteLevel;
 use tk::processors::roberta::RobertaProcessing;
 use tk::processors::template::{SpecialToken, Template};
 use tk::processors::PostProcessorWrapper;
 use tk::{Encoding, PostProcessor};
 use tokenizers as tk;
@@ -38,6 +40,9 @@ impl PyPostProcessor {
            PostProcessorWrapper::Roberta(_) => {
                Py::new(py, (PyRobertaProcessing {}, base)).map(Into::into)
            }
            PostProcessorWrapper::Template(_) => {
                Py::new(py, (PyTemplateProcessing {}, base)).map(Into::into)
            }
        }
    }
 }
@@ -158,6 +163,104 @@ impl PyByteLevel {
    }
 }
 #[derive(Clone, Debug)]
 pub struct PySpecialToken(SpecialToken);
 impl From<PySpecialToken> for SpecialToken {
    fn from(v: PySpecialToken) -> Self {
        v.0
    }
 }
 impl FromPyObject<'_> for PySpecialToken {
    fn extract(ob: &PyAny) -> PyResult<Self> {
        if let Ok(v) = ob.extract::<(String, u32)>() {
            Ok(Self(v.into()))
        } else if let Ok(v) = ob.extract::<(u32, String)>() {
            Ok(Self(v.into()))
        } else if let Ok(d) = ob.downcast::<PyDict>() {
            let id = d
                .get_item("id")
                .ok_or_else(|| exceptions::ValueError::py_err("`id` must be specified"))?
                .extract::<String>()?;
            let ids = d
                .get_item("ids")
                .ok_or_else(|| exceptions::ValueError::py_err("`ids` must be specified"))?
                .extract::<Vec<u32>>()?;
            let type_ids = d.get_item("type_ids").map_or_else(
                || Ok(vec![None; ids.len()]),
                |v| v.extract::<Vec<Option<u32>>>(),
            )?;
            let tokens = d
                .get_item("tokens")
                .ok_or_else(|| exceptions::ValueError::py_err("`tokens` must be specified"))?
                .extract::<Vec<String>>()?;
            Ok(Self(
                ToPyResult(SpecialToken::new(id, ids, type_ids, tokens)).into_py()?,
            ))
        } else {
            Err(exceptions::TypeError::py_err(
                "Expected Union[Tuple[str, int], Tuple[int, str], dict]",
            ))
        }
    }
 }
 #[derive(Clone, Debug)]
 pub struct PyTemplate(Template);
 impl From<PyTemplate> for Template {
    fn from(v: PyTemplate) -> Self {
        v.0
    }
 }
 impl FromPyObject<'_> for PyTemplate {
    fn extract(ob: &PyAny) -> PyResult<Self> {
        if let Ok(s) = ob.extract::<&str>() {
            Ok(Self(s.into()))
        } else if let Ok(s) = ob.extract::<Vec<&str>>() {
            Ok(Self(s.into()))
        } else {
            Err(exceptions::TypeError::py_err(
                "Expected Union[str, List[str]]",
            ))
        }
    }
 }
 #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=TemplateProcessing)]
 pub struct PyTemplateProcessing {}
 #[pymethods]
 impl PyTemplateProcessing {
    #[new]
    #[args(seq_a = "None", seq_b = "None", special_tokens = "None")]
    fn new(
        seq_a: Option<PyTemplate>,
        seq_b: Option<PyTemplate>,
        special_tokens: Option<Vec<PySpecialToken>>,
    ) -> PyResult<(Self, PyPostProcessor)> {
        let mut builder = tk::processors::template::TemplateProcessing::builder();
        if let Some(seq) = seq_a {
            builder.sequence_a(seq);
        }
        if let Some(seq) = seq_b {
            builder.sequence_b(seq);
        }
        if let Some(sp) = special_tokens {
            builder.special_tokens(sp);
        }
        let processor = builder.build().map_err(exceptions::ValueError::py_err)?;
        Ok((
            PyTemplateProcessing {},
            PyPostProcessor::new(Arc::new(processor.into())),
        ))
    }
 }
 #[cfg(test)]
 mod test {
    use std::sync::Arc;
--- a/bindings/python/tests/bindings/test_processors.py
+++ b/bindings/python/tests/bindings/test_processors.py
@@ -5,7 +5,13 @@ from ..utils import data_dir, roberta_files
 from tokenizers import Tokenizer
 from tokenizers.models import BPE
 from tokenizers.pre_tokenizers import ByteLevel as ByteLevelPreTokenizer
-from tokenizers.processors import PostProcessor, BertProcessing, RobertaProcessing, ByteLevel
+from tokenizers.processors import (
    PostProcessor,
    BertProcessing,
    RobertaProcessing,
    ByteLevel,
    TemplateProcessing,
 )
 class TestBertProcessing:
@@ -73,3 +79,78 @@ class TestByteLevelProcessing:
        output = tokenizer.encode("My name is John")
        assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
        assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
 class TestTemplateProcessing:
    def get_bert(self):
        return TemplateProcessing(
            seq_a=["[CLS]", "$0", "[SEP]"],
            seq_b=["$1", "[SEP]"],
            special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
        )
    def get_roberta(self):
        return TemplateProcessing(
            seq_a="<s> $0 </s>", seq_b="</s> $0 </s>", special_tokens=[("<s>", 0), ("</s>", 1)],
        )
    def get_t5_squad(self):
        # >>> from transformers import AutoTokenizer
        # >>> tok = AutoTokenizer.from_pretrained("t5-small")
        # >>> tok.tokenize("question: ")
        # ['▁question', ':']
        # >>> tok.tokenize("context: ")
        # ['▁context', ':']
        # >>> tok.encode("context: ")
        # [2625, 10]
        # >>> tok.encode("question: ")
        # [822, 10]
        return TemplateProcessing(
            seq_a=["Q", "$0"],
            seq_b=["C", "$1"],
            special_tokens=[
                {
                    "id": "Q",
                    "ids": [2625, 10],
                    "type_ids": [None, None],
                    "tokens": ["_question", ":"],
                },
                {
                    "id": "C",
                    "ids": [822, 10],
                    "type_ids": [None, None],
                    "tokens": ["_context", ":"],
                },
            ],
        )
    def test_instantiate(self):
        bert = self.get_bert()
        assert bert is not None
        assert isinstance(bert, PostProcessor)
        assert isinstance(bert, TemplateProcessing)
        assert isinstance(pickle.loads(pickle.dumps(bert)), TemplateProcessing)
    def test_bert_parity(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_special_tokens(["[SEP]", "[CLS]"])
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1))
        original = tokenizer.encode("my name", "pair")
        tokenizer.post_processor = self.get_bert()
        template = tokenizer.encode("my name", "pair")
        assert original.ids == template.ids
    def test_roberta_parity(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_special_tokens(["<s>", "</s>"])
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.post_processor = RobertaProcessing(("</s>", 1), ("<s>", 0))
        original = tokenizer.encode("my name is john", "pair")
        tokenizer.post_processor = self.get_roberta()
        template = tokenizer.encode("my name is john", "pair")
        assert original.ids == template.ids