Python - Bindings for TemplateProcessing

2025-12-05 12:18:20 +00:00 · 2020-09-09 17:13:08 -04:00
parent c156ae3a83
commit 337fe72b13
5 changed files with 255 additions and 3 deletions
--- a/bindings/python/py_src/tokenizers/processors/init.py
+++ b/bindings/python/py_src/tokenizers/processors/init.py
@@ -4,3 +4,4 @@ PostProcessor = processors.PostProcessor
 BertProcessing = processors.BertProcessing
 RobertaProcessing = processors.RobertaProcessing
 ByteLevel = processors.ByteLevel
+TemplateProcessing = processors.TemplateProcessing
--- a/bindings/python/py_src/tokenizers/processors/init.pyi
+++ b/bindings/python/py_src/tokenizers/processors/init.pyi
@@ -1,4 +1,4 @@
-from typing import Tuple
+from typing import Tuple, Union, List

 class PostProcessor:
    """ Base class for all post-processors
@@ -89,7 +89,7 @@ class ByteLevel(PostProcessor):
    want the offsets to include these whitespaces, then this PostProcessor must be used.
    """

-    def __init(self, trim_offsets: bool = True) -> None:
+    def __init__(self, trim_offsets: bool = True) -> None:
        """ Instantiate a new ByteLevel

        Args:
@@ -97,3 +97,67 @@ class ByteLevel(PostProcessor):
                Whether to trim the whitespaces from the produced offsets.
        """
        pass
+
+Template = Union[str, List[str]]
+Tokens = List[Union[Tuple[int, str], Tuple[str, int], dict]]
+
+class TemplateProcessing(PostProcessor):
+    """ TemplateProcessing
+
+    Provides a way to specify templates in order to add the special tokens to each
+    input sequence as relevant.
+
+    Let's take `BERT` tokenizer as an example. It uses two special tokens, used to
+    delimitate each sequence. `[CLS]` is always used at the beginning of the first
+    sequence, and `[SEP]` is added at the end of both the first, and the pair
+    sequences. The final result looks like this:
+        - Single sequence: `[CLS] Hello there [SEP]`
+        - Pair sequences: `[CLS] My name is Anthony [SEP] What is my name? [SEP]`
+
+    You can achieve such behavior using a TemplateProcessing:
+    ```
+    TemplateProcessing(
+        seq_a="[CLS] $0 [SEP]",
+        seq_b="$1 [SEP]",
+        special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
+    )
+    ```
+
+    In this example, $0 and $1 both represent the input sequences. The number in this
+    identifier is actually the default type_id that will be used for each sequence. So,
+    in this case, the first sequence will use 0, while the pair sequence will use 1.
+
+    Note that we are saying the "default" type_id because each SpecialToken can define
+    its own type_id which would override the provided default.
+    """
+
+    def __init__(self, seq_a: Template, seq_b: Template, special_tokens: Tokens) -> None:
+        """ Instantiate a new TemplateProcessing
+
+        Args:
+            seq_a: Template
+                The template for the first sequence.
+
+            seq_b: Template:
+                The template for the pair sequence.
+
+            special_tokens: Tokens:
+                The list of special tokens used in each sequences
+
+        Template: Union[str, List[str]]:
+            - If a `str` is provided, the whitespace is used as delimiter between tokens
+            - If a `List[str]` is provided, a list of tokens
+
+        Tokens: List[Union[Tuple[int, str], Tuple[str, int], dict]]:
+            - A Tuple with both a token and its associated ID, in any order
+            - A dict with the following keys:
+                - "id": str => The special token id, as specified in the Template
+                - "ids": List[int] => The associated IDs
+                - "tokens": List[str] => The associated tokens
+                - "type_ids": Optional[List[Optional[int]]] => If specified, a list of optional
+                  type_ids. In the `type_id` is not specified, the one from the input sequence
+                  will be used.
+             The given dict expects the provided `ids`, `tokens` and `type_ids` lists to have
+             the same length.
+        """
+        pass