Python - Improve typings for new encode/encode_batch

2025-08-22 16:25:30 +00:00 · 2020-04-28 17:52:36 -04:00
parent 3fb8033770
commit 30216190e5
3 changed files with 101 additions and 49 deletions
--- a/bindings/python/tokenizers/init.py
+++ b/bindings/python/tokenizers/init.py
@ -3,8 +3,16 @@ __version__ = "0.7.0"
 from typing import Tuple, Union, Tuple, List

 Offsets = Tuple[int, int]
-InputSequence = Union[str, List[str]]
-EncodeInput = Union[InputSequence, Tuple[InputSequence, InputSequence]]
+
+TextInputSequence = str
+PreTokenizedInputSequence = Union[List[str], Tuple[str]]
+TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
+PreTokenizedEncodeInput = Union[
+    PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence]
+]
+
+InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
+EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]

 from .tokenizers import Tokenizer, Encoding, AddedToken
 from .tokenizers import decoders
--- a/bindings/python/tokenizers/init.pyi
+++ b/bindings/python/tokenizers/init.pyi
@ -15,8 +15,16 @@ from .implementations import (
 from typing import Optional, Union, List, Tuple

 Offsets = Tuple[int, int]
-InputSequence = Union[str, List[str]]
-EncodeInput = Union[InputSequence, Tuple[InputSequence, InputSequence]]
+
+TextInputSequence = str
+PreTokenizedInputSequence = Union[List[str], Tuple[str]]
+TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
+PreTokenizedEncodeInput = Union[
+    PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
+]
+
+InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
+EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]

 class Encoding:
    """ An Encoding as returned by the Tokenizer """
@ -192,7 +200,7 @@ class AddedToken:
    """

    def __new__(
-        cls, content: str, single_word: bool = False, lstrip: bool = False, rstrip: bool = False
+        cls, content: str, single_word: bool = False, lstrip: bool = False, rstrip: bool = False,
    ) -> AddedToken:
        """ Instantiate a new AddedToken

@ -371,18 +379,27 @@ class Tokenizer:
            The normalized string
        """
        pass
-    def encode(self, input: EncodeInput, add_special_tokens: bool = True) -> Encoding:
-        """ Encode the given input. This method accept both string sequences and already
-        pre-tokenized sequences.
+    def encode(
+        self,
+        sequence: InputSequence,
+        pair: Optional[InputSequence],
+        is_pretokenized: bool = False,
+        add_special_tokens: bool = True,
+    ) -> Encoding:
+        """ Encode the given sequence and pair. This method can process raw text sequences as well
+        as already pre-tokenized sequences.

        Args:
-            input: EncodeInput:
-                This content to encode. This can be either:
-                    - A single sequence: InputSequence
-                    - A pair of sequences: Tuple[InputSequence, InputSequence]
-                And a InputSequence can be either:
-                    - A string: str
-                    - A pre-tokenized string: List[str]
+            sequence: InputSequence:
+                The sequence we want to encode. This sequence can be either raw text or
+                pre-tokenized, according to the `is_pretokenized` argument:
+
+                - If `is_pretokenized=False`: `InputSequence` is expected to be `str`
+                - If `is_pretokenized=True`: `InputSequence` is expected to be
+                    `Union[List[str], Tuple[str]]`
+
+            is_pretokenized: bool:
+                Whether the input is already pre-tokenized

            add_special_tokens: bool:
                Whether to add the special tokens while encoding.
@ -392,19 +409,29 @@ class Tokenizer:
        """
        pass
    def encode_batch(
-        self, inputs: List[EncodeInput], add_special_tokens: bool = True
+        self,
+        inputs: List[EncodeInput],
+        is_pretokenized: bool = False,
+        add_special_tokens: bool = True,
    ) -> List[Encoding]:
-        """ Encode the given inputs. This method accept both string sequences and already
+        """ Encode the given inputs. This method accept both raw text sequences as well as already
        pre-tokenized sequences.

        Args:
            inputs: List[EncodeInput]:
-                A list of inputs to encode. Each input can be either:
-                    - A single sequence: InputSequence
-                    - A pair of sequences: Tuple[InputSequence, InputSequence]
-                And a InputSequence can be either:
-                    - A string: str
-                    - A pre-tokenized string: List[str]
+                A list of single sequences or pair sequences to encode. Each `EncodeInput` is
+                expected to be of the following form:
+                    `Union[InputSequence, Tuple[InputSequence, InputSequence]]`
+
+                Each `InputSequence` can either be raw text or pre-tokenized,
+                according to the `is_pretokenized` argument:
+
+                - If `is_pretokenized=False`: `InputSequence` is expected to be `str`
+                - If `is_pretokenized=True`: `InputSequence` is expected to be
+                    `Union[List[str], Tuple[str]]`
+
+            is_pretokenized: bool:
+                Whether the input is already pre-tokenized.

            add_special_tokens: bool:
                Whether to add the special tokens while encoding.
@ -493,7 +520,7 @@ class Tokenizer:
        """
        pass
    def post_process(
-        self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True
+        self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True,
    ) -> Encoding:
        """ Apply all the post-processing steps to the given encodings.

--- a/bindings/python/tokenizers/implementations/base_tokenizer.py
+++ b/bindings/python/tokenizers/implementations/base_tokenizer.py
@ -151,18 +151,27 @@ class BaseTokenizer:
        """
        return self._tokenizer.normalize(sequence)

-    def encode(self, input: EncodeInput, add_special_tokens: bool = True) -> Encoding:
-        """ Encode the given input. This method accept both string sequences and already
-        pre-tokenized sequences.
+    def encode(
+        self,
+        sequence: InputSequence,
+        pair: Optional[InputSequence] = None,
+        is_pretokenized: bool = False,
+        add_special_tokens: bool = True,
+    ) -> Encoding:
+        """ Encode the given sequence and pair. This method can process raw text sequences as well
+        as already pre-tokenized sequences.

        Args:
-            input: EncodeInput:
-                This content to encode. This can be either:
-                    - A single sequence: InputSequence
-                    - A pair of sequences: Tuple[InputSequence, InputSequence]
-                And a InputSequence can be either:
-                    - A string: str
-                    - A pre-tokenized string: List[str]
+            sequence: InputSequence:
+                The sequence we want to encode. This sequence can be either raw text or
+                pre-tokenized, according to the `is_pretokenized` argument:
+
+                - If `is_pretokenized=False`: `InputSequence` is expected to be `str`
+                - If `is_pretokenized=True`: `InputSequence` is expected to be
+                    `Union[List[str], Tuple[str]]`
+
+            is_pretokenized: bool:
+                Whether the input is already pre-tokenized.

            add_special_tokens: bool:
                Whether to add the special tokens while encoding.
@ -170,25 +179,35 @@ class BaseTokenizer:
        Returns:
            An Encoding
        """
-        if input is None:
-            raise ValueError("None input is not valid. Should be a string.")
+        if sequence is None:
+            raise ValueError("encode: `sequence` can't be `None`")

-        return self._tokenizer.encode(input, add_special_tokens)
+        return self._tokenizer.encode(sequence, pair, is_pretokenized, add_special_tokens)

    def encode_batch(
-        self, inputs: List[EncodeInput], add_special_tokens: bool = True
+        self,
+        inputs: List[EncodeInput],
+        is_pretokenized: bool = False,
+        add_special_tokens: bool = True,
    ) -> List[Encoding]:
-        """ Encode the given inputs. This method accept both string sequences and already
+        """ Encode the given inputs. This method accept both raw text sequences as well as already
        pre-tokenized sequences.

        Args:
            inputs: List[EncodeInput]:
-                A list of inputs to encode. Each input can be either:
-                    - A single sequence: InputSequence
-                    - A pair of sequences: Tuple[InputSequence, InputSequence]
-                And a InputSequence can be either:
-                    - A string: str
-                    - A pre-tokenized string: List[str]
+                A list of single sequences or pair sequences to encode. Each `EncodeInput` is
+                expected to be of the following form:
+                    `Union[InputSequence, Tuple[InputSequence, InputSequence]]`
+
+                Each `InputSequence` can either be raw text or pre-tokenized,
+                according to the `is_pretokenized` argument:
+
+                - If `is_pretokenized=False`: `InputSequence` is expected to be `str`
+                - If `is_pretokenized=True`: `InputSequence` is expected to be
+                    `Union[List[str], Tuple[str]]`
+
+            is_pretokenized: bool:
+                Whether the input is already pre-tokenized.

            add_special_tokens: bool:
                Whether to add the special tokens while encoding.
@ -198,11 +217,9 @@ class BaseTokenizer:
        """

        if inputs is None:
-            raise ValueError(
-                "None input is not valid. Should be a list of strings or a list of tuple of strings."
-            )
+            raise ValueError("encode_batch: `inputs` can't be `None`")

-        return self._tokenizer.encode_batch(inputs, add_special_tokens)
+        return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)

    def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
        """ Decode the given list of ids to a string sequence