Python - Update typings for new encode

2025-08-23 16:49:27 +00:00 · 2020-04-24 21:28:31 -04:00
parent 835f08ab02
commit 2e105c4258
3 changed files with 52 additions and 91 deletions
--- a/bindings/python/tokenizers/init.py
+++ b/bindings/python/tokenizers/init.py
@ -1,8 +1,10 @@
 __version__ = "0.7.0"
-from typing import Tuple
+from typing import Tuple, Union, Tuple, List
 Offsets = Tuple[int, int]
 InputSequence = Union[str, List[str]]
 EncodeInput = Union[InputSequence, Tuple[InputSequence, InputSequence]]
 from .tokenizers import Tokenizer, Encoding, AddedToken
 from .tokenizers import decoders
--- a/bindings/python/tokenizers/init.pyi
+++ b/bindings/python/tokenizers/init.pyi
@ -15,6 +15,8 @@ from .implementations import (
 from typing import Optional, Union, List, Tuple
 Offsets = Tuple[int, int]
 InputSequence = Union[str, List[str]]
 EncodeInput = Union[InputSequence, Tuple[InputSequence, InputSequence]]
 class Encoding:
    """ An Encoding as returned by the Tokenizer """
@ -369,37 +371,43 @@ class Tokenizer:
            The normalized string
        """
        pass
-    def encode(
+    def encode(self, input: EncodeInput, add_special_tokens: bool = True) -> Encoding:
-        self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True
+        """ Encode the given input. This method accept both string sequences and already
-    ) -> Encoding:
+        pre-tokenized sequences.
        """ Encode the given sequence
        Args:
-            sequence: str:
+            input: EncodeInput:
-                The sequence to encode
+                This content to encode. This can be either:
-
+                    - A single sequence: InputSequence
-            pair: (`optional`) Optional[str]:
+                    - A pair of sequences: Tuple[InputSequence, InputSequence]
-                The optional pair sequence
+                And a InputSequence can be either:
                    - A string: str
                    - A pre-tokenized string: List[str]
            add_special_tokens: bool:
-                Whether to add the special tokens while encoding
+                Whether to add the special tokens while encoding.
        Returns:
            An Encoding
        """
        pass
    def encode_batch(
-        self, sequences: List[Union[str, Tuple[str, str]]], add_special_tokens: bool = True
+        self, inputs: List[EncodeInput], add_special_tokens: bool = True
    ) -> List[Encoding]:
-        """ Encode the given sequences or pair of sequences
+        """ Encode the given inputs. This method accept both string sequences and already
        pre-tokenized sequences.
        Args:
-            sequences: List[Union[str, Tuple[str, str]]]:
+            inputs: List[EncodeInput]:
-                A list of sequences or pair of sequences. The list can contain both
+                A list of inputs to encode. Each input can be either:
-                at the same time.
+                    - A single sequence: InputSequence
                    - A pair of sequences: Tuple[InputSequence, InputSequence]
                And a InputSequence can be either:
                    - A string: str
                    - A pre-tokenized string: List[str]
            add_special_tokens: bool:
-                Whether to add the special tokens while encoding
+                Whether to add the special tokens while encoding.
        Returns:
            A list of Encoding
--- a/bindings/python/tokenizers/implementations/base_tokenizer.py
+++ b/bindings/python/tokenizers/implementations/base_tokenizer.py
@ -1,4 +1,4 @@
-from tokenizers import Tokenizer, Encoding, AddedToken
+from tokenizers import Tokenizer, Encoding, AddedToken, InputSequence, EncodeInput
 from tokenizers.models import TokenizedSequence, TokenizedSequenceWithOffsets
 from typing import List, Union, Tuple, Optional, Dict
@ -151,72 +151,18 @@ class BaseTokenizer:
        """
        return self._tokenizer.normalize(sequence)
-    def encode_tokenized(
+    def encode(self, input: EncodeInput, add_special_tokens: bool = True) -> Encoding:
-        self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
+        """ Encode the given input. This method accept both string sequences and already
-    ) -> Encoding:
+        pre-tokenized sequences.
        """ Encode the given sequence. Let us skip the Normalizer and PreTokenizer by providing
        already tokenized substrings.
        A sequence can either be:
            - `TokenizedSequence`: (`List[str]`)
            - `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
            a Tuple[int, int].
        If the Offsets are not provided, they will be automatically generated, making the hypothesis
        that all the tokens in the `TokenizedSequence` are contiguous in the original string.
        Args:
-            sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
+            input: EncodeInput:
-                Either a TokenizedSequence or a TokenizedSequenceWithOffsets
+                This content to encode. This can be either:
-
+                    - A single sequence: InputSequence
-            type_id: int:
+                    - A pair of sequences: Tuple[InputSequence, InputSequence]
-                The type id of the given sequence
+                And a InputSequence can be either:
-
+                    - A string: str
-        Returns:
+                    - A pre-tokenized string: List[str]
            An Encoding
        """
        return self._tokenizer.model.encode(sequence, type_id)
    def encode_tokenized_batch(
        self,
        sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
        type_id: int = 0,
    ) -> List[Encoding]:
        """ Encode the given batch of sequence. Let us skip the Normalizer and PreTokenizer by
        providing already tokenized substrings.
        A sequence can either be:
            - `TokenizedSequence`: (`List[str]`)
            - `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
            a Tuple[int, int].
        If the Offsets are not provided, they will be automatically generated, making the hypothesis
        that all the tokens in the `TokenizedSequence` are contiguous in the original string.
        Args:
            sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
                A list of sequence. Each sequence is either a TokenizedSequence or a
                TokenizedSequenceWithOffsets
            type_id: int:
                The type if of the given sequence
        Returns:
            A list of Encoding
        """
        return self._tokenizer.model.encode_batch(sequences, type_id)
    def encode(
        self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True
    ) -> Encoding:
        """ Encode the given sequence
        Args:
            sequence: str:
                The sequence to encode
            pair: (`optional`) Optional[str]:
                The optional pair sequence
            add_special_tokens: bool:
                Whether to add the special tokens while encoding.
@ -224,20 +170,25 @@ class BaseTokenizer:
        Returns:
            An Encoding
        """
-        if sequence is None:
+        if input is None:
            raise ValueError("None input is not valid. Should be a string.")
-        return self._tokenizer.encode(sequence, pair, add_special_tokens)
+        return self._tokenizer.encode(input, add_special_tokens)
    def encode_batch(
-        self, sequences: List[Union[str, Tuple[str, str]]], add_special_tokens: bool = True
+        self, inputs: List[EncodeInput], add_special_tokens: bool = True
    ) -> List[Encoding]:
-        """ Encode the given sequences or pair of sequences
+        """ Encode the given inputs. This method accept both string sequences and already
        pre-tokenized sequences.
        Args:
-            sequences: List[Union[str, Tuple[str, str]]]:
+            inputs: List[EncodeInput]:
-                A list of sequences or pair of sequences. The list can contain both
+                A list of inputs to encode. Each input can be either:
-                at the same time.
+                    - A single sequence: InputSequence
                    - A pair of sequences: Tuple[InputSequence, InputSequence]
                And a InputSequence can be either:
                    - A string: str
                    - A pre-tokenized string: List[str]
            add_special_tokens: bool:
                Whether to add the special tokens while encoding.
@ -246,12 +197,12 @@ class BaseTokenizer:
            A list of Encoding
        """
-        if sequences is None:
+        if inputs is None:
            raise ValueError(
                "None input is not valid. Should be a list of strings or a list of tuple of strings."
            )
-        return self._tokenizer.encode_batch(sequences, add_special_tokens)
+        return self._tokenizer.encode_batch(inputs, add_special_tokens)
    def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
        """ Decode the given list of ids to a string sequence