Doc - Updated API Reference for encode/encode_batch

2025-12-13 05:48:36 +00:00 · 2020-10-06 17:02:21 -04:00
parent f2f3ec51bd
commit 79f02bb7f0
4 changed files with 152 additions and 38 deletions
--- a/bindings/python/py_src/tokenizers/init.py
+++ b/bindings/python/py_src/tokenizers/init.py
@@ -6,15 +6,56 @@ from enum import Enum
 Offsets = Tuple[int, int]
 TextInputSequence = str
 """A :obj:`str` that represents an input sequence """
 PreTokenizedInputSequence = Union[List[str], Tuple[str]]
-TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
+"""A pre-tokenized input sequence. Can be one of:
    - A :obj:`List` of :obj:`str`
    - A :obj:`Tuple` of :obj:`str`
 """
 TextEncodeInput = Union[
    TextInputSequence,
    Tuple[TextInputSequence, TextInputSequence],
    List[TextInputSequence],
 ]
 """Represents a textual input for encoding. Can be either:
    - A single sequence: :data:`~tokenizers.TextInputSequence`
    - A pair of sequences:
      - A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence`
      - Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2
 """
 PreTokenizedEncodeInput = Union[
    PreTokenizedInputSequence,
    Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
    List[PreTokenizedInputSequence],
 ]
 """Represents a pre-tokenized input for encoding. Can be either:
    - A single sequence: :data:`~tokenizers.PreTokenizedInputSequence`
    - A pair of sequences:
      - A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence`
      - Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2
 """
 InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
 """Represents all the possible types of input sequences for encoding. Can be:
    - When ``is_pretokenized=False``: :data:`~TextInputSequence`
    - When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence`
 """
 EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
 """Represents all the possible types of input for encoding. Can be:
    - When ``is_pretokenized=False``: :data:`~TextEncodeInput`
    - When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput`
 """
 class OffsetReferential(Enum):
--- a/bindings/python/py_src/tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/init.pyi
@@ -694,26 +694,40 @@ class Tokenizer:
        is_pretokenized: bool = False,
        add_special_tokens: bool = True,
    ) -> Encoding:
-        """Encode the given sequence and pair. This method can process raw text sequences as well
+        """
-        as already pre-tokenized sequences.
+        Encode the given sequence and pair. This method can process raw text sequences
        as well as already pre-tokenized sequences.
        Example:
            Here are some examples of the inputs that are accepted::
                encode("A single sequence")`
                encode("A sequence", "And its pair")`
                encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)`
                encode(
                    [ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ],
                    is_pretokenized=True
                )
        Args:
-            sequence: InputSequence:
+            sequence (:obj:`~tokenizers.InputSequence`):
-                The sequence we want to encode. This sequence can be either raw text or
+                The main input sequence we want to encode. This sequence can be either raw
-                pre-tokenized, according to the `is_pretokenized` argument:
+                text or pre-tokenized, according to the ``is_pretokenized`` argument:
-                - If `is_pretokenized=False`: `InputSequence` is expected to be `str`
+                - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
-                - If `is_pretokenized=True`: `InputSequence` is expected to be
+                - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
                    `Union[List[str], Tuple[str]]`
-            is_pretokenized: bool:
+            pair (:obj:`~tokenizers.InputSequence`, `optional`):
                An optional input sequence. The expected format is the same that for ``sequence``.
            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
                Whether the input is already pre-tokenized
-            add_special_tokens: bool:
+            add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
-                Whether to add the special tokens while encoding.
+                Whether to add the special tokens
        Returns:
-            An Encoding
+            :class:`~tokenizers.Encoding`: The encoded result
        """
        pass
    def encode_batch(
@@ -722,30 +736,38 @@ class Tokenizer:
        is_pretokenized: bool = False,
        add_special_tokens: bool = True,
    ) -> List[Encoding]:
-        """Encode the given inputs. This method accept both raw text sequences as well as already
+        """
-        pre-tokenized sequences.
+        Encode the given batch of inputs. This method accept both raw text sequences
        as well as already pre-tokenized sequences.
        Example:
            Here are some examples of the inputs that are accepted::
                encode_batch([
                    "A single sequence",
                    ("A tuple with a sequence", "And its pair"),
                    [ "A", "pre", "tokenized", "sequence" ],
                    ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
                ])
        Args:
-            inputs: List[EncodeInput]:
+            input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
-                A list of single sequences or pair sequences to encode. Each `EncodeInput` is
+                A list of single sequences or pair sequences to encode. Each sequence
-                expected to be of the following form:
+                can be either raw text or pre-tokenized, according to the ``is_pretokenized``
-                    `Union[InputSequence, Tuple[InputSequence, InputSequence]]`
+                argument:
-                Each `InputSequence` can either be raw text or pre-tokenized,
+                - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
-                according to the `is_pretokenized` argument:
+                - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
-                - If `is_pretokenized=False`: `InputSequence` is expected to be `str`
+            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
-                - If `is_pretokenized=True`: `InputSequence` is expected to be
+                Whether the input is already pre-tokenized
                    `Union[List[str], Tuple[str]]`
-            is_pretokenized: bool:
+            add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
-                Whether the input is already pre-tokenized.
+                Whether to add the special tokens
            add_special_tokens: bool:
                Whether to add the special tokens while encoding.
        Returns:
-            A list of Encoding
+            A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
        """
        pass
    def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@@ -617,10 +617,21 @@ impl PyTokenizer {
    ///         )
    ///
    /// Args:
-    ///     sequence (:obj:`~tokenizers.InputSequence`): The main input sequence
+    ///     sequence (:obj:`~tokenizers.InputSequence`):
-    ///     pair: (:obj:`~tokenizers.InputSequence`): An optional input sequence
+    ///         The main input sequence we want to encode. This sequence can be either raw
-    ///     is_pretokenized (:obj:`bool`): Whether the input is already pre-tokenized
+    ///         text or pre-tokenized, according to the ``is_pretokenized`` argument:
-    ///     add_special_tokens (:obj:`bool`): Whether to add the special tokens
+    ///
    ///         - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
    ///         - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
    ///
    ///     pair (:obj:`~tokenizers.InputSequence`, `optional`):
    ///         An optional input sequence. The expected format is the same that for ``sequence``.
    ///
    ///     is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
    ///         Whether the input is already pre-tokenized
    ///
    ///     add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
    ///         Whether to add the special tokens
    ///
    /// Returns:
    ///     :class:`~tokenizers.Encoding`: The encoded result
@@ -673,12 +684,22 @@ impl PyTokenizer {
    ///         ])
    ///
    /// Args:
-    ///     input (:obj:`~tokenizers.EncodeInput`): The batch inputs
+    ///     input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
-    ///     is_pretokenized (:obj:`bool`): Whether the input is already pre-tokenized
+    ///         A list of single sequences or pair sequences to encode. Each sequence
-    ///     add_special_tokens (:obj:`bool`): Whether to add the special tokens
+    ///         can be either raw text or pre-tokenized, according to the ``is_pretokenized``
    ///         argument:
    ///
    ///         - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
    ///         - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
    ///
    ///     is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
    ///         Whether the input is already pre-tokenized
    ///
    ///     add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
    ///         Whether to add the special tokens
    ///
    /// Returns:
-    ///     :obj:`List[:class:`~tokenizers.Encoding`]`: The encoded batch
+    ///     A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
    ///
    #[args(is_pretokenized = "false", add_special_tokens = "true")]
    #[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True, /)"]
--- a/docs/source/api/python_reference.inc
+++ b/docs/source/api/python_reference.inc
@@ -1,2 +1,32 @@
 Input sequences
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 These types represent all the different kinds of sequence that can be used as input of a Tokenizer.
 Globally, any sequence can be either a string or a list of strings, according to the operating
 mode of the tokenizer: ``raw text`` vs ``pre-tokenized``.
 .. autodata:: tokenizers.TextInputSequence
 .. autodata:: tokenizers.PreTokenizedInputSequence
 .. autodata:: tokenizers.InputSequence
 Encode inputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 These types represent all the different kinds of input that a :class:`~tokenizers.Tokenizer` accepts
 when using :meth:`~tokenizers.Tokenizer.encode_batch`.
 .. autodata:: tokenizers.TextEncodeInput
 .. autodata:: tokenizers.PreTokenizedEncodeInput
 .. autodata:: tokenizers.EncodeInput
 Tokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: tokenizers.Tokenizer
    :members: