Doc - Updated API Reference for encode/encode_batch

2025-08-22 16:25:30 +00:00 · 2020-10-06 17:02:21 -04:00
parent f2f3ec51bd
commit 79f02bb7f0
4 changed files with 152 additions and 38 deletions
--- a/bindings/python/py_src/tokenizers/init.py
+++ b/bindings/python/py_src/tokenizers/init.py
@ -6,15 +6,56 @@ from enum import Enum
 Offsets = Tuple[int, int]

 TextInputSequence = str
+"""A :obj:`str` that represents an input sequence """
+
 PreTokenizedInputSequence = Union[List[str], Tuple[str]]
-TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
+"""A pre-tokenized input sequence. Can be one of:
+
+    - A :obj:`List` of :obj:`str`
+    - A :obj:`Tuple` of :obj:`str`
+"""
+
+TextEncodeInput = Union[
+    TextInputSequence,
+    Tuple[TextInputSequence, TextInputSequence],
+    List[TextInputSequence],
+]
+"""Represents a textual input for encoding. Can be either:
+
+    - A single sequence: :data:`~tokenizers.TextInputSequence`
+    - A pair of sequences:
+
+      - A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence`
+      - Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2
+"""
+
 PreTokenizedEncodeInput = Union[
    PreTokenizedInputSequence,
    Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
+    List[PreTokenizedInputSequence],
 ]
+"""Represents a pre-tokenized input for encoding. Can be either:
+
+    - A single sequence: :data:`~tokenizers.PreTokenizedInputSequence`
+    - A pair of sequences:
+
+      - A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence`
+      - Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2
+"""

 InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
+"""Represents all the possible types of input sequences for encoding. Can be:
+
+    - When ``is_pretokenized=False``: :data:`~TextInputSequence`
+    - When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence`
+"""
+
 EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
+"""Represents all the possible types of input for encoding. Can be:
+
+    - When ``is_pretokenized=False``: :data:`~TextEncodeInput`
+    - When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput`
+"""


 class OffsetReferential(Enum):
--- a/bindings/python/py_src/tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/init.pyi
@ -694,26 +694,40 @@ class Tokenizer:
        is_pretokenized: bool = False,
        add_special_tokens: bool = True,
    ) -> Encoding:
-        """Encode the given sequence and pair. This method can process raw text sequences as well
-        as already pre-tokenized sequences.
+        """
+        Encode the given sequence and pair. This method can process raw text sequences
+        as well as already pre-tokenized sequences.
+
+        Example:
+            Here are some examples of the inputs that are accepted::
+
+                encode("A single sequence")`
+                encode("A sequence", "And its pair")`
+                encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)`
+                encode(
+                    [ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ],
+                    is_pretokenized=True
+                )

        Args:
-            sequence: InputSequence:
-                The sequence we want to encode. This sequence can be either raw text or
-                pre-tokenized, according to the `is_pretokenized` argument:
+            sequence (:obj:`~tokenizers.InputSequence`):
+                The main input sequence we want to encode. This sequence can be either raw
+                text or pre-tokenized, according to the ``is_pretokenized`` argument:

-                - If `is_pretokenized=False`: `InputSequence` is expected to be `str`
-                - If `is_pretokenized=True`: `InputSequence` is expected to be
-                    `Union[List[str], Tuple[str]]`
+                - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
+                - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`

-            is_pretokenized: bool:
+            pair (:obj:`~tokenizers.InputSequence`, `optional`):
+                An optional input sequence. The expected format is the same that for ``sequence``.
+
+            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
                Whether the input is already pre-tokenized

-            add_special_tokens: bool:
-                Whether to add the special tokens while encoding.
+            add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether to add the special tokens

        Returns:
-            An Encoding
+            :class:`~tokenizers.Encoding`: The encoded result
        """
        pass
    def encode_batch(
@ -722,30 +736,38 @@ class Tokenizer:
        is_pretokenized: bool = False,
        add_special_tokens: bool = True,
    ) -> List[Encoding]:
-        """Encode the given inputs. This method accept both raw text sequences as well as already
-        pre-tokenized sequences.
+        """
+        Encode the given batch of inputs. This method accept both raw text sequences
+        as well as already pre-tokenized sequences.
+
+        Example:
+            Here are some examples of the inputs that are accepted::
+
+                encode_batch([
+                    "A single sequence",
+                    ("A tuple with a sequence", "And its pair"),
+                    [ "A", "pre", "tokenized", "sequence" ],
+                    ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
+                ])

        Args:
-            inputs: List[EncodeInput]:
-                A list of single sequences or pair sequences to encode. Each `EncodeInput` is
-                expected to be of the following form:
-                    `Union[InputSequence, Tuple[InputSequence, InputSequence]]`
+            input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
+                A list of single sequences or pair sequences to encode. Each sequence
+                can be either raw text or pre-tokenized, according to the ``is_pretokenized``
+                argument:

-                Each `InputSequence` can either be raw text or pre-tokenized,
-                according to the `is_pretokenized` argument:
+                - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
+                - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`

-                - If `is_pretokenized=False`: `InputSequence` is expected to be `str`
-                - If `is_pretokenized=True`: `InputSequence` is expected to be
-                    `Union[List[str], Tuple[str]]`
+            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+                Whether the input is already pre-tokenized

-            is_pretokenized: bool:
-                Whether the input is already pre-tokenized.
-
-            add_special_tokens: bool:
-                Whether to add the special tokens while encoding.
+            add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether to add the special tokens

        Returns:
-            A list of Encoding
+            A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
+
        """
        pass
    def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@ -617,10 +617,21 @@ impl PyTokenizer {
    ///         )
    ///
    /// Args:
-    ///     sequence (:obj:`~tokenizers.InputSequence`): The main input sequence
-    ///     pair: (:obj:`~tokenizers.InputSequence`): An optional input sequence
-    ///     is_pretokenized (:obj:`bool`): Whether the input is already pre-tokenized
-    ///     add_special_tokens (:obj:`bool`): Whether to add the special tokens
+    ///     sequence (:obj:`~tokenizers.InputSequence`):
+    ///         The main input sequence we want to encode. This sequence can be either raw
+    ///         text or pre-tokenized, according to the ``is_pretokenized`` argument:
+    ///
+    ///         - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
+    ///         - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
+    ///
+    ///     pair (:obj:`~tokenizers.InputSequence`, `optional`):
+    ///         An optional input sequence. The expected format is the same that for ``sequence``.
+    ///
+    ///     is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+    ///         Whether the input is already pre-tokenized
+    ///
+    ///     add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+    ///         Whether to add the special tokens
    ///
    /// Returns:
    ///     :class:`~tokenizers.Encoding`: The encoded result
@ -673,12 +684,22 @@ impl PyTokenizer {
    ///         ])
    ///
    /// Args:
-    ///     input (:obj:`~tokenizers.EncodeInput`): The batch inputs
-    ///     is_pretokenized (:obj:`bool`): Whether the input is already pre-tokenized
-    ///     add_special_tokens (:obj:`bool`): Whether to add the special tokens
+    ///     input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
+    ///         A list of single sequences or pair sequences to encode. Each sequence
+    ///         can be either raw text or pre-tokenized, according to the ``is_pretokenized``
+    ///         argument:
+    ///
+    ///         - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
+    ///         - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
+    ///
+    ///     is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+    ///         Whether the input is already pre-tokenized
+    ///
+    ///     add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+    ///         Whether to add the special tokens
    ///
    /// Returns:
-    ///     :obj:`List[:class:`~tokenizers.Encoding`]`: The encoded batch
+    ///     A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
    ///
    #[args(is_pretokenized = "false", add_special_tokens = "true")]
    #[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True, /)"]
--- a/docs/source/api/python_reference.inc
+++ b/docs/source/api/python_reference.inc
@ -1,2 +1,32 @@
+Input sequences
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+These types represent all the different kinds of sequence that can be used as input of a Tokenizer.
+Globally, any sequence can be either a string or a list of strings, according to the operating
+mode of the tokenizer: ``raw text`` vs ``pre-tokenized``.
+
+.. autodata:: tokenizers.TextInputSequence
+
+.. autodata:: tokenizers.PreTokenizedInputSequence
+
+.. autodata:: tokenizers.InputSequence
+
+
+Encode inputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+These types represent all the different kinds of input that a :class:`~tokenizers.Tokenizer` accepts
+when using :meth:`~tokenizers.Tokenizer.encode_batch`.
+
+.. autodata:: tokenizers.TextEncodeInput
+
+.. autodata:: tokenizers.PreTokenizedEncodeInput
+
+.. autodata:: tokenizers.EncodeInput
+
+
+Tokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 .. autoclass:: tokenizers.Tokenizer
    :members: