diff --git a/bindings/python/py_src/tokenizers/__init__.py b/bindings/python/py_src/tokenizers/__init__.py index 8b62e769..6404695b 100644 --- a/bindings/python/py_src/tokenizers/__init__.py +++ b/bindings/python/py_src/tokenizers/__init__.py @@ -6,15 +6,56 @@ from enum import Enum Offsets = Tuple[int, int] TextInputSequence = str +"""A :obj:`str` that represents an input sequence """ + PreTokenizedInputSequence = Union[List[str], Tuple[str]] -TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]] +"""A pre-tokenized input sequence. Can be one of: + + - A :obj:`List` of :obj:`str` + - A :obj:`Tuple` of :obj:`str` +""" + +TextEncodeInput = Union[ + TextInputSequence, + Tuple[TextInputSequence, TextInputSequence], + List[TextInputSequence], +] +"""Represents a textual input for encoding. Can be either: + + - A single sequence: :data:`~tokenizers.TextInputSequence` + - A pair of sequences: + + - A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence` + - Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2 +""" + PreTokenizedEncodeInput = Union[ PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], + List[PreTokenizedInputSequence], ] +"""Represents a pre-tokenized input for encoding. Can be either: + + - A single sequence: :data:`~tokenizers.PreTokenizedInputSequence` + - A pair of sequences: + + - A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence` + - Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2 +""" InputSequence = Union[TextInputSequence, PreTokenizedInputSequence] +"""Represents all the possible types of input sequences for encoding. Can be: + + - When ``is_pretokenized=False``: :data:`~TextInputSequence` + - When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence` +""" + EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput] +"""Represents all the possible types of input for encoding. Can be: + + - When ``is_pretokenized=False``: :data:`~TextEncodeInput` + - When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput` +""" class OffsetReferential(Enum): diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi index 7bb38821..61ac6ee0 100644 --- a/bindings/python/py_src/tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/__init__.pyi @@ -694,26 +694,40 @@ class Tokenizer: is_pretokenized: bool = False, add_special_tokens: bool = True, ) -> Encoding: - """Encode the given sequence and pair. This method can process raw text sequences as well - as already pre-tokenized sequences. + """ + Encode the given sequence and pair. This method can process raw text sequences + as well as already pre-tokenized sequences. + + Example: + Here are some examples of the inputs that are accepted:: + + encode("A single sequence")` + encode("A sequence", "And its pair")` + encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)` + encode( + [ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ], + is_pretokenized=True + ) Args: - sequence: InputSequence: - The sequence we want to encode. This sequence can be either raw text or - pre-tokenized, according to the `is_pretokenized` argument: + sequence (:obj:`~tokenizers.InputSequence`): + The main input sequence we want to encode. This sequence can be either raw + text or pre-tokenized, according to the ``is_pretokenized`` argument: - - If `is_pretokenized=False`: `InputSequence` is expected to be `str` - - If `is_pretokenized=True`: `InputSequence` is expected to be - `Union[List[str], Tuple[str]]` + - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence` + - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence` - is_pretokenized: bool: + pair (:obj:`~tokenizers.InputSequence`, `optional`): + An optional input sequence. The expected format is the same that for ``sequence``. + + is_pretokenized (:obj:`bool`, defaults to :obj:`False`): Whether the input is already pre-tokenized - add_special_tokens: bool: - Whether to add the special tokens while encoding. + add_special_tokens (:obj:`bool`, defaults to :obj:`True`): + Whether to add the special tokens Returns: - An Encoding + :class:`~tokenizers.Encoding`: The encoded result """ pass def encode_batch( @@ -722,30 +736,38 @@ class Tokenizer: is_pretokenized: bool = False, add_special_tokens: bool = True, ) -> List[Encoding]: - """Encode the given inputs. This method accept both raw text sequences as well as already - pre-tokenized sequences. + """ + Encode the given batch of inputs. This method accept both raw text sequences + as well as already pre-tokenized sequences. + + Example: + Here are some examples of the inputs that are accepted:: + + encode_batch([ + "A single sequence", + ("A tuple with a sequence", "And its pair"), + [ "A", "pre", "tokenized", "sequence" ], + ([ "A", "pre", "tokenized", "sequence" ], "And its pair") + ]) Args: - inputs: List[EncodeInput]: - A list of single sequences or pair sequences to encode. Each `EncodeInput` is - expected to be of the following form: - `Union[InputSequence, Tuple[InputSequence, InputSequence]]` + input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`): + A list of single sequences or pair sequences to encode. Each sequence + can be either raw text or pre-tokenized, according to the ``is_pretokenized`` + argument: - Each `InputSequence` can either be raw text or pre-tokenized, - according to the `is_pretokenized` argument: + - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput` + - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput` - - If `is_pretokenized=False`: `InputSequence` is expected to be `str` - - If `is_pretokenized=True`: `InputSequence` is expected to be - `Union[List[str], Tuple[str]]` + is_pretokenized (:obj:`bool`, defaults to :obj:`False`): + Whether the input is already pre-tokenized - is_pretokenized: bool: - Whether the input is already pre-tokenized. - - add_special_tokens: bool: - Whether to add the special tokens while encoding. + add_special_tokens (:obj:`bool`, defaults to :obj:`True`): + Whether to add the special tokens Returns: - A list of Encoding + A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch + """ pass def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str: diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index 9c3f9a95..7c9c3523 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -617,10 +617,21 @@ impl PyTokenizer { /// ) /// /// Args: - /// sequence (:obj:`~tokenizers.InputSequence`): The main input sequence - /// pair: (:obj:`~tokenizers.InputSequence`): An optional input sequence - /// is_pretokenized (:obj:`bool`): Whether the input is already pre-tokenized - /// add_special_tokens (:obj:`bool`): Whether to add the special tokens + /// sequence (:obj:`~tokenizers.InputSequence`): + /// The main input sequence we want to encode. This sequence can be either raw + /// text or pre-tokenized, according to the ``is_pretokenized`` argument: + /// + /// - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence` + /// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence` + /// + /// pair (:obj:`~tokenizers.InputSequence`, `optional`): + /// An optional input sequence. The expected format is the same that for ``sequence``. + /// + /// is_pretokenized (:obj:`bool`, defaults to :obj:`False`): + /// Whether the input is already pre-tokenized + /// + /// add_special_tokens (:obj:`bool`, defaults to :obj:`True`): + /// Whether to add the special tokens /// /// Returns: /// :class:`~tokenizers.Encoding`: The encoded result @@ -673,12 +684,22 @@ impl PyTokenizer { /// ]) /// /// Args: - /// input (:obj:`~tokenizers.EncodeInput`): The batch inputs - /// is_pretokenized (:obj:`bool`): Whether the input is already pre-tokenized - /// add_special_tokens (:obj:`bool`): Whether to add the special tokens + /// input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`): + /// A list of single sequences or pair sequences to encode. Each sequence + /// can be either raw text or pre-tokenized, according to the ``is_pretokenized`` + /// argument: + /// + /// - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput` + /// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput` + /// + /// is_pretokenized (:obj:`bool`, defaults to :obj:`False`): + /// Whether the input is already pre-tokenized + /// + /// add_special_tokens (:obj:`bool`, defaults to :obj:`True`): + /// Whether to add the special tokens /// /// Returns: - /// :obj:`List[:class:`~tokenizers.Encoding`]`: The encoded batch + /// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch /// #[args(is_pretokenized = "false", add_special_tokens = "true")] #[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True, /)"] diff --git a/docs/source/api/python_reference.inc b/docs/source/api/python_reference.inc index 11bc82ef..8ae35cb9 100644 --- a/docs/source/api/python_reference.inc +++ b/docs/source/api/python_reference.inc @@ -1,2 +1,32 @@ +Input sequences +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +These types represent all the different kinds of sequence that can be used as input of a Tokenizer. +Globally, any sequence can be either a string or a list of strings, according to the operating +mode of the tokenizer: ``raw text`` vs ``pre-tokenized``. + +.. autodata:: tokenizers.TextInputSequence + +.. autodata:: tokenizers.PreTokenizedInputSequence + +.. autodata:: tokenizers.InputSequence + + +Encode inputs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +These types represent all the different kinds of input that a :class:`~tokenizers.Tokenizer` accepts +when using :meth:`~tokenizers.Tokenizer.encode_batch`. + +.. autodata:: tokenizers.TextEncodeInput + +.. autodata:: tokenizers.PreTokenizedEncodeInput + +.. autodata:: tokenizers.EncodeInput + + +Tokenizer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + .. autoclass:: tokenizers.Tokenizer :members: