Doc - Updated API Reference for encode/encode_batch

This commit is contained in:
Anthony MOI
2020-10-06 17:02:21 -04:00
committed by Anthony MOI
parent f2f3ec51bd
commit 79f02bb7f0
4 changed files with 152 additions and 38 deletions

View File

@ -6,15 +6,56 @@ from enum import Enum
Offsets = Tuple[int, int] Offsets = Tuple[int, int]
TextInputSequence = str TextInputSequence = str
"""A :obj:`str` that represents an input sequence """
PreTokenizedInputSequence = Union[List[str], Tuple[str]] PreTokenizedInputSequence = Union[List[str], Tuple[str]]
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]] """A pre-tokenized input sequence. Can be one of:
- A :obj:`List` of :obj:`str`
- A :obj:`Tuple` of :obj:`str`
"""
TextEncodeInput = Union[
TextInputSequence,
Tuple[TextInputSequence, TextInputSequence],
List[TextInputSequence],
]
"""Represents a textual input for encoding. Can be either:
- A single sequence: :data:`~tokenizers.TextInputSequence`
- A pair of sequences:
- A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence`
- Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2
"""
PreTokenizedEncodeInput = Union[ PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence, PreTokenizedInputSequence,
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
List[PreTokenizedInputSequence],
] ]
"""Represents a pre-tokenized input for encoding. Can be either:
- A single sequence: :data:`~tokenizers.PreTokenizedInputSequence`
- A pair of sequences:
- A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence`
- Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2
"""
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence] InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
"""Represents all the possible types of input sequences for encoding. Can be:
- When ``is_pretokenized=False``: :data:`~TextInputSequence`
- When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence`
"""
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput] EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
"""Represents all the possible types of input for encoding. Can be:
- When ``is_pretokenized=False``: :data:`~TextEncodeInput`
- When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput`
"""
class OffsetReferential(Enum): class OffsetReferential(Enum):

View File

@ -694,26 +694,40 @@ class Tokenizer:
is_pretokenized: bool = False, is_pretokenized: bool = False,
add_special_tokens: bool = True, add_special_tokens: bool = True,
) -> Encoding: ) -> Encoding:
"""Encode the given sequence and pair. This method can process raw text sequences as well """
as already pre-tokenized sequences. Encode the given sequence and pair. This method can process raw text sequences
as well as already pre-tokenized sequences.
Example:
Here are some examples of the inputs that are accepted::
encode("A single sequence")`
encode("A sequence", "And its pair")`
encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)`
encode(
[ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ],
is_pretokenized=True
)
Args: Args:
sequence: InputSequence: sequence (:obj:`~tokenizers.InputSequence`):
The sequence we want to encode. This sequence can be either raw text or The main input sequence we want to encode. This sequence can be either raw
pre-tokenized, according to the `is_pretokenized` argument: text or pre-tokenized, according to the ``is_pretokenized`` argument:
- If `is_pretokenized=False`: `InputSequence` is expected to be `str` - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
- If `is_pretokenized=True`: `InputSequence` is expected to be - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
`Union[List[str], Tuple[str]]`
is_pretokenized: bool: pair (:obj:`~tokenizers.InputSequence`, `optional`):
An optional input sequence. The expected format is the same that for ``sequence``.
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
Whether the input is already pre-tokenized Whether the input is already pre-tokenized
add_special_tokens: bool: add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether to add the special tokens while encoding. Whether to add the special tokens
Returns: Returns:
An Encoding :class:`~tokenizers.Encoding`: The encoded result
""" """
pass pass
def encode_batch( def encode_batch(
@ -722,30 +736,38 @@ class Tokenizer:
is_pretokenized: bool = False, is_pretokenized: bool = False,
add_special_tokens: bool = True, add_special_tokens: bool = True,
) -> List[Encoding]: ) -> List[Encoding]:
"""Encode the given inputs. This method accept both raw text sequences as well as already """
pre-tokenized sequences. Encode the given batch of inputs. This method accept both raw text sequences
as well as already pre-tokenized sequences.
Example:
Here are some examples of the inputs that are accepted::
encode_batch([
"A single sequence",
("A tuple with a sequence", "And its pair"),
[ "A", "pre", "tokenized", "sequence" ],
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
])
Args: Args:
inputs: List[EncodeInput]: input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
A list of single sequences or pair sequences to encode. Each `EncodeInput` is A list of single sequences or pair sequences to encode. Each sequence
expected to be of the following form: can be either raw text or pre-tokenized, according to the ``is_pretokenized``
`Union[InputSequence, Tuple[InputSequence, InputSequence]]` argument:
Each `InputSequence` can either be raw text or pre-tokenized, - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
according to the `is_pretokenized` argument: - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
- If `is_pretokenized=False`: `InputSequence` is expected to be `str` is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
- If `is_pretokenized=True`: `InputSequence` is expected to be Whether the input is already pre-tokenized
`Union[List[str], Tuple[str]]`
is_pretokenized: bool: add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether the input is already pre-tokenized. Whether to add the special tokens
add_special_tokens: bool:
Whether to add the special tokens while encoding.
Returns: Returns:
A list of Encoding A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
""" """
pass pass
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str: def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:

View File

@ -617,10 +617,21 @@ impl PyTokenizer {
/// ) /// )
/// ///
/// Args: /// Args:
/// sequence (:obj:`~tokenizers.InputSequence`): The main input sequence /// sequence (:obj:`~tokenizers.InputSequence`):
/// pair: (:obj:`~tokenizers.InputSequence`): An optional input sequence /// The main input sequence we want to encode. This sequence can be either raw
/// is_pretokenized (:obj:`bool`): Whether the input is already pre-tokenized /// text or pre-tokenized, according to the ``is_pretokenized`` argument:
/// add_special_tokens (:obj:`bool`): Whether to add the special tokens ///
/// - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
/// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
///
/// pair (:obj:`~tokenizers.InputSequence`, `optional`):
/// An optional input sequence. The expected format is the same that for ``sequence``.
///
/// is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
/// Whether the input is already pre-tokenized
///
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether to add the special tokens
/// ///
/// Returns: /// Returns:
/// :class:`~tokenizers.Encoding`: The encoded result /// :class:`~tokenizers.Encoding`: The encoded result
@ -673,12 +684,22 @@ impl PyTokenizer {
/// ]) /// ])
/// ///
/// Args: /// Args:
/// input (:obj:`~tokenizers.EncodeInput`): The batch inputs /// input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
/// is_pretokenized (:obj:`bool`): Whether the input is already pre-tokenized /// A list of single sequences or pair sequences to encode. Each sequence
/// add_special_tokens (:obj:`bool`): Whether to add the special tokens /// can be either raw text or pre-tokenized, according to the ``is_pretokenized``
/// argument:
///
/// - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
/// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
///
/// is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
/// Whether the input is already pre-tokenized
///
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether to add the special tokens
/// ///
/// Returns: /// Returns:
/// :obj:`List[:class:`~tokenizers.Encoding`]`: The encoded batch /// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
/// ///
#[args(is_pretokenized = "false", add_special_tokens = "true")] #[args(is_pretokenized = "false", add_special_tokens = "true")]
#[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True, /)"] #[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True, /)"]

View File

@ -1,2 +1,32 @@
Input sequences
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
These types represent all the different kinds of sequence that can be used as input of a Tokenizer.
Globally, any sequence can be either a string or a list of strings, according to the operating
mode of the tokenizer: ``raw text`` vs ``pre-tokenized``.
.. autodata:: tokenizers.TextInputSequence
.. autodata:: tokenizers.PreTokenizedInputSequence
.. autodata:: tokenizers.InputSequence
Encode inputs
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
These types represent all the different kinds of input that a :class:`~tokenizers.Tokenizer` accepts
when using :meth:`~tokenizers.Tokenizer.encode_batch`.
.. autodata:: tokenizers.TextEncodeInput
.. autodata:: tokenizers.PreTokenizedEncodeInput
.. autodata:: tokenizers.EncodeInput
Tokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: tokenizers.Tokenizer .. autoclass:: tokenizers.Tokenizer
:members: :members: