Doc - Updated API Reference for encode/encode_batch

This commit is contained in:
Anthony MOI
2020-10-06 17:02:21 -04:00
committed by Anthony MOI
parent f2f3ec51bd
commit 79f02bb7f0
4 changed files with 152 additions and 38 deletions

View File

@ -6,15 +6,56 @@ from enum import Enum
Offsets = Tuple[int, int]
TextInputSequence = str
"""A :obj:`str` that represents an input sequence """
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
"""A pre-tokenized input sequence. Can be one of:
- A :obj:`List` of :obj:`str`
- A :obj:`Tuple` of :obj:`str`
"""
TextEncodeInput = Union[
TextInputSequence,
Tuple[TextInputSequence, TextInputSequence],
List[TextInputSequence],
]
"""Represents a textual input for encoding. Can be either:
- A single sequence: :data:`~tokenizers.TextInputSequence`
- A pair of sequences:
- A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence`
- Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2
"""
PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence,
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
List[PreTokenizedInputSequence],
]
"""Represents a pre-tokenized input for encoding. Can be either:
- A single sequence: :data:`~tokenizers.PreTokenizedInputSequence`
- A pair of sequences:
- A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence`
- Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2
"""
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
"""Represents all the possible types of input sequences for encoding. Can be:
- When ``is_pretokenized=False``: :data:`~TextInputSequence`
- When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence`
"""
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
"""Represents all the possible types of input for encoding. Can be:
- When ``is_pretokenized=False``: :data:`~TextEncodeInput`
- When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput`
"""
class OffsetReferential(Enum):

View File

@ -694,26 +694,40 @@ class Tokenizer:
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> Encoding:
"""Encode the given sequence and pair. This method can process raw text sequences as well
as already pre-tokenized sequences.
"""
Encode the given sequence and pair. This method can process raw text sequences
as well as already pre-tokenized sequences.
Example:
Here are some examples of the inputs that are accepted::
encode("A single sequence")`
encode("A sequence", "And its pair")`
encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)`
encode(
[ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ],
is_pretokenized=True
)
Args:
sequence: InputSequence:
The sequence we want to encode. This sequence can be either raw text or
pre-tokenized, according to the `is_pretokenized` argument:
sequence (:obj:`~tokenizers.InputSequence`):
The main input sequence we want to encode. This sequence can be either raw
text or pre-tokenized, according to the ``is_pretokenized`` argument:
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
- If `is_pretokenized=True`: `InputSequence` is expected to be
`Union[List[str], Tuple[str]]`
- If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
is_pretokenized: bool:
pair (:obj:`~tokenizers.InputSequence`, `optional`):
An optional input sequence. The expected format is the same that for ``sequence``.
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
Whether the input is already pre-tokenized
add_special_tokens: bool:
Whether to add the special tokens while encoding.
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether to add the special tokens
Returns:
An Encoding
:class:`~tokenizers.Encoding`: The encoded result
"""
pass
def encode_batch(
@ -722,30 +736,38 @@ class Tokenizer:
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> List[Encoding]:
"""Encode the given inputs. This method accept both raw text sequences as well as already
pre-tokenized sequences.
"""
Encode the given batch of inputs. This method accept both raw text sequences
as well as already pre-tokenized sequences.
Example:
Here are some examples of the inputs that are accepted::
encode_batch([
"A single sequence",
("A tuple with a sequence", "And its pair"),
[ "A", "pre", "tokenized", "sequence" ],
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
])
Args:
inputs: List[EncodeInput]:
A list of single sequences or pair sequences to encode. Each `EncodeInput` is
expected to be of the following form:
`Union[InputSequence, Tuple[InputSequence, InputSequence]]`
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
A list of single sequences or pair sequences to encode. Each sequence
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
argument:
Each `InputSequence` can either be raw text or pre-tokenized,
according to the `is_pretokenized` argument:
- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
- If `is_pretokenized=True`: `InputSequence` is expected to be
`Union[List[str], Tuple[str]]`
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
Whether the input is already pre-tokenized
is_pretokenized: bool:
Whether the input is already pre-tokenized.
add_special_tokens: bool:
Whether to add the special tokens while encoding.
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether to add the special tokens
Returns:
A list of Encoding
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
"""
pass
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:

View File

@ -617,10 +617,21 @@ impl PyTokenizer {
/// )
///
/// Args:
/// sequence (:obj:`~tokenizers.InputSequence`): The main input sequence
/// pair: (:obj:`~tokenizers.InputSequence`): An optional input sequence
/// is_pretokenized (:obj:`bool`): Whether the input is already pre-tokenized
/// add_special_tokens (:obj:`bool`): Whether to add the special tokens
/// sequence (:obj:`~tokenizers.InputSequence`):
/// The main input sequence we want to encode. This sequence can be either raw
/// text or pre-tokenized, according to the ``is_pretokenized`` argument:
///
/// - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
/// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
///
/// pair (:obj:`~tokenizers.InputSequence`, `optional`):
/// An optional input sequence. The expected format is the same that for ``sequence``.
///
/// is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
/// Whether the input is already pre-tokenized
///
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether to add the special tokens
///
/// Returns:
/// :class:`~tokenizers.Encoding`: The encoded result
@ -673,12 +684,22 @@ impl PyTokenizer {
/// ])
///
/// Args:
/// input (:obj:`~tokenizers.EncodeInput`): The batch inputs
/// is_pretokenized (:obj:`bool`): Whether the input is already pre-tokenized
/// add_special_tokens (:obj:`bool`): Whether to add the special tokens
/// input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
/// A list of single sequences or pair sequences to encode. Each sequence
/// can be either raw text or pre-tokenized, according to the ``is_pretokenized``
/// argument:
///
/// - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
/// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
///
/// is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
/// Whether the input is already pre-tokenized
///
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether to add the special tokens
///
/// Returns:
/// :obj:`List[:class:`~tokenizers.Encoding`]`: The encoded batch
/// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
///
#[args(is_pretokenized = "false", add_special_tokens = "true")]
#[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True, /)"]

View File

@ -1,2 +1,32 @@
Input sequences
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
These types represent all the different kinds of sequence that can be used as input of a Tokenizer.
Globally, any sequence can be either a string or a list of strings, according to the operating
mode of the tokenizer: ``raw text`` vs ``pre-tokenized``.
.. autodata:: tokenizers.TextInputSequence
.. autodata:: tokenizers.PreTokenizedInputSequence
.. autodata:: tokenizers.InputSequence
Encode inputs
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
These types represent all the different kinds of input that a :class:`~tokenizers.Tokenizer` accepts
when using :meth:`~tokenizers.Tokenizer.encode_batch`.
.. autodata:: tokenizers.TextEncodeInput
.. autodata:: tokenizers.PreTokenizedEncodeInput
.. autodata:: tokenizers.EncodeInput
Tokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: tokenizers.Tokenizer
:members: