mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Doc - Updated API Reference for encode/encode_batch
This commit is contained in:
@ -6,15 +6,56 @@ from enum import Enum
|
||||
Offsets = Tuple[int, int]
|
||||
|
||||
TextInputSequence = str
|
||||
"""A :obj:`str` that represents an input sequence """
|
||||
|
||||
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
|
||||
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
|
||||
"""A pre-tokenized input sequence. Can be one of:
|
||||
|
||||
- A :obj:`List` of :obj:`str`
|
||||
- A :obj:`Tuple` of :obj:`str`
|
||||
"""
|
||||
|
||||
TextEncodeInput = Union[
|
||||
TextInputSequence,
|
||||
Tuple[TextInputSequence, TextInputSequence],
|
||||
List[TextInputSequence],
|
||||
]
|
||||
"""Represents a textual input for encoding. Can be either:
|
||||
|
||||
- A single sequence: :data:`~tokenizers.TextInputSequence`
|
||||
- A pair of sequences:
|
||||
|
||||
- A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence`
|
||||
- Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2
|
||||
"""
|
||||
|
||||
PreTokenizedEncodeInput = Union[
|
||||
PreTokenizedInputSequence,
|
||||
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
|
||||
List[PreTokenizedInputSequence],
|
||||
]
|
||||
"""Represents a pre-tokenized input for encoding. Can be either:
|
||||
|
||||
- A single sequence: :data:`~tokenizers.PreTokenizedInputSequence`
|
||||
- A pair of sequences:
|
||||
|
||||
- A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence`
|
||||
- Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2
|
||||
"""
|
||||
|
||||
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
|
||||
"""Represents all the possible types of input sequences for encoding. Can be:
|
||||
|
||||
- When ``is_pretokenized=False``: :data:`~TextInputSequence`
|
||||
- When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence`
|
||||
"""
|
||||
|
||||
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
|
||||
"""Represents all the possible types of input for encoding. Can be:
|
||||
|
||||
- When ``is_pretokenized=False``: :data:`~TextEncodeInput`
|
||||
- When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput`
|
||||
"""
|
||||
|
||||
|
||||
class OffsetReferential(Enum):
|
||||
|
@ -694,26 +694,40 @@ class Tokenizer:
|
||||
is_pretokenized: bool = False,
|
||||
add_special_tokens: bool = True,
|
||||
) -> Encoding:
|
||||
"""Encode the given sequence and pair. This method can process raw text sequences as well
|
||||
as already pre-tokenized sequences.
|
||||
"""
|
||||
Encode the given sequence and pair. This method can process raw text sequences
|
||||
as well as already pre-tokenized sequences.
|
||||
|
||||
Example:
|
||||
Here are some examples of the inputs that are accepted::
|
||||
|
||||
encode("A single sequence")`
|
||||
encode("A sequence", "And its pair")`
|
||||
encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)`
|
||||
encode(
|
||||
[ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ],
|
||||
is_pretokenized=True
|
||||
)
|
||||
|
||||
Args:
|
||||
sequence: InputSequence:
|
||||
The sequence we want to encode. This sequence can be either raw text or
|
||||
pre-tokenized, according to the `is_pretokenized` argument:
|
||||
sequence (:obj:`~tokenizers.InputSequence`):
|
||||
The main input sequence we want to encode. This sequence can be either raw
|
||||
text or pre-tokenized, according to the ``is_pretokenized`` argument:
|
||||
|
||||
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
|
||||
- If `is_pretokenized=True`: `InputSequence` is expected to be
|
||||
`Union[List[str], Tuple[str]]`
|
||||
- If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
|
||||
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
|
||||
|
||||
is_pretokenized: bool:
|
||||
pair (:obj:`~tokenizers.InputSequence`, `optional`):
|
||||
An optional input sequence. The expected format is the same that for ``sequence``.
|
||||
|
||||
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
||||
Whether the input is already pre-tokenized
|
||||
|
||||
add_special_tokens: bool:
|
||||
Whether to add the special tokens while encoding.
|
||||
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||
Whether to add the special tokens
|
||||
|
||||
Returns:
|
||||
An Encoding
|
||||
:class:`~tokenizers.Encoding`: The encoded result
|
||||
"""
|
||||
pass
|
||||
def encode_batch(
|
||||
@ -722,30 +736,38 @@ class Tokenizer:
|
||||
is_pretokenized: bool = False,
|
||||
add_special_tokens: bool = True,
|
||||
) -> List[Encoding]:
|
||||
"""Encode the given inputs. This method accept both raw text sequences as well as already
|
||||
pre-tokenized sequences.
|
||||
"""
|
||||
Encode the given batch of inputs. This method accept both raw text sequences
|
||||
as well as already pre-tokenized sequences.
|
||||
|
||||
Example:
|
||||
Here are some examples of the inputs that are accepted::
|
||||
|
||||
encode_batch([
|
||||
"A single sequence",
|
||||
("A tuple with a sequence", "And its pair"),
|
||||
[ "A", "pre", "tokenized", "sequence" ],
|
||||
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
|
||||
])
|
||||
|
||||
Args:
|
||||
inputs: List[EncodeInput]:
|
||||
A list of single sequences or pair sequences to encode. Each `EncodeInput` is
|
||||
expected to be of the following form:
|
||||
`Union[InputSequence, Tuple[InputSequence, InputSequence]]`
|
||||
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
|
||||
A list of single sequences or pair sequences to encode. Each sequence
|
||||
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
|
||||
argument:
|
||||
|
||||
Each `InputSequence` can either be raw text or pre-tokenized,
|
||||
according to the `is_pretokenized` argument:
|
||||
- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
|
||||
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
|
||||
|
||||
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
|
||||
- If `is_pretokenized=True`: `InputSequence` is expected to be
|
||||
`Union[List[str], Tuple[str]]`
|
||||
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
||||
Whether the input is already pre-tokenized
|
||||
|
||||
is_pretokenized: bool:
|
||||
Whether the input is already pre-tokenized.
|
||||
|
||||
add_special_tokens: bool:
|
||||
Whether to add the special tokens while encoding.
|
||||
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||
Whether to add the special tokens
|
||||
|
||||
Returns:
|
||||
A list of Encoding
|
||||
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
|
||||
|
||||
"""
|
||||
pass
|
||||
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
|
||||
|
@ -617,10 +617,21 @@ impl PyTokenizer {
|
||||
/// )
|
||||
///
|
||||
/// Args:
|
||||
/// sequence (:obj:`~tokenizers.InputSequence`): The main input sequence
|
||||
/// pair: (:obj:`~tokenizers.InputSequence`): An optional input sequence
|
||||
/// is_pretokenized (:obj:`bool`): Whether the input is already pre-tokenized
|
||||
/// add_special_tokens (:obj:`bool`): Whether to add the special tokens
|
||||
/// sequence (:obj:`~tokenizers.InputSequence`):
|
||||
/// The main input sequence we want to encode. This sequence can be either raw
|
||||
/// text or pre-tokenized, according to the ``is_pretokenized`` argument:
|
||||
///
|
||||
/// - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
|
||||
/// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
|
||||
///
|
||||
/// pair (:obj:`~tokenizers.InputSequence`, `optional`):
|
||||
/// An optional input sequence. The expected format is the same that for ``sequence``.
|
||||
///
|
||||
/// is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
||||
/// Whether the input is already pre-tokenized
|
||||
///
|
||||
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||
/// Whether to add the special tokens
|
||||
///
|
||||
/// Returns:
|
||||
/// :class:`~tokenizers.Encoding`: The encoded result
|
||||
@ -673,12 +684,22 @@ impl PyTokenizer {
|
||||
/// ])
|
||||
///
|
||||
/// Args:
|
||||
/// input (:obj:`~tokenizers.EncodeInput`): The batch inputs
|
||||
/// is_pretokenized (:obj:`bool`): Whether the input is already pre-tokenized
|
||||
/// add_special_tokens (:obj:`bool`): Whether to add the special tokens
|
||||
/// input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
|
||||
/// A list of single sequences or pair sequences to encode. Each sequence
|
||||
/// can be either raw text or pre-tokenized, according to the ``is_pretokenized``
|
||||
/// argument:
|
||||
///
|
||||
/// - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
|
||||
/// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
|
||||
///
|
||||
/// is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
||||
/// Whether the input is already pre-tokenized
|
||||
///
|
||||
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||
/// Whether to add the special tokens
|
||||
///
|
||||
/// Returns:
|
||||
/// :obj:`List[:class:`~tokenizers.Encoding`]`: The encoded batch
|
||||
/// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
|
||||
///
|
||||
#[args(is_pretokenized = "false", add_special_tokens = "true")]
|
||||
#[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True, /)"]
|
||||
|
@ -1,2 +1,32 @@
|
||||
Input sequences
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
These types represent all the different kinds of sequence that can be used as input of a Tokenizer.
|
||||
Globally, any sequence can be either a string or a list of strings, according to the operating
|
||||
mode of the tokenizer: ``raw text`` vs ``pre-tokenized``.
|
||||
|
||||
.. autodata:: tokenizers.TextInputSequence
|
||||
|
||||
.. autodata:: tokenizers.PreTokenizedInputSequence
|
||||
|
||||
.. autodata:: tokenizers.InputSequence
|
||||
|
||||
|
||||
Encode inputs
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
These types represent all the different kinds of input that a :class:`~tokenizers.Tokenizer` accepts
|
||||
when using :meth:`~tokenizers.Tokenizer.encode_batch`.
|
||||
|
||||
.. autodata:: tokenizers.TextEncodeInput
|
||||
|
||||
.. autodata:: tokenizers.PreTokenizedEncodeInput
|
||||
|
||||
.. autodata:: tokenizers.EncodeInput
|
||||
|
||||
|
||||
Tokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: tokenizers.Tokenizer
|
||||
:members:
|
||||
|
Reference in New Issue
Block a user