mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-02 15:29:21 +00:00
Doc - Updated API Reference for encode/encode_batch
This commit is contained in:
@ -6,15 +6,56 @@ from enum import Enum
|
|||||||
Offsets = Tuple[int, int]
|
Offsets = Tuple[int, int]
|
||||||
|
|
||||||
TextInputSequence = str
|
TextInputSequence = str
|
||||||
|
"""A :obj:`str` that represents an input sequence """
|
||||||
|
|
||||||
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
|
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
|
||||||
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
|
"""A pre-tokenized input sequence. Can be one of:
|
||||||
|
|
||||||
|
- A :obj:`List` of :obj:`str`
|
||||||
|
- A :obj:`Tuple` of :obj:`str`
|
||||||
|
"""
|
||||||
|
|
||||||
|
TextEncodeInput = Union[
|
||||||
|
TextInputSequence,
|
||||||
|
Tuple[TextInputSequence, TextInputSequence],
|
||||||
|
List[TextInputSequence],
|
||||||
|
]
|
||||||
|
"""Represents a textual input for encoding. Can be either:
|
||||||
|
|
||||||
|
- A single sequence: :data:`~tokenizers.TextInputSequence`
|
||||||
|
- A pair of sequences:
|
||||||
|
|
||||||
|
- A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence`
|
||||||
|
- Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2
|
||||||
|
"""
|
||||||
|
|
||||||
PreTokenizedEncodeInput = Union[
|
PreTokenizedEncodeInput = Union[
|
||||||
PreTokenizedInputSequence,
|
PreTokenizedInputSequence,
|
||||||
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
|
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
|
||||||
|
List[PreTokenizedInputSequence],
|
||||||
]
|
]
|
||||||
|
"""Represents a pre-tokenized input for encoding. Can be either:
|
||||||
|
|
||||||
|
- A single sequence: :data:`~tokenizers.PreTokenizedInputSequence`
|
||||||
|
- A pair of sequences:
|
||||||
|
|
||||||
|
- A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence`
|
||||||
|
- Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2
|
||||||
|
"""
|
||||||
|
|
||||||
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
|
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
|
||||||
|
"""Represents all the possible types of input sequences for encoding. Can be:
|
||||||
|
|
||||||
|
- When ``is_pretokenized=False``: :data:`~TextInputSequence`
|
||||||
|
- When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence`
|
||||||
|
"""
|
||||||
|
|
||||||
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
|
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
|
||||||
|
"""Represents all the possible types of input for encoding. Can be:
|
||||||
|
|
||||||
|
- When ``is_pretokenized=False``: :data:`~TextEncodeInput`
|
||||||
|
- When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput`
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class OffsetReferential(Enum):
|
class OffsetReferential(Enum):
|
||||||
|
@ -694,26 +694,40 @@ class Tokenizer:
|
|||||||
is_pretokenized: bool = False,
|
is_pretokenized: bool = False,
|
||||||
add_special_tokens: bool = True,
|
add_special_tokens: bool = True,
|
||||||
) -> Encoding:
|
) -> Encoding:
|
||||||
"""Encode the given sequence and pair. This method can process raw text sequences as well
|
"""
|
||||||
as already pre-tokenized sequences.
|
Encode the given sequence and pair. This method can process raw text sequences
|
||||||
|
as well as already pre-tokenized sequences.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
Here are some examples of the inputs that are accepted::
|
||||||
|
|
||||||
|
encode("A single sequence")`
|
||||||
|
encode("A sequence", "And its pair")`
|
||||||
|
encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)`
|
||||||
|
encode(
|
||||||
|
[ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ],
|
||||||
|
is_pretokenized=True
|
||||||
|
)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
sequence: InputSequence:
|
sequence (:obj:`~tokenizers.InputSequence`):
|
||||||
The sequence we want to encode. This sequence can be either raw text or
|
The main input sequence we want to encode. This sequence can be either raw
|
||||||
pre-tokenized, according to the `is_pretokenized` argument:
|
text or pre-tokenized, according to the ``is_pretokenized`` argument:
|
||||||
|
|
||||||
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
|
- If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
|
||||||
- If `is_pretokenized=True`: `InputSequence` is expected to be
|
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
|
||||||
`Union[List[str], Tuple[str]]`
|
|
||||||
|
|
||||||
is_pretokenized: bool:
|
pair (:obj:`~tokenizers.InputSequence`, `optional`):
|
||||||
|
An optional input sequence. The expected format is the same that for ``sequence``.
|
||||||
|
|
||||||
|
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
||||||
Whether the input is already pre-tokenized
|
Whether the input is already pre-tokenized
|
||||||
|
|
||||||
add_special_tokens: bool:
|
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||||
Whether to add the special tokens while encoding.
|
Whether to add the special tokens
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
An Encoding
|
:class:`~tokenizers.Encoding`: The encoded result
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def encode_batch(
|
def encode_batch(
|
||||||
@ -722,30 +736,38 @@ class Tokenizer:
|
|||||||
is_pretokenized: bool = False,
|
is_pretokenized: bool = False,
|
||||||
add_special_tokens: bool = True,
|
add_special_tokens: bool = True,
|
||||||
) -> List[Encoding]:
|
) -> List[Encoding]:
|
||||||
"""Encode the given inputs. This method accept both raw text sequences as well as already
|
"""
|
||||||
pre-tokenized sequences.
|
Encode the given batch of inputs. This method accept both raw text sequences
|
||||||
|
as well as already pre-tokenized sequences.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
Here are some examples of the inputs that are accepted::
|
||||||
|
|
||||||
|
encode_batch([
|
||||||
|
"A single sequence",
|
||||||
|
("A tuple with a sequence", "And its pair"),
|
||||||
|
[ "A", "pre", "tokenized", "sequence" ],
|
||||||
|
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
|
||||||
|
])
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
inputs: List[EncodeInput]:
|
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
|
||||||
A list of single sequences or pair sequences to encode. Each `EncodeInput` is
|
A list of single sequences or pair sequences to encode. Each sequence
|
||||||
expected to be of the following form:
|
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
|
||||||
`Union[InputSequence, Tuple[InputSequence, InputSequence]]`
|
argument:
|
||||||
|
|
||||||
Each `InputSequence` can either be raw text or pre-tokenized,
|
- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
|
||||||
according to the `is_pretokenized` argument:
|
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
|
||||||
|
|
||||||
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
|
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
||||||
- If `is_pretokenized=True`: `InputSequence` is expected to be
|
Whether the input is already pre-tokenized
|
||||||
`Union[List[str], Tuple[str]]`
|
|
||||||
|
|
||||||
is_pretokenized: bool:
|
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||||
Whether the input is already pre-tokenized.
|
Whether to add the special tokens
|
||||||
|
|
||||||
add_special_tokens: bool:
|
|
||||||
Whether to add the special tokens while encoding.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of Encoding
|
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
|
||||||
|
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
|
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
|
||||||
|
@ -617,10 +617,21 @@ impl PyTokenizer {
|
|||||||
/// )
|
/// )
|
||||||
///
|
///
|
||||||
/// Args:
|
/// Args:
|
||||||
/// sequence (:obj:`~tokenizers.InputSequence`): The main input sequence
|
/// sequence (:obj:`~tokenizers.InputSequence`):
|
||||||
/// pair: (:obj:`~tokenizers.InputSequence`): An optional input sequence
|
/// The main input sequence we want to encode. This sequence can be either raw
|
||||||
/// is_pretokenized (:obj:`bool`): Whether the input is already pre-tokenized
|
/// text or pre-tokenized, according to the ``is_pretokenized`` argument:
|
||||||
/// add_special_tokens (:obj:`bool`): Whether to add the special tokens
|
///
|
||||||
|
/// - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
|
||||||
|
/// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
|
||||||
|
///
|
||||||
|
/// pair (:obj:`~tokenizers.InputSequence`, `optional`):
|
||||||
|
/// An optional input sequence. The expected format is the same that for ``sequence``.
|
||||||
|
///
|
||||||
|
/// is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
||||||
|
/// Whether the input is already pre-tokenized
|
||||||
|
///
|
||||||
|
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||||
|
/// Whether to add the special tokens
|
||||||
///
|
///
|
||||||
/// Returns:
|
/// Returns:
|
||||||
/// :class:`~tokenizers.Encoding`: The encoded result
|
/// :class:`~tokenizers.Encoding`: The encoded result
|
||||||
@ -673,12 +684,22 @@ impl PyTokenizer {
|
|||||||
/// ])
|
/// ])
|
||||||
///
|
///
|
||||||
/// Args:
|
/// Args:
|
||||||
/// input (:obj:`~tokenizers.EncodeInput`): The batch inputs
|
/// input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
|
||||||
/// is_pretokenized (:obj:`bool`): Whether the input is already pre-tokenized
|
/// A list of single sequences or pair sequences to encode. Each sequence
|
||||||
/// add_special_tokens (:obj:`bool`): Whether to add the special tokens
|
/// can be either raw text or pre-tokenized, according to the ``is_pretokenized``
|
||||||
|
/// argument:
|
||||||
|
///
|
||||||
|
/// - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
|
||||||
|
/// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
|
||||||
|
///
|
||||||
|
/// is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
||||||
|
/// Whether the input is already pre-tokenized
|
||||||
|
///
|
||||||
|
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||||
|
/// Whether to add the special tokens
|
||||||
///
|
///
|
||||||
/// Returns:
|
/// Returns:
|
||||||
/// :obj:`List[:class:`~tokenizers.Encoding`]`: The encoded batch
|
/// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
|
||||||
///
|
///
|
||||||
#[args(is_pretokenized = "false", add_special_tokens = "true")]
|
#[args(is_pretokenized = "false", add_special_tokens = "true")]
|
||||||
#[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True, /)"]
|
#[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True, /)"]
|
||||||
|
@ -1,2 +1,32 @@
|
|||||||
|
Input sequences
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
These types represent all the different kinds of sequence that can be used as input of a Tokenizer.
|
||||||
|
Globally, any sequence can be either a string or a list of strings, according to the operating
|
||||||
|
mode of the tokenizer: ``raw text`` vs ``pre-tokenized``.
|
||||||
|
|
||||||
|
.. autodata:: tokenizers.TextInputSequence
|
||||||
|
|
||||||
|
.. autodata:: tokenizers.PreTokenizedInputSequence
|
||||||
|
|
||||||
|
.. autodata:: tokenizers.InputSequence
|
||||||
|
|
||||||
|
|
||||||
|
Encode inputs
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
These types represent all the different kinds of input that a :class:`~tokenizers.Tokenizer` accepts
|
||||||
|
when using :meth:`~tokenizers.Tokenizer.encode_batch`.
|
||||||
|
|
||||||
|
.. autodata:: tokenizers.TextEncodeInput
|
||||||
|
|
||||||
|
.. autodata:: tokenizers.PreTokenizedEncodeInput
|
||||||
|
|
||||||
|
.. autodata:: tokenizers.EncodeInput
|
||||||
|
|
||||||
|
|
||||||
|
Tokenizer
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: tokenizers.Tokenizer
|
.. autoclass:: tokenizers.Tokenizer
|
||||||
:members:
|
:members:
|
||||||
|
Reference in New Issue
Block a user