mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Doc - API Reference for most Tokenizer methods/attributes
This commit is contained in:
@ -422,9 +422,37 @@ class Encoding:
|
||||
pass
|
||||
|
||||
class AddedToken:
|
||||
"""AddedToken represents a token to be added to a Tokenizer
|
||||
"""AddedToken
|
||||
|
||||
An AddedToken can have special options defining the way it should behave.
|
||||
Represents a token that can be be added to a :class:`~tokenizers.Tokenizer`.
|
||||
It can have special options that defines the way it should behave.
|
||||
|
||||
Args:
|
||||
content (:obj:`str`): The content of the token
|
||||
|
||||
single_word (:obj:`bool`, defaults to :obj:`False`):
|
||||
Defines whether this token should only match single words. If :obj:`True`, this
|
||||
token will never match inside of a word. For example the token ``ing`` would match
|
||||
on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`.
|
||||
The notion of "`inside of a word`" is defined by the word boundaries pattern in
|
||||
regular expressions (ie. the token should start and end with word boundaries).
|
||||
|
||||
lstrip (:obj:`bool`, defaults to :obj:`False`):
|
||||
Defines whether this token should strip all potential whitespaces on its left side.
|
||||
If :obj:`True`, this token will greedily match any whitespace on its left. For
|
||||
example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text
|
||||
``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left).
|
||||
|
||||
rstrip (:obj:`bool`, defaults to :obj:`False`):
|
||||
Defines whether this token should strip all potential whitespaces on its right
|
||||
side. If :obj:`True`, this token will greedily match any whitespace on its right.
|
||||
It works just like :obj:`lstrip` but on the right.
|
||||
|
||||
normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
|
||||
Defines whether this token should match against the normalized version of the input
|
||||
text. For example, with the added token ``"yesterday"``, and a normalizer in charge of
|
||||
lowercasing the text, the token could be extract from the input ``"I saw a lion
|
||||
Yesterday"``.
|
||||
"""
|
||||
|
||||
def __new__(
|
||||
@ -438,55 +466,54 @@ class AddedToken:
|
||||
"""Instantiate a new AddedToken
|
||||
|
||||
Args:
|
||||
content: str:
|
||||
The content of the token
|
||||
content (:obj:`str`): The content of the token
|
||||
|
||||
single_word: bool
|
||||
Whether this token should only match against single words. If True,
|
||||
this token will never match inside of a word. For example the token `ing` would
|
||||
match on `tokenizing` if this option if False, but not if this option is True.
|
||||
single_word (:obj:`bool`, defaults to :obj:`False`):
|
||||
Defines whether this token should only match single words. If :obj:`True`, this
|
||||
token will never match inside of a word. For example the token ``ing`` would match
|
||||
on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`.
|
||||
The notion of "`inside of a word`" is defined by the word boundaries pattern in
|
||||
regular expressions (ie. the token should start and end with word boundaries).
|
||||
|
||||
lstrip: bool
|
||||
Whether this token should strip all potential whitespaces on the left side.
|
||||
If True, this token will greedily match any whitespace on the left. For example,
|
||||
if we try to match the token `[MASK]` with lstrip=True, in the text `I saw a [MASK]`
|
||||
we will match on ` [MASK]`.
|
||||
lstrip (:obj:`bool`, defaults to :obj:`False`):
|
||||
Defines whether this token should strip all potential whitespaces on its left side.
|
||||
If :obj:`True`, this token will greedily match any whitespace on its left. For
|
||||
example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text
|
||||
``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left).
|
||||
|
||||
rstrip: bool
|
||||
Whether this token should strip all potential whitespaces on the right side.
|
||||
If True, this token will greedily match any whitespace on the right. It works just
|
||||
like lstrip, but on the right.
|
||||
rstrip (:obj:`bool`, defaults to :obj:`False`):
|
||||
Defines whether this token should strip all potential whitespaces on its right
|
||||
side. If :obj:`True`, this token will greedily match any whitespace on its right.
|
||||
It works just like :obj:`lstrip` but on the right.
|
||||
|
||||
normalized: bool:
|
||||
Whether this token should be match the normalized version of the input text. For
|
||||
example, with the added token `yesterday` and a normalizer in charge of lowercasing
|
||||
the text, the token could be extract from the input `I saw a lion Yesterday`.
|
||||
normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
|
||||
Defines whether this token should match against the normalized version of the input
|
||||
text. For example, with the added token ``"yesterday"``, and a normalizer in charge of
|
||||
lowercasing the text, the token could be extract from the input ``"I saw a lion
|
||||
Yesterday"``.
|
||||
"""
|
||||
pass
|
||||
|
||||
class Tokenizer:
|
||||
"""Tokenizer
|
||||
|
||||
A Tokenizer works as a pipeline, it processes some raw text as input and outputs
|
||||
an `Encoding`.
|
||||
A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
|
||||
and outputs an :class:`~tokenizers.Encoding`.
|
||||
|
||||
The various steps of the pipeline are:
|
||||
1. The `Normalizer`: in charge of normalizing the text. Common examples of
|
||||
normalization are the unicode normalization standards, such as NFD or NFKC.
|
||||
2. The `PreTokenizer`: in charge of creating initial words splits in the text.
|
||||
The most common way of splitting text is simply on whitespace.
|
||||
3. The `Model`: in charge of doing the actual tokenization. An example of a
|
||||
`Model` would be `BPE` or `WordPiece`.
|
||||
4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything
|
||||
relevant that, for example, a language model would need, such as special tokens.
|
||||
Args:
|
||||
model (:class:`~tokenizers.models.Model`):
|
||||
The core algorithm that this :obj:`Tokenizer` should be using.
|
||||
"""
|
||||
|
||||
def __new__(cls, model: models.Model) -> Tokenizer:
|
||||
"""Instantiate a new Tokenizer using the given Model
|
||||
|
||||
A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
|
||||
and outputs an :class:`~tokenizers.Encoding`.
|
||||
|
||||
Args:
|
||||
model: models.Model:
|
||||
The model to be used with this Tokenizer
|
||||
model (:class:`~tokenizers.models.Model`):
|
||||
The core algorithm that this :obj:`Tokenizer` should be using.
|
||||
|
||||
Returns:
|
||||
Tokenizer
|
||||
@ -494,57 +521,62 @@ class Tokenizer:
|
||||
pass
|
||||
@staticmethod
|
||||
def from_str(s: str) -> Tokenizer:
|
||||
"""Instantiate a new Tokenizer from the given JSON string
|
||||
"""Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
|
||||
|
||||
Args:
|
||||
s: str:
|
||||
A JSON string representation of the Tokenizer
|
||||
json (:obj:`str`):
|
||||
A valid JSON string representing a previously serialized
|
||||
:class:`~tokenizers.Tokenizer`
|
||||
|
||||
Returns:
|
||||
Tokenizer
|
||||
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
||||
"""
|
||||
pass
|
||||
@staticmethod
|
||||
def from_file(path: str) -> Tokenizer:
|
||||
"""Instantiate a new Tokenizer from the given file
|
||||
"""Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path.
|
||||
|
||||
Args:
|
||||
path: str:
|
||||
Path to a file containing a Tokenizer
|
||||
path (:obj:`str`):
|
||||
A path to a local JSON file representing a previously serialized
|
||||
:class:`~tokenizers.Tokenizer`
|
||||
|
||||
Returns:
|
||||
Tokenizer
|
||||
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
||||
"""
|
||||
pass
|
||||
@staticmethod
|
||||
def from_buffer(buffer: bytes) -> Tokenizer:
|
||||
"""Instantiate a new Tokenizer from the given buffer
|
||||
"""Instantiate a new :class:`~tokenizers.Tokenizer` from the given buffer.
|
||||
|
||||
Args:
|
||||
buffer: bytes:
|
||||
A buffer used to instantiate a new Tokenizer
|
||||
buffer (:obj:`bytes`):
|
||||
A buffer containing a previously serialized :class:`~tokenizers.Tokenizer`
|
||||
|
||||
Returns:
|
||||
Tokenizer
|
||||
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
||||
"""
|
||||
pass
|
||||
def to_str(self, pretty: bool = False) -> str:
|
||||
"""Get a serialized JSON version of the Tokenizer as a str
|
||||
"""Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
|
||||
|
||||
Args:
|
||||
pretty: bool:
|
||||
Whether the JSON string should be prettified
|
||||
pretty (:obj:`bool`, defaults to :obj:`False`):
|
||||
Whether the JSON string should be pretty formatted.
|
||||
|
||||
Returns:
|
||||
str
|
||||
:obj:`str`: A string representing the serialized Tokenizer
|
||||
"""
|
||||
pass
|
||||
def save(self, path: str, pretty: bool = False):
|
||||
"""Save the Tokenizer as JSON to the given path
|
||||
"""Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
|
||||
|
||||
Args:
|
||||
pretty: bool:
|
||||
Whether the JSON string should be prettified
|
||||
path (:obj:`str`):
|
||||
A path to a file in which to save the serialized tokenizer.
|
||||
|
||||
pretty (:obj:`bool`, defaults to :obj:`False`):
|
||||
Whether the JSON file should be pretty formatted.
|
||||
"""
|
||||
pass
|
||||
@property
|
||||
@ -593,40 +625,41 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
|
||||
"""Returns the vocabulary
|
||||
"""Get the underlying vocabulary
|
||||
|
||||
Args:
|
||||
with_added_tokens: boolean:
|
||||
Whether to include the added tokens in the vocabulary
|
||||
with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||
Whether to include the added tokens
|
||||
|
||||
Returns:
|
||||
The vocabulary
|
||||
:obj:`Dict[str, int]`: The vocabulary
|
||||
"""
|
||||
pass
|
||||
def get_vocab_size(self, with_added_tokens: bool = True) -> int:
|
||||
"""Returns the size of the vocabulary
|
||||
"""Get the size of the underlying vocabulary
|
||||
|
||||
Args:
|
||||
with_added_tokens: boolean:
|
||||
Whether to include the added tokens in the vocabulary's size
|
||||
with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||
Whether to include the added tokens
|
||||
|
||||
Returns:
|
||||
The size of the vocabulary
|
||||
:obj:`int`: The size of the vocabulary
|
||||
"""
|
||||
pass
|
||||
def enable_truncation(self, max_length: int, stride: Optional[int], strategy: Optional[str]):
|
||||
"""Enable the truncation
|
||||
"""Enable truncation
|
||||
|
||||
Args:
|
||||
max_length: unsigned int:
|
||||
The maximum length at which to truncate
|
||||
max_length (:obj:`int`):
|
||||
The max length at which to truncate
|
||||
|
||||
stride: (`optional`) unsigned int:
|
||||
The length of the previous first sequence to be included
|
||||
in the overflowing sequence
|
||||
stride (:obj:`int`, `optional`):
|
||||
The length of the previous first sequence to be included in the overflowing
|
||||
sequence
|
||||
|
||||
strategy: (`optional) str:
|
||||
Can be one of `longest_first`, `only_first` or `only_second`
|
||||
strategy (:obj:`str`, `optional`, defaults to :obj:`longest_first`):
|
||||
The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or
|
||||
``only_second``.
|
||||
"""
|
||||
pass
|
||||
def no_truncation(self):
|
||||
@ -634,11 +667,13 @@ class Tokenizer:
|
||||
pass
|
||||
@property
|
||||
def truncation(self) -> Optional[dict]:
|
||||
"""Get the current truncation parameters
|
||||
"""Get the currently set truncation parameters
|
||||
|
||||
`Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
|
||||
|
||||
Returns:
|
||||
None if truncation is disabled, a dict with the current truncation parameters if
|
||||
truncation is enabled
|
||||
(:obj:`dict`, `optional`):
|
||||
A dict with the current truncation parameters if truncation is enabled
|
||||
"""
|
||||
pass
|
||||
def enable_padding(
|
||||
@ -653,26 +688,26 @@ class Tokenizer:
|
||||
"""Enable the padding
|
||||
|
||||
Args:
|
||||
direction: (`optional`) str:
|
||||
Can be one of: `right` or `left`
|
||||
direction (:obj:`str`, `optional`, defaults to :obj:`right`):
|
||||
The direction in which to pad. Can be either ``right`` or ``left``
|
||||
|
||||
pad_to_multiple_of: (`optional`) unsigned int:
|
||||
If specified, the padding length should always snap to the next multiple of
|
||||
the given value. For example if we were going to pad with a length of 250 but
|
||||
`pad_to_multiple_of=8` then we will pad to 256.
|
||||
pad_to_multiple_of (:obj:`int`, `optional`):
|
||||
If specified, the padding length should always snap to the next multiple of the
|
||||
given value. For example if we were going to pad witha length of 250 but
|
||||
``pad_to_multiple_of=8`` then we will pad to 256.
|
||||
|
||||
pad_id: (`optional`) unsigned int:
|
||||
The indice to be used when padding
|
||||
pad_id (:obj:`int`, defaults to 0):
|
||||
The id to be used when padding
|
||||
|
||||
pad_type_id: (`optional`) unsigned int:
|
||||
The type indice to be used when padding
|
||||
pad_type_id (:obj:`int`, defaults to 0):
|
||||
The type id to be used when padding
|
||||
|
||||
pad_token: (`optional`) str:
|
||||
pad_token (:obj:`str`, defaults to :obj:`[PAD]`):
|
||||
The pad token to be used when padding
|
||||
|
||||
length: (`optional`) unsigned int:
|
||||
If specified, the length at which to pad. If not specified
|
||||
we pad using the size of the longest sequence in a batch
|
||||
length (:obj:`int`, `optional`):
|
||||
If specified, the length at which to pad. If not specified we pad using the size of
|
||||
the longest sequence in a batch.
|
||||
"""
|
||||
pass
|
||||
def no_padding(self):
|
||||
@ -682,9 +717,11 @@ class Tokenizer:
|
||||
def padding(self) -> Optional[dict]:
|
||||
"""Get the current padding parameters
|
||||
|
||||
`Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
|
||||
|
||||
Returns:
|
||||
None if padding is disabled, a dict with the currently set parameters
|
||||
if the padding is enabled.
|
||||
(:obj:`dict`, `optional`):
|
||||
A dict with the current padding parameters if padding is enabled
|
||||
"""
|
||||
pass
|
||||
def encode(
|
||||
@ -694,8 +731,7 @@ class Tokenizer:
|
||||
is_pretokenized: bool = False,
|
||||
add_special_tokens: bool = True,
|
||||
) -> Encoding:
|
||||
"""
|
||||
Encode the given sequence and pair. This method can process raw text sequences
|
||||
"""Encode the given sequence and pair. This method can process raw text sequences
|
||||
as well as already pre-tokenized sequences.
|
||||
|
||||
Example:
|
||||
@ -736,8 +772,7 @@ class Tokenizer:
|
||||
is_pretokenized: bool = False,
|
||||
add_special_tokens: bool = True,
|
||||
) -> List[Encoding]:
|
||||
"""
|
||||
Encode the given batch of inputs. This method accept both raw text sequences
|
||||
"""Encode the given batch of inputs. This method accept both raw text sequences
|
||||
as well as already pre-tokenized sequences.
|
||||
|
||||
Example:
|
||||
@ -771,82 +806,91 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
|
||||
"""Decode the given list of ids to a string sequence
|
||||
"""Decode the given list of ids back to a string
|
||||
|
||||
This is used to decode anything coming back from a Language Model
|
||||
|
||||
Args:
|
||||
ids: List[unsigned int]:
|
||||
A list of ids to be decoded
|
||||
ids (A :obj:`List/Tuple` of :obj:`int`):
|
||||
The list of ids that we want to decode
|
||||
|
||||
skip_special_tokens: (`optional`) boolean:
|
||||
Whether to remove all the special tokens from the output string
|
||||
skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||
Whether the special tokens should be removed from the decoded string
|
||||
|
||||
Returns:
|
||||
The decoded string
|
||||
:obj:`str`: The decoded string
|
||||
"""
|
||||
pass
|
||||
def decode_batch(
|
||||
self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
|
||||
) -> str:
|
||||
"""Decode the list of sequences to a list of string sequences
|
||||
"""Decode a batch of ids back to their corresponding string
|
||||
|
||||
Args:
|
||||
sequences: List[List[unsigned int]]:
|
||||
A list of sequence of ids to be decoded
|
||||
sequences (:obj:`List` of :obj:`List[int]`):
|
||||
The batch of sequences we want to decode
|
||||
|
||||
skip_special_tokens: (`optional`) boolean:
|
||||
Whether to remove all the special tokens from the output strings
|
||||
skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||
Whether the special tokens should be removed from the decoded strings
|
||||
|
||||
Returns:
|
||||
A list of decoded strings
|
||||
:obj:`List[str]`: A list of decoded strings
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, token: str) -> Optional[int]:
|
||||
"""Convert the given token to its corresponding id
|
||||
"""Convert the given token to its corresponding id if it exists
|
||||
|
||||
Args:
|
||||
token: str:
|
||||
token (:obj:`str`):
|
||||
The token to convert
|
||||
|
||||
Returns:
|
||||
The corresponding id if it exists, None otherwise
|
||||
:obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
|
||||
"""
|
||||
pass
|
||||
def id_to_token(self, id: int) -> Optional[str]:
|
||||
"""Convert the given token id to its corresponding string
|
||||
"""Convert the given id to its corresponding token if it exists
|
||||
|
||||
Args:
|
||||
token: id:
|
||||
The token id to convert
|
||||
id (:obj:`int`):
|
||||
The id to convert
|
||||
|
||||
Returns:
|
||||
The corresponding string if it exists, None otherwise
|
||||
:obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
|
||||
"""
|
||||
pass
|
||||
def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
|
||||
"""Add the given tokens to the vocabulary
|
||||
|
||||
The given tokens are added only if they don't already exist in the vocabulary.
|
||||
Each token then gets a new attributed id.
|
||||
|
||||
Args:
|
||||
tokens: List[Union[str, AddedToken]]:
|
||||
A list of tokens to add to the vocabulary. Each token can either be
|
||||
a string, or an instance of AddedToken
|
||||
tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
|
||||
The list of tokens we want to add to the vocabulary. Each token can be either a
|
||||
string or an instance of :class:`~tokenizers.AddedToken` for more customization.
|
||||
|
||||
Returns:
|
||||
The number of tokens that were added to the vocabulary
|
||||
:obj:`int`: The number of tokens that were created in the vocabulary
|
||||
"""
|
||||
pass
|
||||
def add_special_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
|
||||
"""Add the given special tokens to the vocabulary, and treat them as special tokens.
|
||||
"""Add the given special tokens to the Tokenizer.
|
||||
|
||||
The special tokens will never be processed by the model, and will be
|
||||
removed while decoding.
|
||||
If these tokens are already part of the vocabulary, it just let the Tokenizer know about
|
||||
them. If they don't exist, the Tokenizer creates them, giving them a new id.
|
||||
|
||||
These special tokens will never be processed by the model (ie won't be split into
|
||||
multiple tokens), and they can be removed from the output when decoding.
|
||||
|
||||
Args:
|
||||
tokens: List[Union[str, AddedToken]]:
|
||||
The list of special tokens to add. Each token can either be a string
|
||||
or an instance of AddedToken
|
||||
tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
|
||||
The list of special tokens we want to add to the vocabulary. Each token can either
|
||||
be a string or an instance of :class:`~tokenizers.AddedToken` for more
|
||||
customization.
|
||||
|
||||
Returns:
|
||||
The number of tokens that were added to the vocabulary
|
||||
:obj:`int`: The number of tokens that were created in the vocabulary
|
||||
"""
|
||||
pass
|
||||
def post_process(
|
||||
@ -858,21 +902,24 @@ class Tokenizer:
|
||||
"""Apply all the post-processing steps to the given encodings.
|
||||
|
||||
The various steps are:
|
||||
1. Truncate according to global params (provided to `enable_truncation`)
|
||||
2. Apply the PostProcessor
|
||||
3. Pad according to global params. (provided to `enable_padding`)
|
||||
|
||||
1. Truncate according to the set truncation params (provided with
|
||||
:meth:`~tokenizers.Tokenizer.enable_truncation`)
|
||||
2. Apply the :class:`~tokenizers.processors.PostProcessor`
|
||||
3. Pad according to the set padding params (provided with
|
||||
:meth:`~tokenizers.Tokenizer.enable_padding`)
|
||||
|
||||
Args:
|
||||
encoding: Encoding:
|
||||
The main Encoding to post process
|
||||
encoding (:class:`~tokenizers.Encoding`):
|
||||
The :class:`~tokenizers.Encoding` corresponding to the main sequence.
|
||||
|
||||
pair: Optional[Encoding]:
|
||||
An optional pair Encoding
|
||||
pair (:class:`~tokenizers.Encoding`, `optional`):
|
||||
An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence.
|
||||
|
||||
add_special_tokens: bool:
|
||||
Whether to add special tokens
|
||||
add_special_tokens (:obj:`bool`):
|
||||
Whether to add the special tokens
|
||||
|
||||
Returns:
|
||||
The resulting Encoding
|
||||
:class:`~tokenizers.Encoding`: The final post-processed encoding
|
||||
"""
|
||||
pass
|
||||
|
@ -53,7 +53,7 @@ use crate::processors::PyPostProcessor;
|
||||
/// Yesterday"``.
|
||||
///
|
||||
#[pyclass(dict, module = "tokenizers", name=AddedToken)]
|
||||
#[text_signature = "(content, **kwargs)"]
|
||||
#[text_signature = "(content, single_word=False, lstrip=False, rstrip=False, normalized=True)"]
|
||||
pub struct PyAddedToken {
|
||||
pub content: String,
|
||||
pub is_special_token: bool,
|
||||
@ -408,6 +408,7 @@ type Tokenizer = TokenizerImpl<PyModel, PyNormalizer, PyPreTokenizer, PyPostProc
|
||||
/// The core algorithm that this :obj:`Tokenizer` should be using.
|
||||
///
|
||||
#[pyclass(dict, module = "tokenizers", name=Tokenizer)]
|
||||
#[text_signature = "(model)"]
|
||||
#[derive(Clone)]
|
||||
pub struct PyTokenizer {
|
||||
tokenizer: Tokenizer,
|
||||
@ -533,7 +534,7 @@ impl PyTokenizer {
|
||||
/// path (:obj:`str`):
|
||||
/// A path to a file in which to save the serialized tokenizer.
|
||||
///
|
||||
/// pretty (:obj:`bool`, defaults to :obj:`False):
|
||||
/// pretty (:obj:`bool`, defaults to :obj:`False`):
|
||||
/// Whether the JSON file should be pretty formatted.
|
||||
#[args(pretty = false)]
|
||||
#[text_signature = "($self, pretty=False)"]
|
||||
@ -551,7 +552,7 @@ impl PyTokenizer {
|
||||
/// Get the underlying vocabulary
|
||||
///
|
||||
/// Args:
|
||||
/// with_added_tokens (:obj:`bool, defaults to :obj:`True`):
|
||||
/// with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||
/// Whether to include the added tokens
|
||||
///
|
||||
/// Returns:
|
||||
@ -565,7 +566,7 @@ impl PyTokenizer {
|
||||
/// Get the size of the underlying vocabulary
|
||||
///
|
||||
/// Args:
|
||||
/// with_added_tokens (:obj:`bool, defaults to :obj:`True`):
|
||||
/// with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||
/// Whether to include the added tokens
|
||||
///
|
||||
/// Returns:
|
||||
@ -632,6 +633,8 @@ impl PyTokenizer {
|
||||
|
||||
/// Get the currently set truncation parameters
|
||||
///
|
||||
/// `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
|
||||
///
|
||||
/// Returns:
|
||||
/// (:obj:`dict`, `optional`):
|
||||
/// A dict with the current truncation parameters if truncation is enabled
|
||||
@ -737,6 +740,8 @@ impl PyTokenizer {
|
||||
|
||||
/// Get the current padding parameters
|
||||
///
|
||||
/// `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
|
||||
///
|
||||
/// Returns:
|
||||
/// (:obj:`dict`, `optional`):
|
||||
/// A dict with the current padding parameters if padding is enabled
|
||||
@ -797,7 +802,7 @@ impl PyTokenizer {
|
||||
/// :class:`~tokenizers.Encoding`: The encoded result
|
||||
///
|
||||
#[args(pair = "None", is_pretokenized = "false", add_special_tokens = "true")]
|
||||
#[text_signature = "($self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True, /)"]
|
||||
#[text_signature = "($self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)"]
|
||||
fn encode(
|
||||
&self,
|
||||
sequence: &PyAny,
|
||||
@ -862,7 +867,7 @@ impl PyTokenizer {
|
||||
/// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
|
||||
///
|
||||
#[args(is_pretokenized = "false", add_special_tokens = "true")]
|
||||
#[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True, /)"]
|
||||
#[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True)"]
|
||||
fn encode_batch(
|
||||
&self,
|
||||
input: Vec<&PyAny>,
|
||||
@ -891,37 +896,88 @@ impl PyTokenizer {
|
||||
})
|
||||
}
|
||||
|
||||
fn decode(&self, ids: Vec<u32>, skip_special_tokens: Option<bool>) -> PyResult<String> {
|
||||
ToPyResult(
|
||||
self.tokenizer
|
||||
.decode(ids, skip_special_tokens.unwrap_or(true)),
|
||||
)
|
||||
.into()
|
||||
/// Decode the given list of ids back to a string
|
||||
///
|
||||
/// This is used to decode anything coming back from a Language Model
|
||||
///
|
||||
/// Args:
|
||||
/// ids (A :obj:`List/Tuple` of :obj:`int`):
|
||||
/// The list of ids that we want to decode
|
||||
///
|
||||
/// skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||
/// Whether the special tokens should be removed from the decoded string
|
||||
///
|
||||
/// Returns:
|
||||
/// :obj:`str`: The decoded string
|
||||
#[args(skip_special_tokens = true)]
|
||||
#[text_signature = "($self, ids, skip_special_tokens=True)"]
|
||||
fn decode(&self, ids: Vec<u32>, skip_special_tokens: bool) -> PyResult<String> {
|
||||
ToPyResult(self.tokenizer.decode(ids, skip_special_tokens)).into()
|
||||
}
|
||||
|
||||
/// Decode a batch of ids back to their corresponding string
|
||||
///
|
||||
/// Args:
|
||||
/// sequences (:obj:`List` of :obj:`List[int]`):
|
||||
/// The batch of sequences we want to decode
|
||||
///
|
||||
/// skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
||||
/// Whether the special tokens should be removed from the decoded strings
|
||||
///
|
||||
/// Returns:
|
||||
/// :obj:`List[str]`: A list of decoded strings
|
||||
#[args(skip_special_tokens = true)]
|
||||
#[text_signature = "($self, sequences, skip_special_tokens=True)"]
|
||||
fn decode_batch(
|
||||
&self,
|
||||
sentences: Vec<Vec<u32>>,
|
||||
skip_special_tokens: Option<bool>,
|
||||
sequences: Vec<Vec<u32>>,
|
||||
skip_special_tokens: bool,
|
||||
) -> PyResult<Vec<String>> {
|
||||
let gil = Python::acquire_gil();
|
||||
gil.python().allow_threads(|| {
|
||||
ToPyResult(
|
||||
self.tokenizer
|
||||
.decode_batch(sentences, skip_special_tokens.unwrap_or(true)),
|
||||
)
|
||||
.into()
|
||||
ToPyResult(self.tokenizer.decode_batch(sequences, skip_special_tokens)).into()
|
||||
})
|
||||
}
|
||||
|
||||
/// Convert the given token to its corresponding id if it exists
|
||||
///
|
||||
/// Args:
|
||||
/// token (:obj:`str`):
|
||||
/// The token to convert
|
||||
///
|
||||
/// Returns:
|
||||
/// :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
|
||||
#[text_signature = "($self, token)"]
|
||||
fn token_to_id(&self, token: &str) -> Option<u32> {
|
||||
self.tokenizer.token_to_id(token)
|
||||
}
|
||||
|
||||
/// Convert the given id to its corresponding token if it exists
|
||||
///
|
||||
/// Args:
|
||||
/// id (:obj:`int`):
|
||||
/// The id to convert
|
||||
///
|
||||
/// Returns:
|
||||
/// :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
|
||||
#[text_signature = "($self, id)"]
|
||||
fn id_to_token(&self, id: u32) -> Option<&str> {
|
||||
self.tokenizer.id_to_token(id)
|
||||
}
|
||||
|
||||
/// Add the given tokens to the vocabulary
|
||||
///
|
||||
/// The given tokens are added only if they don't already exist in the vocabulary.
|
||||
/// Each token then gets a new attributed id.
|
||||
///
|
||||
/// Args:
|
||||
/// tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
|
||||
/// The list of tokens we want to add to the vocabulary. Each token can be either a
|
||||
/// string or an instance of :class:`~tokenizers.AddedToken` for more customization.
|
||||
///
|
||||
/// Returns:
|
||||
/// :obj:`int`: The number of tokens that were created in the vocabulary
|
||||
#[text_signature = "($self, tokens)"]
|
||||
fn add_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
|
||||
let tokens = tokens
|
||||
.into_iter()
|
||||
@ -942,6 +998,23 @@ impl PyTokenizer {
|
||||
Ok(self.tokenizer.add_tokens(&tokens))
|
||||
}
|
||||
|
||||
/// Add the given special tokens to the Tokenizer.
|
||||
///
|
||||
/// If these tokens are already part of the vocabulary, it just let the Tokenizer know about
|
||||
/// them. If they don't exist, the Tokenizer creates them, giving them a new id.
|
||||
///
|
||||
/// These special tokens will never be processed by the model (ie won't be split into
|
||||
/// multiple tokens), and they can be removed from the output when decoding.
|
||||
///
|
||||
/// Args:
|
||||
/// tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
|
||||
/// The list of special tokens we want to add to the vocabulary. Each token can either
|
||||
/// be a string or an instance of :class:`~tokenizers.AddedToken` for more
|
||||
/// customization.
|
||||
///
|
||||
/// Returns:
|
||||
/// :obj:`int`: The number of tokens that were created in the vocabulary
|
||||
#[text_signature = "($self, tokens)"]
|
||||
fn add_special_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
|
||||
let tokens = tokens
|
||||
.into_iter()
|
||||
@ -968,7 +1041,30 @@ impl PyTokenizer {
|
||||
.allow_threads(|| ToPyResult(self.tokenizer.train_and_replace(trainer, files)).into())
|
||||
}
|
||||
|
||||
/// Apply all the post-processing steps to the given encodings.
|
||||
///
|
||||
/// The various steps are:
|
||||
///
|
||||
/// 1. Truncate according to the set truncation params (provided with
|
||||
/// :meth:`~tokenizers.Tokenizer.enable_truncation`)
|
||||
/// 2. Apply the :class:`~tokenizers.processors.PostProcessor`
|
||||
/// 3. Pad according to the set padding params (provided with
|
||||
/// :meth:`~tokenizers.Tokenizer.enable_padding`)
|
||||
///
|
||||
/// Args:
|
||||
/// encoding (:class:`~tokenizers.Encoding`):
|
||||
/// The :class:`~tokenizers.Encoding` corresponding to the main sequence.
|
||||
///
|
||||
/// pair (:class:`~tokenizers.Encoding`, `optional`):
|
||||
/// An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence.
|
||||
///
|
||||
/// add_special_tokens (:obj:`bool`):
|
||||
/// Whether to add the special tokens
|
||||
///
|
||||
/// Returns:
|
||||
/// :class:`~tokenizers.Encoding`: The final post-processed encoding
|
||||
#[args(pair = "None", add_special_tokens = true)]
|
||||
#[text_signature = "($self, encoding, pair=None, add_special_tokens=True)"]
|
||||
fn post_process(
|
||||
&self,
|
||||
encoding: &PyEncoding,
|
||||
@ -987,16 +1083,19 @@ impl PyTokenizer {
|
||||
.into()
|
||||
}
|
||||
|
||||
/// The :class:`~tokenizers.models.Model` in use by the Tokenizer
|
||||
#[getter]
|
||||
fn get_model(&self) -> PyResult<PyObject> {
|
||||
self.tokenizer.get_model().get_as_subtype()
|
||||
}
|
||||
|
||||
/// Set the :class:`~tokenizers.models.Model`
|
||||
#[setter]
|
||||
fn set_model(&mut self, model: PyRef<PyModel>) {
|
||||
self.tokenizer.with_model(model.clone());
|
||||
}
|
||||
|
||||
/// The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
|
||||
#[getter]
|
||||
fn get_normalizer(&self) -> PyResult<PyObject> {
|
||||
if let Some(n) = self.tokenizer.get_normalizer() {
|
||||
@ -1006,11 +1105,13 @@ impl PyTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the :class:`~tokenizers.normalizers.Normalizer`
|
||||
#[setter]
|
||||
fn set_normalizer(&mut self, normalizer: PyRef<PyNormalizer>) {
|
||||
self.tokenizer.with_normalizer(normalizer.clone());
|
||||
}
|
||||
|
||||
/// The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
|
||||
#[getter]
|
||||
fn get_pre_tokenizer(&self) -> PyResult<PyObject> {
|
||||
if let Some(pt) = self.tokenizer.get_pre_tokenizer() {
|
||||
@ -1020,11 +1121,13 @@ impl PyTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the :class:`~tokenizers.normalizers.Normalizer`
|
||||
#[setter]
|
||||
fn set_pre_tokenizer(&mut self, pretok: PyRef<PyPreTokenizer>) {
|
||||
self.tokenizer.with_pre_tokenizer(pretok.clone());
|
||||
}
|
||||
|
||||
/// The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
|
||||
#[getter]
|
||||
fn get_post_processor(&self) -> PyResult<PyObject> {
|
||||
if let Some(n) = self.tokenizer.get_post_processor() {
|
||||
@ -1034,11 +1137,13 @@ impl PyTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the :class:`~tokenizers.processors.PostProcessor`
|
||||
#[setter]
|
||||
fn set_post_processor(&mut self, processor: PyRef<PyPostProcessor>) {
|
||||
self.tokenizer.with_post_processor(processor.clone());
|
||||
}
|
||||
|
||||
/// The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
|
||||
#[getter]
|
||||
fn get_decoder(&self) -> PyResult<PyObject> {
|
||||
if let Some(dec) = self.tokenizer.get_decoder() {
|
||||
@ -1048,6 +1153,7 @@ impl PyTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the :class:`~tokenizers.decoders.Decoder`
|
||||
#[setter]
|
||||
fn set_decoder(&mut self, decoder: PyRef<PyDecoder>) {
|
||||
self.tokenizer.with_decoder(decoder.clone());
|
||||
|
Reference in New Issue
Block a user