Doc - API Reference for most Tokenizer methods/attributes

This commit is contained in:
Anthony MOI
2020-10-07 13:12:07 -04:00
committed by Anthony MOI
parent 8c0370657e
commit a86d49634c
2 changed files with 304 additions and 151 deletions

View File

@ -422,9 +422,37 @@ class Encoding:
pass
class AddedToken:
"""AddedToken represents a token to be added to a Tokenizer
"""AddedToken
An AddedToken can have special options defining the way it should behave.
Represents a token that can be be added to a :class:`~tokenizers.Tokenizer`.
It can have special options that defines the way it should behave.
Args:
content (:obj:`str`): The content of the token
single_word (:obj:`bool`, defaults to :obj:`False`):
Defines whether this token should only match single words. If :obj:`True`, this
token will never match inside of a word. For example the token ``ing`` would match
on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`.
The notion of "`inside of a word`" is defined by the word boundaries pattern in
regular expressions (ie. the token should start and end with word boundaries).
lstrip (:obj:`bool`, defaults to :obj:`False`):
Defines whether this token should strip all potential whitespaces on its left side.
If :obj:`True`, this token will greedily match any whitespace on its left. For
example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text
``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left).
rstrip (:obj:`bool`, defaults to :obj:`False`):
Defines whether this token should strip all potential whitespaces on its right
side. If :obj:`True`, this token will greedily match any whitespace on its right.
It works just like :obj:`lstrip` but on the right.
normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
Defines whether this token should match against the normalized version of the input
text. For example, with the added token ``"yesterday"``, and a normalizer in charge of
lowercasing the text, the token could be extract from the input ``"I saw a lion
Yesterday"``.
"""
def __new__(
@ -438,55 +466,54 @@ class AddedToken:
"""Instantiate a new AddedToken
Args:
content: str:
The content of the token
content (:obj:`str`): The content of the token
single_word: bool
Whether this token should only match against single words. If True,
this token will never match inside of a word. For example the token `ing` would
match on `tokenizing` if this option if False, but not if this option is True.
single_word (:obj:`bool`, defaults to :obj:`False`):
Defines whether this token should only match single words. If :obj:`True`, this
token will never match inside of a word. For example the token ``ing`` would match
on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`.
The notion of "`inside of a word`" is defined by the word boundaries pattern in
regular expressions (ie. the token should start and end with word boundaries).
lstrip: bool
Whether this token should strip all potential whitespaces on the left side.
If True, this token will greedily match any whitespace on the left. For example,
if we try to match the token `[MASK]` with lstrip=True, in the text `I saw a [MASK]`
we will match on ` [MASK]`.
lstrip (:obj:`bool`, defaults to :obj:`False`):
Defines whether this token should strip all potential whitespaces on its left side.
If :obj:`True`, this token will greedily match any whitespace on its left. For
example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text
``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left).
rstrip: bool
Whether this token should strip all potential whitespaces on the right side.
If True, this token will greedily match any whitespace on the right. It works just
like lstrip, but on the right.
rstrip (:obj:`bool`, defaults to :obj:`False`):
Defines whether this token should strip all potential whitespaces on its right
side. If :obj:`True`, this token will greedily match any whitespace on its right.
It works just like :obj:`lstrip` but on the right.
normalized: bool:
Whether this token should be match the normalized version of the input text. For
example, with the added token `yesterday` and a normalizer in charge of lowercasing
the text, the token could be extract from the input `I saw a lion Yesterday`.
normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
Defines whether this token should match against the normalized version of the input
text. For example, with the added token ``"yesterday"``, and a normalizer in charge of
lowercasing the text, the token could be extract from the input ``"I saw a lion
Yesterday"``.
"""
pass
class Tokenizer:
"""Tokenizer
A Tokenizer works as a pipeline, it processes some raw text as input and outputs
an `Encoding`.
A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
and outputs an :class:`~tokenizers.Encoding`.
The various steps of the pipeline are:
1. The `Normalizer`: in charge of normalizing the text. Common examples of
normalization are the unicode normalization standards, such as NFD or NFKC.
2. The `PreTokenizer`: in charge of creating initial words splits in the text.
The most common way of splitting text is simply on whitespace.
3. The `Model`: in charge of doing the actual tokenization. An example of a
`Model` would be `BPE` or `WordPiece`.
4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything
relevant that, for example, a language model would need, such as special tokens.
Args:
model (:class:`~tokenizers.models.Model`):
The core algorithm that this :obj:`Tokenizer` should be using.
"""
def __new__(cls, model: models.Model) -> Tokenizer:
"""Instantiate a new Tokenizer using the given Model
A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
and outputs an :class:`~tokenizers.Encoding`.
Args:
model: models.Model:
The model to be used with this Tokenizer
model (:class:`~tokenizers.models.Model`):
The core algorithm that this :obj:`Tokenizer` should be using.
Returns:
Tokenizer
@ -494,57 +521,62 @@ class Tokenizer:
pass
@staticmethod
def from_str(s: str) -> Tokenizer:
"""Instantiate a new Tokenizer from the given JSON string
"""Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
Args:
s: str:
A JSON string representation of the Tokenizer
json (:obj:`str`):
A valid JSON string representing a previously serialized
:class:`~tokenizers.Tokenizer`
Returns:
Tokenizer
:class:`~tokenizers.Tokenizer`: The new tokenizer
"""
pass
@staticmethod
def from_file(path: str) -> Tokenizer:
"""Instantiate a new Tokenizer from the given file
"""Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path.
Args:
path: str:
Path to a file containing a Tokenizer
path (:obj:`str`):
A path to a local JSON file representing a previously serialized
:class:`~tokenizers.Tokenizer`
Returns:
Tokenizer
:class:`~tokenizers.Tokenizer`: The new tokenizer
"""
pass
@staticmethod
def from_buffer(buffer: bytes) -> Tokenizer:
"""Instantiate a new Tokenizer from the given buffer
"""Instantiate a new :class:`~tokenizers.Tokenizer` from the given buffer.
Args:
buffer: bytes:
A buffer used to instantiate a new Tokenizer
buffer (:obj:`bytes`):
A buffer containing a previously serialized :class:`~tokenizers.Tokenizer`
Returns:
Tokenizer
:class:`~tokenizers.Tokenizer`: The new tokenizer
"""
pass
def to_str(self, pretty: bool = False) -> str:
"""Get a serialized JSON version of the Tokenizer as a str
"""Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
Args:
pretty: bool:
Whether the JSON string should be prettified
pretty (:obj:`bool`, defaults to :obj:`False`):
Whether the JSON string should be pretty formatted.
Returns:
str
:obj:`str`: A string representing the serialized Tokenizer
"""
pass
def save(self, path: str, pretty: bool = False):
"""Save the Tokenizer as JSON to the given path
"""Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
Args:
pretty: bool:
Whether the JSON string should be prettified
path (:obj:`str`):
A path to a file in which to save the serialized tokenizer.
pretty (:obj:`bool`, defaults to :obj:`False`):
Whether the JSON file should be pretty formatted.
"""
pass
@property
@ -593,40 +625,41 @@ class Tokenizer:
"""
pass
def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
"""Returns the vocabulary
"""Get the underlying vocabulary
Args:
with_added_tokens: boolean:
Whether to include the added tokens in the vocabulary
with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether to include the added tokens
Returns:
The vocabulary
:obj:`Dict[str, int]`: The vocabulary
"""
pass
def get_vocab_size(self, with_added_tokens: bool = True) -> int:
"""Returns the size of the vocabulary
"""Get the size of the underlying vocabulary
Args:
with_added_tokens: boolean:
Whether to include the added tokens in the vocabulary's size
with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether to include the added tokens
Returns:
The size of the vocabulary
:obj:`int`: The size of the vocabulary
"""
pass
def enable_truncation(self, max_length: int, stride: Optional[int], strategy: Optional[str]):
"""Enable the truncation
"""Enable truncation
Args:
max_length: unsigned int:
The maximum length at which to truncate
max_length (:obj:`int`):
The max length at which to truncate
stride: (`optional`) unsigned int:
The length of the previous first sequence to be included
in the overflowing sequence
stride (:obj:`int`, `optional`):
The length of the previous first sequence to be included in the overflowing
sequence
strategy: (`optional) str:
Can be one of `longest_first`, `only_first` or `only_second`
strategy (:obj:`str`, `optional`, defaults to :obj:`longest_first`):
The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or
``only_second``.
"""
pass
def no_truncation(self):
@ -634,11 +667,13 @@ class Tokenizer:
pass
@property
def truncation(self) -> Optional[dict]:
"""Get the current truncation parameters
"""Get the currently set truncation parameters
`Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
Returns:
None if truncation is disabled, a dict with the current truncation parameters if
truncation is enabled
(:obj:`dict`, `optional`):
A dict with the current truncation parameters if truncation is enabled
"""
pass
def enable_padding(
@ -653,26 +688,26 @@ class Tokenizer:
"""Enable the padding
Args:
direction: (`optional`) str:
Can be one of: `right` or `left`
direction (:obj:`str`, `optional`, defaults to :obj:`right`):
The direction in which to pad. Can be either ``right`` or ``left``
pad_to_multiple_of: (`optional`) unsigned int:
If specified, the padding length should always snap to the next multiple of
the given value. For example if we were going to pad with a length of 250 but
`pad_to_multiple_of=8` then we will pad to 256.
pad_to_multiple_of (:obj:`int`, `optional`):
If specified, the padding length should always snap to the next multiple of the
given value. For example if we were going to pad witha length of 250 but
``pad_to_multiple_of=8`` then we will pad to 256.
pad_id: (`optional`) unsigned int:
The indice to be used when padding
pad_id (:obj:`int`, defaults to 0):
The id to be used when padding
pad_type_id: (`optional`) unsigned int:
The type indice to be used when padding
pad_type_id (:obj:`int`, defaults to 0):
The type id to be used when padding
pad_token: (`optional`) str:
pad_token (:obj:`str`, defaults to :obj:`[PAD]`):
The pad token to be used when padding
length: (`optional`) unsigned int:
If specified, the length at which to pad. If not specified
we pad using the size of the longest sequence in a batch
length (:obj:`int`, `optional`):
If specified, the length at which to pad. If not specified we pad using the size of
the longest sequence in a batch.
"""
pass
def no_padding(self):
@ -682,9 +717,11 @@ class Tokenizer:
def padding(self) -> Optional[dict]:
"""Get the current padding parameters
`Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
Returns:
None if padding is disabled, a dict with the currently set parameters
if the padding is enabled.
(:obj:`dict`, `optional`):
A dict with the current padding parameters if padding is enabled
"""
pass
def encode(
@ -694,8 +731,7 @@ class Tokenizer:
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> Encoding:
"""
Encode the given sequence and pair. This method can process raw text sequences
"""Encode the given sequence and pair. This method can process raw text sequences
as well as already pre-tokenized sequences.
Example:
@ -736,8 +772,7 @@ class Tokenizer:
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> List[Encoding]:
"""
Encode the given batch of inputs. This method accept both raw text sequences
"""Encode the given batch of inputs. This method accept both raw text sequences
as well as already pre-tokenized sequences.
Example:
@ -771,82 +806,91 @@ class Tokenizer:
"""
pass
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
"""Decode the given list of ids to a string sequence
"""Decode the given list of ids back to a string
This is used to decode anything coming back from a Language Model
Args:
ids: List[unsigned int]:
A list of ids to be decoded
ids (A :obj:`List/Tuple` of :obj:`int`):
The list of ids that we want to decode
skip_special_tokens: (`optional`) boolean:
Whether to remove all the special tokens from the output string
skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether the special tokens should be removed from the decoded string
Returns:
The decoded string
:obj:`str`: The decoded string
"""
pass
def decode_batch(
self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
) -> str:
"""Decode the list of sequences to a list of string sequences
"""Decode a batch of ids back to their corresponding string
Args:
sequences: List[List[unsigned int]]:
A list of sequence of ids to be decoded
sequences (:obj:`List` of :obj:`List[int]`):
The batch of sequences we want to decode
skip_special_tokens: (`optional`) boolean:
Whether to remove all the special tokens from the output strings
skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether the special tokens should be removed from the decoded strings
Returns:
A list of decoded strings
:obj:`List[str]`: A list of decoded strings
"""
pass
def token_to_id(self, token: str) -> Optional[int]:
"""Convert the given token to its corresponding id
"""Convert the given token to its corresponding id if it exists
Args:
token: str:
token (:obj:`str`):
The token to convert
Returns:
The corresponding id if it exists, None otherwise
:obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
"""
pass
def id_to_token(self, id: int) -> Optional[str]:
"""Convert the given token id to its corresponding string
"""Convert the given id to its corresponding token if it exists
Args:
token: id:
The token id to convert
id (:obj:`int`):
The id to convert
Returns:
The corresponding string if it exists, None otherwise
:obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
"""
pass
def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
"""Add the given tokens to the vocabulary
The given tokens are added only if they don't already exist in the vocabulary.
Each token then gets a new attributed id.
Args:
tokens: List[Union[str, AddedToken]]:
A list of tokens to add to the vocabulary. Each token can either be
a string, or an instance of AddedToken
tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
The list of tokens we want to add to the vocabulary. Each token can be either a
string or an instance of :class:`~tokenizers.AddedToken` for more customization.
Returns:
The number of tokens that were added to the vocabulary
:obj:`int`: The number of tokens that were created in the vocabulary
"""
pass
def add_special_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
"""Add the given special tokens to the vocabulary, and treat them as special tokens.
"""Add the given special tokens to the Tokenizer.
The special tokens will never be processed by the model, and will be
removed while decoding.
If these tokens are already part of the vocabulary, it just let the Tokenizer know about
them. If they don't exist, the Tokenizer creates them, giving them a new id.
These special tokens will never be processed by the model (ie won't be split into
multiple tokens), and they can be removed from the output when decoding.
Args:
tokens: List[Union[str, AddedToken]]:
The list of special tokens to add. Each token can either be a string
or an instance of AddedToken
tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
The list of special tokens we want to add to the vocabulary. Each token can either
be a string or an instance of :class:`~tokenizers.AddedToken` for more
customization.
Returns:
The number of tokens that were added to the vocabulary
:obj:`int`: The number of tokens that were created in the vocabulary
"""
pass
def post_process(
@ -858,21 +902,24 @@ class Tokenizer:
"""Apply all the post-processing steps to the given encodings.
The various steps are:
1. Truncate according to global params (provided to `enable_truncation`)
2. Apply the PostProcessor
3. Pad according to global params. (provided to `enable_padding`)
1. Truncate according to the set truncation params (provided with
:meth:`~tokenizers.Tokenizer.enable_truncation`)
2. Apply the :class:`~tokenizers.processors.PostProcessor`
3. Pad according to the set padding params (provided with
:meth:`~tokenizers.Tokenizer.enable_padding`)
Args:
encoding: Encoding:
The main Encoding to post process
encoding (:class:`~tokenizers.Encoding`):
The :class:`~tokenizers.Encoding` corresponding to the main sequence.
pair: Optional[Encoding]:
An optional pair Encoding
pair (:class:`~tokenizers.Encoding`, `optional`):
An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence.
add_special_tokens: bool:
Whether to add special tokens
add_special_tokens (:obj:`bool`):
Whether to add the special tokens
Returns:
The resulting Encoding
:class:`~tokenizers.Encoding`: The final post-processed encoding
"""
pass

View File

@ -53,7 +53,7 @@ use crate::processors::PyPostProcessor;
/// Yesterday"``.
///
#[pyclass(dict, module = "tokenizers", name=AddedToken)]
#[text_signature = "(content, **kwargs)"]
#[text_signature = "(content, single_word=False, lstrip=False, rstrip=False, normalized=True)"]
pub struct PyAddedToken {
pub content: String,
pub is_special_token: bool,
@ -408,6 +408,7 @@ type Tokenizer = TokenizerImpl<PyModel, PyNormalizer, PyPreTokenizer, PyPostProc
/// The core algorithm that this :obj:`Tokenizer` should be using.
///
#[pyclass(dict, module = "tokenizers", name=Tokenizer)]
#[text_signature = "(model)"]
#[derive(Clone)]
pub struct PyTokenizer {
tokenizer: Tokenizer,
@ -533,7 +534,7 @@ impl PyTokenizer {
/// path (:obj:`str`):
/// A path to a file in which to save the serialized tokenizer.
///
/// pretty (:obj:`bool`, defaults to :obj:`False):
/// pretty (:obj:`bool`, defaults to :obj:`False`):
/// Whether the JSON file should be pretty formatted.
#[args(pretty = false)]
#[text_signature = "($self, pretty=False)"]
@ -551,7 +552,7 @@ impl PyTokenizer {
/// Get the underlying vocabulary
///
/// Args:
/// with_added_tokens (:obj:`bool, defaults to :obj:`True`):
/// with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether to include the added tokens
///
/// Returns:
@ -565,7 +566,7 @@ impl PyTokenizer {
/// Get the size of the underlying vocabulary
///
/// Args:
/// with_added_tokens (:obj:`bool, defaults to :obj:`True`):
/// with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether to include the added tokens
///
/// Returns:
@ -632,6 +633,8 @@ impl PyTokenizer {
/// Get the currently set truncation parameters
///
/// `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
///
/// Returns:
/// (:obj:`dict`, `optional`):
/// A dict with the current truncation parameters if truncation is enabled
@ -737,6 +740,8 @@ impl PyTokenizer {
/// Get the current padding parameters
///
/// `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
///
/// Returns:
/// (:obj:`dict`, `optional`):
/// A dict with the current padding parameters if padding is enabled
@ -797,7 +802,7 @@ impl PyTokenizer {
/// :class:`~tokenizers.Encoding`: The encoded result
///
#[args(pair = "None", is_pretokenized = "false", add_special_tokens = "true")]
#[text_signature = "($self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True, /)"]
#[text_signature = "($self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)"]
fn encode(
&self,
sequence: &PyAny,
@ -862,7 +867,7 @@ impl PyTokenizer {
/// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
///
#[args(is_pretokenized = "false", add_special_tokens = "true")]
#[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True, /)"]
#[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True)"]
fn encode_batch(
&self,
input: Vec<&PyAny>,
@ -891,37 +896,88 @@ impl PyTokenizer {
})
}
fn decode(&self, ids: Vec<u32>, skip_special_tokens: Option<bool>) -> PyResult<String> {
ToPyResult(
self.tokenizer
.decode(ids, skip_special_tokens.unwrap_or(true)),
)
.into()
/// Decode the given list of ids back to a string
///
/// This is used to decode anything coming back from a Language Model
///
/// Args:
/// ids (A :obj:`List/Tuple` of :obj:`int`):
/// The list of ids that we want to decode
///
/// skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether the special tokens should be removed from the decoded string
///
/// Returns:
/// :obj:`str`: The decoded string
#[args(skip_special_tokens = true)]
#[text_signature = "($self, ids, skip_special_tokens=True)"]
fn decode(&self, ids: Vec<u32>, skip_special_tokens: bool) -> PyResult<String> {
ToPyResult(self.tokenizer.decode(ids, skip_special_tokens)).into()
}
/// Decode a batch of ids back to their corresponding string
///
/// Args:
/// sequences (:obj:`List` of :obj:`List[int]`):
/// The batch of sequences we want to decode
///
/// skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether the special tokens should be removed from the decoded strings
///
/// Returns:
/// :obj:`List[str]`: A list of decoded strings
#[args(skip_special_tokens = true)]
#[text_signature = "($self, sequences, skip_special_tokens=True)"]
fn decode_batch(
&self,
sentences: Vec<Vec<u32>>,
skip_special_tokens: Option<bool>,
sequences: Vec<Vec<u32>>,
skip_special_tokens: bool,
) -> PyResult<Vec<String>> {
let gil = Python::acquire_gil();
gil.python().allow_threads(|| {
ToPyResult(
self.tokenizer
.decode_batch(sentences, skip_special_tokens.unwrap_or(true)),
)
.into()
ToPyResult(self.tokenizer.decode_batch(sequences, skip_special_tokens)).into()
})
}
/// Convert the given token to its corresponding id if it exists
///
/// Args:
/// token (:obj:`str`):
/// The token to convert
///
/// Returns:
/// :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
#[text_signature = "($self, token)"]
fn token_to_id(&self, token: &str) -> Option<u32> {
self.tokenizer.token_to_id(token)
}
/// Convert the given id to its corresponding token if it exists
///
/// Args:
/// id (:obj:`int`):
/// The id to convert
///
/// Returns:
/// :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
#[text_signature = "($self, id)"]
fn id_to_token(&self, id: u32) -> Option<&str> {
self.tokenizer.id_to_token(id)
}
/// Add the given tokens to the vocabulary
///
/// The given tokens are added only if they don't already exist in the vocabulary.
/// Each token then gets a new attributed id.
///
/// Args:
/// tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
/// The list of tokens we want to add to the vocabulary. Each token can be either a
/// string or an instance of :class:`~tokenizers.AddedToken` for more customization.
///
/// Returns:
/// :obj:`int`: The number of tokens that were created in the vocabulary
#[text_signature = "($self, tokens)"]
fn add_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
let tokens = tokens
.into_iter()
@ -942,6 +998,23 @@ impl PyTokenizer {
Ok(self.tokenizer.add_tokens(&tokens))
}
/// Add the given special tokens to the Tokenizer.
///
/// If these tokens are already part of the vocabulary, it just let the Tokenizer know about
/// them. If they don't exist, the Tokenizer creates them, giving them a new id.
///
/// These special tokens will never be processed by the model (ie won't be split into
/// multiple tokens), and they can be removed from the output when decoding.
///
/// Args:
/// tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
/// The list of special tokens we want to add to the vocabulary. Each token can either
/// be a string or an instance of :class:`~tokenizers.AddedToken` for more
/// customization.
///
/// Returns:
/// :obj:`int`: The number of tokens that were created in the vocabulary
#[text_signature = "($self, tokens)"]
fn add_special_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
let tokens = tokens
.into_iter()
@ -968,7 +1041,30 @@ impl PyTokenizer {
.allow_threads(|| ToPyResult(self.tokenizer.train_and_replace(trainer, files)).into())
}
/// Apply all the post-processing steps to the given encodings.
///
/// The various steps are:
///
/// 1. Truncate according to the set truncation params (provided with
/// :meth:`~tokenizers.Tokenizer.enable_truncation`)
/// 2. Apply the :class:`~tokenizers.processors.PostProcessor`
/// 3. Pad according to the set padding params (provided with
/// :meth:`~tokenizers.Tokenizer.enable_padding`)
///
/// Args:
/// encoding (:class:`~tokenizers.Encoding`):
/// The :class:`~tokenizers.Encoding` corresponding to the main sequence.
///
/// pair (:class:`~tokenizers.Encoding`, `optional`):
/// An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence.
///
/// add_special_tokens (:obj:`bool`):
/// Whether to add the special tokens
///
/// Returns:
/// :class:`~tokenizers.Encoding`: The final post-processed encoding
#[args(pair = "None", add_special_tokens = true)]
#[text_signature = "($self, encoding, pair=None, add_special_tokens=True)"]
fn post_process(
&self,
encoding: &PyEncoding,
@ -987,16 +1083,19 @@ impl PyTokenizer {
.into()
}
/// The :class:`~tokenizers.models.Model` in use by the Tokenizer
#[getter]
fn get_model(&self) -> PyResult<PyObject> {
self.tokenizer.get_model().get_as_subtype()
}
/// Set the :class:`~tokenizers.models.Model`
#[setter]
fn set_model(&mut self, model: PyRef<PyModel>) {
self.tokenizer.with_model(model.clone());
}
/// The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
#[getter]
fn get_normalizer(&self) -> PyResult<PyObject> {
if let Some(n) = self.tokenizer.get_normalizer() {
@ -1006,11 +1105,13 @@ impl PyTokenizer {
}
}
/// Set the :class:`~tokenizers.normalizers.Normalizer`
#[setter]
fn set_normalizer(&mut self, normalizer: PyRef<PyNormalizer>) {
self.tokenizer.with_normalizer(normalizer.clone());
}
/// The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
#[getter]
fn get_pre_tokenizer(&self) -> PyResult<PyObject> {
if let Some(pt) = self.tokenizer.get_pre_tokenizer() {
@ -1020,11 +1121,13 @@ impl PyTokenizer {
}
}
/// Set the :class:`~tokenizers.normalizers.Normalizer`
#[setter]
fn set_pre_tokenizer(&mut self, pretok: PyRef<PyPreTokenizer>) {
self.tokenizer.with_pre_tokenizer(pretok.clone());
}
/// The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
#[getter]
fn get_post_processor(&self) -> PyResult<PyObject> {
if let Some(n) = self.tokenizer.get_post_processor() {
@ -1034,11 +1137,13 @@ impl PyTokenizer {
}
}
/// Set the :class:`~tokenizers.processors.PostProcessor`
#[setter]
fn set_post_processor(&mut self, processor: PyRef<PyPostProcessor>) {
self.tokenizer.with_post_processor(processor.clone());
}
/// The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
#[getter]
fn get_decoder(&self) -> PyResult<PyObject> {
if let Some(dec) = self.tokenizer.get_decoder() {
@ -1048,6 +1153,7 @@ impl PyTokenizer {
}
}
/// Set the :class:`~tokenizers.decoders.Decoder`
#[setter]
fn set_decoder(&mut self, decoder: PyRef<PyDecoder>) {
self.tokenizer.with_decoder(decoder.clone());