mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Doc - Update API Reference for Encoding
This commit is contained in:
@ -255,130 +255,184 @@ class Regex:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
class Encoding:
|
class Encoding:
|
||||||
""" An Encoding as returned by the Tokenizer """
|
"""
|
||||||
|
The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
|
||||||
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def merge(encodings: List[Encoding], growing_offsets: bool = True) -> Encoding:
|
def merge(encodings: List[Encoding], growing_offsets: bool = True) -> Encoding:
|
||||||
"""Merge the list of Encoding into one final Encoding
|
"""Merge the list of encodings into one final :class:`~tokenizers.Encoding`
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
encodings: List[Encoding]:
|
encodings (A :obj:`List` of :class:`~tokenizers.Encoding`):
|
||||||
The list of encodings
|
The list of encodings that should be merged in one
|
||||||
|
|
||||||
growing_offsets: bool:
|
growing_offsets (:obj:`bool`, defaults to :obj:`True`):
|
||||||
Whether the offsets should accumulate while merging
|
Whether the offsets should accumulate while merging
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The resulting Encoding
|
:class:`~tokenizers.Encoding`: The resulting Encoding
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
@property
|
@property
|
||||||
def ids(self) -> List[int]:
|
def ids(self) -> List[int]:
|
||||||
""" The tokenized ids """
|
"""The generated IDs
|
||||||
|
|
||||||
|
The IDs are the main input to a Language Model. They are the token indices,
|
||||||
|
the numerical representations that a LM understands.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: The list of IDs
|
||||||
|
"""
|
||||||
pass
|
pass
|
||||||
@property
|
@property
|
||||||
def tokens(self) -> List[str]:
|
def tokens(self) -> List[str]:
|
||||||
""" The tokenized strings """
|
"""The generated tokens
|
||||||
|
|
||||||
|
They are the string representation of the IDs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[str]`: The list of tokens
|
||||||
|
"""
|
||||||
pass
|
pass
|
||||||
@property
|
@property
|
||||||
def words(self) -> List[Optional[int]]:
|
def words(self) -> List[Optional[int]]:
|
||||||
""" The tokenized words index """
|
"""The generated word indices.
|
||||||
|
|
||||||
|
They represent the index of the word associated to each token.
|
||||||
|
When the input is pre-tokenized, they correspond to the ID of the given input label,
|
||||||
|
otherwise they correspond to the words indices as defined by the
|
||||||
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
|
||||||
|
|
||||||
|
For special tokens and such (any token that was generated from something that was
|
||||||
|
not part of the input), the output is :obj:`None`
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
|
||||||
|
"""
|
||||||
pass
|
pass
|
||||||
@property
|
@property
|
||||||
def type_ids(self) -> List[int]:
|
def type_ids(self) -> List[int]:
|
||||||
""" The type ids """
|
"""The generated type IDs
|
||||||
|
|
||||||
|
Generally used for tasks like sequence classification or question answering,
|
||||||
|
these tokens let the LM know which input sequence corresponds to each tokens.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: The list of type ids
|
||||||
|
"""
|
||||||
pass
|
pass
|
||||||
@property
|
@property
|
||||||
def offsets(self) -> List[Offsets]:
|
def offsets(self) -> List[Offsets]:
|
||||||
"""The offsets.
|
"""The offsets associated to each token
|
||||||
These offsets can be used to index any `IndexableString` directly. If you want to
|
|
||||||
index the original `str`, make sure to retrieve the converted offsets using the `.offsets`
|
These offsets let's you slice the input string, and thus retrieve the original
|
||||||
method on the `original_str`.
|
part that led to producing the corresponding token.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
@property
|
@property
|
||||||
def special_tokens_mask(self) -> List[int]:
|
def special_tokens_mask(self) -> List[int]:
|
||||||
""" The special tokens mask """
|
"""The special token mask
|
||||||
|
|
||||||
|
This indicates which tokens are special tokens, and which are not.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: The special tokens mask
|
||||||
|
"""
|
||||||
pass
|
pass
|
||||||
@property
|
@property
|
||||||
def attention_mask(self) -> List[int]:
|
def attention_mask(self) -> List[int]:
|
||||||
""" The attention mask """
|
"""The attention mask
|
||||||
|
|
||||||
|
This indicates to the LM which tokens should be attended to, and which should not.
|
||||||
|
This is especially important when batching sequences, where we need to applying
|
||||||
|
padding.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: The attention mask
|
||||||
|
"""
|
||||||
pass
|
pass
|
||||||
@property
|
@property
|
||||||
def overflowing(self) -> Optional[Encoding]:
|
def overflowing(self) -> Optional[Encoding]:
|
||||||
""" The overflowing encoding, after truncation """
|
"""A :obj:`List` of overflowing :class:`~tokenizers.Encoding`
|
||||||
|
|
||||||
|
When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting
|
||||||
|
the output into as many pieces as required to match the specified maximum length.
|
||||||
|
This field lets you retrieve all the subsequent pieces.
|
||||||
|
|
||||||
|
When you use pairs of sequences, the overflowing pieces will contain enough
|
||||||
|
variations to cover all the possible combinations, while respecting the provided
|
||||||
|
maximum length.
|
||||||
|
"""
|
||||||
pass
|
pass
|
||||||
def word_to_tokens(self, word_index: int) -> Optional[Tuple[int, int]]:
|
def word_to_tokens(self, word_index: int) -> Optional[Tuple[int, int]]:
|
||||||
"""
|
"""Get the encoded tokens corresponding to the word at the given index
|
||||||
Get the encoded tokens corresponding to the word at the given index in the input
|
in the input sequence.
|
||||||
sequence, with the form [start_token, end_token + 1]
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
word_index: int:
|
word_index (:obj:`int`):
|
||||||
The index of the word in the input sequence.
|
The index of a word in the input sequence.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The range of tokens with the form [start_token, end_token + 1]
|
:obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def word_to_chars(self, word_index: int) -> Optional[Offsets]:
|
def word_to_chars(self, word_index: int) -> Optional[Offsets]:
|
||||||
"""
|
"""Get the offsets of the word at the given index in the input sequence.
|
||||||
Get the offsets of the word at the given index in the input sequence.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
word_index: int:
|
word_index (:obj:`int`):
|
||||||
The index of the word in the input sequence.
|
The index of a word in the input sequence.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The word offsets
|
:obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def token_to_chars(self, token_index: int) -> Optional[Offsets]:
|
def token_to_chars(self, token_index: int) -> Optional[Offsets]:
|
||||||
"""
|
"""Get the offsets of the token at the given index
|
||||||
Get the offsets of the token at the given index
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
token_index: int:
|
token_index (:obj:`int`):
|
||||||
The index of the token in the encoded sequence.
|
The index of a token in the encoded sequence.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The token offsets
|
:obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def token_to_word(self, token_index: int) -> Optional[int]:
|
def token_to_word(self, token_index: int) -> Optional[int]:
|
||||||
"""
|
"""Get the word that contains the token at the given index
|
||||||
Get the word that contains the token at the given index
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
token_index: int:
|
token_index (:obj:`int`):
|
||||||
The index of the token in the encoded sequence.
|
The index of a token in the encoded sequence.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The index of the word in the input sequence.
|
:obj:`int`: The index of the word in the input sequence.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def char_to_token(self, pos: int) -> Optional[int]:
|
def char_to_token(self, pos: int) -> Optional[int]:
|
||||||
"""
|
"""Get the token that contains the char at the given position
|
||||||
Get the token that contains the char at the given position
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pos: int:
|
char_pos (:obj:`int`):
|
||||||
The position of a char in the input string
|
The position of a char in the input string
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The index of the token that contains this char
|
:obj:`int`: The index of the token that contains this char in the encoded sequence
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def char_to_word(self, pos: int) -> Optional[int]:
|
def char_to_word(self, pos: int) -> Optional[int]:
|
||||||
"""
|
"""Get the word that contains the char at the given position
|
||||||
Get the word that contains the given char.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pos: int:
|
char_pos (:obj:`int`):
|
||||||
The position of a char in the input string
|
The position of a char in the input string
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The index of the word that contains this char
|
:obj:`int`: The index of the word that contains this char in the input sequence
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def pad(
|
def pad(
|
||||||
@ -389,35 +443,34 @@ class Encoding:
|
|||||||
pad_token: Optional[str] = "[PAD]",
|
pad_token: Optional[str] = "[PAD]",
|
||||||
direction: Optional[str] = "right",
|
direction: Optional[str] = "right",
|
||||||
):
|
):
|
||||||
"""Pad the current Encoding at the given length
|
"""Pad the :class:`~tokenizers.Encoding` at the given length
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
length: int:
|
length (:obj:`int`):
|
||||||
The length at which to pad
|
The desired length
|
||||||
|
|
||||||
direction: (`optional`) str:
|
direction: (:obj:`str`, defaults to :obj:`right`):
|
||||||
Can be one of: `right` or `left`
|
The expected padding direction. Can be either :obj:`right` or :obj:`left`
|
||||||
|
|
||||||
pad_id: (`optional`) unsigned int:
|
pad_id (:obj:`int`, defaults to :obj:`0`):
|
||||||
The indice to be used when padding
|
The ID corresponding to the padding token
|
||||||
|
|
||||||
pad_type_id: (`optional`) unsigned int:
|
pad_type_id (:obj:`int`, defaults to :obj:`0`):
|
||||||
The type indice to be used when padding
|
The type ID corresponding to the padding token
|
||||||
|
|
||||||
pad_token: (`optional`) str:
|
pad_token (:obj:`str`, defaults to `[PAD]`):
|
||||||
The pad token to be used when padding
|
The pad token to use
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def truncate(self, max_length: int, stride: Optional[int] = 0):
|
def truncate(self, max_length: int, stride: Optional[int] = 0):
|
||||||
"""Truncate the current Encoding at the given max_length
|
"""Truncate the :class:`~tokenizers.Encoding` at the given length
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
max_length: int:
|
max_length (:obj:`int`):
|
||||||
The maximum length to be kept
|
The desired length
|
||||||
|
|
||||||
stride: (`optional`) unsigned int:
|
stride (:obj:`int`, defaults to :obj:`0`):
|
||||||
The length of the previous first sequence to be included
|
The length of previous content to be included in each overflowing piece
|
||||||
in the overflowing sequence
|
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -7,6 +7,7 @@ use tokenizers as tk;
|
|||||||
|
|
||||||
use crate::error::PyError;
|
use crate::error::PyError;
|
||||||
|
|
||||||
|
/// The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
|
||||||
#[pyclass(dict, module = "tokenizers", name=Encoding)]
|
#[pyclass(dict, module = "tokenizers", name=Encoding)]
|
||||||
#[repr(transparent)]
|
#[repr(transparent)]
|
||||||
pub struct PyEncoding {
|
pub struct PyEncoding {
|
||||||
@ -71,8 +72,20 @@ impl PyEncoding {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Merge the list of encodings into one final :class:`~tokenizers.Encoding`
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// encodings (A :obj:`List` of :class:`~tokenizers.Encoding`):
|
||||||
|
/// The list of encodings that should be merged in one
|
||||||
|
///
|
||||||
|
/// growing_offsets (:obj:`bool`, defaults to :obj:`True`):
|
||||||
|
/// Whether the offsets should accumulate while merging
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :class:`~tokenizers.Encoding`: The resulting Encoding
|
||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
#[args(growing_offsets = true)]
|
#[args(growing_offsets = true)]
|
||||||
|
#[text_signature = "(encodings, growing_offsets=True)"]
|
||||||
fn merge(encodings: Vec<PyRef<PyEncoding>>, growing_offsets: bool) -> PyEncoding {
|
fn merge(encodings: Vec<PyRef<PyEncoding>>, growing_offsets: bool) -> PyEncoding {
|
||||||
tk::tokenizer::Encoding::merge(
|
tk::tokenizer::Encoding::merge(
|
||||||
encodings.into_iter().map(|e| e.encoding.clone()),
|
encodings.into_iter().map(|e| e.encoding.clone()),
|
||||||
@ -81,41 +94,103 @@ impl PyEncoding {
|
|||||||
.into()
|
.into()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The generated IDs
|
||||||
|
///
|
||||||
|
/// The IDs are the main input to a Language Model. They are the token indices,
|
||||||
|
/// the numerical representations that a LM understands.
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :obj:`List[int]`: The list of IDs
|
||||||
#[getter]
|
#[getter]
|
||||||
fn get_ids(&self) -> Vec<u32> {
|
fn get_ids(&self) -> Vec<u32> {
|
||||||
self.encoding.get_ids().to_vec()
|
self.encoding.get_ids().to_vec()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The generated tokens
|
||||||
|
///
|
||||||
|
/// They are the string representation of the IDs.
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :obj:`List[str]`: The list of tokens
|
||||||
#[getter]
|
#[getter]
|
||||||
fn get_tokens(&self) -> Vec<String> {
|
fn get_tokens(&self) -> Vec<String> {
|
||||||
self.encoding.get_tokens().to_vec()
|
self.encoding.get_tokens().to_vec()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The generated word indices.
|
||||||
|
///
|
||||||
|
/// They represent the index of the word associated to each token.
|
||||||
|
/// When the input is pre-tokenized, they correspond to the ID of the given input label,
|
||||||
|
/// otherwise they correspond to the words indices as defined by the
|
||||||
|
/// :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
|
||||||
|
///
|
||||||
|
/// For special tokens and such (any token that was generated from something that was
|
||||||
|
/// not part of the input), the output is :obj:`None`
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
|
||||||
#[getter]
|
#[getter]
|
||||||
fn get_words(&self) -> Vec<Option<u32>> {
|
fn get_words(&self) -> Vec<Option<u32>> {
|
||||||
self.encoding.get_words().to_vec()
|
self.encoding.get_words().to_vec()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The generated type IDs
|
||||||
|
///
|
||||||
|
/// Generally used for tasks like sequence classification or question answering,
|
||||||
|
/// these tokens let the LM know which input sequence corresponds to each tokens.
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :obj:`List[int]`: The list of type ids
|
||||||
#[getter]
|
#[getter]
|
||||||
fn get_type_ids(&self) -> Vec<u32> {
|
fn get_type_ids(&self) -> Vec<u32> {
|
||||||
self.encoding.get_type_ids().to_vec()
|
self.encoding.get_type_ids().to_vec()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The offsets associated to each token
|
||||||
|
///
|
||||||
|
/// These offsets let's you slice the input string, and thus retrieve the original
|
||||||
|
/// part that led to producing the corresponding token.
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
|
||||||
#[getter]
|
#[getter]
|
||||||
fn get_offsets(&self) -> Vec<(usize, usize)> {
|
fn get_offsets(&self) -> Vec<(usize, usize)> {
|
||||||
self.encoding.get_offsets().to_vec()
|
self.encoding.get_offsets().to_vec()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The special token mask
|
||||||
|
///
|
||||||
|
/// This indicates which tokens are special tokens, and which are not.
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :obj:`List[int]`: The special tokens mask
|
||||||
#[getter]
|
#[getter]
|
||||||
fn get_special_tokens_mask(&self) -> Vec<u32> {
|
fn get_special_tokens_mask(&self) -> Vec<u32> {
|
||||||
self.encoding.get_special_tokens_mask().to_vec()
|
self.encoding.get_special_tokens_mask().to_vec()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The attention mask
|
||||||
|
///
|
||||||
|
/// This indicates to the LM which tokens should be attended to, and which should not.
|
||||||
|
/// This is especially important when batching sequences, where we need to applying
|
||||||
|
/// padding.
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :obj:`List[int]`: The attention mask
|
||||||
#[getter]
|
#[getter]
|
||||||
fn get_attention_mask(&self) -> Vec<u32> {
|
fn get_attention_mask(&self) -> Vec<u32> {
|
||||||
self.encoding.get_attention_mask().to_vec()
|
self.encoding.get_attention_mask().to_vec()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A :obj:`List` of overflowing :class:`~tokenizers.Encoding`
|
||||||
|
///
|
||||||
|
/// When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting
|
||||||
|
/// the output into as many pieces as required to match the specified maximum length.
|
||||||
|
/// This field lets you retrieve all the subsequent pieces.
|
||||||
|
///
|
||||||
|
/// When you use pairs of sequences, the overflowing pieces will contain enough
|
||||||
|
/// variations to cover all the possible combinations, while respecting the provided
|
||||||
|
/// maximum length.
|
||||||
#[getter]
|
#[getter]
|
||||||
fn get_overflowing(&self) -> Vec<PyEncoding> {
|
fn get_overflowing(&self) -> Vec<PyEncoding> {
|
||||||
self.encoding
|
self.encoding
|
||||||
@ -126,31 +201,104 @@ impl PyEncoding {
|
|||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the encoded tokens corresponding to the word at the given index
|
||||||
|
/// in the input sequence.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// word_index (:obj:`int`):
|
||||||
|
/// The index of a word in the input sequence.
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
|
||||||
|
#[text_signature = "($self, word_index)"]
|
||||||
fn word_to_tokens(&self, word_index: u32) -> Option<(usize, usize)> {
|
fn word_to_tokens(&self, word_index: u32) -> Option<(usize, usize)> {
|
||||||
self.encoding.word_to_tokens(word_index)
|
self.encoding.word_to_tokens(word_index)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the offsets of the word at the given index in the input sequence.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// word_index (:obj:`int`):
|
||||||
|
/// The index of a word in the input sequence.
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
|
||||||
|
#[text_signature = "($self, word_index)"]
|
||||||
fn word_to_chars(&self, word_index: u32) -> Option<Offsets> {
|
fn word_to_chars(&self, word_index: u32) -> Option<Offsets> {
|
||||||
self.encoding.word_to_chars(word_index)
|
self.encoding.word_to_chars(word_index)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the offsets of the token at the given index
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// token_index (:obj:`int`):
|
||||||
|
/// The index of a token in the encoded sequence.
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
|
||||||
|
#[text_signature = "($self, token_index)"]
|
||||||
fn token_to_chars(&self, token_index: usize) -> Option<Offsets> {
|
fn token_to_chars(&self, token_index: usize) -> Option<Offsets> {
|
||||||
self.encoding.token_to_chars(token_index)
|
self.encoding.token_to_chars(token_index)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the word that contains the token at the given index
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// token_index (:obj:`int`):
|
||||||
|
/// The index of a token in the encoded sequence.
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :obj:`int`: The index of the word in the input sequence.
|
||||||
|
#[text_signature = "($self, token_index)"]
|
||||||
fn token_to_word(&self, token_index: usize) -> Option<u32> {
|
fn token_to_word(&self, token_index: usize) -> Option<u32> {
|
||||||
self.encoding.token_to_word(token_index)
|
self.encoding.token_to_word(token_index)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the token that contains the char at the given position
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// char_pos (:obj:`int`):
|
||||||
|
/// The position of a char in the input string
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :obj:`int`: The index of the token that contains this char in the encoded sequence
|
||||||
|
#[text_signature = "($self, char_pos)"]
|
||||||
fn char_to_token(&self, char_pos: usize) -> Option<usize> {
|
fn char_to_token(&self, char_pos: usize) -> Option<usize> {
|
||||||
self.encoding.char_to_token(char_pos)
|
self.encoding.char_to_token(char_pos)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the word that contains the char at the given position
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// char_pos (:obj:`int`):
|
||||||
|
/// The position of a char in the input string
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :obj:`int`: The index of the word that contains this char in the input sequence
|
||||||
|
#[text_signature = "($self, char_pos)"]
|
||||||
fn char_to_word(&self, char_pos: usize) -> Option<u32> {
|
fn char_to_word(&self, char_pos: usize) -> Option<u32> {
|
||||||
self.encoding.char_to_word(char_pos)
|
self.encoding.char_to_word(char_pos)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Pad the :class:`~tokenizers.Encoding` at the given length
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// length (:obj:`int`):
|
||||||
|
/// The desired length
|
||||||
|
///
|
||||||
|
/// direction: (:obj:`str`, defaults to :obj:`right`):
|
||||||
|
/// The expected padding direction. Can be either :obj:`right` or :obj:`left`
|
||||||
|
///
|
||||||
|
/// pad_id (:obj:`int`, defaults to :obj:`0`):
|
||||||
|
/// The ID corresponding to the padding token
|
||||||
|
///
|
||||||
|
/// pad_type_id (:obj:`int`, defaults to :obj:`0`):
|
||||||
|
/// The type ID corresponding to the padding token
|
||||||
|
///
|
||||||
|
/// pad_token (:obj:`str`, defaults to `[PAD]`):
|
||||||
|
/// The pad token to use
|
||||||
#[args(kwargs = "**")]
|
#[args(kwargs = "**")]
|
||||||
|
#[text_signature = "($self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')"]
|
||||||
fn pad(&mut self, length: usize, kwargs: Option<&PyDict>) -> PyResult<()> {
|
fn pad(&mut self, length: usize, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||||
let mut pad_id = 0;
|
let mut pad_id = 0;
|
||||||
let mut pad_type_id = 0;
|
let mut pad_type_id = 0;
|
||||||
@ -186,19 +334,17 @@ impl PyEncoding {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[args(kwargs = "**")]
|
/// Truncate the :class:`~tokenizers.Encoding` at the given length
|
||||||
fn truncate(&mut self, max_length: usize, kwargs: Option<&PyDict>) -> PyResult<()> {
|
///
|
||||||
let mut stride = 0;
|
/// Args:
|
||||||
|
/// max_length (:obj:`int`):
|
||||||
if let Some(kwargs) = kwargs {
|
/// The desired length
|
||||||
for (key, value) in kwargs {
|
///
|
||||||
let key: &str = key.extract()?;
|
/// stride (:obj:`int`, defaults to :obj:`0`):
|
||||||
match key {
|
/// The length of previous content to be included in each overflowing piece
|
||||||
"stride" => stride = value.extract()?,
|
#[args(stride = "0")]
|
||||||
_ => println!("Ignored unknown kwarg option {}", key),
|
#[text_signature = "($self, max_length, stride=0)"]
|
||||||
}
|
fn truncate(&mut self, max_length: usize, stride: usize) -> PyResult<()> {
|
||||||
}
|
|
||||||
}
|
|
||||||
self.encoding.truncate(max_length, stride);
|
self.encoding.truncate(max_length, stride);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -30,7 +30,13 @@ Tokenizer
|
|||||||
|
|
||||||
.. autoclass:: tokenizers.Tokenizer
|
.. autoclass:: tokenizers.Tokenizer
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
|
||||||
|
|
||||||
|
Encoding
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: tokenizers.Encoding
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
Added Tokens
|
Added Tokens
|
||||||
|
Reference in New Issue
Block a user