Python - Encoding mappings handle sequence_id

This commit is contained in:
Anthony MOI
2020-11-04 11:38:46 -05:00
committed by Anthony MOI
parent f3b970e281
commit dce218ca28
2 changed files with 183 additions and 40 deletions

View File

@ -275,6 +275,21 @@ class Encoding:
""" """
pass pass
@property @property
def n_sequences(self) -> int:
"""The number of sequences represented
Returns:
:obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
"""
pass
def set_sequence_id(self, sequence_index: int):
"""Set the given sequence index
Set the given sequence index for the whole range of tokens contained in this
:class:`~tokenizers.Encoding`.
"""
pass
@property
def ids(self) -> List[int]: def ids(self) -> List[int]:
"""The generated IDs """The generated IDs
@ -368,68 +383,110 @@ class Encoding:
maximum length. maximum length.
""" """
pass pass
def word_to_tokens(self, word_index: int) -> Optional[Tuple[int, int]]: def word_to_tokens(self, word_index: int, sequence_index: int = 0) -> Optional[Tuple[int, int]]:
"""Get the encoded tokens corresponding to the word at the given index """Get the encoded tokens corresponding to the word at the given index
in the input sequence. in one of the input sequences.
Args: Args:
word_index (:obj:`int`): word_index (:obj:`int`):
The index of a word in the input sequence. The index of a word in one of the input sequences.
sequence_index (:obj:`int`, defaults to :obj:`0`):
The index of the sequence that contains the target word
Returns: Returns:
:obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)` :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
""" """
pass pass
def word_to_chars(self, word_index: int) -> Optional[Offsets]: def word_to_chars(self, word_index: int, sequence_index: int = 0) -> Optional[Offsets]:
"""Get the offsets of the word at the given index in the input sequence. """Get the offsets of the word at the given index in one of the input sequences.
Args: Args:
word_index (:obj:`int`): word_index (:obj:`int`):
The index of a word in the input sequence. The index of a word in one of the input sequences.
sequence_index (:obj:`int`, defaults to :obj:`0`):
The index of the sequence that contains the target word
Returns: Returns:
:obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)` :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
""" """
pass pass
def token_to_chars(self, token_index: int) -> Optional[Offsets]: def token_to_sequence(self, token_index: int) -> Optional[int]:
"""Get the offsets of the token at the given index """Get the index of the sequence represented by the given token.
In the general use case, this method returns :obj:`0` for a single sequence or
the first sequence of a pair, and :obj:`1` for the second sequence of a pair
Args: Args:
token_index (:obj:`int`): token_index (:obj:`int`):
The index of a token in the encoded sequence. The index of a token in the encoded sequence.
Returns: Returns:
:obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)` :obj:`int`: The sequence id of the given token
""" """
pass pass
def token_to_word(self, token_index: int) -> Optional[int]: def token_to_chars(self, token_index: int) -> Optional[Union[Offsets, Tuple[int, Offsets]]]:
"""Get the offsets of the token at the given index.
If the :class:`~tokenizers.Encoding` represents multiple sequences (namely
a pair of sequences), then this method returns a Tuple with both the relevant
sequence index, and the offsets.
Args:
token_index (:obj:`int`):
The index of a token in the encoded sequence.
Returns:
:obj:`Tuple[int, int]` or :obj:`Tuple[int, Tuple[int, int]]`:
- For a single sequence: the token offsets:
:obj:`Tuple[int, int]` of the form :obj:`(first, last + 1)`
- For pairs of sequence: A tuple with the sequence index, and the token offsets:
:obj:`Tuple[int, Tuple[int, int]]` with offsets of the form :obj:`(first, last + 1)`
"""
pass
def token_to_word(self, token_index: int) -> Optional[Union[int, Tuple[int, int]]]:
"""Get the word that contains the token at the given index """Get the word that contains the token at the given index
If the :class:`~tokenizers.Encoding` represents multiple sequences (namely
a pair of sequences), then this method returns a Tuple with both the relevant
sequence index, and the word index.
Args: Args:
token_index (:obj:`int`): token_index (:obj:`int`):
The index of a token in the encoded sequence. The index of a token in the encoded sequence.
Returns: Returns:
:obj:`int`: The index of the word in the input sequence. :obj:`int` or :obj:`Tuple[int, int]`:
- For a single sequence: The index of the word in the input sequence: :obj:`int`
- For pairs of sequence: A tuple with the sequence index, and the index of the word
in the said sequence: :obj:`Tuple[int, int]`
""" """
pass pass
def char_to_token(self, pos: int) -> Optional[int]: def char_to_token(self, pos: int, sequence_index: int = 0) -> Optional[int]:
"""Get the token that contains the char at the given position """Get the token that contains the char at the given position in the input sequence.
Args: Args:
char_pos (:obj:`int`): char_pos (:obj:`int`):
The position of a char in the input string The position of a char in the input string
sequence_index (:obj:`int`, defaults to :obj:`0`):
The index of the sequence that contains the target char
Returns: Returns:
:obj:`int`: The index of the token that contains this char in the encoded sequence :obj:`int`: The index of the token that contains this char in the encoded sequence
""" """
pass pass
def char_to_word(self, pos: int) -> Optional[int]: def char_to_word(self, pos: int, sequence_index: int = 0) -> Optional[int]:
"""Get the word that contains the char at the given position """Get the word that contains the char at the given position in the input sequence.
Args: Args:
char_pos (:obj:`int`): char_pos (:obj:`int`):
The position of a char in the input string The position of a char in the input string
sequence_index (:obj:`int`, defaults to :obj:`0`):
The index of the sequence that contains the target char
Returns: Returns:
:obj:`int`: The index of the word that contains this char in the input sequence :obj:`int`: The index of the word that contains this char in the input sequence
@ -465,6 +522,9 @@ class Encoding:
def truncate(self, max_length: int, stride: Optional[int] = 0): def truncate(self, max_length: int, stride: Optional[int] = 0):
"""Truncate the :class:`~tokenizers.Encoding` at the given length """Truncate the :class:`~tokenizers.Encoding` at the given length
If this :class:`~tokenizers.Encoding` represents multiple sequences, when truncating
this information is lost. It will be considered as representing a single sequence.
Args: Args:
max_length (:obj:`int`): max_length (:obj:`int`):
The desired length The desired length

View File

@ -94,6 +94,24 @@ impl PyEncoding {
.into() .into()
} }
/// The number of sequences represented
///
/// Returns:
/// :obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
#[getter]
fn get_n_sequences(&self) -> usize {
self.encoding.n_sequences()
}
/// Set the given sequence index
///
/// Set the given sequence index for the whole range of tokens contained in this
/// :class:`~tokenizers.Encoding`.
#[text_signature = "($self, sequence_id)"]
fn set_sequence_id(&mut self, sequence_id: usize) {
self.encoding.set_sequence_id(sequence_id);
}
/// The generated IDs /// The generated IDs
/// ///
/// The IDs are the main input to a Language Model. They are the token indices, /// The IDs are the main input to a Language Model. They are the token indices,
@ -202,82 +220,144 @@ impl PyEncoding {
} }
/// Get the encoded tokens corresponding to the word at the given index /// Get the encoded tokens corresponding to the word at the given index
/// in the input sequence. /// in one of the input sequences.
/// ///
/// Args: /// Args:
/// word_index (:obj:`int`): /// word_index (:obj:`int`):
/// The index of a word in the input sequence. /// The index of a word in one of the input sequences.
/// sequence_index (:obj:`int`, defaults to :obj:`0`):
/// The index of the sequence that contains the target word
/// ///
/// Returns: /// Returns:
/// :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)` /// :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
#[text_signature = "($self, word_index)"] #[args(sequence_index = 0)]
fn word_to_tokens(&self, word_index: u32) -> Option<(usize, usize)> { #[text_signature = "($self, word_index, sequence_index=0)"]
self.encoding.word_to_tokens(word_index) fn word_to_tokens(&self, word_index: u32, sequence_index: usize) -> Option<(usize, usize)> {
self.encoding.word_to_tokens(word_index, sequence_index)
} }
/// Get the offsets of the word at the given index in the input sequence. /// Get the offsets of the word at the given index in one of the input sequences.
/// ///
/// Args: /// Args:
/// word_index (:obj:`int`): /// word_index (:obj:`int`):
/// The index of a word in the input sequence. /// The index of a word in one of the input sequences.
/// sequence_index (:obj:`int`, defaults to :obj:`0`):
/// The index of the sequence that contains the target word
/// ///
/// Returns: /// Returns:
/// :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)` /// :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
#[text_signature = "($self, word_index)"] #[args(sequence_index = 0)]
fn word_to_chars(&self, word_index: u32) -> Option<Offsets> { #[text_signature = "($self, word_index, sequence_index=0)"]
self.encoding.word_to_chars(word_index) fn word_to_chars(&self, word_index: u32, sequence_index: usize) -> Option<Offsets> {
self.encoding.word_to_chars(word_index, sequence_index)
} }
/// Get the offsets of the token at the given index /// Get the index of the sequence represented by the given token.
///
/// In the general use case, this method returns :obj:`0` for a single sequence or
/// the first sequence of a pair, and :obj:`1` for the second sequence of a pair
/// ///
/// Args: /// Args:
/// token_index (:obj:`int`): /// token_index (:obj:`int`):
/// The index of a token in the encoded sequence. /// The index of a token in the encoded sequence.
/// ///
/// Returns: /// Returns:
/// :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)` /// :obj:`int`: The sequence id of the given token
#[text_signature = "($self, token_index)"] #[text_signature = "($self, token_index)"]
fn token_to_chars(&self, token_index: usize) -> Option<Offsets> { fn token_to_sequence(&self, token_index: usize) -> Option<usize> {
self.encoding.token_to_chars(token_index) self.encoding.token_to_sequence(token_index)
}
/// Get the offsets of the token at the given index.
///
/// If the :class:`~tokenizers.Encoding` represents multiple sequences (namely
/// a pair of sequences), then this method returns a Tuple with both the relevant
/// sequence index, and the offsets.
///
/// Args:
/// token_index (:obj:`int`):
/// The index of a token in the encoded sequence.
///
/// Returns:
/// :obj:`Tuple[int, int]` or :obj:`Tuple[int, Tuple[int, int]]`:
///
/// - For a single sequence: the token offsets:
/// :obj:`Tuple[int, int]` of the form :obj:`(first, last + 1)`
///
/// - For pairs of sequence: A tuple with the sequence index, and the token offsets:
/// :obj:`Tuple[int, Tuple[int, int]]` with offsets of the form :obj:`(first, last + 1)`
///
#[text_signature = "($self, token_index)"]
fn token_to_chars(&self, token_index: usize) -> Option<PyObject> {
let (seq_idx, offsets) = self.encoding.token_to_chars(token_index)?;
Python::with_gil(|py| {
if self.encoding.n_sequences() > 1 {
Some((seq_idx, offsets).to_object(py))
} else {
Some(offsets.to_object(py))
}
})
} }
/// Get the word that contains the token at the given index /// Get the word that contains the token at the given index
/// ///
/// If the :class:`~tokenizers.Encoding` represents multiple sequences (namely
/// a pair of sequences), then this method returns a Tuple with both the relevant
/// sequence index, and the word index.
///
/// Args: /// Args:
/// token_index (:obj:`int`): /// token_index (:obj:`int`):
/// The index of a token in the encoded sequence. /// The index of a token in the encoded sequence.
/// ///
/// Returns: /// Returns:
/// :obj:`int`: The index of the word in the input sequence. /// :obj:`int` or :obj:`Tuple[int, int]`:
///
/// - For a single sequence: The index of the word in the input sequence: :obj:`int`
/// - For pairs of sequence: A tuple with the sequence index, and the index of the word
/// in the said sequence: :obj:`Tuple[int, int]`
///
#[text_signature = "($self, token_index)"] #[text_signature = "($self, token_index)"]
fn token_to_word(&self, token_index: usize) -> Option<u32> { fn token_to_word(&self, token_index: usize) -> Option<PyObject> {
self.encoding.token_to_word(token_index) let (seq_idx, word_idx) = self.encoding.token_to_word(token_index)?;
Python::with_gil(|py| {
if self.encoding.n_sequences() > 1 {
Some((seq_idx, word_idx).to_object(py))
} else {
Some(word_idx.to_object(py))
}
})
} }
/// Get the token that contains the char at the given position /// Get the token that contains the char at the given position in the input sequence.
/// ///
/// Args: /// Args:
/// char_pos (:obj:`int`): /// char_pos (:obj:`int`):
/// The position of a char in the input string /// The position of a char in the input string
/// sequence_index (:obj:`int`, defaults to :obj:`0`):
/// The index of the sequence that contains the target char
/// ///
/// Returns: /// Returns:
/// :obj:`int`: The index of the token that contains this char in the encoded sequence /// :obj:`int`: The index of the token that contains this char in the encoded sequence
#[text_signature = "($self, char_pos)"] #[args(sequence_index = 0)]
fn char_to_token(&self, char_pos: usize) -> Option<usize> { #[text_signature = "($self, char_pos, sequence_index=0)"]
self.encoding.char_to_token(char_pos) fn char_to_token(&self, char_pos: usize, sequence_index: usize) -> Option<usize> {
self.encoding.char_to_token(char_pos, sequence_index)
} }
/// Get the word that contains the char at the given position /// Get the word that contains the char at the given position in the input sequence.
/// ///
/// Args: /// Args:
/// char_pos (:obj:`int`): /// char_pos (:obj:`int`):
/// The position of a char in the input string /// The position of a char in the input string
/// sequence_index (:obj:`int`, defaults to :obj:`0`):
/// The index of the sequence that contains the target char
/// ///
/// Returns: /// Returns:
/// :obj:`int`: The index of the word that contains this char in the input sequence /// :obj:`int`: The index of the word that contains this char in the input sequence
#[text_signature = "($self, char_pos)"] #[args(sequence_index = 0)]
fn char_to_word(&self, char_pos: usize) -> Option<u32> { #[text_signature = "($self, char_pos, sequence_index=0)"]
self.encoding.char_to_word(char_pos) fn char_to_word(&self, char_pos: usize, sequence_index: usize) -> Option<u32> {
self.encoding.char_to_word(char_pos, sequence_index)
} }
/// Pad the :class:`~tokenizers.Encoding` at the given length /// Pad the :class:`~tokenizers.Encoding` at the given length
@ -336,6 +416,9 @@ impl PyEncoding {
/// Truncate the :class:`~tokenizers.Encoding` at the given length /// Truncate the :class:`~tokenizers.Encoding` at the given length
/// ///
/// If this :class:`~tokenizers.Encoding` represents multiple sequences, when truncating
/// this information is lost. It will be considered as representing a single sequence.
///
/// Args: /// Args:
/// max_length (:obj:`int`): /// max_length (:obj:`int`):
/// The desired length /// The desired length