mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-03 15:59:25 +00:00
Simplify the API for Encoding.token_to_XXX
This commit is contained in:
@ -424,46 +424,34 @@ class Encoding:
|
||||
:obj:`int`: The sequence id of the given token
|
||||
"""
|
||||
pass
|
||||
def token_to_chars(self, token_index: int) -> Optional[Union[Offsets, Tuple[int, Offsets]]]:
|
||||
def token_to_chars(self, token_index: int) -> Optional[Offsets]:
|
||||
"""Get the offsets of the token at the given index.
|
||||
|
||||
If the :class:`~tokenizers.Encoding` represents multiple sequences (namely
|
||||
a pair of sequences), then this method returns a Tuple with both the relevant
|
||||
sequence index, and the offsets.
|
||||
The returned offsets are related to the input sequence that contains the
|
||||
token. In order to determine in which input sequence it belongs, you
|
||||
must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
|
||||
|
||||
Args:
|
||||
token_index (:obj:`int`):
|
||||
The index of a token in the encoded sequence.
|
||||
|
||||
Returns:
|
||||
:obj:`Tuple[int, int]` or :obj:`Tuple[int, Tuple[int, int]]`:
|
||||
|
||||
- For a single sequence: the token offsets:
|
||||
:obj:`Tuple[int, int]` of the form :obj:`(first, last + 1)`
|
||||
|
||||
- For pairs of sequence: A tuple with the sequence index, and the token offsets:
|
||||
:obj:`Tuple[int, Tuple[int, int]]` with offsets of the form :obj:`(first, last + 1)`
|
||||
|
||||
:obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
|
||||
"""
|
||||
pass
|
||||
def token_to_word(self, token_index: int) -> Optional[Union[int, Tuple[int, int]]]:
|
||||
"""Get the word that contains the token at the given index
|
||||
def token_to_word(self, token_index: int) -> Optional[int]:
|
||||
"""Get the index of the word that contains the token in one of the input sequences.
|
||||
|
||||
If the :class:`~tokenizers.Encoding` represents multiple sequences (namely
|
||||
a pair of sequences), then this method returns a Tuple with both the relevant
|
||||
sequence index, and the word index.
|
||||
The returned word index is related to the input sequence that contains
|
||||
the token. In order to determine in which input sequence it belongs, you
|
||||
must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
|
||||
|
||||
Args:
|
||||
token_index (:obj:`int`):
|
||||
The index of a token in the encoded sequence.
|
||||
|
||||
Returns:
|
||||
:obj:`int` or :obj:`Tuple[int, int]`:
|
||||
|
||||
- For a single sequence: The index of the word in the input sequence: :obj:`int`
|
||||
- For pairs of sequence: A tuple with the sequence index, and the index of the word
|
||||
in the said sequence: :obj:`Tuple[int, int]`
|
||||
|
||||
:obj:`int`: The index of the word in the relevant input sequence.
|
||||
"""
|
||||
pass
|
||||
def char_to_token(self, pos: int, sequence_index: int = 0) -> Optional[int]:
|
||||
|
@ -270,62 +270,38 @@ impl PyEncoding {
|
||||
|
||||
/// Get the offsets of the token at the given index.
|
||||
///
|
||||
/// If the :class:`~tokenizers.Encoding` represents multiple sequences (namely
|
||||
/// a pair of sequences), then this method returns a Tuple with both the relevant
|
||||
/// sequence index, and the offsets.
|
||||
/// The returned offsets are related to the input sequence that contains the
|
||||
/// token. In order to determine in which input sequence it belongs, you
|
||||
/// must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
|
||||
///
|
||||
/// Args:
|
||||
/// token_index (:obj:`int`):
|
||||
/// The index of a token in the encoded sequence.
|
||||
///
|
||||
/// Returns:
|
||||
/// :obj:`Tuple[int, int]` or :obj:`Tuple[int, Tuple[int, int]]`:
|
||||
///
|
||||
/// - For a single sequence: the token offsets:
|
||||
/// :obj:`Tuple[int, int]` of the form :obj:`(first, last + 1)`
|
||||
///
|
||||
/// - For pairs of sequence: A tuple with the sequence index, and the token offsets:
|
||||
/// :obj:`Tuple[int, Tuple[int, int]]` with offsets of the form :obj:`(first, last + 1)`
|
||||
///
|
||||
/// :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
|
||||
#[text_signature = "($self, token_index)"]
|
||||
fn token_to_chars(&self, token_index: usize) -> Option<PyObject> {
|
||||
let (seq_idx, offsets) = self.encoding.token_to_chars(token_index)?;
|
||||
Python::with_gil(|py| {
|
||||
if self.encoding.n_sequences() > 1 {
|
||||
Some((seq_idx, offsets).to_object(py))
|
||||
} else {
|
||||
Some(offsets.to_object(py))
|
||||
}
|
||||
})
|
||||
fn token_to_chars(&self, token_index: usize) -> Option<Offsets> {
|
||||
let (_, offsets) = self.encoding.token_to_chars(token_index)?;
|
||||
Some(offsets)
|
||||
}
|
||||
|
||||
/// Get the word that contains the token at the given index
|
||||
/// Get the index of the word that contains the token in one of the input sequences.
|
||||
///
|
||||
/// If the :class:`~tokenizers.Encoding` represents multiple sequences (namely
|
||||
/// a pair of sequences), then this method returns a Tuple with both the relevant
|
||||
/// sequence index, and the word index.
|
||||
/// The returned word index is related to the input sequence that contains
|
||||
/// the token. In order to determine in which input sequence it belongs, you
|
||||
/// must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
|
||||
///
|
||||
/// Args:
|
||||
/// token_index (:obj:`int`):
|
||||
/// The index of a token in the encoded sequence.
|
||||
///
|
||||
/// Returns:
|
||||
/// :obj:`int` or :obj:`Tuple[int, int]`:
|
||||
///
|
||||
/// - For a single sequence: The index of the word in the input sequence: :obj:`int`
|
||||
/// - For pairs of sequence: A tuple with the sequence index, and the index of the word
|
||||
/// in the said sequence: :obj:`Tuple[int, int]`
|
||||
///
|
||||
/// :obj:`int`: The index of the word in the relevant input sequence.
|
||||
#[text_signature = "($self, token_index)"]
|
||||
fn token_to_word(&self, token_index: usize) -> Option<PyObject> {
|
||||
let (seq_idx, word_idx) = self.encoding.token_to_word(token_index)?;
|
||||
Python::with_gil(|py| {
|
||||
if self.encoding.n_sequences() > 1 {
|
||||
Some((seq_idx, word_idx).to_object(py))
|
||||
} else {
|
||||
Some(word_idx.to_object(py))
|
||||
}
|
||||
})
|
||||
fn token_to_word(&self, token_index: usize) -> Option<u32> {
|
||||
let (_, word_idx) = self.encoding.token_to_word(token_index)?;
|
||||
Some(word_idx)
|
||||
}
|
||||
|
||||
/// Get the token that contains the char at the given position in the input sequence.
|
||||
|
@ -65,9 +65,9 @@ class TestEncoding:
|
||||
|
||||
assert single.token_to_chars(0) == None
|
||||
assert single.token_to_chars(2) == (2, 6)
|
||||
assert pair.token_to_chars(2) == (0, (2, 6))
|
||||
assert pair.token_to_chars(2) == (2, 6)
|
||||
assert pair.token_to_chars(5) == None
|
||||
assert pair.token_to_chars(6) == (1, (0, 2))
|
||||
assert pair.token_to_chars(6) == (0, 2)
|
||||
|
||||
def test_token_to_word(self, encodings):
|
||||
single, pair = encodings
|
||||
@ -75,11 +75,11 @@ class TestEncoding:
|
||||
assert single.token_to_word(0) == None
|
||||
assert single.token_to_word(1) == 0
|
||||
assert single.token_to_word(4) == 2
|
||||
assert pair.token_to_word(1) == (0, 0)
|
||||
assert pair.token_to_word(4) == (0, 2)
|
||||
assert pair.token_to_word(1) == 0
|
||||
assert pair.token_to_word(4) == 2
|
||||
assert pair.token_to_word(5) == None
|
||||
assert pair.token_to_word(6) == (1, 0)
|
||||
assert pair.token_to_word(7) == (1, 1)
|
||||
assert pair.token_to_word(6) == 0
|
||||
assert pair.token_to_word(7) == 1
|
||||
|
||||
def test_char_to_token(self, encodings):
|
||||
single, pair = encodings
|
||||
|
Reference in New Issue
Block a user