Simplify the API for Encoding.token_to_XXX

2025-09-03 15:59:25 +00:00 · 2020-11-05 12:11:18 -05:00
parent 51dbf0b6df
commit 385d25720a
7 changed files with 70 additions and 116 deletions
--- a/bindings/python/py_src/tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/init.pyi
@ -424,46 +424,34 @@ class Encoding:
            :obj:`int`: The sequence id of the given token
        """
        pass
-    def token_to_chars(self, token_index: int) -> Optional[Union[Offsets, Tuple[int, Offsets]]]:
+    def token_to_chars(self, token_index: int) -> Optional[Offsets]:
        """Get the offsets of the token at the given index.

-        If the :class:`~tokenizers.Encoding` represents multiple sequences (namely
-        a pair of sequences), then this method returns a Tuple with both the relevant
-        sequence index, and the offsets.
+        The returned offsets are related to the input sequence that contains the
+        token.  In order to determine in which input sequence it belongs, you
+        must call :meth:`~tokenizers.Encoding.token_to_sequence()`.

        Args:
            token_index (:obj:`int`):
                The index of a token in the encoded sequence.

        Returns:
-            :obj:`Tuple[int, int]` or :obj:`Tuple[int, Tuple[int, int]]`:
-
-            - For a single sequence: the token offsets:
-              :obj:`Tuple[int, int]` of the form :obj:`(first, last + 1)`
-
-            - For pairs of sequence: A tuple with the sequence index, and the token offsets:
-              :obj:`Tuple[int, Tuple[int, int]]` with offsets of the form :obj:`(first, last + 1)`
-
+            :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
        """
        pass
-    def token_to_word(self, token_index: int) -> Optional[Union[int, Tuple[int, int]]]:
-        """Get the word that contains the token at the given index
+    def token_to_word(self, token_index: int) -> Optional[int]:
+        """Get the index of the word that contains the token in one of the input sequences.

-        If the :class:`~tokenizers.Encoding` represents multiple sequences (namely
-        a pair of sequences), then this method returns a Tuple with both the relevant
-        sequence index, and the word index.
+        The returned word index is related to the input sequence that contains
+        the token.  In order to determine in which input sequence it belongs, you
+        must call :meth:`~tokenizers.Encoding.token_to_sequence()`.

        Args:
            token_index (:obj:`int`):
                The index of a token in the encoded sequence.

        Returns:
-            :obj:`int` or :obj:`Tuple[int, int]`:
-
-            - For a single sequence: The index of the word in the input sequence: :obj:`int`
-            - For pairs of sequence: A tuple with the sequence index, and the index of the word
-              in the said sequence: :obj:`Tuple[int, int]`
-
+            :obj:`int`: The index of the word in the relevant input sequence.
        """
        pass
    def char_to_token(self, pos: int, sequence_index: int = 0) -> Optional[int]:
--- a/bindings/python/src/encoding.rs
+++ b/bindings/python/src/encoding.rs
@ -270,62 +270,38 @@ impl PyEncoding {

    /// Get the offsets of the token at the given index.
    ///
-    /// If the :class:`~tokenizers.Encoding` represents multiple sequences (namely
-    /// a pair of sequences), then this method returns a Tuple with both the relevant
-    /// sequence index, and the offsets.
+    /// The returned offsets are related to the input sequence that contains the
+    /// token.  In order to determine in which input sequence it belongs, you
+    /// must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
    ///
    /// Args:
    ///     token_index (:obj:`int`):
    ///         The index of a token in the encoded sequence.
    ///
    /// Returns:
-    ///     :obj:`Tuple[int, int]` or :obj:`Tuple[int, Tuple[int, int]]`:
-    ///
-    ///     - For a single sequence: the token offsets:
-    ///       :obj:`Tuple[int, int]` of the form :obj:`(first, last + 1)`
-    ///
-    ///     - For pairs of sequence: A tuple with the sequence index, and the token offsets:
-    ///       :obj:`Tuple[int, Tuple[int, int]]` with offsets of the form :obj:`(first, last + 1)`
-    ///
+    ///     :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
    #[text_signature = "($self, token_index)"]
-    fn token_to_chars(&self, token_index: usize) -> Option<PyObject> {
-        let (seq_idx, offsets) = self.encoding.token_to_chars(token_index)?;
-        Python::with_gil(|py| {
-            if self.encoding.n_sequences() > 1 {
-                Some((seq_idx, offsets).to_object(py))
-            } else {
-                Some(offsets.to_object(py))
-            }
-        })
+    fn token_to_chars(&self, token_index: usize) -> Option<Offsets> {
+        let (_, offsets) = self.encoding.token_to_chars(token_index)?;
+        Some(offsets)
    }

-    /// Get the word that contains the token at the given index
+    /// Get the index of the word that contains the token in one of the input sequences.
    ///
-    /// If the :class:`~tokenizers.Encoding` represents multiple sequences (namely
-    /// a pair of sequences), then this method returns a Tuple with both the relevant
-    /// sequence index, and the word index.
+    /// The returned word index is related to the input sequence that contains
+    /// the token.  In order to determine in which input sequence it belongs, you
+    /// must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
    ///
    /// Args:
    ///     token_index (:obj:`int`):
    ///         The index of a token in the encoded sequence.
    ///
    /// Returns:
-    ///     :obj:`int` or :obj:`Tuple[int, int]`:
-    ///
-    ///     - For a single sequence: The index of the word in the input sequence: :obj:`int`
-    ///     - For pairs of sequence: A tuple with the sequence index, and the index of the word
-    ///       in the said sequence: :obj:`Tuple[int, int]`
-    ///
+    ///     :obj:`int`: The index of the word in the relevant input sequence.
    #[text_signature = "($self, token_index)"]
-    fn token_to_word(&self, token_index: usize) -> Option<PyObject> {
-        let (seq_idx, word_idx) = self.encoding.token_to_word(token_index)?;
-        Python::with_gil(|py| {
-            if self.encoding.n_sequences() > 1 {
-                Some((seq_idx, word_idx).to_object(py))
-            } else {
-                Some(word_idx.to_object(py))
-            }
-        })
+    fn token_to_word(&self, token_index: usize) -> Option<u32> {
+        let (_, word_idx) = self.encoding.token_to_word(token_index)?;
+        Some(word_idx)
    }

    /// Get the token that contains the char at the given position in the input sequence.
--- a/bindings/python/tests/bindings/test_encoding.py
+++ b/bindings/python/tests/bindings/test_encoding.py
@ -65,9 +65,9 @@ class TestEncoding:

        assert single.token_to_chars(0) == None
        assert single.token_to_chars(2) == (2, 6)
-        assert pair.token_to_chars(2) == (0, (2, 6))
+        assert pair.token_to_chars(2) == (2, 6)
        assert pair.token_to_chars(5) == None
-        assert pair.token_to_chars(6) == (1, (0, 2))
+        assert pair.token_to_chars(6) == (0, 2)

    def test_token_to_word(self, encodings):
        single, pair = encodings
@ -75,11 +75,11 @@ class TestEncoding:
        assert single.token_to_word(0) == None
        assert single.token_to_word(1) == 0
        assert single.token_to_word(4) == 2
-        assert pair.token_to_word(1) == (0, 0)
-        assert pair.token_to_word(4) == (0, 2)
+        assert pair.token_to_word(1) == 0
+        assert pair.token_to_word(4) == 2
        assert pair.token_to_word(5) == None
-        assert pair.token_to_word(6) == (1, 0)
-        assert pair.token_to_word(7) == (1, 1)
+        assert pair.token_to_word(6) == 0
+        assert pair.token_to_word(7) == 1

    def test_char_to_token(self, encodings):
        single, pair = encodings