Python - Encoding mappings handle sequence_id

2025-09-03 15:59:25 +00:00 · 2020-11-04 11:38:46 -05:00
parent f3b970e281
commit dce218ca28
2 changed files with 183 additions and 40 deletions
--- a/bindings/python/py_src/tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/init.pyi
@ -275,6 +275,21 @@ class Encoding:
        """
        pass
    @property
+    def n_sequences(self) -> int:
+        """The number of sequences represented
+
+        Returns:
+            :obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
+        """
+        pass
+    def set_sequence_id(self, sequence_index: int):
+        """Set the given sequence index
+
+        Set the given sequence index for the whole range of tokens contained in this
+        :class:`~tokenizers.Encoding`.
+        """
+        pass
+    @property
    def ids(self) -> List[int]:
        """The generated IDs

@ -368,68 +383,110 @@ class Encoding:
        maximum length.
        """
        pass
-    def word_to_tokens(self, word_index: int) -> Optional[Tuple[int, int]]:
+    def word_to_tokens(self, word_index: int, sequence_index: int = 0) -> Optional[Tuple[int, int]]:
        """Get the encoded tokens corresponding to the word at the given index
-        in the input sequence.
+        in one of the input sequences.

        Args:
            word_index (:obj:`int`):
-                The index of a word in the input sequence.
+                The index of a word in one of the input sequences.
+            sequence_index (:obj:`int`, defaults to :obj:`0`):
+                The index of the sequence that contains the target word

        Returns:
            :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
        """
        pass
-    def word_to_chars(self, word_index: int) -> Optional[Offsets]:
-        """Get the offsets of the word at the given index in the input sequence.
+    def word_to_chars(self, word_index: int, sequence_index: int = 0) -> Optional[Offsets]:
+        """Get the offsets of the word at the given index in one of the input sequences.

        Args:
            word_index (:obj:`int`):
-                The index of a word in the input sequence.
+                The index of a word in one of the input sequences.
+            sequence_index (:obj:`int`, defaults to :obj:`0`):
+                The index of the sequence that contains the target word

        Returns:
            :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
        """
        pass
-    def token_to_chars(self, token_index: int) -> Optional[Offsets]:
-        """Get the offsets of the token at the given index
+    def token_to_sequence(self, token_index: int) -> Optional[int]:
+        """Get the index of the sequence represented by the given token.
+
+        In the general use case, this method returns :obj:`0` for a single sequence or
+        the first sequence of a pair, and :obj:`1` for the second sequence of a pair

        Args:
            token_index (:obj:`int`):
                The index of a token in the encoded sequence.

        Returns:
-            :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
+            :obj:`int`: The sequence id of the given token
        """
        pass
-    def token_to_word(self, token_index: int) -> Optional[int]:
+    def token_to_chars(self, token_index: int) -> Optional[Union[Offsets, Tuple[int, Offsets]]]:
+        """Get the offsets of the token at the given index.
+
+        If the :class:`~tokenizers.Encoding` represents multiple sequences (namely
+        a pair of sequences), then this method returns a Tuple with both the relevant
+        sequence index, and the offsets.
+
+        Args:
+            token_index (:obj:`int`):
+                The index of a token in the encoded sequence.
+
+        Returns:
+            :obj:`Tuple[int, int]` or :obj:`Tuple[int, Tuple[int, int]]`:
+
+            - For a single sequence: the token offsets:
+              :obj:`Tuple[int, int]` of the form :obj:`(first, last + 1)`
+
+            - For pairs of sequence: A tuple with the sequence index, and the token offsets:
+              :obj:`Tuple[int, Tuple[int, int]]` with offsets of the form :obj:`(first, last + 1)`
+
+        """
+        pass
+    def token_to_word(self, token_index: int) -> Optional[Union[int, Tuple[int, int]]]:
        """Get the word that contains the token at the given index

+        If the :class:`~tokenizers.Encoding` represents multiple sequences (namely
+        a pair of sequences), then this method returns a Tuple with both the relevant
+        sequence index, and the word index.
+
        Args:
            token_index (:obj:`int`):
                The index of a token in the encoded sequence.

        Returns:
-            :obj:`int`: The index of the word in the input sequence.
+            :obj:`int` or :obj:`Tuple[int, int]`:
+
+            - For a single sequence: The index of the word in the input sequence: :obj:`int`
+            - For pairs of sequence: A tuple with the sequence index, and the index of the word
+              in the said sequence: :obj:`Tuple[int, int]`
+
        """
        pass
-    def char_to_token(self, pos: int) -> Optional[int]:
-        """Get the token that contains the char at the given position
+    def char_to_token(self, pos: int, sequence_index: int = 0) -> Optional[int]:
+        """Get the token that contains the char at the given position in the input sequence.

        Args:
            char_pos (:obj:`int`):
                The position of a char in the input string
+            sequence_index (:obj:`int`, defaults to :obj:`0`):
+                The index of the sequence that contains the target char

        Returns:
            :obj:`int`: The index of the token that contains this char in the encoded sequence
        """
        pass
-    def char_to_word(self, pos: int) -> Optional[int]:
-        """Get the word that contains the char at the given position
+    def char_to_word(self, pos: int, sequence_index: int = 0) -> Optional[int]:
+        """Get the word that contains the char at the given position in the input sequence.

        Args:
            char_pos (:obj:`int`):
                The position of a char in the input string
+            sequence_index (:obj:`int`, defaults to :obj:`0`):
+                The index of the sequence that contains the target char

        Returns:
            :obj:`int`: The index of the word that contains this char in the input sequence
@ -465,6 +522,9 @@ class Encoding:
    def truncate(self, max_length: int, stride: Optional[int] = 0):
        """Truncate the :class:`~tokenizers.Encoding` at the given length

+        If this :class:`~tokenizers.Encoding` represents multiple sequences, when truncating
+        this information is lost. It will be considered as representing a single sequence.
+
        Args:
            max_length (:obj:`int`):
                The desired length
--- a/bindings/python/src/encoding.rs
+++ b/bindings/python/src/encoding.rs
@ -94,6 +94,24 @@ impl PyEncoding {
        .into()
    }

+    /// The number of sequences represented
+    ///
+    /// Returns:
+    ///     :obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
+    #[getter]
+    fn get_n_sequences(&self) -> usize {
+        self.encoding.n_sequences()
+    }
+
+    /// Set the given sequence index
+    ///
+    /// Set the given sequence index for the whole range of tokens contained in this
+    /// :class:`~tokenizers.Encoding`.
+    #[text_signature = "($self, sequence_id)"]
+    fn set_sequence_id(&mut self, sequence_id: usize) {
+        self.encoding.set_sequence_id(sequence_id);
+    }
+
    /// The generated IDs
    ///
    /// The IDs are the main input to a Language Model. They are the token indices,
@ -202,82 +220,144 @@ impl PyEncoding {
    }

    /// Get the encoded tokens corresponding to the word at the given index
-    /// in the input sequence.
+    /// in one of the input sequences.
    ///
    /// Args:
    ///     word_index (:obj:`int`):
-    ///         The index of a word in the input sequence.
+    ///         The index of a word in one of the input sequences.
+    ///     sequence_index (:obj:`int`, defaults to :obj:`0`):
+    ///         The index of the sequence that contains the target word
    ///
    /// Returns:
    ///     :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
-    #[text_signature = "($self, word_index)"]
-    fn word_to_tokens(&self, word_index: u32) -> Option<(usize, usize)> {
-        self.encoding.word_to_tokens(word_index)
+    #[args(sequence_index = 0)]
+    #[text_signature = "($self, word_index, sequence_index=0)"]
+    fn word_to_tokens(&self, word_index: u32, sequence_index: usize) -> Option<(usize, usize)> {
+        self.encoding.word_to_tokens(word_index, sequence_index)
    }

-    /// Get the offsets of the word at the given index in the input sequence.
+    /// Get the offsets of the word at the given index in one of the input sequences.
    ///
    /// Args:
    ///     word_index (:obj:`int`):
-    ///         The index of a word in the input sequence.
+    ///         The index of a word in one of the input sequences.
+    ///     sequence_index (:obj:`int`, defaults to :obj:`0`):
+    ///         The index of the sequence that contains the target word
    ///
    /// Returns:
    ///     :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
-    #[text_signature = "($self, word_index)"]
-    fn word_to_chars(&self, word_index: u32) -> Option<Offsets> {
-        self.encoding.word_to_chars(word_index)
+    #[args(sequence_index = 0)]
+    #[text_signature = "($self, word_index, sequence_index=0)"]
+    fn word_to_chars(&self, word_index: u32, sequence_index: usize) -> Option<Offsets> {
+        self.encoding.word_to_chars(word_index, sequence_index)
    }

-    /// Get the offsets of the token at the given index
+    /// Get the index of the sequence represented by the given token.
+    ///
+    /// In the general use case, this method returns :obj:`0` for a single sequence or
+    /// the first sequence of a pair, and :obj:`1` for the second sequence of a pair
    ///
    /// Args:
    ///     token_index (:obj:`int`):
    ///         The index of a token in the encoded sequence.
    ///
    /// Returns:
-    ///     :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
+    ///     :obj:`int`: The sequence id of the given token
    #[text_signature = "($self, token_index)"]
-    fn token_to_chars(&self, token_index: usize) -> Option<Offsets> {
-        self.encoding.token_to_chars(token_index)
+    fn token_to_sequence(&self, token_index: usize) -> Option<usize> {
+        self.encoding.token_to_sequence(token_index)
+    }
+
+    /// Get the offsets of the token at the given index.
+    ///
+    /// If the :class:`~tokenizers.Encoding` represents multiple sequences (namely
+    /// a pair of sequences), then this method returns a Tuple with both the relevant
+    /// sequence index, and the offsets.
+    ///
+    /// Args:
+    ///     token_index (:obj:`int`):
+    ///         The index of a token in the encoded sequence.
+    ///
+    /// Returns:
+    ///     :obj:`Tuple[int, int]` or :obj:`Tuple[int, Tuple[int, int]]`:
+    ///
+    ///     - For a single sequence: the token offsets:
+    ///       :obj:`Tuple[int, int]` of the form :obj:`(first, last + 1)`
+    ///
+    ///     - For pairs of sequence: A tuple with the sequence index, and the token offsets:
+    ///       :obj:`Tuple[int, Tuple[int, int]]` with offsets of the form :obj:`(first, last + 1)`
+    ///
+    #[text_signature = "($self, token_index)"]
+    fn token_to_chars(&self, token_index: usize) -> Option<PyObject> {
+        let (seq_idx, offsets) = self.encoding.token_to_chars(token_index)?;
+        Python::with_gil(|py| {
+            if self.encoding.n_sequences() > 1 {
+                Some((seq_idx, offsets).to_object(py))
+            } else {
+                Some(offsets.to_object(py))
+            }
+        })
    }

    /// Get the word that contains the token at the given index
    ///
+    /// If the :class:`~tokenizers.Encoding` represents multiple sequences (namely
+    /// a pair of sequences), then this method returns a Tuple with both the relevant
+    /// sequence index, and the word index.
+    ///
    /// Args:
    ///     token_index (:obj:`int`):
    ///         The index of a token in the encoded sequence.
    ///
    /// Returns:
-    ///     :obj:`int`: The index of the word in the input sequence.
+    ///     :obj:`int` or :obj:`Tuple[int, int]`:
+    ///
+    ///     - For a single sequence: The index of the word in the input sequence: :obj:`int`
+    ///     - For pairs of sequence: A tuple with the sequence index, and the index of the word
+    ///       in the said sequence: :obj:`Tuple[int, int]`
+    ///
    #[text_signature = "($self, token_index)"]
-    fn token_to_word(&self, token_index: usize) -> Option<u32> {
-        self.encoding.token_to_word(token_index)
+    fn token_to_word(&self, token_index: usize) -> Option<PyObject> {
+        let (seq_idx, word_idx) = self.encoding.token_to_word(token_index)?;
+        Python::with_gil(|py| {
+            if self.encoding.n_sequences() > 1 {
+                Some((seq_idx, word_idx).to_object(py))
+            } else {
+                Some(word_idx.to_object(py))
+            }
+        })
    }

-    /// Get the token that contains the char at the given position
+    /// Get the token that contains the char at the given position in the input sequence.
    ///
    /// Args:
    ///     char_pos (:obj:`int`):
    ///         The position of a char in the input string
+    ///     sequence_index (:obj:`int`, defaults to :obj:`0`):
+    ///         The index of the sequence that contains the target char
    ///
    /// Returns:
    ///     :obj:`int`: The index of the token that contains this char in the encoded sequence
-    #[text_signature = "($self, char_pos)"]
-    fn char_to_token(&self, char_pos: usize) -> Option<usize> {
-        self.encoding.char_to_token(char_pos)
+    #[args(sequence_index = 0)]
+    #[text_signature = "($self, char_pos, sequence_index=0)"]
+    fn char_to_token(&self, char_pos: usize, sequence_index: usize) -> Option<usize> {
+        self.encoding.char_to_token(char_pos, sequence_index)
    }

-    /// Get the word that contains the char at the given position
+    /// Get the word that contains the char at the given position in the input sequence.
    ///
    /// Args:
    ///     char_pos (:obj:`int`):
    ///         The position of a char in the input string
+    ///     sequence_index (:obj:`int`, defaults to :obj:`0`):
+    ///         The index of the sequence that contains the target char
    ///
    /// Returns:
    ///     :obj:`int`: The index of the word that contains this char in the input sequence
-    #[text_signature = "($self, char_pos)"]
-    fn char_to_word(&self, char_pos: usize) -> Option<u32> {
-        self.encoding.char_to_word(char_pos)
+    #[args(sequence_index = 0)]
+    #[text_signature = "($self, char_pos, sequence_index=0)"]
+    fn char_to_word(&self, char_pos: usize, sequence_index: usize) -> Option<u32> {
+        self.encoding.char_to_word(char_pos, sequence_index)
    }

    /// Pad the :class:`~tokenizers.Encoding` at the given length
@ -336,6 +416,9 @@ impl PyEncoding {

    /// Truncate the :class:`~tokenizers.Encoding` at the given length
    ///
+    /// If this :class:`~tokenizers.Encoding` represents multiple sequences, when truncating
+    /// this information is lost. It will be considered as representing a single sequence.
+    ///
    /// Args:
    ///     max_length (:obj:`int`):
    ///         The desired length