words -> word_ids & sequences -> sequence_ids

2025-09-03 15:59:25 +00:00 · 2020-11-09 15:14:14 -05:00
parent 57d162b269
commit d3d9f2c76b
11 changed files with 70 additions and 44 deletions
--- a/bindings/python/src/encoding.rs
+++ b/bindings/python/src/encoding.rs
@ -5,7 +5,7 @@ use pyo3::{PyObjectProtocol, PySequenceProtocol};
 use tk::tokenizer::{Offsets, PaddingDirection};
 use tokenizers as tk;

-use crate::error::PyError;
+use crate::error::{deprecation_warning, PyError};

 /// The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
 #[pyclass(dict, module = "tokenizers", name=Encoding)]
@ -137,6 +137,10 @@ impl PyEncoding {

    /// The generated word indices.
    ///
+    /// .. warning::
+    ///     This is deprecated and will be removed in a future version.
+    ///     Please use :obj:`~tokenizers.Encoding.word_ids` instead.
+    ///
    /// They represent the index of the word associated to each token.
    /// When the input is pre-tokenized, they correspond to the ID of the given input label,
    /// otherwise they correspond to the words indices as defined by the
@ -148,8 +152,29 @@ impl PyEncoding {
    /// Returns:
    ///     A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
    #[getter]
-    fn get_words(&self) -> Vec<Option<u32>> {
-        self.encoding.get_words().to_vec()
+    fn get_words(&self) -> PyResult<Vec<Option<u32>>> {
+        deprecation_warning(
+            "0.9.4",
+            "Encoding.words is deprecated, please use Encoding.word_ids instead.",
+        )?;
+        Ok(self.get_word_ids())
+    }
+
+    /// The generated word indices.
+    ///
+    /// They represent the index of the word associated to each token.
+    /// When the input is pre-tokenized, they correspond to the ID of the given input label,
+    /// otherwise they correspond to the words indices as defined by the
+    /// :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
+    ///
+    /// For special tokens and such (any token that was generated from something that was
+    /// not part of the input), the output is :obj:`None`
+    ///
+    /// Returns:
+    ///     A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
+    #[getter]
+    fn get_word_ids(&self) -> Vec<Option<u32>> {
+        self.encoding.get_word_ids().to_vec()
    }

    /// The generated sequence indices.
@ -161,8 +186,8 @@ impl PyEncoding {
    /// Returns:
    ///     A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
    #[getter]
-    fn get_sequences(&self) -> Vec<Option<usize>> {
-        self.encoding.get_sequences()
+    fn get_sequence_ids(&self) -> Vec<Option<usize>> {
+        self.encoding.get_sequence_ids()
    }

    /// The generated type IDs
--- a/bindings/python/tests/bindings/test_encoding.py
+++ b/bindings/python/tests/bindings/test_encoding.py
@ -12,11 +12,11 @@ class TestEncoding:
        pair_encoding = tokenizer.encode("I love HuggingFace", "Do you?")
        return single_encoding, pair_encoding

-    def test_sequences(self, encodings):
+    def test_sequence_ids(self, encodings):
        single, pair = encodings

-        assert single.sequences == [None, 0, 0, 0, 0, None]
-        assert pair.sequences == [None, 0, 0, 0, 0, None, 1, 1, 1, None]
+        assert single.sequence_ids == [None, 0, 0, 0, 0, None]
+        assert pair.sequence_ids == [None, 0, 0, 0, 0, None, 1, 1, 1, None]

    def test_n_sequences(self, encodings):
        single, pair = encodings