Fixing the stream by removing the read_index altogether. (#1716)

* Fixing the stream by removing the read_index altogether. * Moving the test location because.. Windows. * Ok whatever. * Rust 1.84 * Fmt.
2025-08-22 16:25:30 +00:00 · 2025-01-09 17:41:15 +01:00
parent 862d1a346a
commit 0ff2ab0f64
8 changed files with 89 additions and 132 deletions
--- a/bindings/python/Makefile
+++ b/bindings/python/Makefile
@ -14,8 +14,8 @@ style:
 # Check the source code is formatted correctly
 check-style:
 	python stub.py --check
-	ruff check examples py_src/tokenizers tests 
-	ruff format --check examples py_src/tokenizers tests 
+	ruff check $(check_dirs)
+	ruff format --check $(check_dirs)

 TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json

--- a/bindings/python/py_src/tokenizers/tools/visualizer.py
+++ b/bindings/python/py_src/tokenizers/tools/visualizer.py
@ -241,7 +241,7 @@ class EncodingVisualizer:
            # In this case we are looking at a group/single char that is not tokenized.
            # e.g. white space
            css_classes.append("non-token")
-        css = f'''class="{' '.join(css_classes)}"'''
+        css = f'''class="{" ".join(css_classes)}"'''
        data = ""
        for key, val in data_items.items():
            data += f' data-{key}="{val}"'
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@ -646,11 +646,6 @@ pub struct PyDecodeStream {
    /// The index within the ids corresponding to the prefix so we can drain
    /// correctly
    prefix_index: usize,
-    /// We need to keep 2 prefixes.
-    /// Prefix is the second one that was already emitted to discard the part
-    /// of the text of all the ids
-    /// read is the prefix kept only for starting side effects of the prefix
-    read_index: usize,
 }

 #[pymethods]
@ -663,7 +658,6 @@ impl PyDecodeStream {
            ids: vec![],
            prefix: "".to_string(),
            prefix_index: 0,
-            read_index: 0,
        }
    }

@ -676,7 +670,6 @@ impl PyDecodeStream {
            &mut self.ids,
            &mut self.prefix,
            &mut self.prefix_index,
-            &mut self.read_index,
        ))
        .into()
    }