Fixing the stream by removing the read_index altogether. (#1716)

* Fixing the stream by removing the read_index altogether.

* Moving the test location because.. Windows.

* Ok whatever.

* Rust 1.84

* Fmt.
This commit is contained in:
Nicolas Patry
2025-01-09 17:41:15 +01:00
committed by GitHub
parent 862d1a346a
commit 0ff2ab0f64
8 changed files with 89 additions and 132 deletions

View File

@ -14,8 +14,8 @@ style:
# Check the source code is formatted correctly
check-style:
python stub.py --check
ruff check examples py_src/tokenizers tests
ruff format --check examples py_src/tokenizers tests
ruff check $(check_dirs)
ruff format --check $(check_dirs)
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json

View File

@ -241,7 +241,7 @@ class EncodingVisualizer:
# In this case we are looking at a group/single char that is not tokenized.
# e.g. white space
css_classes.append("non-token")
css = f'''class="{' '.join(css_classes)}"'''
css = f'''class="{" ".join(css_classes)}"'''
data = ""
for key, val in data_items.items():
data += f' data-{key}="{val}"'

View File

@ -646,11 +646,6 @@ pub struct PyDecodeStream {
/// The index within the ids corresponding to the prefix so we can drain
/// correctly
prefix_index: usize,
/// We need to keep 2 prefixes.
/// Prefix is the second one that was already emitted to discard the part
/// of the text of all the ids
/// read is the prefix kept only for starting side effects of the prefix
read_index: usize,
}
#[pymethods]
@ -663,7 +658,6 @@ impl PyDecodeStream {
ids: vec![],
prefix: "".to_string(),
prefix_index: 0,
read_index: 0,
}
}
@ -676,7 +670,6 @@ impl PyDecodeStream {
&mut self.ids,
&mut self.prefix,
&mut self.prefix_index,
&mut self.read_index,
))
.into()
}