mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Fixing the stream by removing the read_index altogether. (#1716)
* Fixing the stream by removing the read_index altogether. * Moving the test location because.. Windows. * Ok whatever. * Rust 1.84 * Fmt.
This commit is contained in:
@ -14,8 +14,8 @@ style:
|
||||
# Check the source code is formatted correctly
|
||||
check-style:
|
||||
python stub.py --check
|
||||
ruff check examples py_src/tokenizers tests
|
||||
ruff format --check examples py_src/tokenizers tests
|
||||
ruff check $(check_dirs)
|
||||
ruff format --check $(check_dirs)
|
||||
|
||||
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json
|
||||
|
||||
|
@ -241,7 +241,7 @@ class EncodingVisualizer:
|
||||
# In this case we are looking at a group/single char that is not tokenized.
|
||||
# e.g. white space
|
||||
css_classes.append("non-token")
|
||||
css = f'''class="{' '.join(css_classes)}"'''
|
||||
css = f'''class="{" ".join(css_classes)}"'''
|
||||
data = ""
|
||||
for key, val in data_items.items():
|
||||
data += f' data-{key}="{val}"'
|
||||
|
@ -646,11 +646,6 @@ pub struct PyDecodeStream {
|
||||
/// The index within the ids corresponding to the prefix so we can drain
|
||||
/// correctly
|
||||
prefix_index: usize,
|
||||
/// We need to keep 2 prefixes.
|
||||
/// Prefix is the second one that was already emitted to discard the part
|
||||
/// of the text of all the ids
|
||||
/// read is the prefix kept only for starting side effects of the prefix
|
||||
read_index: usize,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
@ -663,7 +658,6 @@ impl PyDecodeStream {
|
||||
ids: vec![],
|
||||
prefix: "".to_string(),
|
||||
prefix_index: 0,
|
||||
read_index: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@ -676,7 +670,6 @@ impl PyDecodeStream {
|
||||
&mut self.ids,
|
||||
&mut self.prefix,
|
||||
&mut self.prefix_index,
|
||||
&mut self.read_index,
|
||||
))
|
||||
.into()
|
||||
}
|
||||
|
Reference in New Issue
Block a user