Support None to reset pre_tokenizers and normalizers, and index sequences (#1590)

* initial commit

* support None

* fix clippy

* cleanup

* clean?

* propagate to pre_tokenizer

* fix test

* fix rust tests

* fix node

* propagate to decoder and post processor

* fix calls

* lint

* fmt

* node be happy I am fixing you

* initial commit

* support None

* fix clippy

* cleanup

* clean?

* propagate to pre_tokenizer

* fix test

* fix rust tests

* fix node

* propagate to decoder and post processor

* fix calls

* lint

* fmt

* node be happy I am fixing you

* add a small test

* styling

* style merge

* fix merge test

* fmt

* nits

* update tset
This commit is contained in:
Arthur
2024-08-07 12:52:35 +02:00
committed by GitHub
parent eea8e1ae6f
commit bded212356
13 changed files with 134 additions and 67 deletions

View File

@ -1,8 +1,6 @@
use std::sync::{Arc, RwLock};
use pyo3::exceptions;
use pyo3::prelude::*;
use pyo3::types::*;
use pyo3::{exceptions, prelude::*};
use std::sync::{Arc, RwLock};
use crate::error::ToPyResult;
use crate::utils::{PyNormalizedString, PyNormalizedStringRefMut, PyPattern};
@ -354,6 +352,7 @@ impl PyNFKC {
/// A list of Normalizer to be run as a sequence
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Sequence")]
pub struct PySequence {}
#[pymethods]
impl PySequence {
#[new]
@ -380,6 +379,22 @@ impl PySequence {
fn __len__(&self) -> usize {
0
}
fn __getitem__(self_: PyRef<'_, Self>, py: Python<'_>, index: usize) -> PyResult<Py<PyAny>> {
match &self_.as_ref().normalizer {
PyNormalizerTypeWrapper::Sequence(inner) => match inner.get(index) {
Some(item) => PyNormalizer::new(PyNormalizerTypeWrapper::Single(Arc::clone(item)))
.get_as_subtype(py),
_ => Err(PyErr::new::<pyo3::exceptions::PyIndexError, _>(
"Index not found",
)),
},
PyNormalizerTypeWrapper::Single(inner) => {
PyNormalizer::new(PyNormalizerTypeWrapper::Single(Arc::clone(inner)))
.get_as_subtype(py)
}
}
}
}
/// Lowercase Normalizer