mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-07 05:08:24 +00:00
Adding a new "Replace" normalizer that takes a string and replaces it
with another String (for now).
This commit is contained in:
@@ -12,6 +12,7 @@ Strip = normalizers.Strip
|
||||
StripAccents = normalizers.StripAccents
|
||||
Nmt = normalizers.Nmt
|
||||
Precompiled = normalizers.Precompiled
|
||||
Replace = normalizers.Replace
|
||||
|
||||
|
||||
NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
|
||||
|
||||
@@ -112,11 +112,17 @@ class Nmt(Normalizer):
|
||||
pass
|
||||
|
||||
class Precompiled(Normalizer):
|
||||
""" SpmNmtNfkc normalizer """
|
||||
""" Precompiled normalizer """
|
||||
|
||||
def __init__(self, precompiled_charsmap: bytes) -> Normalizer:
|
||||
pass
|
||||
|
||||
class Replace(Normalizer):
|
||||
""" Replace normalizer """
|
||||
|
||||
def __init__(self, pattern: str, content: str) -> Normalizer:
|
||||
pass
|
||||
|
||||
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
|
||||
"""
|
||||
Instanciate unicode normalizer from the normalizer name
|
||||
|
||||
@@ -111,6 +111,7 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<normalizers::PyStripAccents>()?;
|
||||
m.add_class::<normalizers::PyNmt>()?;
|
||||
m.add_class::<normalizers::PyPrecompiled>()?;
|
||||
m.add_class::<normalizers::PyReplace>()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -8,8 +8,13 @@ use crate::error::ToPyResult;
|
||||
use serde::ser::SerializeStruct;
|
||||
use serde::{Deserialize, Serialize, Serializer};
|
||||
use tk::normalizers::{
|
||||
<<<<<<< HEAD
|
||||
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, StripAccents, NFC, NFD,
|
||||
NFKC, NFKD,
|
||||
=======
|
||||
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Replace, Strip, NFC, NFD, NFKC,
|
||||
NFKD,
|
||||
>>>>>>> Adding a new "Replace" normalizer that takes a string and replaces it
|
||||
};
|
||||
use tk::{NormalizedString, Normalizer};
|
||||
use tokenizers as tk;
|
||||
@@ -54,6 +59,7 @@ impl PyNormalizer {
|
||||
NormalizerWrapper::Precompiled(_) => {
|
||||
Py::new(py, (PyPrecompiled {}, base)).map(Into::into)
|
||||
}
|
||||
NormalizerWrapper::Replace(_) => Py::new(py, (PyReplace {}, base)).map(Into::into),
|
||||
NormalizerWrapper::Nmt(_) => Py::new(py, (PyNmt {}, base)).map(Into::into),
|
||||
},
|
||||
}
|
||||
@@ -324,6 +330,18 @@ impl PyPrecompiled {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Replace)]
|
||||
pub struct PyReplace {}
|
||||
#[pymethods]
|
||||
impl PyReplace {
|
||||
#[new]
|
||||
fn new(pattern: &PyString, content: &PyString) -> PyResult<(Self, PyNormalizer)> {
|
||||
let pattern: String = (*pattern).to_string()?.to_string();
|
||||
let content: String = (*content).to_string()?.to_string();
|
||||
Ok((PyReplace {}, Replace::new(pattern, content).into()))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use pyo3::{AsPyRef, Python};
|
||||
|
||||
Reference in New Issue
Block a user