Adding a new "Replace" normalizer that takes a string and replaces it

with another String (for now).
2025-09-02 07:19:24 +00:00 · 2020-09-16 11:13:37 +02:00
parent 1a4a4649c3
commit 792d618006
6 changed files with 75 additions and 1 deletions
--- a/bindings/python/py_src/tokenizers/normalizers/init.py
+++ b/bindings/python/py_src/tokenizers/normalizers/init.py
@ -12,6 +12,7 @@ Strip = normalizers.Strip
 StripAccents = normalizers.StripAccents
 Nmt = normalizers.Nmt
 Precompiled = normalizers.Precompiled
 Replace = normalizers.Replace
 NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
--- a/bindings/python/py_src/tokenizers/normalizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/normalizers/init.pyi
@ -112,11 +112,17 @@ class Nmt(Normalizer):
        pass
 class Precompiled(Normalizer):
-    """ SpmNmtNfkc normalizer """
+    """ Precompiled normalizer """
    def __init__(self, precompiled_charsmap: bytes) -> Normalizer:
        pass
 class Replace(Normalizer):
    """ Replace normalizer """
    def __init__(self, pattern: str, content: str) -> Normalizer:
        pass
 def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
    """
    Instanciate unicode normalizer from the normalizer name
--- a/bindings/python/src/lib.rs
+++ b/bindings/python/src/lib.rs
@ -111,6 +111,7 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
    m.add_class::<normalizers::PyStripAccents>()?;
    m.add_class::<normalizers::PyNmt>()?;
    m.add_class::<normalizers::PyPrecompiled>()?;
    m.add_class::<normalizers::PyReplace>()?;
    Ok(())
 }
--- a/bindings/python/src/normalizers.rs
+++ b/bindings/python/src/normalizers.rs
@ -8,8 +8,13 @@ use crate::error::ToPyResult;
 use serde::ser::SerializeStruct;
 use serde::{Deserialize, Serialize, Serializer};
 use tk::normalizers::{
 <<<<<<< HEAD
    BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, StripAccents, NFC, NFD,
    NFKC, NFKD,
 =======
    BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Replace, Strip, NFC, NFD, NFKC,
    NFKD,
 >>>>>>> Adding a new "Replace" normalizer that takes a string and replaces it
 };
 use tk::{NormalizedString, Normalizer};
 use tokenizers as tk;
@ -54,6 +59,7 @@ impl PyNormalizer {
                NormalizerWrapper::Precompiled(_) => {
                    Py::new(py, (PyPrecompiled {}, base)).map(Into::into)
                }
                NormalizerWrapper::Replace(_) => Py::new(py, (PyReplace {}, base)).map(Into::into),
                NormalizerWrapper::Nmt(_) => Py::new(py, (PyNmt {}, base)).map(Into::into),
            },
        }
@ -324,6 +330,18 @@ impl PyPrecompiled {
    }
 }
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Replace)]
 pub struct PyReplace {}
 #[pymethods]
 impl PyReplace {
    #[new]
    fn new(pattern: &PyString, content: &PyString) -> PyResult<(Self, PyNormalizer)> {
        let pattern: String = (*pattern).to_string()?.to_string();
        let content: String = (*content).to_string()?.to_string();
        Ok((PyReplace {}, Replace::new(pattern, content).into()))
    }
 }
 #[cfg(test)]
 mod test {
    use pyo3::{AsPyRef, Python};
--- a/tokenizers/src/normalizers/mod.rs
+++ b/tokenizers/src/normalizers/mod.rs
@ -1,11 +1,13 @@
 pub mod bert;
 pub mod precompiled;
 pub mod replace;
 pub mod strip;
 pub mod unicode;
 pub mod utils;
 pub use crate::normalizers::bert::BertNormalizer;
 pub use crate::normalizers::precompiled::Precompiled;
 pub use crate::normalizers::replace::Replace;
 pub use crate::normalizers::strip::{Strip, StripAccents};
 pub use crate::normalizers::unicode::{Nmt, NFC, NFD, NFKC, NFKD};
 pub use crate::normalizers::utils::{Lowercase, Sequence};
@ -29,6 +31,7 @@ pub enum NormalizerWrapper {
    Lowercase(Lowercase),
    Nmt(Nmt),
    Precompiled(Precompiled),
    Replace(Replace),
 }
 impl Normalizer for NormalizerWrapper {
@ -45,6 +48,7 @@ impl Normalizer for NormalizerWrapper {
            NormalizerWrapper::Lowercase(lc) => lc.normalize(normalized),
            NormalizerWrapper::Nmt(lc) => lc.normalize(normalized),
            NormalizerWrapper::Precompiled(lc) => lc.normalize(normalized),
            NormalizerWrapper::Replace(lc) => lc.normalize(normalized),
        }
    }
 }
@ -60,3 +64,4 @@ impl_enum_from!(Sequence, NormalizerWrapper, Sequence);
 impl_enum_from!(Lowercase, NormalizerWrapper, Lowercase);
 impl_enum_from!(Nmt, NormalizerWrapper, Nmt);
 impl_enum_from!(Precompiled, NormalizerWrapper, Precompiled);
 impl_enum_from!(Replace, NormalizerWrapper, Replace);
--- a/tokenizers/src/normalizers/replace.rs
+++ b/tokenizers/src/normalizers/replace.rs
@ -0,0 +1,43 @@
 use crate::tokenizer::{NormalizedString, Normalizer, Result};
 use serde::{Deserialize, Serialize};
 /// This normalizer will take a `pattern` (for now only a String)
 /// and replace every occurrence with `content`.
 #[derive(Deserialize, Serialize, Clone, Debug)]
 #[serde(tag = "type")]
 pub struct Replace {
    pattern: String,
    content: String,
 }
 impl Replace {
    pub fn new(pattern: String, content: String) -> Self {
        Self { pattern, content }
    }
 }
 impl Normalizer for Replace {
    fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
        let pattern: &str = &self.pattern;
        normalized.replace(pattern, &self.content)?;
        Ok(())
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_replace() {
        let original = "This is a ''test''";
        let normalized = "This is a \"test\"";
        let mut n = NormalizedString::from(original);
        Replace::new("''".to_string(), "\"".to_string())
            .normalize(&mut n)
            .unwrap();
        assert_eq!(&n.get(), &normalized);
    }
 }