Adding a new "Replace" normalizer that takes a string and replaces it

with another String (for now).
2025-12-12 05:18:39 +00:00 · 2020-09-16 11:13:37 +02:00
parent 1a4a4649c3
commit 792d618006
6 changed files with 75 additions and 1 deletions
--- a/bindings/python/py_src/tokenizers/normalizers/init.py
+++ b/bindings/python/py_src/tokenizers/normalizers/init.py
@@ -12,6 +12,7 @@ Strip = normalizers.Strip
 StripAccents = normalizers.StripAccents
 Nmt = normalizers.Nmt
 Precompiled = normalizers.Precompiled
+Replace = normalizers.Replace


 NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
--- a/bindings/python/py_src/tokenizers/normalizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/normalizers/init.pyi
@@ -112,11 +112,17 @@ class Nmt(Normalizer):
        pass

 class Precompiled(Normalizer):
-    """ SpmNmtNfkc normalizer """
+    """ Precompiled normalizer """

    def __init__(self, precompiled_charsmap: bytes) -> Normalizer:
        pass

+class Replace(Normalizer):
+    """ Replace normalizer """
+
+    def __init__(self, pattern: str, content: str) -> Normalizer:
+        pass
+
 def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
    """
    Instanciate unicode normalizer from the normalizer name
--- a/bindings/python/src/lib.rs
+++ b/bindings/python/src/lib.rs
@@ -111,6 +111,7 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
    m.add_class::<normalizers::PyStripAccents>()?;
    m.add_class::<normalizers::PyNmt>()?;
    m.add_class::<normalizers::PyPrecompiled>()?;
+    m.add_class::<normalizers::PyReplace>()?;
    Ok(())
 }

--- a/bindings/python/src/normalizers.rs
+++ b/bindings/python/src/normalizers.rs
@@ -8,8 +8,13 @@ use crate::error::ToPyResult;
 use serde::ser::SerializeStruct;
 use serde::{Deserialize, Serialize, Serializer};
 use tk::normalizers::{
+<<<<<<< HEAD
    BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, StripAccents, NFC, NFD,
    NFKC, NFKD,
+=======
+    BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Replace, Strip, NFC, NFD, NFKC,
+    NFKD,
+>>>>>>> Adding a new "Replace" normalizer that takes a string and replaces it
 };
 use tk::{NormalizedString, Normalizer};
 use tokenizers as tk;
@@ -54,6 +59,7 @@ impl PyNormalizer {
                NormalizerWrapper::Precompiled(_) => {
                    Py::new(py, (PyPrecompiled {}, base)).map(Into::into)
                }
+                NormalizerWrapper::Replace(_) => Py::new(py, (PyReplace {}, base)).map(Into::into),
                NormalizerWrapper::Nmt(_) => Py::new(py, (PyNmt {}, base)).map(Into::into),
            },
        }
@@ -324,6 +330,18 @@ impl PyPrecompiled {
    }
 }

+#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Replace)]
+pub struct PyReplace {}
+#[pymethods]
+impl PyReplace {
+    #[new]
+    fn new(pattern: &PyString, content: &PyString) -> PyResult<(Self, PyNormalizer)> {
+        let pattern: String = (*pattern).to_string()?.to_string();
+        let content: String = (*content).to_string()?.to_string();
+        Ok((PyReplace {}, Replace::new(pattern, content).into()))
+    }
+}
+
 #[cfg(test)]
 mod test {
    use pyo3::{AsPyRef, Python};
--- a/tokenizers/src/normalizers/mod.rs
+++ b/tokenizers/src/normalizers/mod.rs
@@ -1,11 +1,13 @@
 pub mod bert;
 pub mod precompiled;
+pub mod replace;
 pub mod strip;
 pub mod unicode;
 pub mod utils;

 pub use crate::normalizers::bert::BertNormalizer;
 pub use crate::normalizers::precompiled::Precompiled;
+pub use crate::normalizers::replace::Replace;
 pub use crate::normalizers::strip::{Strip, StripAccents};
 pub use crate::normalizers::unicode::{Nmt, NFC, NFD, NFKC, NFKD};
 pub use crate::normalizers::utils::{Lowercase, Sequence};
@@ -29,6 +31,7 @@ pub enum NormalizerWrapper {
    Lowercase(Lowercase),
    Nmt(Nmt),
    Precompiled(Precompiled),
+    Replace(Replace),
 }

 impl Normalizer for NormalizerWrapper {
@@ -45,6 +48,7 @@ impl Normalizer for NormalizerWrapper {
            NormalizerWrapper::Lowercase(lc) => lc.normalize(normalized),
            NormalizerWrapper::Nmt(lc) => lc.normalize(normalized),
            NormalizerWrapper::Precompiled(lc) => lc.normalize(normalized),
+            NormalizerWrapper::Replace(lc) => lc.normalize(normalized),
        }
    }
 }
@@ -60,3 +64,4 @@ impl_enum_from!(Sequence, NormalizerWrapper, Sequence);
 impl_enum_from!(Lowercase, NormalizerWrapper, Lowercase);
 impl_enum_from!(Nmt, NormalizerWrapper, Nmt);
 impl_enum_from!(Precompiled, NormalizerWrapper, Precompiled);
+impl_enum_from!(Replace, NormalizerWrapper, Replace);
--- a/tokenizers/src/normalizers/replace.rs
+++ b/tokenizers/src/normalizers/replace.rs
@@ -0,0 +1,43 @@
+use crate::tokenizer::{NormalizedString, Normalizer, Result};
+use serde::{Deserialize, Serialize};
+
+/// This normalizer will take a `pattern` (for now only a String)
+/// and replace every occurrence with `content`.
+#[derive(Deserialize, Serialize, Clone, Debug)]
+#[serde(tag = "type")]
+pub struct Replace {
+    pattern: String,
+    content: String,
+}
+
+impl Replace {
+    pub fn new(pattern: String, content: String) -> Self {
+        Self { pattern, content }
+    }
+}
+
+impl Normalizer for Replace {
+    fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
+        let pattern: &str = &self.pattern;
+        normalized.replace(pattern, &self.content)?;
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_replace() {
+        let original = "This is a ''test''";
+        let normalized = "This is a \"test\"";
+
+        let mut n = NormalizedString::from(original);
+        Replace::new("''".to_string(), "\"".to_string())
+            .normalize(&mut n)
+            .unwrap();
+
+        assert_eq!(&n.get(), &normalized);
+    }
+}