Adding a new normalizer that strips accents by removing combining (#416)

* Adding a new normalizer that strips accents by removing combining characters in unicode strings. * Adding Node bindings + better normalizer impl. * Doc comment -> Regular comment.
2025-08-22 16:25:30 +00:00 · 2020-09-17 09:49:41 +02:00
parent 330876ae02
commit 75464734df
10 changed files with 130 additions and 4 deletions
--- a/bindings/node/lib/bindings/normalizers.d.ts
+++ b/bindings/node/lib/bindings/normalizers.d.ts
@ -75,3 +75,8 @@ export function sequenceNormalizer(normalizers: Normalizer[]): Normalizer;
 * @param [right=true] Whether or not to strip on the right (defaults to `true`)
 */
 export function stripNormalizer(left?: boolean, right?: boolean): Normalizer;
+
+/**
+ *  Returns a new StripAccents Normalizer
+ */
+export function stripAccentsNormalizer(): Normalizer;
--- a/bindings/node/lib/bindings/normalizers.js
+++ b/bindings/node/lib/bindings/normalizers.js
@ -9,4 +9,5 @@ module.exports = {
  nfkdNormalizer: native.normalizers_NFKD,
  sequenceNormalizer: native.normalizers_Sequence,
  stripNormalizer: native.normalizers_Strip,
+  stripAccentsNormalizer: native.normalizers_StripAccents,
 };
--- a/bindings/node/lib/bindings/normalizers.test.ts
+++ b/bindings/node/lib/bindings/normalizers.test.ts
@ -1,4 +1,4 @@
-import { stripNormalizer } from "./normalizers";
+import { stripAccentsNormalizer, stripNormalizer } from "./normalizers";

 describe("stripNormalizer", () => {
  it("instantiates with no parameters", () => {
@ -24,3 +24,10 @@ describe("stripNormalizer", () => {
    expect(normalizer.constructor.name).toEqual("Normalizer");
  });
 });
+
+describe("stripAccentsNormalizer", () => {
+  it("initialize", () => {
+    const normalizer = stripAccentsNormalizer();
+    expect(normalizer.constructor.name).toEqual("Normalizer");
+  });
+});
--- a/bindings/node/native/src/normalizers.rs
+++ b/bindings/node/native/src/normalizers.rs
@ -160,6 +160,14 @@ fn strip(mut cx: FunctionContext) -> JsResult<JsNormalizer> {

    Ok(normalizer)
 }
+/// strip_accents()
+fn strip_accents(mut cx: FunctionContext) -> JsResult<JsNormalizer> {
+    let mut normalizer = JsNormalizer::new::<_, JsNormalizer, _>(&mut cx, vec![])?;
+    let guard = cx.lock();
+    normalizer.borrow_mut(&guard).normalizer = Some(tk::normalizers::strip::StripAccents.into());
+
+    Ok(normalizer)
+}

 /// sequence(normalizers: Normalizer[])
 fn sequence(mut cx: FunctionContext) -> JsResult<JsNormalizer> {
@ -212,6 +220,7 @@ pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
    m.export_function(&format!("{}_Sequence", prefix), sequence)?;
    m.export_function(&format!("{}_Lowercase", prefix), lowercase)?;
    m.export_function(&format!("{}_Strip", prefix), strip)?;
+    m.export_function(&format!("{}_StripAccents", prefix), strip_accents)?;
    Ok(())
 }

--- a/bindings/python/py_src/tokenizers/normalizers/init.py
+++ b/bindings/python/py_src/tokenizers/normalizers/init.py
@ -9,6 +9,7 @@ NFKC = normalizers.NFKC
 Sequence = normalizers.Sequence
 Lowercase = normalizers.Lowercase
 Strip = normalizers.Strip
+StripAccents = normalizers.StripAccents
 Nmt = normalizers.Nmt
 Precompiled = normalizers.Precompiled

--- a/bindings/python/py_src/tokenizers/normalizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/normalizers/init.pyi
@ -99,6 +99,12 @@ class Strip(Normalizer):
    def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
        pass

+class StripAccents(Normalizer):
+    """ StripAccents normalizer """
+
+    def __init__(self) -> Normalizer:
+        pass
+
 class Nmt(Normalizer):
    """ Nmt normalizer """

--- a/bindings/python/src/lib.rs
+++ b/bindings/python/src/lib.rs
@ -108,6 +108,7 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
    m.add_class::<normalizers::PySequence>()?;
    m.add_class::<normalizers::PyLowercase>()?;
    m.add_class::<normalizers::PyStrip>()?;
+    m.add_class::<normalizers::PyStripAccents>()?;
    m.add_class::<normalizers::PyNmt>()?;
    m.add_class::<normalizers::PyPrecompiled>()?;
    Ok(())
--- a/bindings/python/src/normalizers.rs
+++ b/bindings/python/src/normalizers.rs
@ -8,7 +8,8 @@ use crate::error::ToPyResult;
 use serde::ser::SerializeStruct;
 use serde::{Deserialize, Serialize, Serializer};
 use tk::normalizers::{
-    BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, NFC, NFD, NFKC, NFKD,
+    BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, StripAccents, NFC, NFD,
+    NFKC, NFKD,
 };
 use tk::{NormalizedString, Normalizer};
 use tokenizers as tk;
@ -40,6 +41,9 @@ impl PyNormalizer {
                NormalizerWrapper::StripNormalizer(_) => {
                    Py::new(py, (PyBertNormalizer {}, base)).map(Into::into)
                }
+                NormalizerWrapper::StripAccents(_) => {
+                    Py::new(py, (PyStripAccents {}, base)).map(Into::into)
+                }
                NormalizerWrapper::NFC(_) => Py::new(py, (PyNFC {}, base)).map(Into::into),
                NormalizerWrapper::NFD(_) => Py::new(py, (PyNFD {}, base)).map(Into::into),
                NormalizerWrapper::NFKC(_) => Py::new(py, (PyNFKC {}, base)).map(Into::into),
@ -224,6 +228,16 @@ impl PyStrip {
    }
 }

+#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=StripAccents)]
+pub struct PyStripAccents {}
+#[pymethods]
+impl PyStripAccents {
+    #[new]
+    fn new() -> PyResult<(Self, PyNormalizer)> {
+        Ok((PyStripAccents {}, StripAccents.into()))
+    }
+}
+
 #[derive(Clone, Deserialize)]
 #[serde(untagged)]
 pub(crate) enum PyNormalizerWrapper {
--- a/tokenizers/src/normalizers/mod.rs
+++ b/tokenizers/src/normalizers/mod.rs
@ -6,7 +6,7 @@ pub mod utils;

 pub use crate::normalizers::bert::BertNormalizer;
 pub use crate::normalizers::precompiled::Precompiled;
-pub use crate::normalizers::strip::Strip;
+pub use crate::normalizers::strip::{Strip, StripAccents};
 pub use crate::normalizers::unicode::{Nmt, NFC, NFD, NFKC, NFKD};
 pub use crate::normalizers::utils::{Lowercase, Sequence};

@ -20,6 +20,7 @@ use crate::{NormalizedString, Normalizer};
 pub enum NormalizerWrapper {
    BertNormalizer(BertNormalizer),
    StripNormalizer(Strip),
+    StripAccents(StripAccents),
    NFC(NFC),
    NFD(NFD),
    NFKC(NFKC),
@ -35,6 +36,7 @@ impl Normalizer for NormalizerWrapper {
        match self {
            NormalizerWrapper::BertNormalizer(bn) => bn.normalize(normalized),
            NormalizerWrapper::StripNormalizer(sn) => sn.normalize(normalized),
+            NormalizerWrapper::StripAccents(sn) => sn.normalize(normalized),
            NormalizerWrapper::NFC(nfc) => nfc.normalize(normalized),
            NormalizerWrapper::NFD(nfd) => nfd.normalize(normalized),
            NormalizerWrapper::NFKC(nfkc) => nfkc.normalize(normalized),
@ -53,6 +55,7 @@ impl_enum_from!(NFKC, NormalizerWrapper, NFKC);
 impl_enum_from!(NFC, NormalizerWrapper, NFC);
 impl_enum_from!(NFD, NormalizerWrapper, NFD);
 impl_enum_from!(Strip, NormalizerWrapper, StripNormalizer);
+impl_enum_from!(StripAccents, NormalizerWrapper, StripAccents);
 impl_enum_from!(Sequence, NormalizerWrapper, Sequence);
 impl_enum_from!(Lowercase, NormalizerWrapper, Lowercase);
 impl_enum_from!(Nmt, NormalizerWrapper, Nmt);
--- a/tokenizers/src/normalizers/strip.rs
+++ b/tokenizers/src/normalizers/strip.rs
@ -1,5 +1,6 @@
 use crate::tokenizer::{NormalizedString, Normalizer, Result};
 use serde::{Deserialize, Serialize};
+use unicode_normalization_alignments::char::is_combining_mark;

 #[derive(Copy, Clone, Debug, Deserialize, Serialize)]
 #[serde(tag = "type")]
@ -10,7 +11,7 @@ pub struct Strip {

 impl Strip {
    pub fn new(strip_left: bool, strip_right: bool) -> Self {
-        Strip {
+        Self {
            strip_left,
            strip_right,
        }
@ -36,3 +37,81 @@ impl Normalizer for Strip {
        Ok(())
    }
 }
+
+// This normalizer removes combining marks from a normalized string
+// It's different from unidecode as it does not attempt to modify
+// non ascii languages.
+#[derive(Copy, Clone, Debug)]
+pub struct StripAccents;
+impl_serde_unit_struct!(StripAccentsVisitor, StripAccents);
+
+impl Normalizer for StripAccents {
+    /// Strip the normalized string inplace
+    fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
+        normalized.filter(|c| !is_combining_mark(c));
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::normalizer::NormalizedString;
+    use unicode_normalization_alignments::UnicodeNormalization;
+
+    #[test]
+    fn test_strip_accents() {
+        // Unicode combining char
+        let original: String = "Me llamó".nfkd().map(|(c, _)| c).collect();
+        let normalized = "Me llamo";
+        assert_ne!(original, normalized);
+        let mut n = NormalizedString::from(original);
+        StripAccents.normalize(&mut n).unwrap();
+        assert_eq!(&n.get(), &normalized);
+
+        // Ignores regular ascii
+        let original = "Me llamo";
+        let normalized = "Me llamo";
+        assert_eq!(original, normalized);
+        let mut n = NormalizedString::from(original);
+        StripAccents.normalize(&mut n).unwrap();
+        assert_eq!(&n.get(), &normalized);
+
+        // Does not change chinese
+        let original: String = "这很简单".nfkd().map(|(c, _)| c).collect();
+        let normalized = "这很简单";
+        assert_eq!(original, normalized);
+        let mut n = NormalizedString::from(original);
+        StripAccents.normalize(&mut n).unwrap();
+        assert_eq!(&n.get(), &normalized);
+    }
+
+    #[test]
+    fn test_strip_accents_multiple() {
+        let original = "e\u{304}\u{304}\u{304}o";
+        let normalized = "eo";
+        assert_ne!(original, normalized);
+        let mut n = NormalizedString::from(original);
+        StripAccents.normalize(&mut n).unwrap();
+        assert_eq!(&n.get(), &normalized);
+        assert_eq!(
+            n,
+            NormalizedString::new(
+                original.to_string(),
+                normalized.to_string(),
+                vec![(0, 1), (7, 8)],
+                vec![
+                    (0, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 1),
+                    (1, 2)
+                ],
+                0
+            )
+        );
+    }
+}