Adding a new normalizer that strips accents by removing combining (#416)

* Adding a new normalizer that strips accents by removing combining characters in unicode strings. * Adding Node bindings + better normalizer impl. * Doc comment -> Regular comment.
2025-12-05 12:18:20 +00:00 · 2020-09-17 09:49:41 +02:00
parent 330876ae02
commit 75464734df
10 changed files with 130 additions and 4 deletions
--- a/bindings/node/lib/bindings/normalizers.d.ts
+++ b/bindings/node/lib/bindings/normalizers.d.ts
@@ -75,3 +75,8 @@ export function sequenceNormalizer(normalizers: Normalizer[]): Normalizer;
 * @param [right=true] Whether or not to strip on the right (defaults to `true`)
 */
 export function stripNormalizer(left?: boolean, right?: boolean): Normalizer;
 /**
 *  Returns a new StripAccents Normalizer
 */
 export function stripAccentsNormalizer(): Normalizer;
--- a/bindings/node/lib/bindings/normalizers.js
+++ b/bindings/node/lib/bindings/normalizers.js
@@ -9,4 +9,5 @@ module.exports = {
  nfkdNormalizer: native.normalizers_NFKD,
  sequenceNormalizer: native.normalizers_Sequence,
  stripNormalizer: native.normalizers_Strip,
  stripAccentsNormalizer: native.normalizers_StripAccents,
 };
--- a/bindings/node/lib/bindings/normalizers.test.ts
+++ b/bindings/node/lib/bindings/normalizers.test.ts
@@ -1,4 +1,4 @@
-import { stripNormalizer } from "./normalizers";
+import { stripAccentsNormalizer, stripNormalizer } from "./normalizers";
 describe("stripNormalizer", () => {
  it("instantiates with no parameters", () => {
@@ -24,3 +24,10 @@ describe("stripNormalizer", () => {
    expect(normalizer.constructor.name).toEqual("Normalizer");
  });
 });
 describe("stripAccentsNormalizer", () => {
  it("initialize", () => {
    const normalizer = stripAccentsNormalizer();
    expect(normalizer.constructor.name).toEqual("Normalizer");
  });
 });
--- a/bindings/node/native/src/normalizers.rs
+++ b/bindings/node/native/src/normalizers.rs
@@ -160,6 +160,14 @@ fn strip(mut cx: FunctionContext) -> JsResult<JsNormalizer> {
    Ok(normalizer)
 }
 /// strip_accents()
 fn strip_accents(mut cx: FunctionContext) -> JsResult<JsNormalizer> {
    let mut normalizer = JsNormalizer::new::<_, JsNormalizer, _>(&mut cx, vec![])?;
    let guard = cx.lock();
    normalizer.borrow_mut(&guard).normalizer = Some(tk::normalizers::strip::StripAccents.into());
    Ok(normalizer)
 }
 /// sequence(normalizers: Normalizer[])
 fn sequence(mut cx: FunctionContext) -> JsResult<JsNormalizer> {
@@ -212,6 +220,7 @@ pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
    m.export_function(&format!("{}_Sequence", prefix), sequence)?;
    m.export_function(&format!("{}_Lowercase", prefix), lowercase)?;
    m.export_function(&format!("{}_Strip", prefix), strip)?;
    m.export_function(&format!("{}_StripAccents", prefix), strip_accents)?;
    Ok(())
 }
--- a/bindings/python/py_src/tokenizers/normalizers/init.py
+++ b/bindings/python/py_src/tokenizers/normalizers/init.py
@@ -9,6 +9,7 @@ NFKC = normalizers.NFKC
 Sequence = normalizers.Sequence
 Lowercase = normalizers.Lowercase
 Strip = normalizers.Strip
 StripAccents = normalizers.StripAccents
 Nmt = normalizers.Nmt
 Precompiled = normalizers.Precompiled
--- a/bindings/python/py_src/tokenizers/normalizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/normalizers/init.pyi
@@ -99,6 +99,12 @@ class Strip(Normalizer):
    def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
        pass
 class StripAccents(Normalizer):
    """ StripAccents normalizer """
    def __init__(self) -> Normalizer:
        pass
 class Nmt(Normalizer):
    """ Nmt normalizer """
--- a/bindings/python/src/lib.rs
+++ b/bindings/python/src/lib.rs
@@ -108,6 +108,7 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
    m.add_class::<normalizers::PySequence>()?;
    m.add_class::<normalizers::PyLowercase>()?;
    m.add_class::<normalizers::PyStrip>()?;
    m.add_class::<normalizers::PyStripAccents>()?;
    m.add_class::<normalizers::PyNmt>()?;
    m.add_class::<normalizers::PyPrecompiled>()?;
    Ok(())
--- a/bindings/python/src/normalizers.rs
+++ b/bindings/python/src/normalizers.rs
@@ -8,7 +8,8 @@ use crate::error::ToPyResult;
 use serde::ser::SerializeStruct;
 use serde::{Deserialize, Serialize, Serializer};
 use tk::normalizers::{
-    BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, NFC, NFD, NFKC, NFKD,
+    BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, StripAccents, NFC, NFD,
    NFKC, NFKD,
 };
 use tk::{NormalizedString, Normalizer};
 use tokenizers as tk;
@@ -40,6 +41,9 @@ impl PyNormalizer {
                NormalizerWrapper::StripNormalizer(_) => {
                    Py::new(py, (PyBertNormalizer {}, base)).map(Into::into)
                }
                NormalizerWrapper::StripAccents(_) => {
                    Py::new(py, (PyStripAccents {}, base)).map(Into::into)
                }
                NormalizerWrapper::NFC(_) => Py::new(py, (PyNFC {}, base)).map(Into::into),
                NormalizerWrapper::NFD(_) => Py::new(py, (PyNFD {}, base)).map(Into::into),
                NormalizerWrapper::NFKC(_) => Py::new(py, (PyNFKC {}, base)).map(Into::into),
@@ -224,6 +228,16 @@ impl PyStrip {
    }
 }
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=StripAccents)]
 pub struct PyStripAccents {}
 #[pymethods]
 impl PyStripAccents {
    #[new]
    fn new() -> PyResult<(Self, PyNormalizer)> {
        Ok((PyStripAccents {}, StripAccents.into()))
    }
 }
 #[derive(Clone, Deserialize)]
 #[serde(untagged)]
 pub(crate) enum PyNormalizerWrapper {
--- a/tokenizers/src/normalizers/mod.rs
+++ b/tokenizers/src/normalizers/mod.rs
@@ -6,7 +6,7 @@ pub mod utils;
 pub use crate::normalizers::bert::BertNormalizer;
 pub use crate::normalizers::precompiled::Precompiled;
-pub use crate::normalizers::strip::Strip;
+pub use crate::normalizers::strip::{Strip, StripAccents};
 pub use crate::normalizers::unicode::{Nmt, NFC, NFD, NFKC, NFKD};
 pub use crate::normalizers::utils::{Lowercase, Sequence};
@@ -20,6 +20,7 @@ use crate::{NormalizedString, Normalizer};
 pub enum NormalizerWrapper {
    BertNormalizer(BertNormalizer),
    StripNormalizer(Strip),
    StripAccents(StripAccents),
    NFC(NFC),
    NFD(NFD),
    NFKC(NFKC),
@@ -35,6 +36,7 @@ impl Normalizer for NormalizerWrapper {
        match self {
            NormalizerWrapper::BertNormalizer(bn) => bn.normalize(normalized),
            NormalizerWrapper::StripNormalizer(sn) => sn.normalize(normalized),
            NormalizerWrapper::StripAccents(sn) => sn.normalize(normalized),
            NormalizerWrapper::NFC(nfc) => nfc.normalize(normalized),
            NormalizerWrapper::NFD(nfd) => nfd.normalize(normalized),
            NormalizerWrapper::NFKC(nfkc) => nfkc.normalize(normalized),
@@ -53,6 +55,7 @@ impl_enum_from!(NFKC, NormalizerWrapper, NFKC);
 impl_enum_from!(NFC, NormalizerWrapper, NFC);
 impl_enum_from!(NFD, NormalizerWrapper, NFD);
 impl_enum_from!(Strip, NormalizerWrapper, StripNormalizer);
 impl_enum_from!(StripAccents, NormalizerWrapper, StripAccents);
 impl_enum_from!(Sequence, NormalizerWrapper, Sequence);
 impl_enum_from!(Lowercase, NormalizerWrapper, Lowercase);
 impl_enum_from!(Nmt, NormalizerWrapper, Nmt);
--- a/tokenizers/src/normalizers/strip.rs
+++ b/tokenizers/src/normalizers/strip.rs
@@ -1,5 +1,6 @@
 use crate::tokenizer::{NormalizedString, Normalizer, Result};
 use serde::{Deserialize, Serialize};
 use unicode_normalization_alignments::char::is_combining_mark;
 #[derive(Copy, Clone, Debug, Deserialize, Serialize)]
 #[serde(tag = "type")]
@@ -10,7 +11,7 @@ pub struct Strip {
 impl Strip {
    pub fn new(strip_left: bool, strip_right: bool) -> Self {
-        Strip {
+        Self {
            strip_left,
            strip_right,
        }
@@ -36,3 +37,81 @@ impl Normalizer for Strip {
        Ok(())
    }
 }
 // This normalizer removes combining marks from a normalized string
 // It's different from unidecode as it does not attempt to modify
 // non ascii languages.
 #[derive(Copy, Clone, Debug)]
 pub struct StripAccents;
 impl_serde_unit_struct!(StripAccentsVisitor, StripAccents);
 impl Normalizer for StripAccents {
    /// Strip the normalized string inplace
    fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
        normalized.filter(|c| !is_combining_mark(c));
        Ok(())
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::normalizer::NormalizedString;
    use unicode_normalization_alignments::UnicodeNormalization;
    #[test]
    fn test_strip_accents() {
        // Unicode combining char
        let original: String = "Me llamó".nfkd().map(|(c, _)| c).collect();
        let normalized = "Me llamo";
        assert_ne!(original, normalized);
        let mut n = NormalizedString::from(original);
        StripAccents.normalize(&mut n).unwrap();
        assert_eq!(&n.get(), &normalized);
        // Ignores regular ascii
        let original = "Me llamo";
        let normalized = "Me llamo";
        assert_eq!(original, normalized);
        let mut n = NormalizedString::from(original);
        StripAccents.normalize(&mut n).unwrap();
        assert_eq!(&n.get(), &normalized);
        // Does not change chinese
        let original: String = "这很简单".nfkd().map(|(c, _)| c).collect();
        let normalized = "这很简单";
        assert_eq!(original, normalized);
        let mut n = NormalizedString::from(original);
        StripAccents.normalize(&mut n).unwrap();
        assert_eq!(&n.get(), &normalized);
    }
    #[test]
    fn test_strip_accents_multiple() {
        let original = "e\u{304}\u{304}\u{304}o";
        let normalized = "eo";
        assert_ne!(original, normalized);
        let mut n = NormalizedString::from(original);
        StripAccents.normalize(&mut n).unwrap();
        assert_eq!(&n.get(), &normalized);
        assert_eq!(
            n,
            NormalizedString::new(
                original.to_string(),
                normalized.to_string(),
                vec![(0, 1), (7, 8)],
                vec![
                    (0, 1),
                    (1, 1),
                    (1, 1),
                    (1, 1),
                    (1, 1),
                    (1, 1),
                    (1, 1),
                    (1, 2)
                ],
                0
            )
        );
    }
 }