Adding a new normalizer that strips accents by removing combining (#416)

* Adding a new normalizer that strips accents by removing combining characters in unicode strings. * Adding Node bindings + better normalizer impl. * Doc comment -> Regular comment.
2025-08-23 16:49:27 +00:00 · 2020-09-17 09:49:41 +02:00
parent 330876ae02
commit 75464734df
10 changed files with 130 additions and 4 deletions
--- a/bindings/python/src/normalizers.rs
+++ b/bindings/python/src/normalizers.rs
@ -8,7 +8,8 @@ use crate::error::ToPyResult;
 use serde::ser::SerializeStruct;
 use serde::{Deserialize, Serialize, Serializer};
 use tk::normalizers::{
-    BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, NFC, NFD, NFKC, NFKD,
+    BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, StripAccents, NFC, NFD,
+    NFKC, NFKD,
 };
 use tk::{NormalizedString, Normalizer};
 use tokenizers as tk;
@ -40,6 +41,9 @@ impl PyNormalizer {
                NormalizerWrapper::StripNormalizer(_) => {
                    Py::new(py, (PyBertNormalizer {}, base)).map(Into::into)
                }
+                NormalizerWrapper::StripAccents(_) => {
+                    Py::new(py, (PyStripAccents {}, base)).map(Into::into)
+                }
                NormalizerWrapper::NFC(_) => Py::new(py, (PyNFC {}, base)).map(Into::into),
                NormalizerWrapper::NFD(_) => Py::new(py, (PyNFD {}, base)).map(Into::into),
                NormalizerWrapper::NFKC(_) => Py::new(py, (PyNFKC {}, base)).map(Into::into),
@ -224,6 +228,16 @@ impl PyStrip {
    }
 }

+#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=StripAccents)]
+pub struct PyStripAccents {}
+#[pymethods]
+impl PyStripAccents {
+    #[new]
+    fn new() -> PyResult<(Self, PyNormalizer)> {
+        Ok((PyStripAccents {}, StripAccents.into()))
+    }
+}
+
 #[derive(Clone, Deserialize)]
 #[serde(untagged)]
 pub(crate) enum PyNormalizerWrapper {