Adding a new normalizer that strips accents by removing combining (#416)

* Adding a new normalizer that strips accents by removing combining characters in unicode strings. * Adding Node bindings + better normalizer impl. * Doc comment -> Regular comment.
2025-12-05 12:18:20 +00:00 · 2020-09-17 09:49:41 +02:00
parent 330876ae02
commit 75464734df
10 changed files with 130 additions and 4 deletions
--- a/bindings/python/py_src/tokenizers/normalizers/init.py
+++ b/bindings/python/py_src/tokenizers/normalizers/init.py
@@ -9,6 +9,7 @@ NFKC = normalizers.NFKC
 Sequence = normalizers.Sequence
 Lowercase = normalizers.Lowercase
 Strip = normalizers.Strip
+StripAccents = normalizers.StripAccents
 Nmt = normalizers.Nmt
 Precompiled = normalizers.Precompiled

--- a/bindings/python/py_src/tokenizers/normalizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/normalizers/init.pyi
@@ -99,6 +99,12 @@ class Strip(Normalizer):
    def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
        pass

+class StripAccents(Normalizer):
+    """ StripAccents normalizer """
+
+    def __init__(self) -> Normalizer:
+        pass
+
 class Nmt(Normalizer):
    """ Nmt normalizer """

--- a/bindings/python/src/lib.rs
+++ b/bindings/python/src/lib.rs
@@ -108,6 +108,7 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
    m.add_class::<normalizers::PySequence>()?;
    m.add_class::<normalizers::PyLowercase>()?;
    m.add_class::<normalizers::PyStrip>()?;
+    m.add_class::<normalizers::PyStripAccents>()?;
    m.add_class::<normalizers::PyNmt>()?;
    m.add_class::<normalizers::PyPrecompiled>()?;
    Ok(())
--- a/bindings/python/src/normalizers.rs
+++ b/bindings/python/src/normalizers.rs
@@ -8,7 +8,8 @@ use crate::error::ToPyResult;
 use serde::ser::SerializeStruct;
 use serde::{Deserialize, Serialize, Serializer};
 use tk::normalizers::{
-    BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, NFC, NFD, NFKC, NFKD,
+    BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, StripAccents, NFC, NFD,
+    NFKC, NFKD,
 };
 use tk::{NormalizedString, Normalizer};
 use tokenizers as tk;
@@ -40,6 +41,9 @@ impl PyNormalizer {
                NormalizerWrapper::StripNormalizer(_) => {
                    Py::new(py, (PyBertNormalizer {}, base)).map(Into::into)
                }
+                NormalizerWrapper::StripAccents(_) => {
+                    Py::new(py, (PyStripAccents {}, base)).map(Into::into)
+                }
                NormalizerWrapper::NFC(_) => Py::new(py, (PyNFC {}, base)).map(Into::into),
                NormalizerWrapper::NFD(_) => Py::new(py, (PyNFD {}, base)).map(Into::into),
                NormalizerWrapper::NFKC(_) => Py::new(py, (PyNFKC {}, base)).map(Into::into),
@@ -224,6 +228,16 @@ impl PyStrip {
    }
 }

+#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=StripAccents)]
+pub struct PyStripAccents {}
+#[pymethods]
+impl PyStripAccents {
+    #[new]
+    fn new() -> PyResult<(Self, PyNormalizer)> {
+        Ok((PyStripAccents {}, StripAccents.into()))
+    }
+}
+
 #[derive(Clone, Deserialize)]
 #[serde(untagged)]
 pub(crate) enum PyNormalizerWrapper {