Adding a new normalizer that strips accents by removing combining (#416)

* Adding a new normalizer that strips accents by removing combining

characters in unicode strings.

* Adding Node bindings

+ better normalizer impl.

* Doc comment -> Regular comment.
This commit is contained in:
Nicolas Patry
2020-09-17 09:49:41 +02:00
committed by GitHub
parent 330876ae02
commit 75464734df
10 changed files with 130 additions and 4 deletions

View File

@@ -9,6 +9,7 @@ NFKC = normalizers.NFKC
Sequence = normalizers.Sequence
Lowercase = normalizers.Lowercase
Strip = normalizers.Strip
StripAccents = normalizers.StripAccents
Nmt = normalizers.Nmt
Precompiled = normalizers.Precompiled

View File

@@ -99,6 +99,12 @@ class Strip(Normalizer):
def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
pass
class StripAccents(Normalizer):
""" StripAccents normalizer """
def __init__(self) -> Normalizer:
pass
class Nmt(Normalizer):
""" Nmt normalizer """

View File

@@ -108,6 +108,7 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<normalizers::PySequence>()?;
m.add_class::<normalizers::PyLowercase>()?;
m.add_class::<normalizers::PyStrip>()?;
m.add_class::<normalizers::PyStripAccents>()?;
m.add_class::<normalizers::PyNmt>()?;
m.add_class::<normalizers::PyPrecompiled>()?;
Ok(())

View File

@@ -8,7 +8,8 @@ use crate::error::ToPyResult;
use serde::ser::SerializeStruct;
use serde::{Deserialize, Serialize, Serializer};
use tk::normalizers::{
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, NFC, NFD, NFKC, NFKD,
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, StripAccents, NFC, NFD,
NFKC, NFKD,
};
use tk::{NormalizedString, Normalizer};
use tokenizers as tk;
@@ -40,6 +41,9 @@ impl PyNormalizer {
NormalizerWrapper::StripNormalizer(_) => {
Py::new(py, (PyBertNormalizer {}, base)).map(Into::into)
}
NormalizerWrapper::StripAccents(_) => {
Py::new(py, (PyStripAccents {}, base)).map(Into::into)
}
NormalizerWrapper::NFC(_) => Py::new(py, (PyNFC {}, base)).map(Into::into),
NormalizerWrapper::NFD(_) => Py::new(py, (PyNFD {}, base)).map(Into::into),
NormalizerWrapper::NFKC(_) => Py::new(py, (PyNFKC {}, base)).map(Into::into),
@@ -224,6 +228,16 @@ impl PyStrip {
}
}
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=StripAccents)]
pub struct PyStripAccents {}
#[pymethods]
impl PyStripAccents {
#[new]
fn new() -> PyResult<(Self, PyNormalizer)> {
Ok((PyStripAccents {}, StripAccents.into()))
}
}
#[derive(Clone, Deserialize)]
#[serde(untagged)]
pub(crate) enum PyNormalizerWrapper {