Adding a new normalizer that strips accents by removing combining (#416)

* Adding a new normalizer that strips accents by removing combining

characters in unicode strings.

* Adding Node bindings

+ better normalizer impl.

* Doc comment -> Regular comment.
This commit is contained in:
Nicolas Patry
2020-09-17 09:49:41 +02:00
committed by GitHub
parent 330876ae02
commit 75464734df
10 changed files with 130 additions and 4 deletions

View File

@ -8,7 +8,8 @@ use crate::error::ToPyResult;
use serde::ser::SerializeStruct;
use serde::{Deserialize, Serialize, Serializer};
use tk::normalizers::{
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, NFC, NFD, NFKC, NFKD,
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, StripAccents, NFC, NFD,
NFKC, NFKD,
};
use tk::{NormalizedString, Normalizer};
use tokenizers as tk;
@ -40,6 +41,9 @@ impl PyNormalizer {
NormalizerWrapper::StripNormalizer(_) => {
Py::new(py, (PyBertNormalizer {}, base)).map(Into::into)
}
NormalizerWrapper::StripAccents(_) => {
Py::new(py, (PyStripAccents {}, base)).map(Into::into)
}
NormalizerWrapper::NFC(_) => Py::new(py, (PyNFC {}, base)).map(Into::into),
NormalizerWrapper::NFD(_) => Py::new(py, (PyNFD {}, base)).map(Into::into),
NormalizerWrapper::NFKC(_) => Py::new(py, (PyNFKC {}, base)).map(Into::into),
@ -224,6 +228,16 @@ impl PyStrip {
}
}
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=StripAccents)]
pub struct PyStripAccents {}
#[pymethods]
impl PyStripAccents {
#[new]
fn new() -> PyResult<(Self, PyNormalizer)> {
Ok((PyStripAccents {}, StripAccents.into()))
}
}
#[derive(Clone, Deserialize)]
#[serde(untagged)]
pub(crate) enum PyNormalizerWrapper {