Adding a new normalizer that strips accents by removing combining (#416)

* Adding a new normalizer that strips accents by removing combining

characters in unicode strings.

* Adding Node bindings

+ better normalizer impl.

* Doc comment -> Regular comment.
This commit is contained in:
Nicolas Patry
2020-09-17 09:49:41 +02:00
committed by GitHub
parent 330876ae02
commit 75464734df
10 changed files with 130 additions and 4 deletions

View File

@ -75,3 +75,8 @@ export function sequenceNormalizer(normalizers: Normalizer[]): Normalizer;
* @param [right=true] Whether or not to strip on the right (defaults to `true`)
*/
export function stripNormalizer(left?: boolean, right?: boolean): Normalizer;
/**
* Returns a new StripAccents Normalizer
*/
export function stripAccentsNormalizer(): Normalizer;

View File

@ -9,4 +9,5 @@ module.exports = {
nfkdNormalizer: native.normalizers_NFKD,
sequenceNormalizer: native.normalizers_Sequence,
stripNormalizer: native.normalizers_Strip,
stripAccentsNormalizer: native.normalizers_StripAccents,
};

View File

@ -1,4 +1,4 @@
import { stripNormalizer } from "./normalizers";
import { stripAccentsNormalizer, stripNormalizer } from "./normalizers";
describe("stripNormalizer", () => {
it("instantiates with no parameters", () => {
@ -24,3 +24,10 @@ describe("stripNormalizer", () => {
expect(normalizer.constructor.name).toEqual("Normalizer");
});
});
describe("stripAccentsNormalizer", () => {
it("initialize", () => {
const normalizer = stripAccentsNormalizer();
expect(normalizer.constructor.name).toEqual("Normalizer");
});
});

View File

@ -160,6 +160,14 @@ fn strip(mut cx: FunctionContext) -> JsResult<JsNormalizer> {
Ok(normalizer)
}
/// strip_accents()
fn strip_accents(mut cx: FunctionContext) -> JsResult<JsNormalizer> {
let mut normalizer = JsNormalizer::new::<_, JsNormalizer, _>(&mut cx, vec![])?;
let guard = cx.lock();
normalizer.borrow_mut(&guard).normalizer = Some(tk::normalizers::strip::StripAccents.into());
Ok(normalizer)
}
/// sequence(normalizers: Normalizer[])
fn sequence(mut cx: FunctionContext) -> JsResult<JsNormalizer> {
@ -212,6 +220,7 @@ pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
m.export_function(&format!("{}_Sequence", prefix), sequence)?;
m.export_function(&format!("{}_Lowercase", prefix), lowercase)?;
m.export_function(&format!("{}_Strip", prefix), strip)?;
m.export_function(&format!("{}_StripAccents", prefix), strip_accents)?;
Ok(())
}

View File

@ -9,6 +9,7 @@ NFKC = normalizers.NFKC
Sequence = normalizers.Sequence
Lowercase = normalizers.Lowercase
Strip = normalizers.Strip
StripAccents = normalizers.StripAccents
Nmt = normalizers.Nmt
Precompiled = normalizers.Precompiled

View File

@ -99,6 +99,12 @@ class Strip(Normalizer):
def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
pass
class StripAccents(Normalizer):
""" StripAccents normalizer """
def __init__(self) -> Normalizer:
pass
class Nmt(Normalizer):
""" Nmt normalizer """

View File

@ -108,6 +108,7 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<normalizers::PySequence>()?;
m.add_class::<normalizers::PyLowercase>()?;
m.add_class::<normalizers::PyStrip>()?;
m.add_class::<normalizers::PyStripAccents>()?;
m.add_class::<normalizers::PyNmt>()?;
m.add_class::<normalizers::PyPrecompiled>()?;
Ok(())

View File

@ -8,7 +8,8 @@ use crate::error::ToPyResult;
use serde::ser::SerializeStruct;
use serde::{Deserialize, Serialize, Serializer};
use tk::normalizers::{
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, NFC, NFD, NFKC, NFKD,
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, StripAccents, NFC, NFD,
NFKC, NFKD,
};
use tk::{NormalizedString, Normalizer};
use tokenizers as tk;
@ -40,6 +41,9 @@ impl PyNormalizer {
NormalizerWrapper::StripNormalizer(_) => {
Py::new(py, (PyBertNormalizer {}, base)).map(Into::into)
}
NormalizerWrapper::StripAccents(_) => {
Py::new(py, (PyStripAccents {}, base)).map(Into::into)
}
NormalizerWrapper::NFC(_) => Py::new(py, (PyNFC {}, base)).map(Into::into),
NormalizerWrapper::NFD(_) => Py::new(py, (PyNFD {}, base)).map(Into::into),
NormalizerWrapper::NFKC(_) => Py::new(py, (PyNFKC {}, base)).map(Into::into),
@ -224,6 +228,16 @@ impl PyStrip {
}
}
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=StripAccents)]
pub struct PyStripAccents {}
#[pymethods]
impl PyStripAccents {
#[new]
fn new() -> PyResult<(Self, PyNormalizer)> {
Ok((PyStripAccents {}, StripAccents.into()))
}
}
#[derive(Clone, Deserialize)]
#[serde(untagged)]
pub(crate) enum PyNormalizerWrapper {

View File

@ -6,7 +6,7 @@ pub mod utils;
pub use crate::normalizers::bert::BertNormalizer;
pub use crate::normalizers::precompiled::Precompiled;
pub use crate::normalizers::strip::Strip;
pub use crate::normalizers::strip::{Strip, StripAccents};
pub use crate::normalizers::unicode::{Nmt, NFC, NFD, NFKC, NFKD};
pub use crate::normalizers::utils::{Lowercase, Sequence};
@ -20,6 +20,7 @@ use crate::{NormalizedString, Normalizer};
pub enum NormalizerWrapper {
BertNormalizer(BertNormalizer),
StripNormalizer(Strip),
StripAccents(StripAccents),
NFC(NFC),
NFD(NFD),
NFKC(NFKC),
@ -35,6 +36,7 @@ impl Normalizer for NormalizerWrapper {
match self {
NormalizerWrapper::BertNormalizer(bn) => bn.normalize(normalized),
NormalizerWrapper::StripNormalizer(sn) => sn.normalize(normalized),
NormalizerWrapper::StripAccents(sn) => sn.normalize(normalized),
NormalizerWrapper::NFC(nfc) => nfc.normalize(normalized),
NormalizerWrapper::NFD(nfd) => nfd.normalize(normalized),
NormalizerWrapper::NFKC(nfkc) => nfkc.normalize(normalized),
@ -53,6 +55,7 @@ impl_enum_from!(NFKC, NormalizerWrapper, NFKC);
impl_enum_from!(NFC, NormalizerWrapper, NFC);
impl_enum_from!(NFD, NormalizerWrapper, NFD);
impl_enum_from!(Strip, NormalizerWrapper, StripNormalizer);
impl_enum_from!(StripAccents, NormalizerWrapper, StripAccents);
impl_enum_from!(Sequence, NormalizerWrapper, Sequence);
impl_enum_from!(Lowercase, NormalizerWrapper, Lowercase);
impl_enum_from!(Nmt, NormalizerWrapper, Nmt);

View File

@ -1,5 +1,6 @@
use crate::tokenizer::{NormalizedString, Normalizer, Result};
use serde::{Deserialize, Serialize};
use unicode_normalization_alignments::char::is_combining_mark;
#[derive(Copy, Clone, Debug, Deserialize, Serialize)]
#[serde(tag = "type")]
@ -10,7 +11,7 @@ pub struct Strip {
impl Strip {
pub fn new(strip_left: bool, strip_right: bool) -> Self {
Strip {
Self {
strip_left,
strip_right,
}
@ -36,3 +37,81 @@ impl Normalizer for Strip {
Ok(())
}
}
// This normalizer removes combining marks from a normalized string
// It's different from unidecode as it does not attempt to modify
// non ascii languages.
#[derive(Copy, Clone, Debug)]
pub struct StripAccents;
impl_serde_unit_struct!(StripAccentsVisitor, StripAccents);
impl Normalizer for StripAccents {
/// Strip the normalized string inplace
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
normalized.filter(|c| !is_combining_mark(c));
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::normalizer::NormalizedString;
use unicode_normalization_alignments::UnicodeNormalization;
#[test]
fn test_strip_accents() {
// Unicode combining char
let original: String = "Me llamó".nfkd().map(|(c, _)| c).collect();
let normalized = "Me llamo";
assert_ne!(original, normalized);
let mut n = NormalizedString::from(original);
StripAccents.normalize(&mut n).unwrap();
assert_eq!(&n.get(), &normalized);
// Ignores regular ascii
let original = "Me llamo";
let normalized = "Me llamo";
assert_eq!(original, normalized);
let mut n = NormalizedString::from(original);
StripAccents.normalize(&mut n).unwrap();
assert_eq!(&n.get(), &normalized);
// Does not change chinese
let original: String = "这很简单".nfkd().map(|(c, _)| c).collect();
let normalized = "这很简单";
assert_eq!(original, normalized);
let mut n = NormalizedString::from(original);
StripAccents.normalize(&mut n).unwrap();
assert_eq!(&n.get(), &normalized);
}
#[test]
fn test_strip_accents_multiple() {
let original = "e\u{304}\u{304}\u{304}o";
let normalized = "eo";
assert_ne!(original, normalized);
let mut n = NormalizedString::from(original);
StripAccents.normalize(&mut n).unwrap();
assert_eq!(&n.get(), &normalized);
assert_eq!(
n,
NormalizedString::new(
original.to_string(),
normalized.to_string(),
vec![(0, 1), (7, 8)],
vec![
(0, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 2)
],
0
)
);
}
}