mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Adding a new normalizer that strips accents by removing combining (#416)
* Adding a new normalizer that strips accents by removing combining characters in unicode strings. * Adding Node bindings + better normalizer impl. * Doc comment -> Regular comment.
This commit is contained in:
5
bindings/node/lib/bindings/normalizers.d.ts
vendored
5
bindings/node/lib/bindings/normalizers.d.ts
vendored
@ -75,3 +75,8 @@ export function sequenceNormalizer(normalizers: Normalizer[]): Normalizer;
|
|||||||
* @param [right=true] Whether or not to strip on the right (defaults to `true`)
|
* @param [right=true] Whether or not to strip on the right (defaults to `true`)
|
||||||
*/
|
*/
|
||||||
export function stripNormalizer(left?: boolean, right?: boolean): Normalizer;
|
export function stripNormalizer(left?: boolean, right?: boolean): Normalizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a new StripAccents Normalizer
|
||||||
|
*/
|
||||||
|
export function stripAccentsNormalizer(): Normalizer;
|
||||||
|
@ -9,4 +9,5 @@ module.exports = {
|
|||||||
nfkdNormalizer: native.normalizers_NFKD,
|
nfkdNormalizer: native.normalizers_NFKD,
|
||||||
sequenceNormalizer: native.normalizers_Sequence,
|
sequenceNormalizer: native.normalizers_Sequence,
|
||||||
stripNormalizer: native.normalizers_Strip,
|
stripNormalizer: native.normalizers_Strip,
|
||||||
|
stripAccentsNormalizer: native.normalizers_StripAccents,
|
||||||
};
|
};
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { stripNormalizer } from "./normalizers";
|
import { stripAccentsNormalizer, stripNormalizer } from "./normalizers";
|
||||||
|
|
||||||
describe("stripNormalizer", () => {
|
describe("stripNormalizer", () => {
|
||||||
it("instantiates with no parameters", () => {
|
it("instantiates with no parameters", () => {
|
||||||
@ -24,3 +24,10 @@ describe("stripNormalizer", () => {
|
|||||||
expect(normalizer.constructor.name).toEqual("Normalizer");
|
expect(normalizer.constructor.name).toEqual("Normalizer");
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("stripAccentsNormalizer", () => {
|
||||||
|
it("initialize", () => {
|
||||||
|
const normalizer = stripAccentsNormalizer();
|
||||||
|
expect(normalizer.constructor.name).toEqual("Normalizer");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
@ -160,6 +160,14 @@ fn strip(mut cx: FunctionContext) -> JsResult<JsNormalizer> {
|
|||||||
|
|
||||||
Ok(normalizer)
|
Ok(normalizer)
|
||||||
}
|
}
|
||||||
|
/// strip_accents()
|
||||||
|
fn strip_accents(mut cx: FunctionContext) -> JsResult<JsNormalizer> {
|
||||||
|
let mut normalizer = JsNormalizer::new::<_, JsNormalizer, _>(&mut cx, vec![])?;
|
||||||
|
let guard = cx.lock();
|
||||||
|
normalizer.borrow_mut(&guard).normalizer = Some(tk::normalizers::strip::StripAccents.into());
|
||||||
|
|
||||||
|
Ok(normalizer)
|
||||||
|
}
|
||||||
|
|
||||||
/// sequence(normalizers: Normalizer[])
|
/// sequence(normalizers: Normalizer[])
|
||||||
fn sequence(mut cx: FunctionContext) -> JsResult<JsNormalizer> {
|
fn sequence(mut cx: FunctionContext) -> JsResult<JsNormalizer> {
|
||||||
@ -212,6 +220,7 @@ pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
|
|||||||
m.export_function(&format!("{}_Sequence", prefix), sequence)?;
|
m.export_function(&format!("{}_Sequence", prefix), sequence)?;
|
||||||
m.export_function(&format!("{}_Lowercase", prefix), lowercase)?;
|
m.export_function(&format!("{}_Lowercase", prefix), lowercase)?;
|
||||||
m.export_function(&format!("{}_Strip", prefix), strip)?;
|
m.export_function(&format!("{}_Strip", prefix), strip)?;
|
||||||
|
m.export_function(&format!("{}_StripAccents", prefix), strip_accents)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@ NFKC = normalizers.NFKC
|
|||||||
Sequence = normalizers.Sequence
|
Sequence = normalizers.Sequence
|
||||||
Lowercase = normalizers.Lowercase
|
Lowercase = normalizers.Lowercase
|
||||||
Strip = normalizers.Strip
|
Strip = normalizers.Strip
|
||||||
|
StripAccents = normalizers.StripAccents
|
||||||
Nmt = normalizers.Nmt
|
Nmt = normalizers.Nmt
|
||||||
Precompiled = normalizers.Precompiled
|
Precompiled = normalizers.Precompiled
|
||||||
|
|
||||||
|
@ -99,6 +99,12 @@ class Strip(Normalizer):
|
|||||||
def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
|
def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class StripAccents(Normalizer):
|
||||||
|
""" StripAccents normalizer """
|
||||||
|
|
||||||
|
def __init__(self) -> Normalizer:
|
||||||
|
pass
|
||||||
|
|
||||||
class Nmt(Normalizer):
|
class Nmt(Normalizer):
|
||||||
""" Nmt normalizer """
|
""" Nmt normalizer """
|
||||||
|
|
||||||
|
@ -108,6 +108,7 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
|||||||
m.add_class::<normalizers::PySequence>()?;
|
m.add_class::<normalizers::PySequence>()?;
|
||||||
m.add_class::<normalizers::PyLowercase>()?;
|
m.add_class::<normalizers::PyLowercase>()?;
|
||||||
m.add_class::<normalizers::PyStrip>()?;
|
m.add_class::<normalizers::PyStrip>()?;
|
||||||
|
m.add_class::<normalizers::PyStripAccents>()?;
|
||||||
m.add_class::<normalizers::PyNmt>()?;
|
m.add_class::<normalizers::PyNmt>()?;
|
||||||
m.add_class::<normalizers::PyPrecompiled>()?;
|
m.add_class::<normalizers::PyPrecompiled>()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -8,7 +8,8 @@ use crate::error::ToPyResult;
|
|||||||
use serde::ser::SerializeStruct;
|
use serde::ser::SerializeStruct;
|
||||||
use serde::{Deserialize, Serialize, Serializer};
|
use serde::{Deserialize, Serialize, Serializer};
|
||||||
use tk::normalizers::{
|
use tk::normalizers::{
|
||||||
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, NFC, NFD, NFKC, NFKD,
|
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, StripAccents, NFC, NFD,
|
||||||
|
NFKC, NFKD,
|
||||||
};
|
};
|
||||||
use tk::{NormalizedString, Normalizer};
|
use tk::{NormalizedString, Normalizer};
|
||||||
use tokenizers as tk;
|
use tokenizers as tk;
|
||||||
@ -40,6 +41,9 @@ impl PyNormalizer {
|
|||||||
NormalizerWrapper::StripNormalizer(_) => {
|
NormalizerWrapper::StripNormalizer(_) => {
|
||||||
Py::new(py, (PyBertNormalizer {}, base)).map(Into::into)
|
Py::new(py, (PyBertNormalizer {}, base)).map(Into::into)
|
||||||
}
|
}
|
||||||
|
NormalizerWrapper::StripAccents(_) => {
|
||||||
|
Py::new(py, (PyStripAccents {}, base)).map(Into::into)
|
||||||
|
}
|
||||||
NormalizerWrapper::NFC(_) => Py::new(py, (PyNFC {}, base)).map(Into::into),
|
NormalizerWrapper::NFC(_) => Py::new(py, (PyNFC {}, base)).map(Into::into),
|
||||||
NormalizerWrapper::NFD(_) => Py::new(py, (PyNFD {}, base)).map(Into::into),
|
NormalizerWrapper::NFD(_) => Py::new(py, (PyNFD {}, base)).map(Into::into),
|
||||||
NormalizerWrapper::NFKC(_) => Py::new(py, (PyNFKC {}, base)).map(Into::into),
|
NormalizerWrapper::NFKC(_) => Py::new(py, (PyNFKC {}, base)).map(Into::into),
|
||||||
@ -224,6 +228,16 @@ impl PyStrip {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=StripAccents)]
|
||||||
|
pub struct PyStripAccents {}
|
||||||
|
#[pymethods]
|
||||||
|
impl PyStripAccents {
|
||||||
|
#[new]
|
||||||
|
fn new() -> PyResult<(Self, PyNormalizer)> {
|
||||||
|
Ok((PyStripAccents {}, StripAccents.into()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Clone, Deserialize)]
|
#[derive(Clone, Deserialize)]
|
||||||
#[serde(untagged)]
|
#[serde(untagged)]
|
||||||
pub(crate) enum PyNormalizerWrapper {
|
pub(crate) enum PyNormalizerWrapper {
|
||||||
|
@ -6,7 +6,7 @@ pub mod utils;
|
|||||||
|
|
||||||
pub use crate::normalizers::bert::BertNormalizer;
|
pub use crate::normalizers::bert::BertNormalizer;
|
||||||
pub use crate::normalizers::precompiled::Precompiled;
|
pub use crate::normalizers::precompiled::Precompiled;
|
||||||
pub use crate::normalizers::strip::Strip;
|
pub use crate::normalizers::strip::{Strip, StripAccents};
|
||||||
pub use crate::normalizers::unicode::{Nmt, NFC, NFD, NFKC, NFKD};
|
pub use crate::normalizers::unicode::{Nmt, NFC, NFD, NFKC, NFKD};
|
||||||
pub use crate::normalizers::utils::{Lowercase, Sequence};
|
pub use crate::normalizers::utils::{Lowercase, Sequence};
|
||||||
|
|
||||||
@ -20,6 +20,7 @@ use crate::{NormalizedString, Normalizer};
|
|||||||
pub enum NormalizerWrapper {
|
pub enum NormalizerWrapper {
|
||||||
BertNormalizer(BertNormalizer),
|
BertNormalizer(BertNormalizer),
|
||||||
StripNormalizer(Strip),
|
StripNormalizer(Strip),
|
||||||
|
StripAccents(StripAccents),
|
||||||
NFC(NFC),
|
NFC(NFC),
|
||||||
NFD(NFD),
|
NFD(NFD),
|
||||||
NFKC(NFKC),
|
NFKC(NFKC),
|
||||||
@ -35,6 +36,7 @@ impl Normalizer for NormalizerWrapper {
|
|||||||
match self {
|
match self {
|
||||||
NormalizerWrapper::BertNormalizer(bn) => bn.normalize(normalized),
|
NormalizerWrapper::BertNormalizer(bn) => bn.normalize(normalized),
|
||||||
NormalizerWrapper::StripNormalizer(sn) => sn.normalize(normalized),
|
NormalizerWrapper::StripNormalizer(sn) => sn.normalize(normalized),
|
||||||
|
NormalizerWrapper::StripAccents(sn) => sn.normalize(normalized),
|
||||||
NormalizerWrapper::NFC(nfc) => nfc.normalize(normalized),
|
NormalizerWrapper::NFC(nfc) => nfc.normalize(normalized),
|
||||||
NormalizerWrapper::NFD(nfd) => nfd.normalize(normalized),
|
NormalizerWrapper::NFD(nfd) => nfd.normalize(normalized),
|
||||||
NormalizerWrapper::NFKC(nfkc) => nfkc.normalize(normalized),
|
NormalizerWrapper::NFKC(nfkc) => nfkc.normalize(normalized),
|
||||||
@ -53,6 +55,7 @@ impl_enum_from!(NFKC, NormalizerWrapper, NFKC);
|
|||||||
impl_enum_from!(NFC, NormalizerWrapper, NFC);
|
impl_enum_from!(NFC, NormalizerWrapper, NFC);
|
||||||
impl_enum_from!(NFD, NormalizerWrapper, NFD);
|
impl_enum_from!(NFD, NormalizerWrapper, NFD);
|
||||||
impl_enum_from!(Strip, NormalizerWrapper, StripNormalizer);
|
impl_enum_from!(Strip, NormalizerWrapper, StripNormalizer);
|
||||||
|
impl_enum_from!(StripAccents, NormalizerWrapper, StripAccents);
|
||||||
impl_enum_from!(Sequence, NormalizerWrapper, Sequence);
|
impl_enum_from!(Sequence, NormalizerWrapper, Sequence);
|
||||||
impl_enum_from!(Lowercase, NormalizerWrapper, Lowercase);
|
impl_enum_from!(Lowercase, NormalizerWrapper, Lowercase);
|
||||||
impl_enum_from!(Nmt, NormalizerWrapper, Nmt);
|
impl_enum_from!(Nmt, NormalizerWrapper, Nmt);
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
use crate::tokenizer::{NormalizedString, Normalizer, Result};
|
use crate::tokenizer::{NormalizedString, Normalizer, Result};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
use unicode_normalization_alignments::char::is_combining_mark;
|
||||||
|
|
||||||
#[derive(Copy, Clone, Debug, Deserialize, Serialize)]
|
#[derive(Copy, Clone, Debug, Deserialize, Serialize)]
|
||||||
#[serde(tag = "type")]
|
#[serde(tag = "type")]
|
||||||
@ -10,7 +11,7 @@ pub struct Strip {
|
|||||||
|
|
||||||
impl Strip {
|
impl Strip {
|
||||||
pub fn new(strip_left: bool, strip_right: bool) -> Self {
|
pub fn new(strip_left: bool, strip_right: bool) -> Self {
|
||||||
Strip {
|
Self {
|
||||||
strip_left,
|
strip_left,
|
||||||
strip_right,
|
strip_right,
|
||||||
}
|
}
|
||||||
@ -36,3 +37,81 @@ impl Normalizer for Strip {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This normalizer removes combining marks from a normalized string
|
||||||
|
// It's different from unidecode as it does not attempt to modify
|
||||||
|
// non ascii languages.
|
||||||
|
#[derive(Copy, Clone, Debug)]
|
||||||
|
pub struct StripAccents;
|
||||||
|
impl_serde_unit_struct!(StripAccentsVisitor, StripAccents);
|
||||||
|
|
||||||
|
impl Normalizer for StripAccents {
|
||||||
|
/// Strip the normalized string inplace
|
||||||
|
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
|
||||||
|
normalized.filter(|c| !is_combining_mark(c));
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::normalizer::NormalizedString;
|
||||||
|
use unicode_normalization_alignments::UnicodeNormalization;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_strip_accents() {
|
||||||
|
// Unicode combining char
|
||||||
|
let original: String = "Me llamó".nfkd().map(|(c, _)| c).collect();
|
||||||
|
let normalized = "Me llamo";
|
||||||
|
assert_ne!(original, normalized);
|
||||||
|
let mut n = NormalizedString::from(original);
|
||||||
|
StripAccents.normalize(&mut n).unwrap();
|
||||||
|
assert_eq!(&n.get(), &normalized);
|
||||||
|
|
||||||
|
// Ignores regular ascii
|
||||||
|
let original = "Me llamo";
|
||||||
|
let normalized = "Me llamo";
|
||||||
|
assert_eq!(original, normalized);
|
||||||
|
let mut n = NormalizedString::from(original);
|
||||||
|
StripAccents.normalize(&mut n).unwrap();
|
||||||
|
assert_eq!(&n.get(), &normalized);
|
||||||
|
|
||||||
|
// Does not change chinese
|
||||||
|
let original: String = "这很简单".nfkd().map(|(c, _)| c).collect();
|
||||||
|
let normalized = "这很简单";
|
||||||
|
assert_eq!(original, normalized);
|
||||||
|
let mut n = NormalizedString::from(original);
|
||||||
|
StripAccents.normalize(&mut n).unwrap();
|
||||||
|
assert_eq!(&n.get(), &normalized);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_strip_accents_multiple() {
|
||||||
|
let original = "e\u{304}\u{304}\u{304}o";
|
||||||
|
let normalized = "eo";
|
||||||
|
assert_ne!(original, normalized);
|
||||||
|
let mut n = NormalizedString::from(original);
|
||||||
|
StripAccents.normalize(&mut n).unwrap();
|
||||||
|
assert_eq!(&n.get(), &normalized);
|
||||||
|
assert_eq!(
|
||||||
|
n,
|
||||||
|
NormalizedString::new(
|
||||||
|
original.to_string(),
|
||||||
|
normalized.to_string(),
|
||||||
|
vec![(0, 1), (7, 8)],
|
||||||
|
vec![
|
||||||
|
(0, 1),
|
||||||
|
(1, 1),
|
||||||
|
(1, 1),
|
||||||
|
(1, 1),
|
||||||
|
(1, 1),
|
||||||
|
(1, 1),
|
||||||
|
(1, 1),
|
||||||
|
(1, 2)
|
||||||
|
],
|
||||||
|
0
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user