mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Adding a new normalizer that strips accents by removing combining (#416)
* Adding a new normalizer that strips accents by removing combining characters in unicode strings. * Adding Node bindings + better normalizer impl. * Doc comment -> Regular comment.
This commit is contained in:
5
bindings/node/lib/bindings/normalizers.d.ts
vendored
5
bindings/node/lib/bindings/normalizers.d.ts
vendored
@ -75,3 +75,8 @@ export function sequenceNormalizer(normalizers: Normalizer[]): Normalizer;
|
||||
* @param [right=true] Whether or not to strip on the right (defaults to `true`)
|
||||
*/
|
||||
export function stripNormalizer(left?: boolean, right?: boolean): Normalizer;
|
||||
|
||||
/**
|
||||
* Returns a new StripAccents Normalizer
|
||||
*/
|
||||
export function stripAccentsNormalizer(): Normalizer;
|
||||
|
@ -9,4 +9,5 @@ module.exports = {
|
||||
nfkdNormalizer: native.normalizers_NFKD,
|
||||
sequenceNormalizer: native.normalizers_Sequence,
|
||||
stripNormalizer: native.normalizers_Strip,
|
||||
stripAccentsNormalizer: native.normalizers_StripAccents,
|
||||
};
|
||||
|
@ -1,4 +1,4 @@
|
||||
import { stripNormalizer } from "./normalizers";
|
||||
import { stripAccentsNormalizer, stripNormalizer } from "./normalizers";
|
||||
|
||||
describe("stripNormalizer", () => {
|
||||
it("instantiates with no parameters", () => {
|
||||
@ -24,3 +24,10 @@ describe("stripNormalizer", () => {
|
||||
expect(normalizer.constructor.name).toEqual("Normalizer");
|
||||
});
|
||||
});
|
||||
|
||||
describe("stripAccentsNormalizer", () => {
|
||||
it("initialize", () => {
|
||||
const normalizer = stripAccentsNormalizer();
|
||||
expect(normalizer.constructor.name).toEqual("Normalizer");
|
||||
});
|
||||
});
|
||||
|
@ -160,6 +160,14 @@ fn strip(mut cx: FunctionContext) -> JsResult<JsNormalizer> {
|
||||
|
||||
Ok(normalizer)
|
||||
}
|
||||
/// strip_accents()
|
||||
fn strip_accents(mut cx: FunctionContext) -> JsResult<JsNormalizer> {
|
||||
let mut normalizer = JsNormalizer::new::<_, JsNormalizer, _>(&mut cx, vec![])?;
|
||||
let guard = cx.lock();
|
||||
normalizer.borrow_mut(&guard).normalizer = Some(tk::normalizers::strip::StripAccents.into());
|
||||
|
||||
Ok(normalizer)
|
||||
}
|
||||
|
||||
/// sequence(normalizers: Normalizer[])
|
||||
fn sequence(mut cx: FunctionContext) -> JsResult<JsNormalizer> {
|
||||
@ -212,6 +220,7 @@ pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
|
||||
m.export_function(&format!("{}_Sequence", prefix), sequence)?;
|
||||
m.export_function(&format!("{}_Lowercase", prefix), lowercase)?;
|
||||
m.export_function(&format!("{}_Strip", prefix), strip)?;
|
||||
m.export_function(&format!("{}_StripAccents", prefix), strip_accents)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -9,6 +9,7 @@ NFKC = normalizers.NFKC
|
||||
Sequence = normalizers.Sequence
|
||||
Lowercase = normalizers.Lowercase
|
||||
Strip = normalizers.Strip
|
||||
StripAccents = normalizers.StripAccents
|
||||
Nmt = normalizers.Nmt
|
||||
Precompiled = normalizers.Precompiled
|
||||
|
||||
|
@ -99,6 +99,12 @@ class Strip(Normalizer):
|
||||
def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
|
||||
pass
|
||||
|
||||
class StripAccents(Normalizer):
|
||||
""" StripAccents normalizer """
|
||||
|
||||
def __init__(self) -> Normalizer:
|
||||
pass
|
||||
|
||||
class Nmt(Normalizer):
|
||||
""" Nmt normalizer """
|
||||
|
||||
|
@ -108,6 +108,7 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<normalizers::PySequence>()?;
|
||||
m.add_class::<normalizers::PyLowercase>()?;
|
||||
m.add_class::<normalizers::PyStrip>()?;
|
||||
m.add_class::<normalizers::PyStripAccents>()?;
|
||||
m.add_class::<normalizers::PyNmt>()?;
|
||||
m.add_class::<normalizers::PyPrecompiled>()?;
|
||||
Ok(())
|
||||
|
@ -8,7 +8,8 @@ use crate::error::ToPyResult;
|
||||
use serde::ser::SerializeStruct;
|
||||
use serde::{Deserialize, Serialize, Serializer};
|
||||
use tk::normalizers::{
|
||||
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, NFC, NFD, NFKC, NFKD,
|
||||
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, StripAccents, NFC, NFD,
|
||||
NFKC, NFKD,
|
||||
};
|
||||
use tk::{NormalizedString, Normalizer};
|
||||
use tokenizers as tk;
|
||||
@ -40,6 +41,9 @@ impl PyNormalizer {
|
||||
NormalizerWrapper::StripNormalizer(_) => {
|
||||
Py::new(py, (PyBertNormalizer {}, base)).map(Into::into)
|
||||
}
|
||||
NormalizerWrapper::StripAccents(_) => {
|
||||
Py::new(py, (PyStripAccents {}, base)).map(Into::into)
|
||||
}
|
||||
NormalizerWrapper::NFC(_) => Py::new(py, (PyNFC {}, base)).map(Into::into),
|
||||
NormalizerWrapper::NFD(_) => Py::new(py, (PyNFD {}, base)).map(Into::into),
|
||||
NormalizerWrapper::NFKC(_) => Py::new(py, (PyNFKC {}, base)).map(Into::into),
|
||||
@ -224,6 +228,16 @@ impl PyStrip {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=StripAccents)]
|
||||
pub struct PyStripAccents {}
|
||||
#[pymethods]
|
||||
impl PyStripAccents {
|
||||
#[new]
|
||||
fn new() -> PyResult<(Self, PyNormalizer)> {
|
||||
Ok((PyStripAccents {}, StripAccents.into()))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub(crate) enum PyNormalizerWrapper {
|
||||
|
@ -6,7 +6,7 @@ pub mod utils;
|
||||
|
||||
pub use crate::normalizers::bert::BertNormalizer;
|
||||
pub use crate::normalizers::precompiled::Precompiled;
|
||||
pub use crate::normalizers::strip::Strip;
|
||||
pub use crate::normalizers::strip::{Strip, StripAccents};
|
||||
pub use crate::normalizers::unicode::{Nmt, NFC, NFD, NFKC, NFKD};
|
||||
pub use crate::normalizers::utils::{Lowercase, Sequence};
|
||||
|
||||
@ -20,6 +20,7 @@ use crate::{NormalizedString, Normalizer};
|
||||
pub enum NormalizerWrapper {
|
||||
BertNormalizer(BertNormalizer),
|
||||
StripNormalizer(Strip),
|
||||
StripAccents(StripAccents),
|
||||
NFC(NFC),
|
||||
NFD(NFD),
|
||||
NFKC(NFKC),
|
||||
@ -35,6 +36,7 @@ impl Normalizer for NormalizerWrapper {
|
||||
match self {
|
||||
NormalizerWrapper::BertNormalizer(bn) => bn.normalize(normalized),
|
||||
NormalizerWrapper::StripNormalizer(sn) => sn.normalize(normalized),
|
||||
NormalizerWrapper::StripAccents(sn) => sn.normalize(normalized),
|
||||
NormalizerWrapper::NFC(nfc) => nfc.normalize(normalized),
|
||||
NormalizerWrapper::NFD(nfd) => nfd.normalize(normalized),
|
||||
NormalizerWrapper::NFKC(nfkc) => nfkc.normalize(normalized),
|
||||
@ -53,6 +55,7 @@ impl_enum_from!(NFKC, NormalizerWrapper, NFKC);
|
||||
impl_enum_from!(NFC, NormalizerWrapper, NFC);
|
||||
impl_enum_from!(NFD, NormalizerWrapper, NFD);
|
||||
impl_enum_from!(Strip, NormalizerWrapper, StripNormalizer);
|
||||
impl_enum_from!(StripAccents, NormalizerWrapper, StripAccents);
|
||||
impl_enum_from!(Sequence, NormalizerWrapper, Sequence);
|
||||
impl_enum_from!(Lowercase, NormalizerWrapper, Lowercase);
|
||||
impl_enum_from!(Nmt, NormalizerWrapper, Nmt);
|
||||
|
@ -1,5 +1,6 @@
|
||||
use crate::tokenizer::{NormalizedString, Normalizer, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use unicode_normalization_alignments::char::is_combining_mark;
|
||||
|
||||
#[derive(Copy, Clone, Debug, Deserialize, Serialize)]
|
||||
#[serde(tag = "type")]
|
||||
@ -10,7 +11,7 @@ pub struct Strip {
|
||||
|
||||
impl Strip {
|
||||
pub fn new(strip_left: bool, strip_right: bool) -> Self {
|
||||
Strip {
|
||||
Self {
|
||||
strip_left,
|
||||
strip_right,
|
||||
}
|
||||
@ -36,3 +37,81 @@ impl Normalizer for Strip {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// This normalizer removes combining marks from a normalized string
|
||||
// It's different from unidecode as it does not attempt to modify
|
||||
// non ascii languages.
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct StripAccents;
|
||||
impl_serde_unit_struct!(StripAccentsVisitor, StripAccents);
|
||||
|
||||
impl Normalizer for StripAccents {
|
||||
/// Strip the normalized string inplace
|
||||
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
|
||||
normalized.filter(|c| !is_combining_mark(c));
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::normalizer::NormalizedString;
|
||||
use unicode_normalization_alignments::UnicodeNormalization;
|
||||
|
||||
#[test]
|
||||
fn test_strip_accents() {
|
||||
// Unicode combining char
|
||||
let original: String = "Me llamó".nfkd().map(|(c, _)| c).collect();
|
||||
let normalized = "Me llamo";
|
||||
assert_ne!(original, normalized);
|
||||
let mut n = NormalizedString::from(original);
|
||||
StripAccents.normalize(&mut n).unwrap();
|
||||
assert_eq!(&n.get(), &normalized);
|
||||
|
||||
// Ignores regular ascii
|
||||
let original = "Me llamo";
|
||||
let normalized = "Me llamo";
|
||||
assert_eq!(original, normalized);
|
||||
let mut n = NormalizedString::from(original);
|
||||
StripAccents.normalize(&mut n).unwrap();
|
||||
assert_eq!(&n.get(), &normalized);
|
||||
|
||||
// Does not change chinese
|
||||
let original: String = "这很简单".nfkd().map(|(c, _)| c).collect();
|
||||
let normalized = "这很简单";
|
||||
assert_eq!(original, normalized);
|
||||
let mut n = NormalizedString::from(original);
|
||||
StripAccents.normalize(&mut n).unwrap();
|
||||
assert_eq!(&n.get(), &normalized);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_strip_accents_multiple() {
|
||||
let original = "e\u{304}\u{304}\u{304}o";
|
||||
let normalized = "eo";
|
||||
assert_ne!(original, normalized);
|
||||
let mut n = NormalizedString::from(original);
|
||||
StripAccents.normalize(&mut n).unwrap();
|
||||
assert_eq!(&n.get(), &normalized);
|
||||
assert_eq!(
|
||||
n,
|
||||
NormalizedString::new(
|
||||
original.to_string(),
|
||||
normalized.to_string(),
|
||||
vec![(0, 1), (7, 8)],
|
||||
vec![
|
||||
(0, 1),
|
||||
(1, 1),
|
||||
(1, 1),
|
||||
(1, 1),
|
||||
(1, 1),
|
||||
(1, 1),
|
||||
(1, 1),
|
||||
(1, 2)
|
||||
],
|
||||
0
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user