mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-05 12:18:20 +00:00
Adding a new normalizer that strips accents by removing combining (#416)
* Adding a new normalizer that strips accents by removing combining characters in unicode strings. * Adding Node bindings + better normalizer impl. * Doc comment -> Regular comment.
This commit is contained in:
@@ -9,6 +9,7 @@ NFKC = normalizers.NFKC
|
||||
Sequence = normalizers.Sequence
|
||||
Lowercase = normalizers.Lowercase
|
||||
Strip = normalizers.Strip
|
||||
StripAccents = normalizers.StripAccents
|
||||
Nmt = normalizers.Nmt
|
||||
Precompiled = normalizers.Precompiled
|
||||
|
||||
|
||||
@@ -99,6 +99,12 @@ class Strip(Normalizer):
|
||||
def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
|
||||
pass
|
||||
|
||||
class StripAccents(Normalizer):
|
||||
""" StripAccents normalizer """
|
||||
|
||||
def __init__(self) -> Normalizer:
|
||||
pass
|
||||
|
||||
class Nmt(Normalizer):
|
||||
""" Nmt normalizer """
|
||||
|
||||
|
||||
@@ -108,6 +108,7 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<normalizers::PySequence>()?;
|
||||
m.add_class::<normalizers::PyLowercase>()?;
|
||||
m.add_class::<normalizers::PyStrip>()?;
|
||||
m.add_class::<normalizers::PyStripAccents>()?;
|
||||
m.add_class::<normalizers::PyNmt>()?;
|
||||
m.add_class::<normalizers::PyPrecompiled>()?;
|
||||
Ok(())
|
||||
|
||||
@@ -8,7 +8,8 @@ use crate::error::ToPyResult;
|
||||
use serde::ser::SerializeStruct;
|
||||
use serde::{Deserialize, Serialize, Serializer};
|
||||
use tk::normalizers::{
|
||||
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, NFC, NFD, NFKC, NFKD,
|
||||
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, StripAccents, NFC, NFD,
|
||||
NFKC, NFKD,
|
||||
};
|
||||
use tk::{NormalizedString, Normalizer};
|
||||
use tokenizers as tk;
|
||||
@@ -40,6 +41,9 @@ impl PyNormalizer {
|
||||
NormalizerWrapper::StripNormalizer(_) => {
|
||||
Py::new(py, (PyBertNormalizer {}, base)).map(Into::into)
|
||||
}
|
||||
NormalizerWrapper::StripAccents(_) => {
|
||||
Py::new(py, (PyStripAccents {}, base)).map(Into::into)
|
||||
}
|
||||
NormalizerWrapper::NFC(_) => Py::new(py, (PyNFC {}, base)).map(Into::into),
|
||||
NormalizerWrapper::NFD(_) => Py::new(py, (PyNFD {}, base)).map(Into::into),
|
||||
NormalizerWrapper::NFKC(_) => Py::new(py, (PyNFKC {}, base)).map(Into::into),
|
||||
@@ -224,6 +228,16 @@ impl PyStrip {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=StripAccents)]
|
||||
pub struct PyStripAccents {}
|
||||
#[pymethods]
|
||||
impl PyStripAccents {
|
||||
#[new]
|
||||
fn new() -> PyResult<(Self, PyNormalizer)> {
|
||||
Ok((PyStripAccents {}, StripAccents.into()))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub(crate) enum PyNormalizerWrapper {
|
||||
|
||||
Reference in New Issue
Block a user