mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-02 07:19:24 +00:00
Adding a new "Replace" normalizer that takes a string and replaces it
with another String (for now).
This commit is contained in:
@ -12,6 +12,7 @@ Strip = normalizers.Strip
|
|||||||
StripAccents = normalizers.StripAccents
|
StripAccents = normalizers.StripAccents
|
||||||
Nmt = normalizers.Nmt
|
Nmt = normalizers.Nmt
|
||||||
Precompiled = normalizers.Precompiled
|
Precompiled = normalizers.Precompiled
|
||||||
|
Replace = normalizers.Replace
|
||||||
|
|
||||||
|
|
||||||
NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
|
NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
|
||||||
|
@ -112,11 +112,17 @@ class Nmt(Normalizer):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
class Precompiled(Normalizer):
|
class Precompiled(Normalizer):
|
||||||
""" SpmNmtNfkc normalizer """
|
""" Precompiled normalizer """
|
||||||
|
|
||||||
def __init__(self, precompiled_charsmap: bytes) -> Normalizer:
|
def __init__(self, precompiled_charsmap: bytes) -> Normalizer:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class Replace(Normalizer):
|
||||||
|
""" Replace normalizer """
|
||||||
|
|
||||||
|
def __init__(self, pattern: str, content: str) -> Normalizer:
|
||||||
|
pass
|
||||||
|
|
||||||
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
|
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
|
||||||
"""
|
"""
|
||||||
Instanciate unicode normalizer from the normalizer name
|
Instanciate unicode normalizer from the normalizer name
|
||||||
|
@ -111,6 +111,7 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
|||||||
m.add_class::<normalizers::PyStripAccents>()?;
|
m.add_class::<normalizers::PyStripAccents>()?;
|
||||||
m.add_class::<normalizers::PyNmt>()?;
|
m.add_class::<normalizers::PyNmt>()?;
|
||||||
m.add_class::<normalizers::PyPrecompiled>()?;
|
m.add_class::<normalizers::PyPrecompiled>()?;
|
||||||
|
m.add_class::<normalizers::PyReplace>()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -8,8 +8,13 @@ use crate::error::ToPyResult;
|
|||||||
use serde::ser::SerializeStruct;
|
use serde::ser::SerializeStruct;
|
||||||
use serde::{Deserialize, Serialize, Serializer};
|
use serde::{Deserialize, Serialize, Serializer};
|
||||||
use tk::normalizers::{
|
use tk::normalizers::{
|
||||||
|
<<<<<<< HEAD
|
||||||
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, StripAccents, NFC, NFD,
|
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Strip, StripAccents, NFC, NFD,
|
||||||
NFKC, NFKD,
|
NFKC, NFKD,
|
||||||
|
=======
|
||||||
|
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Replace, Strip, NFC, NFD, NFKC,
|
||||||
|
NFKD,
|
||||||
|
>>>>>>> Adding a new "Replace" normalizer that takes a string and replaces it
|
||||||
};
|
};
|
||||||
use tk::{NormalizedString, Normalizer};
|
use tk::{NormalizedString, Normalizer};
|
||||||
use tokenizers as tk;
|
use tokenizers as tk;
|
||||||
@ -54,6 +59,7 @@ impl PyNormalizer {
|
|||||||
NormalizerWrapper::Precompiled(_) => {
|
NormalizerWrapper::Precompiled(_) => {
|
||||||
Py::new(py, (PyPrecompiled {}, base)).map(Into::into)
|
Py::new(py, (PyPrecompiled {}, base)).map(Into::into)
|
||||||
}
|
}
|
||||||
|
NormalizerWrapper::Replace(_) => Py::new(py, (PyReplace {}, base)).map(Into::into),
|
||||||
NormalizerWrapper::Nmt(_) => Py::new(py, (PyNmt {}, base)).map(Into::into),
|
NormalizerWrapper::Nmt(_) => Py::new(py, (PyNmt {}, base)).map(Into::into),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@ -324,6 +330,18 @@ impl PyPrecompiled {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Replace)]
|
||||||
|
pub struct PyReplace {}
|
||||||
|
#[pymethods]
|
||||||
|
impl PyReplace {
|
||||||
|
#[new]
|
||||||
|
fn new(pattern: &PyString, content: &PyString) -> PyResult<(Self, PyNormalizer)> {
|
||||||
|
let pattern: String = (*pattern).to_string()?.to_string();
|
||||||
|
let content: String = (*content).to_string()?.to_string();
|
||||||
|
Ok((PyReplace {}, Replace::new(pattern, content).into()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use pyo3::{AsPyRef, Python};
|
use pyo3::{AsPyRef, Python};
|
||||||
|
@ -1,11 +1,13 @@
|
|||||||
pub mod bert;
|
pub mod bert;
|
||||||
pub mod precompiled;
|
pub mod precompiled;
|
||||||
|
pub mod replace;
|
||||||
pub mod strip;
|
pub mod strip;
|
||||||
pub mod unicode;
|
pub mod unicode;
|
||||||
pub mod utils;
|
pub mod utils;
|
||||||
|
|
||||||
pub use crate::normalizers::bert::BertNormalizer;
|
pub use crate::normalizers::bert::BertNormalizer;
|
||||||
pub use crate::normalizers::precompiled::Precompiled;
|
pub use crate::normalizers::precompiled::Precompiled;
|
||||||
|
pub use crate::normalizers::replace::Replace;
|
||||||
pub use crate::normalizers::strip::{Strip, StripAccents};
|
pub use crate::normalizers::strip::{Strip, StripAccents};
|
||||||
pub use crate::normalizers::unicode::{Nmt, NFC, NFD, NFKC, NFKD};
|
pub use crate::normalizers::unicode::{Nmt, NFC, NFD, NFKC, NFKD};
|
||||||
pub use crate::normalizers::utils::{Lowercase, Sequence};
|
pub use crate::normalizers::utils::{Lowercase, Sequence};
|
||||||
@ -29,6 +31,7 @@ pub enum NormalizerWrapper {
|
|||||||
Lowercase(Lowercase),
|
Lowercase(Lowercase),
|
||||||
Nmt(Nmt),
|
Nmt(Nmt),
|
||||||
Precompiled(Precompiled),
|
Precompiled(Precompiled),
|
||||||
|
Replace(Replace),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Normalizer for NormalizerWrapper {
|
impl Normalizer for NormalizerWrapper {
|
||||||
@ -45,6 +48,7 @@ impl Normalizer for NormalizerWrapper {
|
|||||||
NormalizerWrapper::Lowercase(lc) => lc.normalize(normalized),
|
NormalizerWrapper::Lowercase(lc) => lc.normalize(normalized),
|
||||||
NormalizerWrapper::Nmt(lc) => lc.normalize(normalized),
|
NormalizerWrapper::Nmt(lc) => lc.normalize(normalized),
|
||||||
NormalizerWrapper::Precompiled(lc) => lc.normalize(normalized),
|
NormalizerWrapper::Precompiled(lc) => lc.normalize(normalized),
|
||||||
|
NormalizerWrapper::Replace(lc) => lc.normalize(normalized),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -60,3 +64,4 @@ impl_enum_from!(Sequence, NormalizerWrapper, Sequence);
|
|||||||
impl_enum_from!(Lowercase, NormalizerWrapper, Lowercase);
|
impl_enum_from!(Lowercase, NormalizerWrapper, Lowercase);
|
||||||
impl_enum_from!(Nmt, NormalizerWrapper, Nmt);
|
impl_enum_from!(Nmt, NormalizerWrapper, Nmt);
|
||||||
impl_enum_from!(Precompiled, NormalizerWrapper, Precompiled);
|
impl_enum_from!(Precompiled, NormalizerWrapper, Precompiled);
|
||||||
|
impl_enum_from!(Replace, NormalizerWrapper, Replace);
|
||||||
|
43
tokenizers/src/normalizers/replace.rs
Normal file
43
tokenizers/src/normalizers/replace.rs
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
use crate::tokenizer::{NormalizedString, Normalizer, Result};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
/// This normalizer will take a `pattern` (for now only a String)
|
||||||
|
/// and replace every occurrence with `content`.
|
||||||
|
#[derive(Deserialize, Serialize, Clone, Debug)]
|
||||||
|
#[serde(tag = "type")]
|
||||||
|
pub struct Replace {
|
||||||
|
pattern: String,
|
||||||
|
content: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Replace {
|
||||||
|
pub fn new(pattern: String, content: String) -> Self {
|
||||||
|
Self { pattern, content }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Normalizer for Replace {
|
||||||
|
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
|
||||||
|
let pattern: &str = &self.pattern;
|
||||||
|
normalized.replace(pattern, &self.content)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_replace() {
|
||||||
|
let original = "This is a ''test''";
|
||||||
|
let normalized = "This is a \"test\"";
|
||||||
|
|
||||||
|
let mut n = NormalizedString::from(original);
|
||||||
|
Replace::new("''".to_string(), "\"".to_string())
|
||||||
|
.normalize(&mut n)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(&n.get(), &normalized);
|
||||||
|
}
|
||||||
|
}
|
Reference in New Issue
Block a user