Add Sequence Normalizer

This commit is contained in:
Anthony MOI
2020-01-06 21:03:05 -05:00
parent 5c02bbbc4c
commit 185b6f0b8b
6 changed files with 70 additions and 0 deletions

View File

@ -68,6 +68,7 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<normalizers::NFKD>()?;
m.add_class::<normalizers::NFC>()?;
m.add_class::<normalizers::NFKC>()?;
m.add_class::<normalizers::Sequence>()?;
Ok(())
}

View File

@ -1,6 +1,7 @@
extern crate tokenizers as tk;
use super::utils::Container;
use pyo3::exceptions;
use pyo3::prelude::*;
use pyo3::types::*;
@ -92,3 +93,31 @@ impl NFKC {
})
}
}
#[pyclass]
pub struct Sequence {}
#[pymethods]
impl Sequence {
#[staticmethod]
fn new(normalizers: &PyList) -> PyResult<Normalizer> {
let normalizers = normalizers
.iter()
.map(|n| {
let normalizer: &mut Normalizer = n.extract()?;
if let Some(normalizer) = normalizer.normalizer.to_pointer() {
Ok(normalizer)
} else {
Err(exceptions::Exception::py_err(
"At least one normalizer is already being used in another Tokenizer",
))
}
})
.collect::<PyResult<_>>()?;
Ok(Normalizer {
normalizer: Container::Owned(Box::new(tk::normalizers::utils::Sequence::new(
normalizers,
))),
})
}
}

View File

@ -6,3 +6,4 @@ NFD = normalizers.NFD
NFKD = normalizers.NFKD
NFC = normalizers.NFC
NFKC = normalizers.NFKC
Sequence = normalizers.Sequence

View File

@ -71,3 +71,19 @@ class NFKC:
def new() -> Normalizer:
""" Instantiate a new NFKC Normalizer """
pass
class Sequence:
""" Allows concatenating multiple other Normalizer as a Sequence.
All the normalizers run in sequence in the given order
"""
@staticmethod
def new(normalizers: List[Normalizer]) -> Normalizer:
""" Instantiate a new normalization Sequence using the given normalizers
Args:
normalizers: List[Normalizer]:
A list of Normalizer to be run as a sequence
"""
pass

View File

@ -1,2 +1,3 @@
pub mod bert;
pub mod unicode;
pub mod utils;

View File

@ -0,0 +1,22 @@
use crate::tokenizer::{NormalizedString, Normalizer, Result};
/// Allows concatenating multiple other Normalizer as a Sequence.
/// All the normalizers run in sequence in the given order against the same NormalizedString.
pub struct Sequence {
normalizers: Vec<Box<dyn Normalizer + Sync>>,
}
impl Sequence {
pub fn new(normalizers: Vec<Box<dyn Normalizer + Sync>>) -> Self {
Self { normalizers }
}
}
impl Normalizer for Sequence {
fn normalize(&self, mut normalized: &mut NormalizedString) -> Result<()> {
for normalizer in &self.normalizers {
normalizer.normalize(&mut normalized)?;
}
Ok(())
}
}