mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Add Sequence Normalizer
This commit is contained in:
@ -68,6 +68,7 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<normalizers::NFKD>()?;
|
||||
m.add_class::<normalizers::NFC>()?;
|
||||
m.add_class::<normalizers::NFKC>()?;
|
||||
m.add_class::<normalizers::Sequence>()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
extern crate tokenizers as tk;
|
||||
|
||||
use super::utils::Container;
|
||||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::*;
|
||||
|
||||
@ -92,3 +93,31 @@ impl NFKC {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct Sequence {}
|
||||
#[pymethods]
|
||||
impl Sequence {
|
||||
#[staticmethod]
|
||||
fn new(normalizers: &PyList) -> PyResult<Normalizer> {
|
||||
let normalizers = normalizers
|
||||
.iter()
|
||||
.map(|n| {
|
||||
let normalizer: &mut Normalizer = n.extract()?;
|
||||
if let Some(normalizer) = normalizer.normalizer.to_pointer() {
|
||||
Ok(normalizer)
|
||||
} else {
|
||||
Err(exceptions::Exception::py_err(
|
||||
"At least one normalizer is already being used in another Tokenizer",
|
||||
))
|
||||
}
|
||||
})
|
||||
.collect::<PyResult<_>>()?;
|
||||
|
||||
Ok(Normalizer {
|
||||
normalizer: Container::Owned(Box::new(tk::normalizers::utils::Sequence::new(
|
||||
normalizers,
|
||||
))),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -6,3 +6,4 @@ NFD = normalizers.NFD
|
||||
NFKD = normalizers.NFKD
|
||||
NFC = normalizers.NFC
|
||||
NFKC = normalizers.NFKC
|
||||
Sequence = normalizers.Sequence
|
||||
|
@ -71,3 +71,19 @@ class NFKC:
|
||||
def new() -> Normalizer:
|
||||
""" Instantiate a new NFKC Normalizer """
|
||||
pass
|
||||
|
||||
class Sequence:
|
||||
""" Allows concatenating multiple other Normalizer as a Sequence.
|
||||
|
||||
All the normalizers run in sequence in the given order
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def new(normalizers: List[Normalizer]) -> Normalizer:
|
||||
""" Instantiate a new normalization Sequence using the given normalizers
|
||||
|
||||
Args:
|
||||
normalizers: List[Normalizer]:
|
||||
A list of Normalizer to be run as a sequence
|
||||
"""
|
||||
pass
|
||||
|
@ -1,2 +1,3 @@
|
||||
pub mod bert;
|
||||
pub mod unicode;
|
||||
pub mod utils;
|
||||
|
22
tokenizers/src/normalizers/utils.rs
Normal file
22
tokenizers/src/normalizers/utils.rs
Normal file
@ -0,0 +1,22 @@
|
||||
use crate::tokenizer::{NormalizedString, Normalizer, Result};
|
||||
|
||||
/// Allows concatenating multiple other Normalizer as a Sequence.
|
||||
/// All the normalizers run in sequence in the given order against the same NormalizedString.
|
||||
pub struct Sequence {
|
||||
normalizers: Vec<Box<dyn Normalizer + Sync>>,
|
||||
}
|
||||
|
||||
impl Sequence {
|
||||
pub fn new(normalizers: Vec<Box<dyn Normalizer + Sync>>) -> Self {
|
||||
Self { normalizers }
|
||||
}
|
||||
}
|
||||
|
||||
impl Normalizer for Sequence {
|
||||
fn normalize(&self, mut normalized: &mut NormalizedString) -> Result<()> {
|
||||
for normalizer in &self.normalizers {
|
||||
normalizer.normalize(&mut normalized)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user