Python - Add normalizers bindings & BertNormalizer

This commit is contained in:
Anthony MOI
2019-12-29 00:36:09 -05:00
parent 81be029881
commit 3f79d9d5e0
4 changed files with 71 additions and 1 deletions

View File

@ -2,6 +2,7 @@ mod decoders;
mod encoding;
mod error;
mod models;
mod normalizers;
mod pre_tokenizers;
mod processors;
mod token;
@ -55,6 +56,14 @@ fn processors(_py: Python, m: &PyModule) -> PyResult<()> {
Ok(())
}
/// Normalizers Module
#[pymodule]
fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<normalizers::Normalizer>()?;
m.add_class::<normalizers::BertNormalizer>()?;
Ok(())
}
/// Tokenizers Module
#[pymodule]
fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
@ -63,6 +72,7 @@ fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_wrapped(wrap_pymodule!(pre_tokenizers))?;
m.add_wrapped(wrap_pymodule!(decoders))?;
m.add_wrapped(wrap_pymodule!(processors))?;
m.add_wrapped(wrap_pymodule!(normalizers))?;
m.add_wrapped(wrap_pymodule!(trainers))?;
Ok(())
}

View File

@ -0,0 +1,48 @@
extern crate tokenizers as tk;
use super::error::{PyError, ToPyResult};
use super::utils::Container;
use pyo3::prelude::*;
use pyo3::types::*;
use tk::tokenizer::Result;
#[pyclass(dict)]
pub struct Normalizer {
pub normalizer: Container<dyn tk::tokenizer::Normalizer + Sync>,
}
#[pyclass]
pub struct BertNormalizer {}
#[pymethods]
impl BertNormalizer {
#[staticmethod]
#[args(kwargs = "**")]
fn new(kwargs: Option<&PyDict>) -> PyResult<Normalizer> {
let mut clean_text = true;
let mut handle_chinese_chars = true;
let mut strip_accents = true;
let mut lowercase = true;
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
"clean_text" => clean_text = value.extract()?,
"handle_chinese_chars" => handle_chinese_chars = value.extract()?,
"strip_accents" => strip_accents = value.extract()?,
"lowercase" => lowercase = value.extract()?,
_ => println!("Ignored unknown kwargs option {}", key),
}
}
}
Ok(Normalizer {
normalizer: Container::Owned(Box::new(tk::normalizers::bert::BertNormalizer::new(
clean_text,
handle_chinese_chars,
strip_accents,
lowercase,
))),
})
}
}

View File

@ -8,6 +8,7 @@ use super::decoders::Decoder;
use super::encoding::Encoding;
use super::error::{PyError, ToPyResult};
use super::models::Model;
use super::normalizers::Normalizer;
use super::pre_tokenizers::PreTokenizer;
use super::processors::PostProcessor;
use super::trainers::Trainer;
@ -97,6 +98,17 @@ impl Tokenizer {
}
}
fn with_normalizer(&mut self, normalizer: &mut Normalizer) -> PyResult<()> {
if let Some(normalizer) = normalizer.normalizer.to_pointer() {
self.tokenizer.with_normalizer(normalizer);
Ok(())
} else {
Err(exceptions::Exception::py_err(
"The Normalizer is already being used in another Tokenizer",
))
}
}
#[args(kwargs = "**")]
fn with_truncation(&mut self, max_length: usize, kwargs: Option<&PyDict>) -> PyResult<()> {
let mut stride = 0;

View File

@ -1,3 +1,3 @@
__version__ = "0.0.11"
from .tokenizers import Tokenizer, models, decoders, pre_tokenizers, trainers, processors
from .tokenizers import Tokenizer, models, decoders, pre_tokenizers, trainers, processors, normalizers