Add basic unicode normalizers

This commit is contained in:
Anthony MOI
2020-01-06 20:38:42 -05:00
parent 4b9ae66419
commit 5c02bbbc4c
7 changed files with 123 additions and 2 deletions

View File

@@ -64,6 +64,10 @@ fn processors(_py: Python, m: &PyModule) -> PyResult<()> {
fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> { fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<normalizers::Normalizer>()?; m.add_class::<normalizers::Normalizer>()?;
m.add_class::<normalizers::BertNormalizer>()?; m.add_class::<normalizers::BertNormalizer>()?;
m.add_class::<normalizers::NFD>()?;
m.add_class::<normalizers::NFKD>()?;
m.add_class::<normalizers::NFC>()?;
m.add_class::<normalizers::NFKC>()?;
Ok(()) Ok(())
} }

View File

@@ -44,3 +44,51 @@ impl BertNormalizer {
}) })
} }
} }
#[pyclass]
pub struct NFD {}
#[pymethods]
impl NFD {
#[staticmethod]
fn new() -> PyResult<Normalizer> {
Ok(Normalizer {
normalizer: Container::Owned(Box::new(tk::normalizers::unicode::NFD)),
})
}
}
#[pyclass]
pub struct NFKD {}
#[pymethods]
impl NFKD {
#[staticmethod]
fn new() -> PyResult<Normalizer> {
Ok(Normalizer {
normalizer: Container::Owned(Box::new(tk::normalizers::unicode::NFKD)),
})
}
}
#[pyclass]
pub struct NFC {}
#[pymethods]
impl NFC {
#[staticmethod]
fn new() -> PyResult<Normalizer> {
Ok(Normalizer {
normalizer: Container::Owned(Box::new(tk::normalizers::unicode::NFC)),
})
}
}
#[pyclass]
pub struct NFKC {}
#[pymethods]
impl NFKC {
#[staticmethod]
fn new() -> PyResult<Normalizer> {
Ok(Normalizer {
normalizer: Container::Owned(Box::new(tk::normalizers::unicode::NFKC)),
})
}
}

View File

@@ -20,8 +20,7 @@ class ByteLevel:
pass pass
class WordPiece: class WordPiece:
"""WordPiece """ WordPiece Decoder """
"""
@staticmethod @staticmethod
def new(prefix: str="##") -> Decoder: def new(prefix: str="##") -> Decoder:

View File

@@ -2,3 +2,7 @@ from .. import normalizers
Normalizer = normalizers.Normalizer Normalizer = normalizers.Normalizer
BertNormalizer = normalizers.BertNormalizer BertNormalizer = normalizers.BertNormalizer
NFD = normalizers.NFD
NFKD = normalizers.NFKD
NFC = normalizers.NFC
NFKC = normalizers.NFKC

View File

@@ -39,3 +39,35 @@ class BertNormalizer:
Normalizer Normalizer
""" """
pass pass
class NFD:
""" NFD Unicode Normalizer """
@staticmethod
def new() -> Normalizer:
""" Instantiate a new NFD Normalizer """
pass
class NFKD:
""" NFKD Unicode Normalizer """
@staticmethod
def new() -> Normalizer:
""" Instantiate a new NFKD Normalizer """
pass
class NFC:
""" NFC Unicode Normalizer """
@staticmethod
def new() -> Normalizer:
""" Instantiate a new NFC Normalizer """
pass
class NFKC:
""" NFKC Unicode Normalizer """
@staticmethod
def new() -> Normalizer:
""" Instantiate a new NFKC Normalizer """
pass

View File

@@ -1 +1,2 @@
pub mod bert; pub mod bert;
pub mod unicode;

View File

@@ -0,0 +1,33 @@
use crate::tokenizer::{NormalizedString, Normalizer, Result};
pub struct NFD;
impl Normalizer for NFD {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
normalized.nfd();
Ok(())
}
}
pub struct NFKD;
impl Normalizer for NFKD {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
normalized.nfkd();
Ok(())
}
}
pub struct NFC;
impl Normalizer for NFC {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
normalized.nfc();
Ok(())
}
}
pub struct NFKC;
impl Normalizer for NFKC {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
normalized.nfkc();
Ok(())
}
}