mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-06 12:48:18 +00:00
Add basic unicode normalizers
This commit is contained in:
@@ -64,6 +64,10 @@ fn processors(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<normalizers::Normalizer>()?;
|
||||
m.add_class::<normalizers::BertNormalizer>()?;
|
||||
m.add_class::<normalizers::NFD>()?;
|
||||
m.add_class::<normalizers::NFKD>()?;
|
||||
m.add_class::<normalizers::NFC>()?;
|
||||
m.add_class::<normalizers::NFKC>()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -44,3 +44,51 @@ impl BertNormalizer {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct NFD {}
|
||||
#[pymethods]
|
||||
impl NFD {
|
||||
#[staticmethod]
|
||||
fn new() -> PyResult<Normalizer> {
|
||||
Ok(Normalizer {
|
||||
normalizer: Container::Owned(Box::new(tk::normalizers::unicode::NFD)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct NFKD {}
|
||||
#[pymethods]
|
||||
impl NFKD {
|
||||
#[staticmethod]
|
||||
fn new() -> PyResult<Normalizer> {
|
||||
Ok(Normalizer {
|
||||
normalizer: Container::Owned(Box::new(tk::normalizers::unicode::NFKD)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct NFC {}
|
||||
#[pymethods]
|
||||
impl NFC {
|
||||
#[staticmethod]
|
||||
fn new() -> PyResult<Normalizer> {
|
||||
Ok(Normalizer {
|
||||
normalizer: Container::Owned(Box::new(tk::normalizers::unicode::NFC)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct NFKC {}
|
||||
#[pymethods]
|
||||
impl NFKC {
|
||||
#[staticmethod]
|
||||
fn new() -> PyResult<Normalizer> {
|
||||
Ok(Normalizer {
|
||||
normalizer: Container::Owned(Box::new(tk::normalizers::unicode::NFKC)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,8 +20,7 @@ class ByteLevel:
|
||||
pass
|
||||
|
||||
class WordPiece:
|
||||
"""WordPiece
|
||||
"""
|
||||
""" WordPiece Decoder """
|
||||
|
||||
@staticmethod
|
||||
def new(prefix: str="##") -> Decoder:
|
||||
|
||||
@@ -2,3 +2,7 @@ from .. import normalizers
|
||||
|
||||
Normalizer = normalizers.Normalizer
|
||||
BertNormalizer = normalizers.BertNormalizer
|
||||
NFD = normalizers.NFD
|
||||
NFKD = normalizers.NFKD
|
||||
NFC = normalizers.NFC
|
||||
NFKC = normalizers.NFKC
|
||||
|
||||
@@ -39,3 +39,35 @@ class BertNormalizer:
|
||||
Normalizer
|
||||
"""
|
||||
pass
|
||||
|
||||
class NFD:
|
||||
""" NFD Unicode Normalizer """
|
||||
|
||||
@staticmethod
|
||||
def new() -> Normalizer:
|
||||
""" Instantiate a new NFD Normalizer """
|
||||
pass
|
||||
|
||||
class NFKD:
|
||||
""" NFKD Unicode Normalizer """
|
||||
|
||||
@staticmethod
|
||||
def new() -> Normalizer:
|
||||
""" Instantiate a new NFKD Normalizer """
|
||||
pass
|
||||
|
||||
class NFC:
|
||||
""" NFC Unicode Normalizer """
|
||||
|
||||
@staticmethod
|
||||
def new() -> Normalizer:
|
||||
""" Instantiate a new NFC Normalizer """
|
||||
pass
|
||||
|
||||
class NFKC:
|
||||
""" NFKC Unicode Normalizer """
|
||||
|
||||
@staticmethod
|
||||
def new() -> Normalizer:
|
||||
""" Instantiate a new NFKC Normalizer """
|
||||
pass
|
||||
|
||||
@@ -1 +1,2 @@
|
||||
pub mod bert;
|
||||
pub mod unicode;
|
||||
|
||||
33
tokenizers/src/normalizers/unicode.rs
Normal file
33
tokenizers/src/normalizers/unicode.rs
Normal file
@@ -0,0 +1,33 @@
|
||||
use crate::tokenizer::{NormalizedString, Normalizer, Result};
|
||||
|
||||
pub struct NFD;
|
||||
impl Normalizer for NFD {
|
||||
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
|
||||
normalized.nfd();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct NFKD;
|
||||
impl Normalizer for NFKD {
|
||||
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
|
||||
normalized.nfkd();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct NFC;
|
||||
impl Normalizer for NFC {
|
||||
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
|
||||
normalized.nfc();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct NFKC;
|
||||
impl Normalizer for NFKC {
|
||||
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
|
||||
normalized.nfkc();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user