mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-06 12:48:18 +00:00
Add basic unicode normalizers
This commit is contained in:
@@ -64,6 +64,10 @@ fn processors(_py: Python, m: &PyModule) -> PyResult<()> {
|
|||||||
fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||||
m.add_class::<normalizers::Normalizer>()?;
|
m.add_class::<normalizers::Normalizer>()?;
|
||||||
m.add_class::<normalizers::BertNormalizer>()?;
|
m.add_class::<normalizers::BertNormalizer>()?;
|
||||||
|
m.add_class::<normalizers::NFD>()?;
|
||||||
|
m.add_class::<normalizers::NFKD>()?;
|
||||||
|
m.add_class::<normalizers::NFC>()?;
|
||||||
|
m.add_class::<normalizers::NFKC>()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -44,3 +44,51 @@ impl BertNormalizer {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[pyclass]
|
||||||
|
pub struct NFD {}
|
||||||
|
#[pymethods]
|
||||||
|
impl NFD {
|
||||||
|
#[staticmethod]
|
||||||
|
fn new() -> PyResult<Normalizer> {
|
||||||
|
Ok(Normalizer {
|
||||||
|
normalizer: Container::Owned(Box::new(tk::normalizers::unicode::NFD)),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pyclass]
|
||||||
|
pub struct NFKD {}
|
||||||
|
#[pymethods]
|
||||||
|
impl NFKD {
|
||||||
|
#[staticmethod]
|
||||||
|
fn new() -> PyResult<Normalizer> {
|
||||||
|
Ok(Normalizer {
|
||||||
|
normalizer: Container::Owned(Box::new(tk::normalizers::unicode::NFKD)),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pyclass]
|
||||||
|
pub struct NFC {}
|
||||||
|
#[pymethods]
|
||||||
|
impl NFC {
|
||||||
|
#[staticmethod]
|
||||||
|
fn new() -> PyResult<Normalizer> {
|
||||||
|
Ok(Normalizer {
|
||||||
|
normalizer: Container::Owned(Box::new(tk::normalizers::unicode::NFC)),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pyclass]
|
||||||
|
pub struct NFKC {}
|
||||||
|
#[pymethods]
|
||||||
|
impl NFKC {
|
||||||
|
#[staticmethod]
|
||||||
|
fn new() -> PyResult<Normalizer> {
|
||||||
|
Ok(Normalizer {
|
||||||
|
normalizer: Container::Owned(Box::new(tk::normalizers::unicode::NFKC)),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -20,8 +20,7 @@ class ByteLevel:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
class WordPiece:
|
class WordPiece:
|
||||||
"""WordPiece
|
""" WordPiece Decoder """
|
||||||
"""
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def new(prefix: str="##") -> Decoder:
|
def new(prefix: str="##") -> Decoder:
|
||||||
|
|||||||
@@ -2,3 +2,7 @@ from .. import normalizers
|
|||||||
|
|
||||||
Normalizer = normalizers.Normalizer
|
Normalizer = normalizers.Normalizer
|
||||||
BertNormalizer = normalizers.BertNormalizer
|
BertNormalizer = normalizers.BertNormalizer
|
||||||
|
NFD = normalizers.NFD
|
||||||
|
NFKD = normalizers.NFKD
|
||||||
|
NFC = normalizers.NFC
|
||||||
|
NFKC = normalizers.NFKC
|
||||||
|
|||||||
@@ -39,3 +39,35 @@ class BertNormalizer:
|
|||||||
Normalizer
|
Normalizer
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class NFD:
|
||||||
|
""" NFD Unicode Normalizer """
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def new() -> Normalizer:
|
||||||
|
""" Instantiate a new NFD Normalizer """
|
||||||
|
pass
|
||||||
|
|
||||||
|
class NFKD:
|
||||||
|
""" NFKD Unicode Normalizer """
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def new() -> Normalizer:
|
||||||
|
""" Instantiate a new NFKD Normalizer """
|
||||||
|
pass
|
||||||
|
|
||||||
|
class NFC:
|
||||||
|
""" NFC Unicode Normalizer """
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def new() -> Normalizer:
|
||||||
|
""" Instantiate a new NFC Normalizer """
|
||||||
|
pass
|
||||||
|
|
||||||
|
class NFKC:
|
||||||
|
""" NFKC Unicode Normalizer """
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def new() -> Normalizer:
|
||||||
|
""" Instantiate a new NFKC Normalizer """
|
||||||
|
pass
|
||||||
|
|||||||
@@ -1 +1,2 @@
|
|||||||
pub mod bert;
|
pub mod bert;
|
||||||
|
pub mod unicode;
|
||||||
|
|||||||
33
tokenizers/src/normalizers/unicode.rs
Normal file
33
tokenizers/src/normalizers/unicode.rs
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
use crate::tokenizer::{NormalizedString, Normalizer, Result};
|
||||||
|
|
||||||
|
pub struct NFD;
|
||||||
|
impl Normalizer for NFD {
|
||||||
|
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
|
||||||
|
normalized.nfd();
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct NFKD;
|
||||||
|
impl Normalizer for NFKD {
|
||||||
|
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
|
||||||
|
normalized.nfkd();
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct NFC;
|
||||||
|
impl Normalizer for NFC {
|
||||||
|
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
|
||||||
|
normalized.nfc();
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct NFKC;
|
||||||
|
impl Normalizer for NFKC {
|
||||||
|
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
|
||||||
|
normalized.nfkc();
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user