Creating normalizers.Prepend (To be used instead of Metaspace). (#1194)

* Creating `normalizers.Prepend` (To be used instead of `Metaspace`).

* Linting + stub.

* Fixing pickling/unpickling by setting a default.

* Black.
This commit is contained in:
Nicolas Patry
2023-03-24 00:33:31 +01:00
committed by GitHub
parent 250d46c676
commit d2c8190a0f
10 changed files with 191 additions and 5 deletions

View File

@ -78,6 +78,12 @@ export function lowercaseNormalizer(): Normalizer;
*/ */
export function stripNormalizer(left?: boolean, right?: boolean): Normalizer; export function stripNormalizer(left?: boolean, right?: boolean): Normalizer;
/**
* Returns a new Prepend Normalizer
* @param [prepend] The string to prepend
*/
export function prependNormalizer(prepend: string): Normalizer;
/** /**
* Returns a new StripAccents Normalizer * Returns a new StripAccents Normalizer
*/ */

View File

@ -9,6 +9,7 @@ module.exports = {
sequenceNormalizer: native.normalizers_Sequence, sequenceNormalizer: native.normalizers_Sequence,
lowercaseNormalizer: native.normalizers_Lowercase, lowercaseNormalizer: native.normalizers_Lowercase,
stripNormalizer: native.normalizers_Strip, stripNormalizer: native.normalizers_Strip,
prependNormalizer: native.normalizers_Prepend,
stripAccentsNormalizer: native.normalizers_StripAccents, stripAccentsNormalizer: native.normalizers_StripAccents,
nmtNormalizer: native.normalizers_Nmt, nmtNormalizer: native.normalizers_Nmt,
precompiledNormalizer: native.normalizers_Precompiled, precompiledNormalizer: native.normalizers_Precompiled,

View File

@ -1,4 +1,8 @@
import { stripAccentsNormalizer, stripNormalizer } from "./normalizers"; import {
prependNormalizer,
stripAccentsNormalizer,
stripNormalizer,
} from "./normalizers";
describe("stripNormalizer", () => { describe("stripNormalizer", () => {
it("instantiates with no parameters", () => { it("instantiates with no parameters", () => {
@ -24,6 +28,12 @@ describe("stripNormalizer", () => {
expect(normalizer.constructor.name).toEqual("Normalizer"); expect(normalizer.constructor.name).toEqual("Normalizer");
}); });
it("prepend instantiates with one parameter", () => {
const normalizer = prependNormalizer("_");
expect(normalizer.constructor.name).toEqual("Normalizer");
expect(normalizer.normalizeString("Hello")).toEqual("_Hello");
});
it("can normalize strings", () => { it("can normalize strings", () => {
const normalizer = stripNormalizer(); const normalizer = stripNormalizer();
expect(normalizer.normalizeString(" Hello there ")).toEqual("Hello there"); expect(normalizer.normalizeString(" Hello there ")).toEqual("Hello there");

View File

@ -175,6 +175,18 @@ fn strip(mut cx: FunctionContext) -> JsResult<JsNormalizer> {
Ok(normalizer) Ok(normalizer)
} }
/// prepend(prepend: string)
fn prepend(mut cx: FunctionContext) -> JsResult<JsNormalizer> {
let prepend: String = cx.extract::<String>(0)?;
let mut normalizer = JsNormalizer::new::<_, JsNormalizer, _>(&mut cx, vec![])?;
let guard = cx.lock();
normalizer.borrow_mut(&guard).normalizer =
Some(tk::normalizers::prepend::Prepend::new(prepend).into());
Ok(normalizer)
}
/// strip_accents() /// strip_accents()
fn strip_accents(mut cx: FunctionContext) -> JsResult<JsNormalizer> { fn strip_accents(mut cx: FunctionContext) -> JsResult<JsNormalizer> {
let mut normalizer = JsNormalizer::new::<_, JsNormalizer, _>(&mut cx, vec![])?; let mut normalizer = JsNormalizer::new::<_, JsNormalizer, _>(&mut cx, vec![])?;
@ -267,6 +279,7 @@ pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
m.export_function(&format!("{}_Sequence", prefix), sequence)?; m.export_function(&format!("{}_Sequence", prefix), sequence)?;
m.export_function(&format!("{}_Lowercase", prefix), lowercase)?; m.export_function(&format!("{}_Lowercase", prefix), lowercase)?;
m.export_function(&format!("{}_Strip", prefix), strip)?; m.export_function(&format!("{}_Strip", prefix), strip)?;
m.export_function(&format!("{}_Prepend", prefix), prepend)?;
m.export_function(&format!("{}_StripAccents", prefix), strip_accents)?; m.export_function(&format!("{}_StripAccents", prefix), strip_accents)?;
m.export_function(&format!("{}_Nmt", prefix), nmt)?; m.export_function(&format!("{}_Nmt", prefix), nmt)?;
m.export_function(&format!("{}_Precompiled", prefix), precompiled)?; m.export_function(&format!("{}_Precompiled", prefix), precompiled)?;

View File

@ -9,6 +9,7 @@ NFC = normalizers.NFC
NFKC = normalizers.NFKC NFKC = normalizers.NFKC
Sequence = normalizers.Sequence Sequence = normalizers.Sequence
Lowercase = normalizers.Lowercase Lowercase = normalizers.Lowercase
Prepend = normalizers.Prepend
Strip = normalizers.Strip Strip = normalizers.Strip
StripAccents = normalizers.StripAccents StripAccents = normalizers.StripAccents
Nmt = normalizers.Nmt Nmt = normalizers.Nmt

View File

@ -379,6 +379,46 @@ class Precompiled(Normalizer):
""" """
pass pass
class Prepend(Normalizer):
"""
Prepend normalizer
"""
def __init__(self, prepend):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
This method allows to modify a :class:`~tokenizers.NormalizedString` to
keep track of the alignment information. If you just want to see the result
of the normalization on a raw string, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
Args:
normalized (:class:`~tokenizers.NormalizedString`):
The normalized string on which to apply this
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
information. If you need to get/convert offsets, you can use
:meth:`~tokenizers.normalizers.Normalizer.normalize`
Args:
sequence (:obj:`str`):
A string to normalize
Returns:
:obj:`str`: A string after normalization
"""
pass
class Replace(Normalizer): class Replace(Normalizer):
""" """
Replace normalizer Replace normalizer

View File

@ -9,8 +9,8 @@ use crate::utils::{PyNormalizedString, PyNormalizedStringRefMut, PyPattern};
use serde::ser::SerializeStruct; use serde::ser::SerializeStruct;
use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde::{Deserialize, Deserializer, Serialize, Serializer};
use tk::normalizers::{ use tk::normalizers::{
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Replace, Strip, StripAccents, BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Prepend, Replace, Strip,
NFC, NFD, NFKC, NFKD, StripAccents, NFC, NFD, NFKC, NFKD,
}; };
use tk::{NormalizedString, Normalizer}; use tk::{NormalizedString, Normalizer};
use tokenizers as tk; use tokenizers as tk;
@ -69,6 +69,7 @@ impl PyNormalizer {
NormalizerWrapper::StripNormalizer(_) => { NormalizerWrapper::StripNormalizer(_) => {
Py::new(py, (PyBertNormalizer {}, base))?.into_py(py) Py::new(py, (PyBertNormalizer {}, base))?.into_py(py)
} }
NormalizerWrapper::Prepend(_) => Py::new(py, (PyPrepend {}, base))?.into_py(py),
NormalizerWrapper::StripAccents(_) => { NormalizerWrapper::StripAccents(_) => {
Py::new(py, (PyStripAccents {}, base))?.into_py(py) Py::new(py, (PyStripAccents {}, base))?.into_py(py)
} }
@ -172,7 +173,8 @@ macro_rules! getter {
let super_ = $self.as_ref(); let super_ = $self.as_ref();
if let PyNormalizerTypeWrapper::Single(ref norm) = super_.normalizer { if let PyNormalizerTypeWrapper::Single(ref norm) = super_.normalizer {
let wrapper = norm.read().unwrap(); let wrapper = norm.read().unwrap();
if let PyNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = *wrapper { if let PyNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (*wrapper).clone()
{
o.$name o.$name
} else { } else {
unreachable!() unreachable!()
@ -413,6 +415,29 @@ impl PyStrip {
} }
} }
/// Prepend normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Prepend")]
#[pyo3(text_signature = "(self, prepend)")]
pub struct PyPrepend {}
#[pymethods]
impl PyPrepend {
#[getter]
fn get_prepend(self_: PyRef<Self>) -> String {
getter!(self_, Prepend, prepend)
}
#[setter]
fn set_prepend(self_: PyRef<Self>, prepend: String) {
setter!(self_, Prepend, prepend, prepend)
}
#[new]
#[pyo3(signature = (prepend="".to_string()))]
fn new(prepend: String) -> (Self, PyNormalizer) {
(PyPrepend {}, Prepend::new(prepend).into())
}
}
/// StripAccents normalizer /// StripAccents normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "StripAccents")] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "StripAccents")]
#[pyo3(text_signature = "(self)")] #[pyo3(text_signature = "(self)")]
@ -624,6 +649,7 @@ pub fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<PyLowercase>()?; m.add_class::<PyLowercase>()?;
m.add_class::<PyStrip>()?; m.add_class::<PyStrip>()?;
m.add_class::<PyStripAccents>()?; m.add_class::<PyStripAccents>()?;
m.add_class::<PyPrepend>()?;
m.add_class::<PyNmt>()?; m.add_class::<PyNmt>()?;
m.add_class::<PyPrecompiled>()?; m.add_class::<PyPrecompiled>()?;
m.add_class::<PyReplace>()?; m.add_class::<PyReplace>()?;

View File

@ -4,7 +4,7 @@ import pytest
from tokenizers import NormalizedString, Tokenizer from tokenizers import NormalizedString, Tokenizer
from tokenizers.models import BPE from tokenizers.models import BPE
from tokenizers.normalizers import BertNormalizer, Lowercase, Normalizer, Sequence, Strip from tokenizers.normalizers import BertNormalizer, Lowercase, Normalizer, Sequence, Strip, Prepend
class TestBertNormalizer: class TestBertNormalizer:
@ -119,6 +119,28 @@ class TestStrip:
assert normalizer.right == False assert normalizer.right == False
class TestPrepend:
def test_instantiate(self):
assert isinstance(Prepend(""), Normalizer)
assert isinstance(Prepend(""), Prepend)
assert isinstance(pickle.loads(pickle.dumps(Prepend(""))), Prepend)
def test_prepend(self):
normalizer = Prepend(prepend="")
output = normalizer.normalize_str("hello")
assert output == "▁hello"
def test_can_modify(self):
normalizer = Prepend("")
assert normalizer.prepend == ""
# Modify these
normalizer.prepend = "-"
assert normalizer.prepend == "-"
class TestCustomNormalizer: class TestCustomNormalizer:
class BadCustomNormalizer: class BadCustomNormalizer:
def normalize(self, normalized, wrong): def normalize(self, normalized, wrong):

View File

@ -1,5 +1,6 @@
pub mod bert; pub mod bert;
pub mod precompiled; pub mod precompiled;
pub mod prepend;
pub mod replace; pub mod replace;
pub mod strip; pub mod strip;
pub mod unicode; pub mod unicode;
@ -7,6 +8,7 @@ pub mod utils;
pub use crate::normalizers::bert::BertNormalizer; pub use crate::normalizers::bert::BertNormalizer;
pub use crate::normalizers::precompiled::Precompiled; pub use crate::normalizers::precompiled::Precompiled;
pub use crate::normalizers::prepend::Prepend;
pub use crate::normalizers::replace::Replace; pub use crate::normalizers::replace::Replace;
pub use crate::normalizers::strip::{Strip, StripAccents}; pub use crate::normalizers::strip::{Strip, StripAccents};
pub use crate::normalizers::unicode::{Nmt, NFC, NFD, NFKC, NFKD}; pub use crate::normalizers::unicode::{Nmt, NFC, NFD, NFKC, NFKD};
@ -32,6 +34,7 @@ pub enum NormalizerWrapper {
Nmt(Nmt), Nmt(Nmt),
Precompiled(Precompiled), Precompiled(Precompiled),
Replace(Replace), Replace(Replace),
Prepend(Prepend),
} }
impl Normalizer for NormalizerWrapper { impl Normalizer for NormalizerWrapper {
@ -49,6 +52,7 @@ impl Normalizer for NormalizerWrapper {
Self::Nmt(lc) => lc.normalize(normalized), Self::Nmt(lc) => lc.normalize(normalized),
Self::Precompiled(lc) => lc.normalize(normalized), Self::Precompiled(lc) => lc.normalize(normalized),
Self::Replace(lc) => lc.normalize(normalized), Self::Replace(lc) => lc.normalize(normalized),
Self::Prepend(lc) => lc.normalize(normalized),
} }
} }
} }
@ -65,3 +69,4 @@ impl_enum_from!(Lowercase, NormalizerWrapper, Lowercase);
impl_enum_from!(Nmt, NormalizerWrapper, Nmt); impl_enum_from!(Nmt, NormalizerWrapper, Nmt);
impl_enum_from!(Precompiled, NormalizerWrapper, Precompiled); impl_enum_from!(Precompiled, NormalizerWrapper, Precompiled);
impl_enum_from!(Replace, NormalizerWrapper, Replace); impl_enum_from!(Replace, NormalizerWrapper, Replace);
impl_enum_from!(Prepend, NormalizerWrapper, Prepend);

View File

@ -0,0 +1,62 @@
use crate::tokenizer::{NormalizedString, Normalizer, Result};
use serde::{Deserialize, Serialize};
#[derive(Clone, Debug, Deserialize, Serialize)]
#[serde(tag = "type")]
pub struct Prepend {
pub prepend: String,
}
impl Prepend {
pub fn new(prepend: String) -> Self {
Self { prepend }
}
}
impl Normalizer for Prepend {
/// Strip the normalized string inplace
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
if !normalized.is_empty() {
normalized.prepend(&self.prepend);
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_prepend() {
let original = "Hello";
let normalized = "▁Hello";
assert_ne!(original, normalized);
let mut n = NormalizedString::from(original);
let prepend = Prepend::new("".to_string());
prepend.normalize(&mut n).unwrap();
assert_eq!(&n.get(), &normalized);
assert_eq!(
n,
NormalizedString::new(
original.to_string(),
normalized.to_string(),
vec![
(0, 1),
(0, 1),
(0, 1),
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5)
],
0
)
);
assert_eq!(
n.alignments_original(),
vec![(0, 4), (4, 5), (5, 6), (6, 7), (7, 8)]
);
}
}