mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-03 11:18:29 +00:00
Creating normalizers.Prepend (To be used instead of Metaspace). (#1194)
* Creating `normalizers.Prepend` (To be used instead of `Metaspace`). * Linting + stub. * Fixing pickling/unpickling by setting a default. * Black.
This commit is contained in:
@@ -9,6 +9,7 @@ NFC = normalizers.NFC
|
||||
NFKC = normalizers.NFKC
|
||||
Sequence = normalizers.Sequence
|
||||
Lowercase = normalizers.Lowercase
|
||||
Prepend = normalizers.Prepend
|
||||
Strip = normalizers.Strip
|
||||
StripAccents = normalizers.StripAccents
|
||||
Nmt = normalizers.Nmt
|
||||
|
||||
@@ -379,6 +379,46 @@ class Precompiled(Normalizer):
|
||||
"""
|
||||
pass
|
||||
|
||||
class Prepend(Normalizer):
|
||||
"""
|
||||
Prepend normalizer
|
||||
"""
|
||||
|
||||
def __init__(self, prepend):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.NormalizedString` to
|
||||
keep track of the alignment information. If you just want to see the result
|
||||
of the normalization on a raw string, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize_str`
|
||||
|
||||
Args:
|
||||
normalized (:class:`~tokenizers.NormalizedString`):
|
||||
The normalized string on which to apply this
|
||||
:class:`~tokenizers.normalizers.Normalizer`
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
|
||||
information. If you need to get/convert offsets, you can use
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to normalize
|
||||
|
||||
Returns:
|
||||
:obj:`str`: A string after normalization
|
||||
"""
|
||||
pass
|
||||
|
||||
class Replace(Normalizer):
|
||||
"""
|
||||
Replace normalizer
|
||||
|
||||
@@ -9,8 +9,8 @@ use crate::utils::{PyNormalizedString, PyNormalizedStringRefMut, PyPattern};
|
||||
use serde::ser::SerializeStruct;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use tk::normalizers::{
|
||||
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Replace, Strip, StripAccents,
|
||||
NFC, NFD, NFKC, NFKD,
|
||||
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Prepend, Replace, Strip,
|
||||
StripAccents, NFC, NFD, NFKC, NFKD,
|
||||
};
|
||||
use tk::{NormalizedString, Normalizer};
|
||||
use tokenizers as tk;
|
||||
@@ -69,6 +69,7 @@ impl PyNormalizer {
|
||||
NormalizerWrapper::StripNormalizer(_) => {
|
||||
Py::new(py, (PyBertNormalizer {}, base))?.into_py(py)
|
||||
}
|
||||
NormalizerWrapper::Prepend(_) => Py::new(py, (PyPrepend {}, base))?.into_py(py),
|
||||
NormalizerWrapper::StripAccents(_) => {
|
||||
Py::new(py, (PyStripAccents {}, base))?.into_py(py)
|
||||
}
|
||||
@@ -172,7 +173,8 @@ macro_rules! getter {
|
||||
let super_ = $self.as_ref();
|
||||
if let PyNormalizerTypeWrapper::Single(ref norm) = super_.normalizer {
|
||||
let wrapper = norm.read().unwrap();
|
||||
if let PyNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = *wrapper {
|
||||
if let PyNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (*wrapper).clone()
|
||||
{
|
||||
o.$name
|
||||
} else {
|
||||
unreachable!()
|
||||
@@ -413,6 +415,29 @@ impl PyStrip {
|
||||
}
|
||||
}
|
||||
|
||||
/// Prepend normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Prepend")]
|
||||
#[pyo3(text_signature = "(self, prepend)")]
|
||||
pub struct PyPrepend {}
|
||||
#[pymethods]
|
||||
impl PyPrepend {
|
||||
#[getter]
|
||||
fn get_prepend(self_: PyRef<Self>) -> String {
|
||||
getter!(self_, Prepend, prepend)
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_prepend(self_: PyRef<Self>, prepend: String) {
|
||||
setter!(self_, Prepend, prepend, prepend)
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (prepend="▁".to_string()))]
|
||||
fn new(prepend: String) -> (Self, PyNormalizer) {
|
||||
(PyPrepend {}, Prepend::new(prepend).into())
|
||||
}
|
||||
}
|
||||
|
||||
/// StripAccents normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "StripAccents")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
@@ -624,6 +649,7 @@ pub fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<PyLowercase>()?;
|
||||
m.add_class::<PyStrip>()?;
|
||||
m.add_class::<PyStripAccents>()?;
|
||||
m.add_class::<PyPrepend>()?;
|
||||
m.add_class::<PyNmt>()?;
|
||||
m.add_class::<PyPrecompiled>()?;
|
||||
m.add_class::<PyReplace>()?;
|
||||
|
||||
@@ -4,7 +4,7 @@ import pytest
|
||||
|
||||
from tokenizers import NormalizedString, Tokenizer
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import BertNormalizer, Lowercase, Normalizer, Sequence, Strip
|
||||
from tokenizers.normalizers import BertNormalizer, Lowercase, Normalizer, Sequence, Strip, Prepend
|
||||
|
||||
|
||||
class TestBertNormalizer:
|
||||
@@ -119,6 +119,28 @@ class TestStrip:
|
||||
assert normalizer.right == False
|
||||
|
||||
|
||||
class TestPrepend:
|
||||
def test_instantiate(self):
|
||||
assert isinstance(Prepend("▁"), Normalizer)
|
||||
assert isinstance(Prepend("▁"), Prepend)
|
||||
assert isinstance(pickle.loads(pickle.dumps(Prepend("▁"))), Prepend)
|
||||
|
||||
def test_prepend(self):
|
||||
normalizer = Prepend(prepend="▁")
|
||||
|
||||
output = normalizer.normalize_str("hello")
|
||||
assert output == "▁hello"
|
||||
|
||||
def test_can_modify(self):
|
||||
normalizer = Prepend("▁")
|
||||
|
||||
assert normalizer.prepend == "▁"
|
||||
|
||||
# Modify these
|
||||
normalizer.prepend = "-"
|
||||
assert normalizer.prepend == "-"
|
||||
|
||||
|
||||
class TestCustomNormalizer:
|
||||
class BadCustomNormalizer:
|
||||
def normalize(self, normalized, wrong):
|
||||
|
||||
Reference in New Issue
Block a user