mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-05 20:28:22 +00:00
Remove Tokenizer::normalize
This is actually a legacy function that doesn't really make sense now, and is getting really difficult to keep. So we remove it.
This commit is contained in:
@@ -459,17 +459,6 @@ class Tokenizer:
|
||||
if the padding is enabled.
|
||||
"""
|
||||
pass
|
||||
def normalize(self, sequence: str) -> str:
|
||||
""" Normalize the given sequence
|
||||
|
||||
Args:
|
||||
sequence: str:
|
||||
The sequence to normalize
|
||||
|
||||
Returns:
|
||||
The normalized string
|
||||
"""
|
||||
pass
|
||||
def encode(
|
||||
self,
|
||||
sequence: InputSequence,
|
||||
|
||||
@@ -4,6 +4,7 @@ use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::*;
|
||||
|
||||
use crate::error::ToPyResult;
|
||||
use serde::ser::SerializeStruct;
|
||||
use serde::{Deserialize, Serialize, Serializer};
|
||||
use tk::normalizers::{BertNormalizer, Lowercase, NormalizerWrapper, Strip, NFC, NFD, NFKC, NFKD};
|
||||
@@ -81,6 +82,12 @@ impl PyNormalizer {
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
|
||||
fn normalize_str(&self, sequence: &str) -> PyResult<String> {
|
||||
let mut normalized = NormalizedString::from(sequence);
|
||||
ToPyResult(self.normalizer.normalize(&mut normalized)).into_py()?;
|
||||
Ok(normalized.get().to_owned())
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=BertNormalizer)]
|
||||
|
||||
@@ -509,15 +509,6 @@ impl PyTokenizer {
|
||||
})
|
||||
}
|
||||
|
||||
fn normalize(&self, sentence: &str) -> PyResult<String> {
|
||||
ToPyResult(
|
||||
self.tokenizer
|
||||
.normalize(sentence)
|
||||
.map(|s| s.get().to_owned()),
|
||||
)
|
||||
.into()
|
||||
}
|
||||
|
||||
/// Input can be:
|
||||
/// encode("A single sequence")
|
||||
/// encode("A sequence", "And its pair")
|
||||
|
||||
@@ -12,39 +12,35 @@ class TestBertNormalizer:
|
||||
assert isinstance(pickle.loads(pickle.dumps(BertNormalizer())), BertNormalizer)
|
||||
|
||||
def test_strip_accents(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
tokenizer.normalizer = BertNormalizer(
|
||||
normalizer = BertNormalizer(
|
||||
strip_accents=True, lowercase=False, handle_chinese_chars=False, clean_text=False
|
||||
)
|
||||
|
||||
output = tokenizer.normalize("Héllò")
|
||||
output = normalizer.normalize_str("Héllò")
|
||||
assert output == "Hello"
|
||||
|
||||
def test_handle_chinese_chars(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
tokenizer.normalizer = BertNormalizer(
|
||||
normalizer = BertNormalizer(
|
||||
strip_accents=False, lowercase=False, handle_chinese_chars=True, clean_text=False
|
||||
)
|
||||
|
||||
output = tokenizer.normalize("你好")
|
||||
output = normalizer.normalize_str("你好")
|
||||
assert output == " 你 好 "
|
||||
|
||||
def test_clean_text(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
tokenizer.normalizer = BertNormalizer(
|
||||
normalizer = BertNormalizer(
|
||||
strip_accents=False, lowercase=False, handle_chinese_chars=False, clean_text=True
|
||||
)
|
||||
|
||||
output = tokenizer.normalize("\ufeffHello")
|
||||
output = normalizer.normalize_str("\ufeffHello")
|
||||
assert output == "Hello"
|
||||
|
||||
def test_lowercase(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
tokenizer.normalizer = BertNormalizer(
|
||||
normalizer = BertNormalizer(
|
||||
strip_accents=False, lowercase=True, handle_chinese_chars=False, clean_text=False
|
||||
)
|
||||
|
||||
output = tokenizer.normalize("Héllò")
|
||||
output = normalizer.normalize_str("Héllò")
|
||||
assert output == "héllò"
|
||||
|
||||
|
||||
@@ -55,10 +51,9 @@ class TestSequence:
|
||||
assert isinstance(pickle.loads(pickle.dumps(Sequence([]))), Sequence)
|
||||
|
||||
def test_can_make_sequences(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
tokenizer.normalizer = Sequence([Lowercase(), Strip()])
|
||||
normalizer = Sequence([Lowercase(), Strip()])
|
||||
|
||||
output = tokenizer.normalize(" HELLO ")
|
||||
output = normalizer.normalize_str(" HELLO ")
|
||||
assert output == "hello"
|
||||
|
||||
|
||||
@@ -69,10 +64,9 @@ class TestLowercase:
|
||||
assert isinstance(pickle.loads(pickle.dumps(Lowercase())), Lowercase)
|
||||
|
||||
def test_lowercase(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
tokenizer.normalizer = Lowercase()
|
||||
normalizer = Lowercase()
|
||||
|
||||
output = tokenizer.normalize("HELLO")
|
||||
output = normalizer.normalize_str("HELLO")
|
||||
assert output == "hello"
|
||||
|
||||
|
||||
@@ -83,22 +77,19 @@ class TestStrip:
|
||||
assert isinstance(pickle.loads(pickle.dumps(Strip())), Strip)
|
||||
|
||||
def test_left_strip(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
tokenizer.normalizer = Strip(left=True, right=False)
|
||||
normalizer = Strip(left=True, right=False)
|
||||
|
||||
output = tokenizer.normalize(" hello ")
|
||||
output = normalizer.normalize_str(" hello ")
|
||||
assert output == "hello "
|
||||
|
||||
def test_right_strip(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
tokenizer.normalizer = Strip(left=False, right=True)
|
||||
normalizer = Strip(left=False, right=True)
|
||||
|
||||
output = tokenizer.normalize(" hello ")
|
||||
output = normalizer.normalize_str(" hello ")
|
||||
assert output == " hello"
|
||||
|
||||
def test_full_strip(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
tokenizer.normalizer = Strip(left=True, right=True)
|
||||
normalizer = Strip(left=True, right=True)
|
||||
|
||||
output = tokenizer.normalize(" hello ")
|
||||
output = normalizer.normalize_str(" hello ")
|
||||
assert output == "hello"
|
||||
|
||||
@@ -65,7 +65,6 @@ class TestTokenizer:
|
||||
assert callable(tokenizer.no_truncation)
|
||||
assert callable(tokenizer.enable_padding)
|
||||
assert callable(tokenizer.no_padding)
|
||||
assert callable(tokenizer.normalize)
|
||||
assert callable(tokenizer.encode)
|
||||
assert callable(tokenizer.encode_batch)
|
||||
assert callable(tokenizer.decode)
|
||||
@@ -265,14 +264,6 @@ class TestTokenizer:
|
||||
size = tokenizer.get_vocab_size(with_added_tokens=False)
|
||||
assert size == 0
|
||||
|
||||
def test_normalize(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||
tokenizer.normalizer = Lowercase()
|
||||
|
||||
output = tokenizer.normalize("My Name Is John")
|
||||
assert output == "my name is john"
|
||||
|
||||
def test_post_process(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||
|
||||
Reference in New Issue
Block a user