mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Remove Tokenizer::normalize
This is actually a legacy function that doesn't really make sense now, and is getting really difficult to keep. So we remove it.
This commit is contained in:
@ -459,17 +459,6 @@ class Tokenizer:
|
|||||||
if the padding is enabled.
|
if the padding is enabled.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def normalize(self, sequence: str) -> str:
|
|
||||||
""" Normalize the given sequence
|
|
||||||
|
|
||||||
Args:
|
|
||||||
sequence: str:
|
|
||||||
The sequence to normalize
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The normalized string
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
def encode(
|
def encode(
|
||||||
self,
|
self,
|
||||||
sequence: InputSequence,
|
sequence: InputSequence,
|
||||||
|
@ -4,6 +4,7 @@ use pyo3::exceptions;
|
|||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
use pyo3::types::*;
|
use pyo3::types::*;
|
||||||
|
|
||||||
|
use crate::error::ToPyResult;
|
||||||
use serde::ser::SerializeStruct;
|
use serde::ser::SerializeStruct;
|
||||||
use serde::{Deserialize, Serialize, Serializer};
|
use serde::{Deserialize, Serialize, Serializer};
|
||||||
use tk::normalizers::{BertNormalizer, Lowercase, NormalizerWrapper, Strip, NFC, NFD, NFKC, NFKD};
|
use tk::normalizers::{BertNormalizer, Lowercase, NormalizerWrapper, Strip, NFC, NFD, NFKC, NFKD};
|
||||||
@ -81,6 +82,12 @@ impl PyNormalizer {
|
|||||||
Err(e) => Err(e),
|
Err(e) => Err(e),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn normalize_str(&self, sequence: &str) -> PyResult<String> {
|
||||||
|
let mut normalized = NormalizedString::from(sequence);
|
||||||
|
ToPyResult(self.normalizer.normalize(&mut normalized)).into_py()?;
|
||||||
|
Ok(normalized.get().to_owned())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=BertNormalizer)]
|
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=BertNormalizer)]
|
||||||
|
@ -509,15 +509,6 @@ impl PyTokenizer {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn normalize(&self, sentence: &str) -> PyResult<String> {
|
|
||||||
ToPyResult(
|
|
||||||
self.tokenizer
|
|
||||||
.normalize(sentence)
|
|
||||||
.map(|s| s.get().to_owned()),
|
|
||||||
)
|
|
||||||
.into()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Input can be:
|
/// Input can be:
|
||||||
/// encode("A single sequence")
|
/// encode("A single sequence")
|
||||||
/// encode("A sequence", "And its pair")
|
/// encode("A sequence", "And its pair")
|
||||||
|
@ -12,39 +12,35 @@ class TestBertNormalizer:
|
|||||||
assert isinstance(pickle.loads(pickle.dumps(BertNormalizer())), BertNormalizer)
|
assert isinstance(pickle.loads(pickle.dumps(BertNormalizer())), BertNormalizer)
|
||||||
|
|
||||||
def test_strip_accents(self):
|
def test_strip_accents(self):
|
||||||
tokenizer = Tokenizer(BPE())
|
normalizer = BertNormalizer(
|
||||||
tokenizer.normalizer = BertNormalizer(
|
|
||||||
strip_accents=True, lowercase=False, handle_chinese_chars=False, clean_text=False
|
strip_accents=True, lowercase=False, handle_chinese_chars=False, clean_text=False
|
||||||
)
|
)
|
||||||
|
|
||||||
output = tokenizer.normalize("Héllò")
|
output = normalizer.normalize_str("Héllò")
|
||||||
assert output == "Hello"
|
assert output == "Hello"
|
||||||
|
|
||||||
def test_handle_chinese_chars(self):
|
def test_handle_chinese_chars(self):
|
||||||
tokenizer = Tokenizer(BPE())
|
normalizer = BertNormalizer(
|
||||||
tokenizer.normalizer = BertNormalizer(
|
|
||||||
strip_accents=False, lowercase=False, handle_chinese_chars=True, clean_text=False
|
strip_accents=False, lowercase=False, handle_chinese_chars=True, clean_text=False
|
||||||
)
|
)
|
||||||
|
|
||||||
output = tokenizer.normalize("你好")
|
output = normalizer.normalize_str("你好")
|
||||||
assert output == " 你 好 "
|
assert output == " 你 好 "
|
||||||
|
|
||||||
def test_clean_text(self):
|
def test_clean_text(self):
|
||||||
tokenizer = Tokenizer(BPE())
|
normalizer = BertNormalizer(
|
||||||
tokenizer.normalizer = BertNormalizer(
|
|
||||||
strip_accents=False, lowercase=False, handle_chinese_chars=False, clean_text=True
|
strip_accents=False, lowercase=False, handle_chinese_chars=False, clean_text=True
|
||||||
)
|
)
|
||||||
|
|
||||||
output = tokenizer.normalize("\ufeffHello")
|
output = normalizer.normalize_str("\ufeffHello")
|
||||||
assert output == "Hello"
|
assert output == "Hello"
|
||||||
|
|
||||||
def test_lowercase(self):
|
def test_lowercase(self):
|
||||||
tokenizer = Tokenizer(BPE())
|
normalizer = BertNormalizer(
|
||||||
tokenizer.normalizer = BertNormalizer(
|
|
||||||
strip_accents=False, lowercase=True, handle_chinese_chars=False, clean_text=False
|
strip_accents=False, lowercase=True, handle_chinese_chars=False, clean_text=False
|
||||||
)
|
)
|
||||||
|
|
||||||
output = tokenizer.normalize("Héllò")
|
output = normalizer.normalize_str("Héllò")
|
||||||
assert output == "héllò"
|
assert output == "héllò"
|
||||||
|
|
||||||
|
|
||||||
@ -55,10 +51,9 @@ class TestSequence:
|
|||||||
assert isinstance(pickle.loads(pickle.dumps(Sequence([]))), Sequence)
|
assert isinstance(pickle.loads(pickle.dumps(Sequence([]))), Sequence)
|
||||||
|
|
||||||
def test_can_make_sequences(self):
|
def test_can_make_sequences(self):
|
||||||
tokenizer = Tokenizer(BPE())
|
normalizer = Sequence([Lowercase(), Strip()])
|
||||||
tokenizer.normalizer = Sequence([Lowercase(), Strip()])
|
|
||||||
|
|
||||||
output = tokenizer.normalize(" HELLO ")
|
output = normalizer.normalize_str(" HELLO ")
|
||||||
assert output == "hello"
|
assert output == "hello"
|
||||||
|
|
||||||
|
|
||||||
@ -69,10 +64,9 @@ class TestLowercase:
|
|||||||
assert isinstance(pickle.loads(pickle.dumps(Lowercase())), Lowercase)
|
assert isinstance(pickle.loads(pickle.dumps(Lowercase())), Lowercase)
|
||||||
|
|
||||||
def test_lowercase(self):
|
def test_lowercase(self):
|
||||||
tokenizer = Tokenizer(BPE())
|
normalizer = Lowercase()
|
||||||
tokenizer.normalizer = Lowercase()
|
|
||||||
|
|
||||||
output = tokenizer.normalize("HELLO")
|
output = normalizer.normalize_str("HELLO")
|
||||||
assert output == "hello"
|
assert output == "hello"
|
||||||
|
|
||||||
|
|
||||||
@ -83,22 +77,19 @@ class TestStrip:
|
|||||||
assert isinstance(pickle.loads(pickle.dumps(Strip())), Strip)
|
assert isinstance(pickle.loads(pickle.dumps(Strip())), Strip)
|
||||||
|
|
||||||
def test_left_strip(self):
|
def test_left_strip(self):
|
||||||
tokenizer = Tokenizer(BPE())
|
normalizer = Strip(left=True, right=False)
|
||||||
tokenizer.normalizer = Strip(left=True, right=False)
|
|
||||||
|
|
||||||
output = tokenizer.normalize(" hello ")
|
output = normalizer.normalize_str(" hello ")
|
||||||
assert output == "hello "
|
assert output == "hello "
|
||||||
|
|
||||||
def test_right_strip(self):
|
def test_right_strip(self):
|
||||||
tokenizer = Tokenizer(BPE())
|
normalizer = Strip(left=False, right=True)
|
||||||
tokenizer.normalizer = Strip(left=False, right=True)
|
|
||||||
|
|
||||||
output = tokenizer.normalize(" hello ")
|
output = normalizer.normalize_str(" hello ")
|
||||||
assert output == " hello"
|
assert output == " hello"
|
||||||
|
|
||||||
def test_full_strip(self):
|
def test_full_strip(self):
|
||||||
tokenizer = Tokenizer(BPE())
|
normalizer = Strip(left=True, right=True)
|
||||||
tokenizer.normalizer = Strip(left=True, right=True)
|
|
||||||
|
|
||||||
output = tokenizer.normalize(" hello ")
|
output = normalizer.normalize_str(" hello ")
|
||||||
assert output == "hello"
|
assert output == "hello"
|
||||||
|
@ -65,7 +65,6 @@ class TestTokenizer:
|
|||||||
assert callable(tokenizer.no_truncation)
|
assert callable(tokenizer.no_truncation)
|
||||||
assert callable(tokenizer.enable_padding)
|
assert callable(tokenizer.enable_padding)
|
||||||
assert callable(tokenizer.no_padding)
|
assert callable(tokenizer.no_padding)
|
||||||
assert callable(tokenizer.normalize)
|
|
||||||
assert callable(tokenizer.encode)
|
assert callable(tokenizer.encode)
|
||||||
assert callable(tokenizer.encode_batch)
|
assert callable(tokenizer.encode_batch)
|
||||||
assert callable(tokenizer.decode)
|
assert callable(tokenizer.decode)
|
||||||
@ -265,14 +264,6 @@ class TestTokenizer:
|
|||||||
size = tokenizer.get_vocab_size(with_added_tokens=False)
|
size = tokenizer.get_vocab_size(with_added_tokens=False)
|
||||||
assert size == 0
|
assert size == 0
|
||||||
|
|
||||||
def test_normalize(self):
|
|
||||||
tokenizer = Tokenizer(BPE())
|
|
||||||
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
|
||||||
tokenizer.normalizer = Lowercase()
|
|
||||||
|
|
||||||
output = tokenizer.normalize("My Name Is John")
|
|
||||||
assert output == "my name is john"
|
|
||||||
|
|
||||||
def test_post_process(self):
|
def test_post_process(self):
|
||||||
tokenizer = Tokenizer(BPE())
|
tokenizer = Tokenizer(BPE())
|
||||||
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||||
|
@ -562,15 +562,6 @@ where
|
|||||||
self.added_vocabulary.id_to_token(id, &self.model)
|
self.added_vocabulary.id_to_token(id, &self.model)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Normalize the given sentence and return the corresponding normalized string
|
|
||||||
pub fn normalize(&self, sentence: &str) -> Result<NormalizedString> {
|
|
||||||
let normalized = self
|
|
||||||
.added_vocabulary
|
|
||||||
.extract_and_normalize(self.normalizer.as_ref(), sentence);
|
|
||||||
let pre_tokenized = self.do_pre_tokenize(normalized)?;
|
|
||||||
Ok(pre_tokenized.into())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Encode a single sequence
|
/// Encode a single sequence
|
||||||
fn encode_single_sequence(
|
fn encode_single_sequence(
|
||||||
&self,
|
&self,
|
||||||
|
@ -842,27 +842,6 @@ impl NormalizedString {
|
|||||||
.collect())
|
.collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Merge with the given NormalizedString by appending it to self
|
|
||||||
pub fn merge_with(&mut self, other: &NormalizedString) {
|
|
||||||
let n_shift = self.len_original();
|
|
||||||
let o_shift = self.len();
|
|
||||||
self.original.push_str(&other.original);
|
|
||||||
self.normalized.push_str(&other.normalized);
|
|
||||||
self.alignments.extend(
|
|
||||||
other
|
|
||||||
.alignments
|
|
||||||
.iter()
|
|
||||||
.map(|(start, end)| (start + n_shift, end + n_shift)),
|
|
||||||
);
|
|
||||||
self.alignments_original.extend(
|
|
||||||
other
|
|
||||||
.alignments
|
|
||||||
.iter()
|
|
||||||
.map(|(start, end)| (start + o_shift, end + o_shift)),
|
|
||||||
);
|
|
||||||
todo!("This must take into account the `original_shift`");
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Remove any leading space(s) of the normalized string
|
/// Remove any leading space(s) of the normalized string
|
||||||
pub fn lstrip(&mut self) -> &mut Self {
|
pub fn lstrip(&mut self) -> &mut Self {
|
||||||
self.lrstrip(true, false)
|
self.lrstrip(true, false)
|
||||||
@ -1057,16 +1036,6 @@ impl From<&str> for NormalizedString {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::iter::FromIterator<NormalizedString> for NormalizedString {
|
|
||||||
fn from_iter<I: IntoIterator<Item = NormalizedString>>(iter: I) -> NormalizedString {
|
|
||||||
let mut normalized: NormalizedString = "".into();
|
|
||||||
for sub in iter {
|
|
||||||
normalized.merge_with(&sub)
|
|
||||||
}
|
|
||||||
normalized
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
#![allow(clippy::reversed_empty_ranges)]
|
#![allow(clippy::reversed_empty_ranges)]
|
||||||
@ -1196,7 +1165,6 @@ mod tests {
|
|||||||
fn range_conversion() {
|
fn range_conversion() {
|
||||||
let mut n = NormalizedString::from(" __Hello__ ");
|
let mut n = NormalizedString::from(" __Hello__ ");
|
||||||
n.filter(|c| !c.is_whitespace()).lowercase();
|
n.filter(|c| !c.is_whitespace()).lowercase();
|
||||||
println!("{:?}", n);
|
|
||||||
let hello_n = n.convert_offsets(Range::Original(6..11));
|
let hello_n = n.convert_offsets(Range::Original(6..11));
|
||||||
assert_eq!(hello_n, Some(2..7));
|
assert_eq!(hello_n, Some(2..7));
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
@ -240,7 +240,6 @@ impl PreTokenizedString {
|
|||||||
|
|
||||||
impl From<NormalizedString> for PreTokenizedString {
|
impl From<NormalizedString> for PreTokenizedString {
|
||||||
fn from(s: NormalizedString) -> Self {
|
fn from(s: NormalizedString) -> Self {
|
||||||
let original_offsets = (0, s.len_original());
|
|
||||||
Self {
|
Self {
|
||||||
original: s.get_original().to_owned(),
|
original: s.get_original().to_owned(),
|
||||||
splits: vec![Split {
|
splits: vec![Split {
|
||||||
@ -264,9 +263,3 @@ impl From<String> for PreTokenizedString {
|
|||||||
normalized.into()
|
normalized.into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<PreTokenizedString> for NormalizedString {
|
|
||||||
fn from(p: PreTokenizedString) -> Self {
|
|
||||||
p.splits.into_iter().map(|split| split.normalized).collect()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
Reference in New Issue
Block a user