mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
* current updates * simplify * set_item works, but `tokenizer._tokenizer.post_processor[1].single = ["$0", "</s>"]` does not ! * fix: `normalizers` deserialization and other refactoring * fix: `pre_tokenizer` deserialization * feat: add `__len__` implementation for `normalizer::PySequence` * feat: add `__setitem__` impl for `normalizers::PySequence` * feat: add `__setitem__` impl to `pre_tokenizer::PySequence` * feat: add `__setitem__` impl to `post_processor::PySequence` * test: add normalizer sequence setter check * refactor: allow unused `processors::setter` macro * test: add `__setitem__` test for processors & pretok * refactor: `unwrap` -> `PyException::new_err()?` * refactor: fmt * refactor: remove unnecessary `pub` * feat(bindings): add missing getters & setters for pretoks * feat(bindings): add missing getters & setters for processors * refactor(bindings): rewrite RwLock poison error msg * refactor: remove debug print * feat(bindings): add description as to why custom deser is needed * feat: make post proc sequence elements mutable * fix(binding): serialization --------- Co-authored-by: Luc Georges <luc.sydney.georges@gmail.com>
238 lines
7.8 KiB
Python
238 lines
7.8 KiB
Python
import pickle
|
|
|
|
import pytest
|
|
|
|
from tokenizers import NormalizedString
|
|
from tokenizers.normalizers import (
|
|
BertNormalizer,
|
|
Lowercase,
|
|
Normalizer,
|
|
Precompiled,
|
|
Sequence,
|
|
Strip,
|
|
Prepend,
|
|
Replace,
|
|
)
|
|
|
|
|
|
class TestBertNormalizer:
|
|
def test_instantiate(self):
|
|
assert isinstance(BertNormalizer(), Normalizer)
|
|
assert isinstance(BertNormalizer(), BertNormalizer)
|
|
assert isinstance(pickle.loads(pickle.dumps(BertNormalizer())), BertNormalizer)
|
|
|
|
def test_strip_accents(self):
|
|
normalizer = BertNormalizer(strip_accents=True, lowercase=False, handle_chinese_chars=False, clean_text=False)
|
|
|
|
output = normalizer.normalize_str("Héllò")
|
|
assert output == "Hello"
|
|
|
|
def test_handle_chinese_chars(self):
|
|
normalizer = BertNormalizer(strip_accents=False, lowercase=False, handle_chinese_chars=True, clean_text=False)
|
|
|
|
output = normalizer.normalize_str("你好")
|
|
assert output == " 你 好 "
|
|
|
|
def test_clean_text(self):
|
|
normalizer = BertNormalizer(strip_accents=False, lowercase=False, handle_chinese_chars=False, clean_text=True)
|
|
|
|
output = normalizer.normalize_str("\ufeffHello")
|
|
assert output == "Hello"
|
|
|
|
def test_lowercase(self):
|
|
normalizer = BertNormalizer(strip_accents=False, lowercase=True, handle_chinese_chars=False, clean_text=False)
|
|
|
|
output = normalizer.normalize_str("Héllò")
|
|
assert output == "héllò"
|
|
|
|
def test_can_modify(self):
|
|
normalizer = BertNormalizer(clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True)
|
|
|
|
assert normalizer.clean_text == True
|
|
assert normalizer.handle_chinese_chars == True
|
|
assert normalizer.strip_accents == True
|
|
assert normalizer.lowercase == True
|
|
|
|
# Modify these
|
|
normalizer.clean_text = False
|
|
assert normalizer.clean_text == False
|
|
normalizer.handle_chinese_chars = False
|
|
assert normalizer.handle_chinese_chars == False
|
|
normalizer.strip_accents = None
|
|
assert normalizer.strip_accents == None
|
|
normalizer.lowercase = False
|
|
assert normalizer.lowercase == False
|
|
|
|
|
|
class TestSequence:
|
|
def test_instantiate(self):
|
|
assert isinstance(Sequence([]), Normalizer)
|
|
assert isinstance(Sequence([]), Sequence)
|
|
assert isinstance(pickle.loads(pickle.dumps(Sequence([]))), Sequence)
|
|
|
|
def test_can_make_sequences(self):
|
|
normalizer = Sequence([Lowercase(), Strip()])
|
|
|
|
output = normalizer.normalize_str(" HELLO ")
|
|
assert output == "hello"
|
|
|
|
def test_set_item(self):
|
|
normalizers = Sequence(
|
|
[
|
|
BertNormalizer(True, True),
|
|
Prepend(prepend="test"),
|
|
]
|
|
)
|
|
assert normalizers[0].__class__ == BertNormalizer
|
|
assert normalizers[1].__class__ == Prepend
|
|
normalizers[1] = Strip()
|
|
assert normalizers[1].__class__ == Strip
|
|
with pytest.raises(IndexError):
|
|
print(normalizers[2])
|
|
|
|
def test_item_getters_and_setters(self):
|
|
normalizers = Sequence(
|
|
[
|
|
BertNormalizer(clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True),
|
|
Strip(left=True, right=True),
|
|
Prepend(prepend="_"),
|
|
Replace(pattern="something", content="else"),
|
|
]
|
|
)
|
|
|
|
assert normalizers[0].__class__ == BertNormalizer
|
|
normalizers[0].clean_text = False
|
|
normalizers[0].handle_chinese_chars = False
|
|
normalizers[0].strip_accents = False
|
|
normalizers[0].lowercase = False
|
|
assert not normalizers[0].clean_text
|
|
assert not normalizers[0].handle_chinese_chars
|
|
assert not normalizers[0].strip_accents
|
|
assert not normalizers[0].lowercase
|
|
|
|
assert normalizers[1].__class__ == Strip
|
|
normalizers[1].left = False
|
|
normalizers[1].right = False
|
|
assert not normalizers[1].left
|
|
assert not normalizers[1].right
|
|
|
|
assert normalizers[2].__class__ == Prepend
|
|
normalizers[2].prepend = " "
|
|
assert normalizers[2].prepend == " "
|
|
|
|
assert normalizers[3].__class__ == Replace
|
|
with pytest.raises(Exception):
|
|
normalizers[3].pattern = "test"
|
|
with pytest.raises(Exception):
|
|
print(normalizers[3].pattern)
|
|
normalizers[3].content = "test"
|
|
assert normalizers[3].content == "test"
|
|
|
|
|
|
class TestLowercase:
|
|
def test_instantiate(self):
|
|
assert isinstance(Lowercase(), Normalizer)
|
|
assert isinstance(Lowercase(), Lowercase)
|
|
assert isinstance(pickle.loads(pickle.dumps(Lowercase())), Lowercase)
|
|
|
|
def test_lowercase(self):
|
|
normalizer = Lowercase()
|
|
|
|
output = normalizer.normalize_str("HELLO")
|
|
assert output == "hello"
|
|
|
|
|
|
class TestStrip:
|
|
def test_instantiate(self):
|
|
assert isinstance(Strip(), Normalizer)
|
|
assert isinstance(Strip(), Strip)
|
|
assert isinstance(pickle.loads(pickle.dumps(Strip())), Strip)
|
|
|
|
def test_left_strip(self):
|
|
normalizer = Strip(left=True, right=False)
|
|
|
|
output = normalizer.normalize_str(" hello ")
|
|
assert output == "hello "
|
|
|
|
def test_right_strip(self):
|
|
normalizer = Strip(left=False, right=True)
|
|
|
|
output = normalizer.normalize_str(" hello ")
|
|
assert output == " hello"
|
|
|
|
def test_full_strip(self):
|
|
normalizer = Strip(left=True, right=True)
|
|
|
|
output = normalizer.normalize_str(" hello ")
|
|
assert output == "hello"
|
|
|
|
def test_can_modify(self):
|
|
normalizer = Strip(left=True, right=True)
|
|
|
|
assert normalizer.left == True
|
|
assert normalizer.right == True
|
|
|
|
# Modify these
|
|
normalizer.left = False
|
|
assert normalizer.left == False
|
|
normalizer.right = False
|
|
assert normalizer.right == False
|
|
|
|
|
|
class TestPrepend:
|
|
def test_instantiate(self):
|
|
assert isinstance(Prepend("▁"), Normalizer)
|
|
assert isinstance(Prepend("▁"), Prepend)
|
|
assert isinstance(pickle.loads(pickle.dumps(Prepend("▁"))), Prepend)
|
|
|
|
def test_prepend(self):
|
|
normalizer = Prepend(prepend="▁")
|
|
|
|
output = normalizer.normalize_str("hello")
|
|
assert output == "▁hello"
|
|
|
|
def test_can_modify(self):
|
|
normalizer = Prepend("▁")
|
|
|
|
assert normalizer.prepend == "▁"
|
|
|
|
# Modify these
|
|
normalizer.prepend = "-"
|
|
assert normalizer.prepend == "-"
|
|
|
|
|
|
class TestCustomNormalizer:
|
|
class BadCustomNormalizer:
|
|
def normalize(self, normalized, wrong):
|
|
pass
|
|
|
|
class GoodCustomNormalizer:
|
|
def normalize(self, normalized):
|
|
self.kept_normalized = normalized
|
|
normalized.replace("there", "you")
|
|
|
|
def use_after_normalize(self):
|
|
self.kept_normalized.replace("something", "else")
|
|
|
|
def test_instantiate(self):
|
|
bad = Normalizer.custom(TestCustomNormalizer.BadCustomNormalizer())
|
|
good_custom = TestCustomNormalizer.GoodCustomNormalizer()
|
|
good = Normalizer.custom(good_custom)
|
|
|
|
assert isinstance(bad, Normalizer)
|
|
assert isinstance(good, Normalizer)
|
|
with pytest.raises(Exception, match="TypeError:.*normalize()"):
|
|
bad.normalize_str("Hey there!")
|
|
assert good.normalize_str("Hey there!") == "Hey you!"
|
|
with pytest.raises(Exception, match="Cannot use a NormalizedStringRefMut outside `normalize`"):
|
|
good_custom.use_after_normalize()
|
|
|
|
def test_normalizer_interface(self):
|
|
normalizer = Normalizer.custom(TestCustomNormalizer.GoodCustomNormalizer())
|
|
|
|
normalized = NormalizedString("Hey there!")
|
|
normalizer.normalize(normalized)
|
|
|
|
assert repr(normalized) == 'NormalizedString(original="Hey there!", normalized="Hey you!")'
|
|
assert str(normalized) == "Hey you!"
|