diff --git a/bindings/python/tests/bindings/test_normalizers.py b/bindings/python/tests/bindings/test_normalizers.py new file mode 100644 index 00000000..43adf702 --- /dev/null +++ b/bindings/python/tests/bindings/test_normalizers.py @@ -0,0 +1,82 @@ +from tokenizers import Tokenizer +from tokenizers.models import BPE +from tokenizers.normalizers import BertNormalizer, Sequence, Lowercase, Strip + + +class TestBertNormalizer: + def test_strip_accents(self): + tokenizer = Tokenizer(BPE.empty()) + tokenizer.normalizer = BertNormalizer( + strip_accents=True, lowercase=False, handle_chinese_chars=False, clean_text=False + ) + + output = tokenizer.normalize("Héllò") + assert output == "Hello" + + def test_handle_chinese_chars(self): + tokenizer = Tokenizer(BPE.empty()) + tokenizer.normalizer = BertNormalizer( + strip_accents=False, lowercase=False, handle_chinese_chars=True, clean_text=False + ) + + output = tokenizer.normalize("你好") + assert output == " 你 好 " + + def test_clean_text(self): + tokenizer = Tokenizer(BPE.empty()) + tokenizer.normalizer = BertNormalizer( + strip_accents=False, lowercase=False, handle_chinese_chars=False, clean_text=True + ) + + output = tokenizer.normalize("\ufeffHello") + assert output == "Hello" + + def test_lowercase(self): + tokenizer = Tokenizer(BPE.empty()) + tokenizer.normalizer = BertNormalizer( + strip_accents=False, lowercase=True, handle_chinese_chars=False, clean_text=False + ) + + output = tokenizer.normalize("Héllò") + assert output == "héllò" + + +class TestSequence: + def test_can_make_sequences(self): + tokenizer = Tokenizer(BPE.empty()) + tokenizer.normalizer = Sequence([Lowercase(), Strip()]) + + output = tokenizer.normalize(" HELLO ") + assert output == "hello" + + +class TestLowercase: + def test_lowercase(self): + tokenizer = Tokenizer(BPE.empty()) + tokenizer.normalizer = Lowercase() + + output = tokenizer.normalize("HELLO") + assert output == "hello" + + +class TestStrip: + def test_left_strip(self): + tokenizer = Tokenizer(BPE.empty()) + tokenizer.normalizer = Strip(left=True, right=False) + + output = tokenizer.normalize(" hello ") + assert output == "hello " + + def test_right_strip(self): + tokenizer = Tokenizer(BPE.empty()) + tokenizer.normalizer = Strip(left=False, right=True) + + output = tokenizer.normalize(" hello ") + assert output == " hello" + + def test_full_strip(self): + tokenizer = Tokenizer(BPE.empty()) + tokenizer.normalizer = Strip(left=True, right=True) + + output = tokenizer.normalize(" hello ") + assert output == "hello"