From 0de9885da8afb86dcc78c3b5c64520a70a73aa25 Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Tue, 31 Mar 2020 17:40:06 -0400 Subject: [PATCH] Python - Test PreTokenizers --- .../tests/bindings/test_pre_tokenizers.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 bindings/python/tests/bindings/test_pre_tokenizers.py diff --git a/bindings/python/tests/bindings/test_pre_tokenizers.py b/bindings/python/tests/bindings/test_pre_tokenizers.py new file mode 100644 index 00000000..ee946800 --- /dev/null +++ b/bindings/python/tests/bindings/test_pre_tokenizers.py @@ -0,0 +1,59 @@ +import pytest + +from tokenizers.pre_tokenizers import ( + PreTokenizer, + ByteLevel, + Whitespace, + WhitespaceSplit, + BertPreTokenizer, + Metaspace, + CharDelimiterSplit, +) + + +class TestByteLevel: + def test_instantiate(self): + assert ByteLevel() is not None + assert ByteLevel(add_prefix_space=True) is not None + assert ByteLevel(add_prefix_space=False) is not None + assert isinstance(ByteLevel(), PreTokenizer) + + def test_has_alphabet(self): + assert isinstance(ByteLevel.alphabet(), list) + assert len(ByteLevel.alphabet()) == 256 + + +class TestWhitespace: + def test_instantiate(self): + assert Whitespace() is not None + assert isinstance(Whitespace(), PreTokenizer) + + +class TestWhitespaceSplit: + def test_instantiate(self): + assert WhitespaceSplit() is not None + assert isinstance(WhitespaceSplit(), PreTokenizer) + + +class TestBertPreTokenizer: + def test_instantiate(self): + assert BertPreTokenizer() is not None + assert isinstance(BertPreTokenizer(), PreTokenizer) + + +class TestMetaspace: + def test_instantiate(self): + assert Metaspace() is not None + assert Metaspace(replacement="-") is not None + with pytest.raises(Exception, match="replacement must be a character"): + Metaspace(replacement="") + assert Metaspace(add_prefix_space=True) is not None + assert isinstance(Metaspace(), PreTokenizer) + + +class TestCharDelimiterSplit: + def test_instantiate(self): + assert CharDelimiterSplit("-") is not None + with pytest.raises(Exception, match="delimiter must be a single character"): + CharDelimiterSplit("") + assert isinstance(CharDelimiterSplit(" "), PreTokenizer)