Python - Test PreTokenizers

This commit is contained in:
Anthony MOI
2020-03-31 17:40:06 -04:00
parent d6692d4072
commit 0de9885da8

View File

@@ -0,0 +1,59 @@
import pytest
from tokenizers.pre_tokenizers import (
PreTokenizer,
ByteLevel,
Whitespace,
WhitespaceSplit,
BertPreTokenizer,
Metaspace,
CharDelimiterSplit,
)
class TestByteLevel:
def test_instantiate(self):
assert ByteLevel() is not None
assert ByteLevel(add_prefix_space=True) is not None
assert ByteLevel(add_prefix_space=False) is not None
assert isinstance(ByteLevel(), PreTokenizer)
def test_has_alphabet(self):
assert isinstance(ByteLevel.alphabet(), list)
assert len(ByteLevel.alphabet()) == 256
class TestWhitespace:
def test_instantiate(self):
assert Whitespace() is not None
assert isinstance(Whitespace(), PreTokenizer)
class TestWhitespaceSplit:
def test_instantiate(self):
assert WhitespaceSplit() is not None
assert isinstance(WhitespaceSplit(), PreTokenizer)
class TestBertPreTokenizer:
def test_instantiate(self):
assert BertPreTokenizer() is not None
assert isinstance(BertPreTokenizer(), PreTokenizer)
class TestMetaspace:
def test_instantiate(self):
assert Metaspace() is not None
assert Metaspace(replacement="-") is not None
with pytest.raises(Exception, match="replacement must be a character"):
Metaspace(replacement="")
assert Metaspace(add_prefix_space=True) is not None
assert isinstance(Metaspace(), PreTokenizer)
class TestCharDelimiterSplit:
def test_instantiate(self):
assert CharDelimiterSplit("-") is not None
with pytest.raises(Exception, match="delimiter must be a single character"):
CharDelimiterSplit("")
assert isinstance(CharDelimiterSplit(" "), PreTokenizer)