Python - Add first implementations tests

This commit is contained in:
Anthony MOI
2020-03-30 16:57:08 -04:00
parent 023566fbbb
commit 5ebe687753
2 changed files with 81 additions and 0 deletions

View File

@ -0,0 +1,81 @@
from ..utils import data_dir, roberta_files
from tokenizers import ByteLevelBPETokenizer
class TestByteLevelBPE:
def test_basic_encode(self, roberta_files):
tokenizer = ByteLevelBPETokenizer(roberta_files["vocab"], roberta_files["merges"])
output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
assert output.tokens == [
"The",
"Ġquick",
"Ġbrown",
"Ġfox",
"Ġjumps",
"Ġover",
"Ġthe",
"Ġlazy",
"Ġdog",
]
assert output.offsets == [
(0, 3),
(3, 9),
(9, 15),
(15, 19),
(19, 25),
(25, 30),
(30, 34),
(34, 39),
(39, 43),
]
def test_add_prefix_space(self, roberta_files):
tokenizer = ByteLevelBPETokenizer(
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True
)
output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
assert output.ids == [20, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
assert output.tokens == [
"ĠThe",
"Ġquick",
"Ġbrown",
"Ġfox",
"Ġjumps",
"Ġover",
"Ġthe",
"Ġlazy",
"Ġdog",
]
assert output.offsets == [
(0, 3),
(3, 9),
(9, 15),
(15, 19),
(19, 25),
(25, 30),
(30, 34),
(34, 39),
(39, 43),
]
def test_lowerspace(self, roberta_files):
tokenizer = ByteLevelBPETokenizer(
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True
)
output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")
assert output.ids == [5, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
assert output.tokens == [
"Ġthe",
"Ġquick",
"Ġbrown",
"Ġfox",
"Ġjumps",
"Ġover",
"Ġthe",
"Ġlazy",
"Ġdog",
]