Files
tokenizers/bindings/python/tests/implementations/test_byte_level_bpe.py
2021-01-07 09:02:20 -05:00

100 lines
3.0 KiB
Python

import pytest
from ..utils import data_dir, roberta_files, multiprocessing_with_parallelism
from tokenizers import ByteLevelBPETokenizer
class TestByteLevelBPE:
def test_basic_encode(self, roberta_files):
tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"])
output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
assert output.tokens == [
"The",
"Ġquick",
"Ġbrown",
"Ġfox",
"Ġjumps",
"Ġover",
"Ġthe",
"Ġlazy",
"Ġdog",
]
assert output.offsets == [
(0, 3),
(3, 9),
(9, 15),
(15, 19),
(19, 25),
(25, 30),
(30, 34),
(34, 39),
(39, 43),
]
def test_add_prefix_space(self, roberta_files):
tokenizer = ByteLevelBPETokenizer.from_file(
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True
)
output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
assert output.ids == [20, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
assert output.tokens == [
"ĠThe",
"Ġquick",
"Ġbrown",
"Ġfox",
"Ġjumps",
"Ġover",
"Ġthe",
"Ġlazy",
"Ġdog",
]
assert output.offsets == [
(0, 3),
(3, 9),
(9, 15),
(15, 19),
(19, 25),
(25, 30),
(30, 34),
(34, 39),
(39, 43),
]
def test_lowerspace(self, roberta_files):
tokenizer = ByteLevelBPETokenizer.from_file(
roberta_files["vocab"],
roberta_files["merges"],
add_prefix_space=True,
lowercase=True,
)
output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")
assert output.ids == [5, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
assert output.tokens == [
"Ġthe",
"Ġquick",
"Ġbrown",
"Ġfox",
"Ġjumps",
"Ġover",
"Ġthe",
"Ġlazy",
"Ġdog",
]
def test_multiprocessing_with_parallelism(self, roberta_files):
tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"])
multiprocessing_with_parallelism(tokenizer, False)
multiprocessing_with_parallelism(tokenizer, True)
def test_train_from_iterator(self):
text = ["A first sentence", "Another sentence", "And a last one"]
tokenizer = ByteLevelBPETokenizer()
tokenizer.train_from_iterator(text, show_progress=False)
output = tokenizer.encode("A sentence")
assert output.tokens == ["A", "Ġsentence"]