Files
tokenizers/bindings/python/tests/implementations/test_byte_level_bpe.py
2020-09-24 09:27:30 -04:00

92 lines
2.7 KiB
Python

import pytest
from ..utils import data_dir, roberta_files, multiprocessing_with_parallelism
from tokenizers import ByteLevelBPETokenizer
class TestByteLevelBPE:
def test_basic_encode(self, roberta_files):
tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"])
output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
assert output.tokens == [
"The",
"Ġquick",
"Ġbrown",
"Ġfox",
"Ġjumps",
"Ġover",
"Ġthe",
"Ġlazy",
"Ġdog",
]
assert output.offsets == [
(0, 3),
(3, 9),
(9, 15),
(15, 19),
(19, 25),
(25, 30),
(30, 34),
(34, 39),
(39, 43),
]
def test_add_prefix_space(self, roberta_files):
tokenizer = ByteLevelBPETokenizer.from_file(
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True
)
output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
assert output.ids == [20, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
assert output.tokens == [
"ĠThe",
"Ġquick",
"Ġbrown",
"Ġfox",
"Ġjumps",
"Ġover",
"Ġthe",
"Ġlazy",
"Ġdog",
]
assert output.offsets == [
(0, 3),
(3, 9),
(9, 15),
(15, 19),
(19, 25),
(25, 30),
(30, 34),
(34, 39),
(39, 43),
]
def test_lowerspace(self, roberta_files):
tokenizer = ByteLevelBPETokenizer.from_file(
roberta_files["vocab"],
roberta_files["merges"],
add_prefix_space=True,
lowercase=True,
)
output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")
assert output.ids == [5, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
assert output.tokens == [
"Ġthe",
"Ġquick",
"Ġbrown",
"Ġfox",
"Ġjumps",
"Ġover",
"Ġthe",
"Ġlazy",
"Ġdog",
]
def test_multiprocessing_with_parallelism(self, roberta_files):
tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"])
multiprocessing_with_parallelism(tokenizer, False)
multiprocessing_with_parallelism(tokenizer, True)