from tokenizers import ByteLevelBPETokenizer from ..utils import data_dir, multiprocessing_with_parallelism, roberta_files class TestByteLevelBPE: def test_basic_encode(self, roberta_files): tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"]) output = tokenizer.encode("The quick brown fox jumps over the lazy dog") assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335] assert output.tokens == [ "The", "Ġquick", "Ġbrown", "Ġfox", "Ġjumps", "Ġover", "Ġthe", "Ġlazy", "Ġdog", ] assert output.offsets == [ (0, 3), (3, 9), (9, 15), (15, 19), (19, 25), (25, 30), (30, 34), (34, 39), (39, 43), ] def test_add_prefix_space(self, roberta_files): tokenizer = ByteLevelBPETokenizer.from_file( roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True ) output = tokenizer.encode("The quick brown fox jumps over the lazy dog") assert output.ids == [20, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335] assert output.tokens == [ "ĠThe", "Ġquick", "Ġbrown", "Ġfox", "Ġjumps", "Ġover", "Ġthe", "Ġlazy", "Ġdog", ] assert output.offsets == [ (0, 3), (3, 9), (9, 15), (15, 19), (19, 25), (25, 30), (30, 34), (34, 39), (39, 43), ] def test_lowerspace(self, roberta_files): tokenizer = ByteLevelBPETokenizer.from_file( roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True, ) output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog") assert output.ids == [5, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335] assert output.tokens == [ "Ġthe", "Ġquick", "Ġbrown", "Ġfox", "Ġjumps", "Ġover", "Ġthe", "Ġlazy", "Ġdog", ] def test_multiprocessing_with_parallelism(self, roberta_files): tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"]) multiprocessing_with_parallelism(tokenizer, False) multiprocessing_with_parallelism(tokenizer, True) def test_train_from_iterator(self): text = ["A first sentence", "Another sentence", "And a last one"] tokenizer = ByteLevelBPETokenizer() tokenizer.train_from_iterator(text, show_progress=False) output = tokenizer.encode("A sentence") assert output.tokens == ["A", "Ġsentence"]