Adressing first pass of comments.

This commit is contained in:
Nicolas Patry
2020-09-23 11:29:17 +02:00
parent 1cd4824273
commit 8f8156fd2c
10 changed files with 196 additions and 125 deletions

View File

@ -67,7 +67,10 @@ class TestByteLevelBPE:
def test_lowerspace(self, roberta_files):
tokenizer = ByteLevelBPETokenizer.from_files(
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True,
roberta_files["vocab"],
roberta_files["merges"],
add_prefix_space=True,
lowercase=True,
)
output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")

View File

@ -6,7 +6,9 @@ from tokenizers import CharBPETokenizer
class TestBertWordPieceBPE:
def test_basic_encode(self, openai_files):
tokenizer = CharBPETokenizer.from_files(openai_files["vocab"], openai_files["merges"])
tokenizer = CharBPETokenizer.from_files(
openai_files["vocab"], openai_files["merges"]
)
output = tokenizer.encode("My name is John", "pair")
assert output.ids == [0, 253, 1362, 544, 0, 7, 12662, 2688]
@ -50,6 +52,8 @@ class TestBertWordPieceBPE:
assert decoded == "my name is john"
def test_multiprocessing_with_parallelism(self, openai_files):
tokenizer = CharBPETokenizer.from_files(openai_files["vocab"], openai_files["merges"])
tokenizer = CharBPETokenizer.from_files(
openai_files["vocab"], openai_files["merges"]
)
multiprocessing_with_parallelism(tokenizer, False)
multiprocessing_with_parallelism(tokenizer, True)