This commit is contained in:
Nicolas Patry
2020-09-23 11:33:14 +02:00
parent 8f8156fd2c
commit acd4a7599f

View File

@@ -144,9 +144,7 @@ class TestTokenizer:
assert output.tokens == ["my", "name", "is", "john"] assert output.tokens == ["my", "name", "is", "john"]
# Can encode a batch with both a single sequence and a pair of sequences # Can encode a batch with both a single sequence and a pair of sequences
output = tokenizer.encode_batch( output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")])
["my name is john", ("my name is john", "pair")]
)
assert len(output) == 2 assert len(output) == 2
def test_encode_formats(self, bert_files): def test_encode_formats(self, bert_files):
@@ -169,9 +167,7 @@ class TestTokenizer:
] ]
output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True) output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True)
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"] assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
output = tokenizer.encode( output = tokenizer.encode(["my", "name", "is", "john"], ["pair"], is_pretokenized=True)
["my", "name", "is", "john"], ["pair"], is_pretokenized=True
)
assert output.tokens == [ assert output.tokens == [
"[CLS]", "[CLS]",
"my", "my",
@@ -217,19 +213,13 @@ class TestTokenizer:
# Numpy # Numpy
test_single(np.array(["My name is John", "My name is Georges"])) test_single(np.array(["My name is John", "My name is Georges"]))
test_pair( test_pair(np.array([("My name is John", "pair"), ("My name is Georges", "pair")]))
np.array([("My name is John", "pair"), ("My name is Georges", "pair")]) test_pair(np.array([["My name is John", "pair"], ["My name is Georges", "pair"]]))
)
test_pair(
np.array([["My name is John", "pair"], ["My name is Georges", "pair"]])
)
# PreTokenized inputs # PreTokenized inputs
# Lists # Lists
test_single( test_single([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]], True)
[["My", "name", "is", "John"], ["My", "name", "is", "Georges"]], True
)
test_pair( test_pair(
[ [
(["My", "name", "is", "John"], ["pair"]), (["My", "name", "is", "John"], ["pair"]),
@@ -246,9 +236,7 @@ class TestTokenizer:
) )
# Tuples # Tuples
test_single( test_single((("My", "name", "is", "John"), ("My", "name", "is", "Georges")), True)
(("My", "name", "is", "John"), ("My", "name", "is", "Georges")), True
)
test_pair( test_pair(
( (
(("My", "name", "is", "John"), ("pair",)), (("My", "name", "is", "John"), ("pair",)),
@@ -317,9 +305,7 @@ class TestTokenizer:
) )
# Can encode with special tokens # Can encode with special tokens
output_with_specials = tokenizer.encode( output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True)
"My name is John", add_special_tokens=True
)
assert output_with_specials.tokens == [ assert output_with_specials.tokens == [
"<s>", "<s>",
"ĠMy", "ĠMy",
@@ -330,9 +316,7 @@ class TestTokenizer:
] ]
# Can encode without special tokens # Can encode without special tokens
output_without_specials = tokenizer.encode( output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False)
"My name is John", add_special_tokens=False
)
assert output_without_specials.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"] assert output_without_specials.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
def test_truncation(self): def test_truncation(self):