mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-05 04:08:22 +00:00
Black.
This commit is contained in:
@@ -144,9 +144,7 @@ class TestTokenizer:
|
|||||||
assert output.tokens == ["my", "name", "is", "john"]
|
assert output.tokens == ["my", "name", "is", "john"]
|
||||||
|
|
||||||
# Can encode a batch with both a single sequence and a pair of sequences
|
# Can encode a batch with both a single sequence and a pair of sequences
|
||||||
output = tokenizer.encode_batch(
|
output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")])
|
||||||
["my name is john", ("my name is john", "pair")]
|
|
||||||
)
|
|
||||||
assert len(output) == 2
|
assert len(output) == 2
|
||||||
|
|
||||||
def test_encode_formats(self, bert_files):
|
def test_encode_formats(self, bert_files):
|
||||||
@@ -169,9 +167,7 @@ class TestTokenizer:
|
|||||||
]
|
]
|
||||||
output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True)
|
output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True)
|
||||||
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
|
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
|
||||||
output = tokenizer.encode(
|
output = tokenizer.encode(["my", "name", "is", "john"], ["pair"], is_pretokenized=True)
|
||||||
["my", "name", "is", "john"], ["pair"], is_pretokenized=True
|
|
||||||
)
|
|
||||||
assert output.tokens == [
|
assert output.tokens == [
|
||||||
"[CLS]",
|
"[CLS]",
|
||||||
"my",
|
"my",
|
||||||
@@ -217,19 +213,13 @@ class TestTokenizer:
|
|||||||
|
|
||||||
# Numpy
|
# Numpy
|
||||||
test_single(np.array(["My name is John", "My name is Georges"]))
|
test_single(np.array(["My name is John", "My name is Georges"]))
|
||||||
test_pair(
|
test_pair(np.array([("My name is John", "pair"), ("My name is Georges", "pair")]))
|
||||||
np.array([("My name is John", "pair"), ("My name is Georges", "pair")])
|
test_pair(np.array([["My name is John", "pair"], ["My name is Georges", "pair"]]))
|
||||||
)
|
|
||||||
test_pair(
|
|
||||||
np.array([["My name is John", "pair"], ["My name is Georges", "pair"]])
|
|
||||||
)
|
|
||||||
|
|
||||||
# PreTokenized inputs
|
# PreTokenized inputs
|
||||||
|
|
||||||
# Lists
|
# Lists
|
||||||
test_single(
|
test_single([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]], True)
|
||||||
[["My", "name", "is", "John"], ["My", "name", "is", "Georges"]], True
|
|
||||||
)
|
|
||||||
test_pair(
|
test_pair(
|
||||||
[
|
[
|
||||||
(["My", "name", "is", "John"], ["pair"]),
|
(["My", "name", "is", "John"], ["pair"]),
|
||||||
@@ -246,9 +236,7 @@ class TestTokenizer:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Tuples
|
# Tuples
|
||||||
test_single(
|
test_single((("My", "name", "is", "John"), ("My", "name", "is", "Georges")), True)
|
||||||
(("My", "name", "is", "John"), ("My", "name", "is", "Georges")), True
|
|
||||||
)
|
|
||||||
test_pair(
|
test_pair(
|
||||||
(
|
(
|
||||||
(("My", "name", "is", "John"), ("pair",)),
|
(("My", "name", "is", "John"), ("pair",)),
|
||||||
@@ -317,9 +305,7 @@ class TestTokenizer:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Can encode with special tokens
|
# Can encode with special tokens
|
||||||
output_with_specials = tokenizer.encode(
|
output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True)
|
||||||
"My name is John", add_special_tokens=True
|
|
||||||
)
|
|
||||||
assert output_with_specials.tokens == [
|
assert output_with_specials.tokens == [
|
||||||
"<s>",
|
"<s>",
|
||||||
"ĠMy",
|
"ĠMy",
|
||||||
@@ -330,9 +316,7 @@ class TestTokenizer:
|
|||||||
]
|
]
|
||||||
|
|
||||||
# Can encode without special tokens
|
# Can encode without special tokens
|
||||||
output_without_specials = tokenizer.encode(
|
output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False)
|
||||||
"My name is John", add_special_tokens=False
|
|
||||||
)
|
|
||||||
assert output_without_specials.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
|
assert output_without_specials.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
|
||||||
|
|
||||||
def test_truncation(self):
|
def test_truncation(self):
|
||||||
|
|||||||
Reference in New Issue
Block a user