Black.

2025-12-05 04:08:22 +00:00 · 2020-09-23 11:33:14 +02:00
parent 8f8156fd2c
commit acd4a7599f
1 changed files with 8 additions and 24 deletions
--- a/bindings/python/tests/bindings/test_tokenizer.py
+++ b/bindings/python/tests/bindings/test_tokenizer.py
@@ -144,9 +144,7 @@ class TestTokenizer:
        assert output.tokens == ["my", "name", "is", "john"]
        # Can encode a batch with both a single sequence and a pair of sequences
-        output = tokenizer.encode_batch(
+        output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")])
            ["my name is john", ("my name is john", "pair")]
        )
        assert len(output) == 2
    def test_encode_formats(self, bert_files):
@@ -169,9 +167,7 @@ class TestTokenizer:
        ]
        output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True)
        assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
-        output = tokenizer.encode(
+        output = tokenizer.encode(["my", "name", "is", "john"], ["pair"], is_pretokenized=True)
            ["my", "name", "is", "john"], ["pair"], is_pretokenized=True
        )
        assert output.tokens == [
            "[CLS]",
            "my",
@@ -217,19 +213,13 @@ class TestTokenizer:
        # Numpy
        test_single(np.array(["My name is John", "My name is Georges"]))
-        test_pair(
+        test_pair(np.array([("My name is John", "pair"), ("My name is Georges", "pair")]))
-            np.array([("My name is John", "pair"), ("My name is Georges", "pair")])
+        test_pair(np.array([["My name is John", "pair"], ["My name is Georges", "pair"]]))
        )
        test_pair(
            np.array([["My name is John", "pair"], ["My name is Georges", "pair"]])
        )
        # PreTokenized inputs
        # Lists
-        test_single(
+        test_single([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]], True)
            [["My", "name", "is", "John"], ["My", "name", "is", "Georges"]], True
        )
        test_pair(
            [
                (["My", "name", "is", "John"], ["pair"]),
@@ -246,9 +236,7 @@ class TestTokenizer:
        )
        # Tuples
-        test_single(
+        test_single((("My", "name", "is", "John"), ("My", "name", "is", "Georges")), True)
            (("My", "name", "is", "John"), ("My", "name", "is", "Georges")), True
        )
        test_pair(
            (
                (("My", "name", "is", "John"), ("pair",)),
@@ -317,9 +305,7 @@ class TestTokenizer:
        )
        # Can encode with special tokens
-        output_with_specials = tokenizer.encode(
+        output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True)
            "My name is John", add_special_tokens=True
        )
        assert output_with_specials.tokens == [
            "<s>",
            "ĠMy",
@@ -330,9 +316,7 @@ class TestTokenizer:
        ]
        # Can encode without special tokens
-        output_without_specials = tokenizer.encode(
+        output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False)
            "My name is John", add_special_tokens=False
        )
        assert output_without_specials.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
    def test_truncation(self):