From a41090305147d3cd70d8fc08799f61d586bc6ec2 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 24 Sep 2020 15:00:20 +0200
Subject: [PATCH] Upgrading to black 20.8b1

---
 .github/workflows/python.yml                  |  2 +-
 bindings/python/examples/example.py           |  8 +++--
 .../python/examples/train_bert_wordpiece.py   |  5 ++-
 bindings/python/py_src/tokenizers/__init__.py |  3 +-
 .../python/py_src/tokenizers/__init__.pyi     | 32 +++++++++++--------
 .../implementations/sentencepiece_unigram.py  | 16 ++++++++--
 .../py_src/tokenizers/models/__init__.pyi     |  6 ++--
 bindings/python/tests/bindings/test_models.py |  5 ++-
 .../python/tests/bindings/test_processors.py  |  7 ++--
 .../python/tests/bindings/test_tokenizer.py   |  9 ++++--
 .../implementations/test_byte_level_bpe.py    |  5 ++-
 11 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 43aa357b..b7baee2f 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -25,7 +25,7 @@ jobs:
           architecture: "x64"
 
       - name: Install dependencies
-        run: pip install black==19.10b0
+        run: pip install black==20.8b1
 
       - name: Check style
         working-directory: ./bindings/python
diff --git a/bindings/python/examples/example.py b/bindings/python/examples/example.py
index 7382c3ac..e9fbe8f8 100644
--- a/bindings/python/examples/example.py
+++ b/bindings/python/examples/example.py
@@ -70,13 +70,17 @@ elif args.type == "bert":
 
     tok_r = Tokenizer(WordPiece(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100))
     tok_r.normalizer = BertNormalizer(
-        clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True,
+        clean_text=True,
+        handle_chinese_chars=True,
+        strip_accents=True,
+        lowercase=True,
     )
     # tok_r.pre_tokenizer = pre_tokenizers.Whitespace()
     tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
     tok_r.decoder = decoders.WordPiece()
     tok_r.post_processor = BertProcessing(
-        ("[SEP]", tok_r.token_to_id("[SEP]")), ("[CLS]", tok_r.token_to_id("[CLS]")),
+        ("[SEP]", tok_r.token_to_id("[SEP]")),
+        ("[CLS]", tok_r.token_to_id("[CLS]")),
     )
 else:
     raise Exception(f"Unknown type {args.type}")
diff --git a/bindings/python/examples/train_bert_wordpiece.py b/bindings/python/examples/train_bert_wordpiece.py
index c31146b8..37ab9581 100644
--- a/bindings/python/examples/train_bert_wordpiece.py
+++ b/bindings/python/examples/train_bert_wordpiece.py
@@ -32,7 +32,10 @@ if not files:
 
 # Initialize an empty tokenizer
 tokenizer = BertWordPieceTokenizer(
-    clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True,
+    clean_text=True,
+    handle_chinese_chars=True,
+    strip_accents=True,
+    lowercase=True,
 )
 
 # And then train
diff --git a/bindings/python/py_src/tokenizers/__init__.py b/bindings/python/py_src/tokenizers/__init__.py
index c3a37475..d43e3a35 100644
--- a/bindings/python/py_src/tokenizers/__init__.py
+++ b/bindings/python/py_src/tokenizers/__init__.py
@@ -9,7 +9,8 @@ TextInputSequence = str
 PreTokenizedInputSequence = Union[List[str], Tuple[str]]
 TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
 PreTokenizedEncodeInput = Union[
-    PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
+    PreTokenizedInputSequence,
+    Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
 ]
 
 InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi
index 67173b5e..7bb38821 100644
--- a/bindings/python/py_src/tokenizers/__init__.pyi
+++ b/bindings/python/py_src/tokenizers/__init__.pyi
@@ -21,7 +21,8 @@ TextInputSequence = str
 PreTokenizedInputSequence = Union[List[str], Tuple[str]]
 TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
 PreTokenizedEncodeInput = Union[
-    PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
+    PreTokenizedInputSequence,
+    Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
 ]
 
 InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
@@ -52,7 +53,7 @@ Range = Union[int, Tuple[int, int], slice]
 Pattern = Union[str, Regex]
 
 class PreTokenizedString:
-    """ PreTokenizedString
+    """PreTokenizedString
 
     Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
     underlying string, while keeping track of the alignment information (offsets).
@@ -66,7 +67,7 @@ class PreTokenizedString:
     """
 
     def __new__(sequence: str) -> PreTokenizedString:
-        """ Instantiate a new PreTokenizedString using the given str
+        """Instantiate a new PreTokenizedString using the given str
 
         Args:
             sequence: str:
@@ -74,7 +75,7 @@ class PreTokenizedString:
         """
         pass
     def split(self, func: Callable[[index, NormalizedString], List[NormalizedString]]):
-        """ Split the PreTokenizedString using the given `func`
+        """Split the PreTokenizedString using the given `func`
 
         Args:
             func: Callable[[index, NormalizedString], List[NormalizedString]]:
@@ -87,7 +88,7 @@ class PreTokenizedString:
         """
         pass
     def normalize(self, func: Callable[[NormalizedString], None]):
-        """ Normalize each split of the `PreTokenizedString` using the given `func`
+        """Normalize each split of the `PreTokenizedString` using the given `func`
 
         Args:
             func: Callable[[NormalizedString], None]:
@@ -97,7 +98,7 @@ class PreTokenizedString:
         """
         pass
     def tokenize(self, func: Callable[[str], List[Token]]):
-        """ Tokenize each split of the `PreTokenizedString` using the given `func`
+        """Tokenize each split of the `PreTokenizedString` using the given `func`
 
         Args:
             func: Callable[[str], List[Token]]:
@@ -106,7 +107,7 @@ class PreTokenizedString:
         """
         pass
     def to_encoding(self, type_id: int = 0, word_idx: Optional[int] = None) -> Encoding:
-        """ Return an Encoding generated from this PreTokenizedString
+        """Return an Encoding generated from this PreTokenizedString
 
         Args:
             type_id: int = 0:
@@ -126,7 +127,7 @@ class PreTokenizedString:
         offset_referential: OffsetReferential = OffsetReferential.ORIGINAL,
         offset_type: OffsetType = OffsetType.CHAR,
     ) -> List[Split]:
-        """ Get the splits currently managed by the PreTokenizedString
+        """Get the splits currently managed by the PreTokenizedString
 
         Args:
             offset_referential: OffsetReferential:
@@ -145,7 +146,7 @@ class PreTokenizedString:
         pass
 
 class NormalizedString:
-    """ NormalizedString
+    """NormalizedString
 
     A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one.
     While making all the requested modifications, it keeps track of the alignment information
@@ -153,7 +154,7 @@ class NormalizedString:
     """
 
     def __new__(sequence: str) -> NormalizedString:
-        """ Instantiate a new NormalizedString using the given str
+        """Instantiate a new NormalizedString using the given str
 
         Args:
             sequence: str:
@@ -214,14 +215,14 @@ class NormalizedString:
         """ Calls the given function for each character of the string """
         pass
     def map(self, func: Callable[[str], str]):
-        """ Calls the given function for each character of the string
+        """Calls the given function for each character of the string
 
         Replaces each character of the string using the returned value. Each
         returned value **must** be a str of length 1 (ie a character).
         """
         pass
     def split(self, pattern: Pattern, behavior: SplitDelimiterBehavior) -> List[NormalizedString]:
-        """ Split the NormalizedString using the given pattern and the specified behavior
+        """Split the NormalizedString using the given pattern and the specified behavior
 
         Args:
             pattern: Pattern:
@@ -235,7 +236,7 @@ class NormalizedString:
         """
         pass
     def replace(self, pattern: Pattern, content: str):
-        """ Replace the content of the given pattern with the provided content
+        """Replace the content of the given pattern with the provided content
 
         Args:
             pattern: Pattern:
@@ -827,7 +828,10 @@ class Tokenizer:
         """
         pass
     def post_process(
-        self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True,
+        self,
+        encoding: Encoding,
+        pair: Optional[Encoding] = None,
+        add_special_tokens: bool = True,
     ) -> Encoding:
         """Apply all the post-processing steps to the given encodings.
 
diff --git a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
index 1cbc9c3a..81f35e22 100644
--- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
+++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
@@ -21,7 +21,10 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
     """
 
     def __init__(
-        self, vocab: Optional[str] = None, replacement: str = "▁", add_prefix_space: bool = True,
+        self,
+        vocab: Optional[str] = None,
+        replacement: str = "▁",
+        add_prefix_space: bool = True,
     ):
         if vocab is not None:
             # Let Unigram(..) fail if only one of them is None
@@ -29,7 +32,12 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
         else:
             tokenizer = Tokenizer(Unigram())
 
-        tokenizer.normalizer = normalizers.Sequence([normalizers.Nmt(), normalizers.NFKC(),])
+        tokenizer.normalizer = normalizers.Sequence(
+            [
+                normalizers.Nmt(),
+                normalizers.NFKC(),
+            ]
+        )
         tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
             [
                 pre_tokenizers.WhitespaceSplit(),
@@ -60,7 +68,9 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
         """ Train the model using the given files """
 
         trainer = trainers.UnigramTrainer(
-            vocab_size=vocab_size, special_tokens=special_tokens, show_progress=show_progress,
+            vocab_size=vocab_size,
+            special_tokens=special_tokens,
+            show_progress=show_progress,
         )
 
         if isinstance(files, str):
diff --git a/bindings/python/py_src/tokenizers/models/__init__.pyi b/bindings/python/py_src/tokenizers/models/__init__.pyi
index 4b61d133..2b49496c 100644
--- a/bindings/python/py_src/tokenizers/models/__init__.pyi
+++ b/bindings/python/py_src/tokenizers/models/__init__.pyi
@@ -78,7 +78,7 @@ class BPE(Model):
     def from_file(vocab_filename: str, merges_filename: str, **kwargs) -> BPE:
         """
         Convenient method to intialize a BPE from files
-        Roughly equivalent to 
+        Roughly equivalent to
 
         def from_file(vocab_filename, merges_filenames, **kwargs):
             vocab, merges = BPE.read_file(vocab_filename, merges_filename)
@@ -116,7 +116,7 @@ class WordPiece(Model):
     def from_file(vocab_filename: str, **kwargs) -> WordPiece:
         """
         Convenient method to intialize a WordPiece from file
-        Roughly equivalent to 
+        Roughly equivalent to
 
         def from_file(vocab_filename, **kwargs):
             vocab, merges = WordPiece.read_file(vocab_filename)
@@ -147,7 +147,7 @@ class WordLevel(Model):
     def from_file(vocab_filename: str, **kwargs) -> WordLevelg:
         """
         Convenient method to intialize a WordLevelg from file
-        Roughly equivalent to 
+        Roughly equivalent to
 
         def from_file(vocab_filename, **kwargs):
             vocab, merges = WordLevelg.read_file(vocab_filename)
diff --git a/bindings/python/tests/bindings/test_models.py b/bindings/python/tests/bindings/test_models.py
index 952c3134..0a0403f0 100644
--- a/bindings/python/tests/bindings/test_models.py
+++ b/bindings/python/tests/bindings/test_models.py
@@ -19,7 +19,10 @@ class TestBPE:
             BPE(vocab=vocab)
             BPE(merges=merges)
 
-        assert isinstance(pickle.loads(pickle.dumps(BPE(vocab, merges))), BPE,)
+        assert isinstance(
+            pickle.loads(pickle.dumps(BPE(vocab, merges))),
+            BPE,
+        )
 
         # Deprecated calls in 0.9
         with pytest.deprecated_call():
diff --git a/bindings/python/tests/bindings/test_processors.py b/bindings/python/tests/bindings/test_processors.py
index 30c9ec5b..12abc156 100644
--- a/bindings/python/tests/bindings/test_processors.py
+++ b/bindings/python/tests/bindings/test_processors.py
@@ -22,7 +22,8 @@ class TestBertProcessing:
         assert isinstance(processor, PostProcessor)
         assert isinstance(processor, BertProcessing)
         assert isinstance(
-            pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))), BertProcessing,
+            pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))),
+            BertProcessing,
         )
 
     def test_processing(self):
@@ -94,7 +95,9 @@ class TestTemplateProcessing:
 
     def get_roberta(self):
         return TemplateProcessing(
-            seq_a="<s> $0 </s>", seq_b="</s> $0 </s>", special_tokens=[("<s>", 0), ("</s>", 1)],
+            seq_a="<s> $0 </s>",
+            seq_b="</s> $0 </s>",
+            special_tokens=[("<s>", 0), ("</s>", 1)],
         )
 
     def get_t5_squad(self):
diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py
index 54b498eb..8ad5401d 100644
--- a/bindings/python/tests/bindings/test_tokenizer.py
+++ b/bindings/python/tests/bindings/test_tokenizer.py
@@ -232,10 +232,12 @@ class TestTokenizer:
 
         # Numpy
         test_single(
-            np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]), True,
+            np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]),
+            True,
         )
         test_single(
-            np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))), True,
+            np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))),
+            True,
         )
         test_pair(
             np.array(
@@ -276,7 +278,8 @@ class TestTokenizer:
 
         tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
         tokenizer.post_processor = RobertaProcessing(
-            ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")),
+            ("</s>", tokenizer.token_to_id("</s>")),
+            ("<s>", tokenizer.token_to_id("<s>")),
         )
 
         # Can encode with special tokens
diff --git a/bindings/python/tests/implementations/test_byte_level_bpe.py b/bindings/python/tests/implementations/test_byte_level_bpe.py
index 68a01209..afa28fdc 100644
--- a/bindings/python/tests/implementations/test_byte_level_bpe.py
+++ b/bindings/python/tests/implementations/test_byte_level_bpe.py
@@ -65,7 +65,10 @@ class TestByteLevelBPE:
 
     def test_lowerspace(self, roberta_files):
         tokenizer = ByteLevelBPETokenizer.from_file(
-            roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True,
+            roberta_files["vocab"],
+            roberta_files["merges"],
+            add_prefix_space=True,
+            lowercase=True,
         )
         output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")