From a41090305147d3cd70d8fc08799f61d586bc6ec2 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 24 Sep 2020 15:00:20 +0200 Subject: [PATCH] Upgrading to black 20.8b1 --- .github/workflows/python.yml | 2 +- bindings/python/examples/example.py | 8 +++-- .../python/examples/train_bert_wordpiece.py | 5 ++- bindings/python/py_src/tokenizers/__init__.py | 3 +- .../python/py_src/tokenizers/__init__.pyi | 32 +++++++++++-------- .../implementations/sentencepiece_unigram.py | 16 ++++++++-- .../py_src/tokenizers/models/__init__.pyi | 6 ++-- bindings/python/tests/bindings/test_models.py | 5 ++- .../python/tests/bindings/test_processors.py | 7 ++-- .../python/tests/bindings/test_tokenizer.py | 9 ++++-- .../implementations/test_byte_level_bpe.py | 5 ++- 11 files changed, 66 insertions(+), 32 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 43aa357b..b7baee2f 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -25,7 +25,7 @@ jobs: architecture: "x64" - name: Install dependencies - run: pip install black==19.10b0 + run: pip install black==20.8b1 - name: Check style working-directory: ./bindings/python diff --git a/bindings/python/examples/example.py b/bindings/python/examples/example.py index 7382c3ac..e9fbe8f8 100644 --- a/bindings/python/examples/example.py +++ b/bindings/python/examples/example.py @@ -70,13 +70,17 @@ elif args.type == "bert": tok_r = Tokenizer(WordPiece(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100)) tok_r.normalizer = BertNormalizer( - clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, + clean_text=True, + handle_chinese_chars=True, + strip_accents=True, + lowercase=True, ) # tok_r.pre_tokenizer = pre_tokenizers.Whitespace() tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tok_r.decoder = decoders.WordPiece() tok_r.post_processor = BertProcessing( - ("[SEP]", tok_r.token_to_id("[SEP]")), ("[CLS]", tok_r.token_to_id("[CLS]")), + ("[SEP]", tok_r.token_to_id("[SEP]")), + ("[CLS]", tok_r.token_to_id("[CLS]")), ) else: raise Exception(f"Unknown type {args.type}") diff --git a/bindings/python/examples/train_bert_wordpiece.py b/bindings/python/examples/train_bert_wordpiece.py index c31146b8..37ab9581 100644 --- a/bindings/python/examples/train_bert_wordpiece.py +++ b/bindings/python/examples/train_bert_wordpiece.py @@ -32,7 +32,10 @@ if not files: # Initialize an empty tokenizer tokenizer = BertWordPieceTokenizer( - clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, + clean_text=True, + handle_chinese_chars=True, + strip_accents=True, + lowercase=True, ) # And then train diff --git a/bindings/python/py_src/tokenizers/__init__.py b/bindings/python/py_src/tokenizers/__init__.py index c3a37475..d43e3a35 100644 --- a/bindings/python/py_src/tokenizers/__init__.py +++ b/bindings/python/py_src/tokenizers/__init__.py @@ -9,7 +9,8 @@ TextInputSequence = str PreTokenizedInputSequence = Union[List[str], Tuple[str]] TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]] PreTokenizedEncodeInput = Union[ - PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], + PreTokenizedInputSequence, + Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], ] InputSequence = Union[TextInputSequence, PreTokenizedInputSequence] diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi index 67173b5e..7bb38821 100644 --- a/bindings/python/py_src/tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/__init__.pyi @@ -21,7 +21,8 @@ TextInputSequence = str PreTokenizedInputSequence = Union[List[str], Tuple[str]] TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]] PreTokenizedEncodeInput = Union[ - PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], + PreTokenizedInputSequence, + Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], ] InputSequence = Union[TextInputSequence, PreTokenizedInputSequence] @@ -52,7 +53,7 @@ Range = Union[int, Tuple[int, int], slice] Pattern = Union[str, Regex] class PreTokenizedString: - """ PreTokenizedString + """PreTokenizedString Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the underlying string, while keeping track of the alignment information (offsets). @@ -66,7 +67,7 @@ class PreTokenizedString: """ def __new__(sequence: str) -> PreTokenizedString: - """ Instantiate a new PreTokenizedString using the given str + """Instantiate a new PreTokenizedString using the given str Args: sequence: str: @@ -74,7 +75,7 @@ class PreTokenizedString: """ pass def split(self, func: Callable[[index, NormalizedString], List[NormalizedString]]): - """ Split the PreTokenizedString using the given `func` + """Split the PreTokenizedString using the given `func` Args: func: Callable[[index, NormalizedString], List[NormalizedString]]: @@ -87,7 +88,7 @@ class PreTokenizedString: """ pass def normalize(self, func: Callable[[NormalizedString], None]): - """ Normalize each split of the `PreTokenizedString` using the given `func` + """Normalize each split of the `PreTokenizedString` using the given `func` Args: func: Callable[[NormalizedString], None]: @@ -97,7 +98,7 @@ class PreTokenizedString: """ pass def tokenize(self, func: Callable[[str], List[Token]]): - """ Tokenize each split of the `PreTokenizedString` using the given `func` + """Tokenize each split of the `PreTokenizedString` using the given `func` Args: func: Callable[[str], List[Token]]: @@ -106,7 +107,7 @@ class PreTokenizedString: """ pass def to_encoding(self, type_id: int = 0, word_idx: Optional[int] = None) -> Encoding: - """ Return an Encoding generated from this PreTokenizedString + """Return an Encoding generated from this PreTokenizedString Args: type_id: int = 0: @@ -126,7 +127,7 @@ class PreTokenizedString: offset_referential: OffsetReferential = OffsetReferential.ORIGINAL, offset_type: OffsetType = OffsetType.CHAR, ) -> List[Split]: - """ Get the splits currently managed by the PreTokenizedString + """Get the splits currently managed by the PreTokenizedString Args: offset_referential: OffsetReferential: @@ -145,7 +146,7 @@ class PreTokenizedString: pass class NormalizedString: - """ NormalizedString + """NormalizedString A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one. While making all the requested modifications, it keeps track of the alignment information @@ -153,7 +154,7 @@ class NormalizedString: """ def __new__(sequence: str) -> NormalizedString: - """ Instantiate a new NormalizedString using the given str + """Instantiate a new NormalizedString using the given str Args: sequence: str: @@ -214,14 +215,14 @@ class NormalizedString: """ Calls the given function for each character of the string """ pass def map(self, func: Callable[[str], str]): - """ Calls the given function for each character of the string + """Calls the given function for each character of the string Replaces each character of the string using the returned value. Each returned value **must** be a str of length 1 (ie a character). """ pass def split(self, pattern: Pattern, behavior: SplitDelimiterBehavior) -> List[NormalizedString]: - """ Split the NormalizedString using the given pattern and the specified behavior + """Split the NormalizedString using the given pattern and the specified behavior Args: pattern: Pattern: @@ -235,7 +236,7 @@ class NormalizedString: """ pass def replace(self, pattern: Pattern, content: str): - """ Replace the content of the given pattern with the provided content + """Replace the content of the given pattern with the provided content Args: pattern: Pattern: @@ -827,7 +828,10 @@ class Tokenizer: """ pass def post_process( - self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True, + self, + encoding: Encoding, + pair: Optional[Encoding] = None, + add_special_tokens: bool = True, ) -> Encoding: """Apply all the post-processing steps to the given encodings. diff --git a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py index 1cbc9c3a..81f35e22 100644 --- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py +++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py @@ -21,7 +21,10 @@ class SentencePieceUnigramTokenizer(BaseTokenizer): """ def __init__( - self, vocab: Optional[str] = None, replacement: str = "▁", add_prefix_space: bool = True, + self, + vocab: Optional[str] = None, + replacement: str = "▁", + add_prefix_space: bool = True, ): if vocab is not None: # Let Unigram(..) fail if only one of them is None @@ -29,7 +32,12 @@ class SentencePieceUnigramTokenizer(BaseTokenizer): else: tokenizer = Tokenizer(Unigram()) - tokenizer.normalizer = normalizers.Sequence([normalizers.Nmt(), normalizers.NFKC(),]) + tokenizer.normalizer = normalizers.Sequence( + [ + normalizers.Nmt(), + normalizers.NFKC(), + ] + ) tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [ pre_tokenizers.WhitespaceSplit(), @@ -60,7 +68,9 @@ class SentencePieceUnigramTokenizer(BaseTokenizer): """ Train the model using the given files """ trainer = trainers.UnigramTrainer( - vocab_size=vocab_size, special_tokens=special_tokens, show_progress=show_progress, + vocab_size=vocab_size, + special_tokens=special_tokens, + show_progress=show_progress, ) if isinstance(files, str): diff --git a/bindings/python/py_src/tokenizers/models/__init__.pyi b/bindings/python/py_src/tokenizers/models/__init__.pyi index 4b61d133..2b49496c 100644 --- a/bindings/python/py_src/tokenizers/models/__init__.pyi +++ b/bindings/python/py_src/tokenizers/models/__init__.pyi @@ -78,7 +78,7 @@ class BPE(Model): def from_file(vocab_filename: str, merges_filename: str, **kwargs) -> BPE: """ Convenient method to intialize a BPE from files - Roughly equivalent to + Roughly equivalent to def from_file(vocab_filename, merges_filenames, **kwargs): vocab, merges = BPE.read_file(vocab_filename, merges_filename) @@ -116,7 +116,7 @@ class WordPiece(Model): def from_file(vocab_filename: str, **kwargs) -> WordPiece: """ Convenient method to intialize a WordPiece from file - Roughly equivalent to + Roughly equivalent to def from_file(vocab_filename, **kwargs): vocab, merges = WordPiece.read_file(vocab_filename) @@ -147,7 +147,7 @@ class WordLevel(Model): def from_file(vocab_filename: str, **kwargs) -> WordLevelg: """ Convenient method to intialize a WordLevelg from file - Roughly equivalent to + Roughly equivalent to def from_file(vocab_filename, **kwargs): vocab, merges = WordLevelg.read_file(vocab_filename) diff --git a/bindings/python/tests/bindings/test_models.py b/bindings/python/tests/bindings/test_models.py index 952c3134..0a0403f0 100644 --- a/bindings/python/tests/bindings/test_models.py +++ b/bindings/python/tests/bindings/test_models.py @@ -19,7 +19,10 @@ class TestBPE: BPE(vocab=vocab) BPE(merges=merges) - assert isinstance(pickle.loads(pickle.dumps(BPE(vocab, merges))), BPE,) + assert isinstance( + pickle.loads(pickle.dumps(BPE(vocab, merges))), + BPE, + ) # Deprecated calls in 0.9 with pytest.deprecated_call(): diff --git a/bindings/python/tests/bindings/test_processors.py b/bindings/python/tests/bindings/test_processors.py index 30c9ec5b..12abc156 100644 --- a/bindings/python/tests/bindings/test_processors.py +++ b/bindings/python/tests/bindings/test_processors.py @@ -22,7 +22,8 @@ class TestBertProcessing: assert isinstance(processor, PostProcessor) assert isinstance(processor, BertProcessing) assert isinstance( - pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))), BertProcessing, + pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))), + BertProcessing, ) def test_processing(self): @@ -94,7 +95,9 @@ class TestTemplateProcessing: def get_roberta(self): return TemplateProcessing( - seq_a=" $0 ", seq_b=" $0 ", special_tokens=[("", 0), ("", 1)], + seq_a=" $0 ", + seq_b=" $0 ", + special_tokens=[("", 0), ("", 1)], ) def get_t5_squad(self): diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index 54b498eb..8ad5401d 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -232,10 +232,12 @@ class TestTokenizer: # Numpy test_single( - np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]), True, + np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]), + True, ) test_single( - np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))), True, + np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))), + True, ) test_pair( np.array( @@ -276,7 +278,8 @@ class TestTokenizer: tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True) tokenizer.post_processor = RobertaProcessing( - ("", tokenizer.token_to_id("")), ("", tokenizer.token_to_id("")), + ("", tokenizer.token_to_id("")), + ("", tokenizer.token_to_id("")), ) # Can encode with special tokens diff --git a/bindings/python/tests/implementations/test_byte_level_bpe.py b/bindings/python/tests/implementations/test_byte_level_bpe.py index 68a01209..afa28fdc 100644 --- a/bindings/python/tests/implementations/test_byte_level_bpe.py +++ b/bindings/python/tests/implementations/test_byte_level_bpe.py @@ -65,7 +65,10 @@ class TestByteLevelBPE: def test_lowerspace(self, roberta_files): tokenizer = ByteLevelBPETokenizer.from_file( - roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True, + roberta_files["vocab"], + roberta_files["merges"], + add_prefix_space=True, + lowercase=True, ) output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")