diff --git a/bindings/python/py_src/tokenizers/__init__.py b/bindings/python/py_src/tokenizers/__init__.py index c3a37475..d43e3a35 100644 --- a/bindings/python/py_src/tokenizers/__init__.py +++ b/bindings/python/py_src/tokenizers/__init__.py @@ -9,7 +9,8 @@ TextInputSequence = str PreTokenizedInputSequence = Union[List[str], Tuple[str]] TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]] PreTokenizedEncodeInput = Union[ - PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], + PreTokenizedInputSequence, + Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], ] InputSequence = Union[TextInputSequence, PreTokenizedInputSequence] diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi index 67173b5e..7a086bda 100644 --- a/bindings/python/py_src/tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/__init__.pyi @@ -21,7 +21,8 @@ TextInputSequence = str PreTokenizedInputSequence = Union[List[str], Tuple[str]] TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]] PreTokenizedEncodeInput = Union[ - PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], + PreTokenizedInputSequence, + Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], ] InputSequence = Union[TextInputSequence, PreTokenizedInputSequence] @@ -827,7 +828,10 @@ class Tokenizer: """ pass def post_process( - self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True, + self, + encoding: Encoding, + pair: Optional[Encoding] = None, + add_special_tokens: bool = True, ) -> Encoding: """Apply all the post-processing steps to the given encodings. diff --git a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py index 1cbc9c3a..81f35e22 100644 --- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py +++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py @@ -21,7 +21,10 @@ class SentencePieceUnigramTokenizer(BaseTokenizer): """ def __init__( - self, vocab: Optional[str] = None, replacement: str = "▁", add_prefix_space: bool = True, + self, + vocab: Optional[str] = None, + replacement: str = "▁", + add_prefix_space: bool = True, ): if vocab is not None: # Let Unigram(..) fail if only one of them is None @@ -29,7 +32,12 @@ class SentencePieceUnigramTokenizer(BaseTokenizer): else: tokenizer = Tokenizer(Unigram()) - tokenizer.normalizer = normalizers.Sequence([normalizers.Nmt(), normalizers.NFKC(),]) + tokenizer.normalizer = normalizers.Sequence( + [ + normalizers.Nmt(), + normalizers.NFKC(), + ] + ) tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [ pre_tokenizers.WhitespaceSplit(), @@ -60,7 +68,9 @@ class SentencePieceUnigramTokenizer(BaseTokenizer): """ Train the model using the given files """ trainer = trainers.UnigramTrainer( - vocab_size=vocab_size, special_tokens=special_tokens, show_progress=show_progress, + vocab_size=vocab_size, + special_tokens=special_tokens, + show_progress=show_progress, ) if isinstance(files, str): diff --git a/bindings/python/tests/bindings/test_models.py b/bindings/python/tests/bindings/test_models.py index 7dfef0b6..45f68bcd 100644 --- a/bindings/python/tests/bindings/test_models.py +++ b/bindings/python/tests/bindings/test_models.py @@ -18,7 +18,10 @@ class TestBPE: BPE(vocab=vocab) BPE(merges=merges) - assert isinstance(pickle.loads(pickle.dumps(BPE(vocab, merges))), BPE,) + assert isinstance( + pickle.loads(pickle.dumps(BPE(vocab, merges))), + BPE, + ) # Deprecated calls in 0.9 with pytest.deprecated_call(): diff --git a/bindings/python/tests/bindings/test_processors.py b/bindings/python/tests/bindings/test_processors.py index 30c9ec5b..12abc156 100644 --- a/bindings/python/tests/bindings/test_processors.py +++ b/bindings/python/tests/bindings/test_processors.py @@ -22,7 +22,8 @@ class TestBertProcessing: assert isinstance(processor, PostProcessor) assert isinstance(processor, BertProcessing) assert isinstance( - pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))), BertProcessing, + pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))), + BertProcessing, ) def test_processing(self): @@ -94,7 +95,9 @@ class TestTemplateProcessing: def get_roberta(self): return TemplateProcessing( - seq_a=" $0 ", seq_b=" $0 ", special_tokens=[("", 0), ("", 1)], + seq_a=" $0 ", + seq_b=" $0 ", + special_tokens=[("", 0), ("", 1)], ) def get_t5_squad(self):