Upgrading to black 20.8b1

2025-12-08 13:48:19 +00:00 · 2020-09-24 15:00:20 +02:00
parent dc1d0711cf
commit a410903051
11 changed files with 66 additions and 32 deletions
--- a/bindings/python/py_src/tokenizers/init.py
+++ b/bindings/python/py_src/tokenizers/init.py
@@ -9,7 +9,8 @@ TextInputSequence = str
 PreTokenizedInputSequence = Union[List[str], Tuple[str]]
 TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
 PreTokenizedEncodeInput = Union[
-    PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
+    PreTokenizedInputSequence,
+    Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
 ]

 InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
--- a/bindings/python/py_src/tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/init.pyi
@@ -21,7 +21,8 @@ TextInputSequence = str
 PreTokenizedInputSequence = Union[List[str], Tuple[str]]
 TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
 PreTokenizedEncodeInput = Union[
-    PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
+    PreTokenizedInputSequence,
+    Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
 ]

 InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
@@ -52,7 +53,7 @@ Range = Union[int, Tuple[int, int], slice]
 Pattern = Union[str, Regex]

 class PreTokenizedString:
-    """ PreTokenizedString
+    """PreTokenizedString

    Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
    underlying string, while keeping track of the alignment information (offsets).
@@ -66,7 +67,7 @@ class PreTokenizedString:
    """

    def __new__(sequence: str) -> PreTokenizedString:
-        """ Instantiate a new PreTokenizedString using the given str
+        """Instantiate a new PreTokenizedString using the given str

        Args:
            sequence: str:
@@ -74,7 +75,7 @@ class PreTokenizedString:
        """
        pass
    def split(self, func: Callable[[index, NormalizedString], List[NormalizedString]]):
-        """ Split the PreTokenizedString using the given `func`
+        """Split the PreTokenizedString using the given `func`

        Args:
            func: Callable[[index, NormalizedString], List[NormalizedString]]:
@@ -87,7 +88,7 @@ class PreTokenizedString:
        """
        pass
    def normalize(self, func: Callable[[NormalizedString], None]):
-        """ Normalize each split of the `PreTokenizedString` using the given `func`
+        """Normalize each split of the `PreTokenizedString` using the given `func`

        Args:
            func: Callable[[NormalizedString], None]:
@@ -97,7 +98,7 @@ class PreTokenizedString:
        """
        pass
    def tokenize(self, func: Callable[[str], List[Token]]):
-        """ Tokenize each split of the `PreTokenizedString` using the given `func`
+        """Tokenize each split of the `PreTokenizedString` using the given `func`

        Args:
            func: Callable[[str], List[Token]]:
@@ -106,7 +107,7 @@ class PreTokenizedString:
        """
        pass
    def to_encoding(self, type_id: int = 0, word_idx: Optional[int] = None) -> Encoding:
-        """ Return an Encoding generated from this PreTokenizedString
+        """Return an Encoding generated from this PreTokenizedString

        Args:
            type_id: int = 0:
@@ -126,7 +127,7 @@ class PreTokenizedString:
        offset_referential: OffsetReferential = OffsetReferential.ORIGINAL,
        offset_type: OffsetType = OffsetType.CHAR,
    ) -> List[Split]:
-        """ Get the splits currently managed by the PreTokenizedString
+        """Get the splits currently managed by the PreTokenizedString

        Args:
            offset_referential: OffsetReferential:
@@ -145,7 +146,7 @@ class PreTokenizedString:
        pass

 class NormalizedString:
-    """ NormalizedString
+    """NormalizedString

    A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one.
    While making all the requested modifications, it keeps track of the alignment information
@@ -153,7 +154,7 @@ class NormalizedString:
    """

    def __new__(sequence: str) -> NormalizedString:
-        """ Instantiate a new NormalizedString using the given str
+        """Instantiate a new NormalizedString using the given str

        Args:
            sequence: str:
@@ -214,14 +215,14 @@ class NormalizedString:
        """ Calls the given function for each character of the string """
        pass
    def map(self, func: Callable[[str], str]):
-        """ Calls the given function for each character of the string
+        """Calls the given function for each character of the string

        Replaces each character of the string using the returned value. Each
        returned value **must** be a str of length 1 (ie a character).
        """
        pass
    def split(self, pattern: Pattern, behavior: SplitDelimiterBehavior) -> List[NormalizedString]:
-        """ Split the NormalizedString using the given pattern and the specified behavior
+        """Split the NormalizedString using the given pattern and the specified behavior

        Args:
            pattern: Pattern:
@@ -235,7 +236,7 @@ class NormalizedString:
        """
        pass
    def replace(self, pattern: Pattern, content: str):
-        """ Replace the content of the given pattern with the provided content
+        """Replace the content of the given pattern with the provided content

        Args:
            pattern: Pattern:
@@ -827,7 +828,10 @@ class Tokenizer:
        """
        pass
    def post_process(
-        self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True,
+        self,
+        encoding: Encoding,
+        pair: Optional[Encoding] = None,
+        add_special_tokens: bool = True,
    ) -> Encoding:
        """Apply all the post-processing steps to the given encodings.

--- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
+++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
@@ -21,7 +21,10 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
    """

    def __init__(
-        self, vocab: Optional[str] = None, replacement: str = "▁", add_prefix_space: bool = True,
+        self,
+        vocab: Optional[str] = None,
+        replacement: str = "▁",
+        add_prefix_space: bool = True,
    ):
        if vocab is not None:
            # Let Unigram(..) fail if only one of them is None
@@ -29,7 +32,12 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
        else:
            tokenizer = Tokenizer(Unigram())

-        tokenizer.normalizer = normalizers.Sequence([normalizers.Nmt(), normalizers.NFKC(),])
+        tokenizer.normalizer = normalizers.Sequence(
+            [
+                normalizers.Nmt(),
+                normalizers.NFKC(),
+            ]
+        )
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
            [
                pre_tokenizers.WhitespaceSplit(),
@@ -60,7 +68,9 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
        """ Train the model using the given files """

        trainer = trainers.UnigramTrainer(
-            vocab_size=vocab_size, special_tokens=special_tokens, show_progress=show_progress,
+            vocab_size=vocab_size,
+            special_tokens=special_tokens,
+            show_progress=show_progress,
        )

        if isinstance(files, str):
--- a/bindings/python/py_src/tokenizers/models/init.pyi
+++ b/bindings/python/py_src/tokenizers/models/init.pyi
@@ -78,7 +78,7 @@ class BPE(Model):
    def from_file(vocab_filename: str, merges_filename: str, **kwargs) -> BPE:
        """
        Convenient method to intialize a BPE from files
-        Roughly equivalent to 
+        Roughly equivalent to

        def from_file(vocab_filename, merges_filenames, **kwargs):
            vocab, merges = BPE.read_file(vocab_filename, merges_filename)
@@ -116,7 +116,7 @@ class WordPiece(Model):
    def from_file(vocab_filename: str, **kwargs) -> WordPiece:
        """
        Convenient method to intialize a WordPiece from file
-        Roughly equivalent to 
+        Roughly equivalent to

        def from_file(vocab_filename, **kwargs):
            vocab, merges = WordPiece.read_file(vocab_filename)
@@ -147,7 +147,7 @@ class WordLevel(Model):
    def from_file(vocab_filename: str, **kwargs) -> WordLevelg:
        """
        Convenient method to intialize a WordLevelg from file
-        Roughly equivalent to 
+        Roughly equivalent to

        def from_file(vocab_filename, **kwargs):
            vocab, merges = WordLevelg.read_file(vocab_filename)