Black pre-commit after rebase.

2025-12-08 21:58:18 +00:00 · 2020-09-23 11:37:09 +02:00
parent acd4a7599f
commit 9b1ef9d895
8 changed files with 80 additions and 97 deletions
--- a/bindings/python/py_src/tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/init.pyi
@@ -258,7 +258,7 @@ class Encoding:

    @staticmethod
    def merge(encodings: List[Encoding], growing_offsets: bool = True) -> Encoding:
-        """ Merge the list of Encoding into one final Encoding
+        """Merge the list of Encoding into one final Encoding

        Args:
            encodings: List[Encoding]:
@@ -289,7 +289,7 @@ class Encoding:
        pass
    @property
    def offsets(self) -> List[Offsets]:
-        """ The offsets.
+        """The offsets.
        These offsets can be used to index any `IndexableString` directly. If you want to
        index the original `str`, make sure to retrieve the converted offsets using the `.offsets`
        method on the `original_str`.
@@ -388,7 +388,7 @@ class Encoding:
        pad_token: Optional[str] = "[PAD]",
        direction: Optional[str] = "right",
    ):
-        """ Pad the current Encoding at the given length
+        """Pad the current Encoding at the given length

        Args:
            length: int:
@@ -408,7 +408,7 @@ class Encoding:
        """
        pass
    def truncate(self, max_length: int, stride: Optional[int] = 0):
-        """ Truncate the current Encoding at the given max_length
+        """Truncate the current Encoding at the given max_length

        Args:
            max_length: int:
@@ -421,7 +421,7 @@ class Encoding:
        pass

 class AddedToken:
-    """ AddedToken represents a token to be added to a Tokenizer
+    """AddedToken represents a token to be added to a Tokenizer

    An AddedToken can have special options defining the way it should behave.
    """
@@ -434,7 +434,7 @@ class AddedToken:
        rstrip: bool = False,
        normalized: bool = True,
    ) -> AddedToken:
-        """ Instantiate a new AddedToken
+        """Instantiate a new AddedToken

        Args:
            content: str:
@@ -464,7 +464,7 @@ class AddedToken:
        pass

 class Tokenizer:
-    """ Tokenizer
+    """Tokenizer

    A Tokenizer works as a pipeline, it processes some raw text as input and outputs
    an `Encoding`.
@@ -481,7 +481,7 @@ class Tokenizer:
    """

    def __new__(cls, model: models.Model) -> Tokenizer:
-        """ Instantiate a new Tokenizer using the given Model
+        """Instantiate a new Tokenizer using the given Model

        Args:
            model: models.Model:
@@ -493,7 +493,7 @@ class Tokenizer:
        pass
    @staticmethod
    def from_str(s: str) -> Tokenizer:
-        """ Instantiate a new Tokenizer from the given JSON string
+        """Instantiate a new Tokenizer from the given JSON string

        Args:
            s: str:
@@ -505,7 +505,7 @@ class Tokenizer:
        pass
    @staticmethod
    def from_file(path: str) -> Tokenizer:
-        """ Instantiate a new Tokenizer from the given file
+        """Instantiate a new Tokenizer from the given file

        Args:
            path: str:
@@ -517,7 +517,7 @@ class Tokenizer:
        pass
    @staticmethod
    def from_buffer(buffer: bytes) -> Tokenizer:
-        """ Instantiate a new Tokenizer from the given buffer
+        """Instantiate a new Tokenizer from the given buffer

        Args:
            buffer: bytes:
@@ -528,7 +528,7 @@ class Tokenizer:
        """
        pass
    def to_str(self, pretty: bool = False) -> str:
-        """ Get a serialized JSON version of the Tokenizer as a str
+        """Get a serialized JSON version of the Tokenizer as a str

        Args:
            pretty: bool:
@@ -539,7 +539,7 @@ class Tokenizer:
        """
        pass
    def save(self, path: str, pretty: bool = False):
-        """ Save the Tokenizer as JSON to the given path
+        """Save the Tokenizer as JSON to the given path

        Args:
            pretty: bool:
@@ -592,7 +592,7 @@ class Tokenizer:
        """
        pass
    def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
-        """ Returns the vocabulary
+        """Returns the vocabulary

        Args:
            with_added_tokens: boolean:
@@ -603,7 +603,7 @@ class Tokenizer:
        """
        pass
    def get_vocab_size(self, with_added_tokens: bool = True) -> int:
-        """ Returns the size of the vocabulary
+        """Returns the size of the vocabulary

        Args:
            with_added_tokens: boolean:
@@ -614,7 +614,7 @@ class Tokenizer:
        """
        pass
    def enable_truncation(self, max_length: int, stride: Optional[int], strategy: Optional[str]):
-        """ Enable the truncation
+        """Enable the truncation

        Args:
            max_length: unsigned int:
@@ -633,7 +633,7 @@ class Tokenizer:
        pass
    @property
    def truncation(self) -> Optional[dict]:
-        """ Get the current truncation parameters
+        """Get the current truncation parameters

        Returns:
            None if truncation is disabled, a dict with the current truncation parameters if
@@ -649,7 +649,7 @@ class Tokenizer:
        pad_token: Optional[str] = "[PAD]",
        length: Optional[int] = None,
    ):
-        """ Enable the padding
+        """Enable the padding

        Args:
            direction: (`optional`) str:
@@ -679,7 +679,7 @@ class Tokenizer:
        pass
    @property
    def padding(self) -> Optional[dict]:
-        """ Get the current padding parameters
+        """Get the current padding parameters

        Returns:
            None if padding is disabled, a dict with the currently set parameters
@@ -693,7 +693,7 @@ class Tokenizer:
        is_pretokenized: bool = False,
        add_special_tokens: bool = True,
    ) -> Encoding:
-        """ Encode the given sequence and pair. This method can process raw text sequences as well
+        """Encode the given sequence and pair. This method can process raw text sequences as well
        as already pre-tokenized sequences.

        Args:
@@ -721,7 +721,7 @@ class Tokenizer:
        is_pretokenized: bool = False,
        add_special_tokens: bool = True,
    ) -> List[Encoding]:
-        """ Encode the given inputs. This method accept both raw text sequences as well as already
+        """Encode the given inputs. This method accept both raw text sequences as well as already
        pre-tokenized sequences.

        Args:
@@ -748,7 +748,7 @@ class Tokenizer:
        """
        pass
    def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
-        """ Decode the given list of ids to a string sequence
+        """Decode the given list of ids to a string sequence

        Args:
            ids: List[unsigned int]:
@@ -764,7 +764,7 @@ class Tokenizer:
    def decode_batch(
        self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
    ) -> str:
-        """ Decode the list of sequences to a list of string sequences
+        """Decode the list of sequences to a list of string sequences

        Args:
            sequences: List[List[unsigned int]]:
@@ -778,7 +778,7 @@ class Tokenizer:
        """
        pass
    def token_to_id(self, token: str) -> Optional[int]:
-        """ Convert the given token to its corresponding id
+        """Convert the given token to its corresponding id

        Args:
            token: str:
@@ -789,7 +789,7 @@ class Tokenizer:
        """
        pass
    def id_to_token(self, id: int) -> Optional[str]:
-        """ Convert the given token id to its corresponding string
+        """Convert the given token id to its corresponding string

        Args:
            token: id:
@@ -800,7 +800,7 @@ class Tokenizer:
        """
        pass
    def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
-        """ Add the given tokens to the vocabulary
+        """Add the given tokens to the vocabulary

        Args:
            tokens: List[Union[str, AddedToken]]:
@@ -812,7 +812,7 @@ class Tokenizer:
        """
        pass
    def add_special_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
-        """ Add the given special tokens to the vocabulary, and treat them as special tokens.
+        """Add the given special tokens to the vocabulary, and treat them as special tokens.

        The special tokens will never be processed by the model, and will be
        removed while decoding.
@@ -829,7 +829,7 @@ class Tokenizer:
    def post_process(
        self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True,
    ) -> Encoding:
-        """ Apply all the post-processing steps to the given encodings.
+        """Apply all the post-processing steps to the given encodings.

        The various steps are:
            1. Truncate according to global params (provided to `enable_truncation`)
--- a/bindings/python/py_src/tokenizers/decoders/init.pyi
+++ b/bindings/python/py_src/tokenizers/decoders/init.pyi
@@ -1,7 +1,7 @@
 from typing import List

 class Decoder:
-    """ Base class for all decoders
+    """Base class for all decoders

    This class is not supposed to be instantiated directly. Instead, any implementation of
    a Decoder will return an instance of this class when instantiated.
@@ -23,7 +23,7 @@ class WordPiece(Decoder):

    @staticmethod
    def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder:
-        """ Instantiate a new WordPiece Decoder
+        """Instantiate a new WordPiece Decoder

        Args:
            prefix: str:
@@ -38,7 +38,7 @@ class Metaspace(Decoder):
    """ Metaspace decoder """

    def __init__(self, replacement: str = "▁", add_prefix_space: bool = True) -> None:
-        """ Instantiate a new Metaspace
+        """Instantiate a new Metaspace

        Args:
            replacement: str:
@@ -55,7 +55,7 @@ class BPEDecoder(Decoder):
    """ BPEDecoder """

    def __init__(self, suffix: str = "</w>") -> None:
-        """ Instantiate a new BPEDecoder
+        """Instantiate a new BPEDecoder

        Args:
            suffix: str:
--- a/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py
+++ b/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py
@@ -25,7 +25,7 @@ class BaseTokenizer:
        return self._tokenizer.num_special_tokens_to_add(is_pair)

    def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
-        """ Returns the vocabulary
+        """Returns the vocabulary

        Args:
            with_added_tokens: boolean:
@@ -37,7 +37,7 @@ class BaseTokenizer:
        return self._tokenizer.get_vocab(with_added_tokens=with_added_tokens)

    def get_vocab_size(self, with_added_tokens: bool = True) -> int:
-        """ Return the size of vocabulary, with or without added tokens.
+        """Return the size of vocabulary, with or without added tokens.

        Args:
            with_added_tokens: (`optional`) bool:
@@ -57,7 +57,7 @@ class BaseTokenizer:
        pad_token: Optional[str] = "[PAD]",
        length: Optional[int] = None,
    ):
-        """ Change the padding strategy
+        """Change the padding strategy

        Args:
            direction: (`optional`) str:
@@ -96,7 +96,7 @@ class BaseTokenizer:

    @property
    def padding(self) -> Optional[dict]:
-        """ Get the current padding parameters
+        """Get the current padding parameters

        Returns:
            None if padding is disabled, a dict with the currently set parameters
@@ -107,7 +107,7 @@ class BaseTokenizer:
    def enable_truncation(
        self, max_length: int, stride: Optional[int] = 0, strategy: Optional[str] = "longest_first"
    ):
-        """ Change the truncation options
+        """Change the truncation options

        Args:
            max_length: unsigned int:
@@ -128,7 +128,7 @@ class BaseTokenizer:

    @property
    def truncation(self) -> Optional[dict]:
-        """ Get the current truncation parameters
+        """Get the current truncation parameters

        Returns:
            None if truncation is disabled, a dict with the current truncation parameters if
@@ -137,7 +137,7 @@ class BaseTokenizer:
        return self._tokenizer.truncation

    def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
-        """ Add the given tokens to the vocabulary
+        """Add the given tokens to the vocabulary

        Args:
            tokens: List[Union[str, AddedToken]]:
@@ -150,7 +150,7 @@ class BaseTokenizer:
        return self._tokenizer.add_tokens(tokens)

    def add_special_tokens(self, special_tokens: List[Union[str, AddedToken]]) -> int:
-        """ Add the given special tokens to the vocabulary, and treat them as special tokens.
+        """Add the given special tokens to the vocabulary, and treat them as special tokens.

        The special tokens will never be processed by the model, and will be
        removed while decoding.
@@ -166,7 +166,7 @@ class BaseTokenizer:
        return self._tokenizer.add_special_tokens(special_tokens)

    def normalize(self, sequence: str) -> str:
-        """ Normalize the given sequence
+        """Normalize the given sequence

        Args:
            sequence: str:
@@ -184,7 +184,7 @@ class BaseTokenizer:
        is_pretokenized: bool = False,
        add_special_tokens: bool = True,
    ) -> Encoding:
-        """ Encode the given sequence and pair. This method can process raw text sequences as well
+        """Encode the given sequence and pair. This method can process raw text sequences as well
        as already pre-tokenized sequences.

        Args:
@@ -216,7 +216,7 @@ class BaseTokenizer:
        is_pretokenized: bool = False,
        add_special_tokens: bool = True,
    ) -> List[Encoding]:
-        """ Encode the given inputs. This method accept both raw text sequences as well as already
+        """Encode the given inputs. This method accept both raw text sequences as well as already
        pre-tokenized sequences.

        Args:
@@ -248,7 +248,7 @@ class BaseTokenizer:
        return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)

    def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
-        """ Decode the given list of ids to a string sequence
+        """Decode the given list of ids to a string sequence

        Args:
            ids: List[unsigned int]:
@@ -268,7 +268,7 @@ class BaseTokenizer:
    def decode_batch(
        self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
    ) -> str:
-        """ Decode the list of sequences to a list of string sequences
+        """Decode the list of sequences to a list of string sequences

        Args:
            sequences: List[List[unsigned int]]:
@@ -286,7 +286,7 @@ class BaseTokenizer:
        return self._tokenizer.decode_batch(sequences, skip_special_tokens=skip_special_tokens)

    def token_to_id(self, token: str) -> Optional[int]:
-        """ Convert the given token to its corresponding id
+        """Convert the given token to its corresponding id

        Args:
            token: str:
@@ -298,7 +298,7 @@ class BaseTokenizer:
        return self._tokenizer.token_to_id(token)

    def id_to_token(self, id: int) -> Optional[str]:
-        """ Convert the given token id to its corresponding string
+        """Convert the given token id to its corresponding string

        Args:
            token: id:
@@ -310,7 +310,7 @@ class BaseTokenizer:
        return self._tokenizer.id_to_token(id)

    def save_model(self, directory: str, name: Optional[str] = None):
-        """ Save the current model to the given directory
+        """Save the current model to the given directory

        Args:
            directory: str:
@@ -322,7 +322,7 @@ class BaseTokenizer:
        return self._tokenizer.model.save(directory, name=name)

    def save(self, path: str, pretty: bool = False):
-        """ Save the current Tokenizer at the given path
+        """Save the current Tokenizer at the given path

        Args:
            path: str:
@@ -331,7 +331,7 @@ class BaseTokenizer:
        return self._tokenizer.save(path, pretty)

    def to_str(self, pretty: bool = False):
-        """ Get a serialized JSON version of the Tokenizer as a str
+        """Get a serialized JSON version of the Tokenizer as a str

        Args:
            pretty: bool:
@@ -345,7 +345,7 @@ class BaseTokenizer:
    def post_process(
        self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True
    ) -> Encoding:
-        """ Apply all the post-processing steps to the given encodings.
+        """Apply all the post-processing steps to the given encodings.

        The various steps are:
            1. Truncate according to global params (provided to `enable_truncation`)
--- a/bindings/python/py_src/tokenizers/normalizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/normalizers/init.pyi
@@ -2,7 +2,7 @@ from .. import NormalizedString
 from typing import Optional, List

 class Normalizer:
-    """ Base class for all normalizers
+    """Base class for all normalizers

    This class is not supposed to be instantiated directly. Instead, any implementation of a
    Normalizer will return an instance of this class when instantiated.
@@ -16,7 +16,7 @@ class Normalizer:
        pass

 class BertNormalizer(Normalizer):
-    """ BertNormalizer
+    """BertNormalizer

    Takes care of normalizing raw text before giving it to a Bert model.
    This includes cleaning the text, handling accents, chinese chars and lowercasing
@@ -29,7 +29,7 @@ class BertNormalizer(Normalizer):
        strip_accents: Optional[bool] = None,
        lowercase: Optional[bool] = True,
    ) -> None:
-        """ Instantiate a BertNormalizer with the given options.
+        """Instantiate a BertNormalizer with the given options.

        Args:
            clean_text: (`optional`) boolean:
@@ -80,13 +80,13 @@ class NFKC(Normalizer):
        pass

 class Sequence(Normalizer):
-    """ Allows concatenating multiple other Normalizer as a Sequence.
+    """Allows concatenating multiple other Normalizer as a Sequence.

    All the normalizers run in sequence in the given order
    """

    def __init__(self, normalizers: List[Normalizer]) -> None:
-        """ Instantiate a new normalization Sequence using the given normalizers
+        """Instantiate a new normalization Sequence using the given normalizers

        Args:
            normalizers: List[Normalizer]:
--- a/bindings/python/py_src/tokenizers/processors/init.pyi
+++ b/bindings/python/py_src/tokenizers/processors/init.pyi
@@ -2,7 +2,7 @@ from .. import Encoding
 from typing import Tuple, Union, List

 class PostProcessor:
-    """ Base class for all post-processors
+    """Base class for all post-processors

    This class is not supposed to be instantiated directly. Instead, any implementation of
    a PostProcessor will return an instance of this class when instantiated.
@@ -22,7 +22,7 @@ class PostProcessor:
        pass

 class BertProcessing(PostProcessor):
-    """ BertProcessing
+    """BertProcessing

    This post-processor takes care of adding the special tokens needed by
    a Bert model:
@@ -31,7 +31,7 @@ class BertProcessing(PostProcessor):
    """

    def __init__(self, sep: Tuple[str, int], cls: Tuple[str, int]) -> None:
-        """ Instantiate a new BertProcessing with the given tokens
+        """Instantiate a new BertProcessing with the given tokens

        Args:
            sep: Tuple[str, int]:
@@ -46,7 +46,7 @@ class BertProcessing(PostProcessor):
        pass

 class RobertaProcessing(PostProcessor):
-    """ RobertaProcessing
+    """RobertaProcessing

    This post-processor takes care of adding the special tokens needed by
    a Roberta model:
@@ -66,7 +66,7 @@ class RobertaProcessing(PostProcessor):
        trim_offsets: bool = True,
        add_prefix_space: bool = True,
    ) -> None:
-        """ Instantiate a new RobertaProcessing with the given tokens
+        """Instantiate a new RobertaProcessing with the given tokens

        Args:
            sep: Tuple[str, int]:
@@ -88,7 +88,7 @@ class RobertaProcessing(PostProcessor):
        pass

 class ByteLevel(PostProcessor):
-    """ ByteLevel Post processing
+    """ByteLevel Post processing

    This post-processor takes care of trimming the offsets.
    By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
@@ -96,7 +96,7 @@ class ByteLevel(PostProcessor):
    """

    def __init__(self, trim_offsets: bool = True) -> None:
-        """ Instantiate a new ByteLevel
+        """Instantiate a new ByteLevel

        Args:
            trim_offsets: bool:
@@ -108,7 +108,7 @@ Template = Union[str, List[str]]
 Tokens = List[Union[Tuple[int, str], Tuple[str, int], dict]]

 class TemplateProcessing(PostProcessor):
-    """ TemplateProcessing
+    """TemplateProcessing

    Provides a way to specify templates in order to add the special tokens to each
    input sequence as relevant.
@@ -143,7 +143,7 @@ class TemplateProcessing(PostProcessor):
    """

    def __init__(self, seq_a: Template, seq_b: Template, special_tokens: Tokens) -> None:
-        """ Instantiate a new TemplateProcessing
+        """Instantiate a new TemplateProcessing

        Args:
            seq_a: Template
--- a/bindings/python/py_src/tokenizers/trainers/init.pyi
+++ b/bindings/python/py_src/tokenizers/trainers/init.pyi
@@ -2,14 +2,14 @@ from .. import AddedToken
 from typing import Optional, List, Union

 class Trainer:
-    """ Base class for all trainers
+    """Base class for all trainers

    This class is not supposed to be instantiated directly. Instead, any implementation of a
    Trainer will return an instance of this class when instantiated.
    """

 class BpeTrainer(Trainer):
-    """ BpeTrainer
+    """BpeTrainer

    Capable of training a BPE model
    """
@@ -25,7 +25,7 @@ class BpeTrainer(Trainer):
        continuing_subword_prefix: Optional[str] = None,
        end_of_word_suffix: Optional[str] = None,
    ) -> None:
-        """ Instantiate a new BpeTrainer with the given options:
+        """Instantiate a new BpeTrainer with the given options:

        Args:
            vocab_size: unsigned int:
@@ -61,7 +61,7 @@ class BpeTrainer(Trainer):
        pass

 class WordPieceTrainer(Trainer):
-    """ WordPieceTrainer
+    """WordPieceTrainer

    Capable of training a WordPiece model
    """
@@ -77,7 +77,7 @@ class WordPieceTrainer(Trainer):
        continuing_subword_prefix: Optional[str] = "##",
        end_of_word_suffix: Optional[str] = None,
    ) -> Trainer:
-        """ Instantiate a new WordPieceTrainer with the given options:
+        """Instantiate a new WordPieceTrainer with the given options:

        Args:
            vocab_size: unsigned int:
@@ -113,7 +113,7 @@ class WordPieceTrainer(Trainer):
        pass

 class UnigramTrainer(Trainer):
-    """ UnigramTrainer
+    """UnigramTrainer

    Capable of training a Unigram model
    """
@@ -124,7 +124,7 @@ class UnigramTrainer(Trainer):
        show_progress: bool = True,
        special_tokens: List[Union[str, AddedToken]] = [],
    ) -> Trainer:
-        """ Instantiate a new UnigramTrainer with the given options:
+        """Instantiate a new UnigramTrainer with the given options:

        Args:
            vocab_size: unsigned int: