Python - Update all typings

2025-12-16 17:18:43 +00:00 · 2020-01-06 20:03:00 -05:00
parent 0079a7a6b7
commit 772d0680b6
5 changed files with 195 additions and 28 deletions
--- a/bindings/python/tokenizers/decoders/init.pyi
+++ b/bindings/python/tokenizers/decoders/init.pyi
@@ -1,22 +1,22 @@
-from .. import decoders
+from typing import List

 class Decoder:
-    """Decoder
+    """ Base class for all decoders
+
+    This class is not supposed to be instantiated directly. Instead, any implementation of
+    a Decoder will return an instance of this class when instantiated.
    """

-    @staticmethod
-    def custom():
-        pass
-
-    def decode(tokens: List[str]) -> str:
+    def decode(self, tokens: List[str]) -> str:
+        """ Decode the given list of string to a final string """
        pass

 class ByteLevel:
-    """ByteLevel
-    """
+    """ ByteLevel Decoder """

    @staticmethod
    def new() -> Decoder:
+        """ Instantiate a new ByteLevel Decoder """
        pass

 class WordPiece:
@@ -25,4 +25,5 @@ class WordPiece:

    @staticmethod
    def new() -> Decoder:
+        """ Instantiate a new WordPiece Decoder """
        pass
--- a/bindings/python/tokenizers/normalizers/init.pyi
+++ b/bindings/python/tokenizers/normalizers/init.pyi
@@ -1,12 +1,41 @@
-from .. import normalizers
+from typing import Optional

 class Normalizer:
-    """Normalizer
+    """ Base class for all normalizers
+
+    This class is not supposed to be instantiated directly. Instead, any implementation of a
+    Normalizer will return an instance of this class when instantiated.
    """

 class BertNormalizer:
-    """BertNormalizer
+    """ BertNormalizer
+
+    Takes care of normalizing raw text before giving it to a Bert model.
+    This includes cleaning the text, handling accents, chinese chars and lowercasing
    """

-    def new() -> Normalizer:
+    @staticmethod
+    def new(clean_text: Optional[bool]=True,
+            handle_chinese_chars: Optional[bool]=True,
+            strip_accents: Optional[bool]=True,
+            lowercase: Optional[bool]=True) -> Normalizer:
+        """ Instantiate a BertNormalizer with the given options.
+
+        Args:
+            clean_text: (`optional`) boolean:
+                Whether to clean the text, by removing any control characters
+                and replacing all whitespaces by the classic one.
+
+            handle_chinese_chars: (`optional`) boolean:
+                Whether to handle chinese chars by putting spaces around them.
+
+            strip_accents: (`optional`) boolean:
+                Whether to strip all accents.
+
+            lowercase: (`optional`) boolean:
+                Whether to lowercase.
+
+        Returns:
+            Normalizer
+        """
        pass
--- a/bindings/python/tokenizers/pre_tokenizers/init.pyi
+++ b/bindings/python/tokenizers/pre_tokenizers/init.pyi
@@ -1,38 +1,68 @@
-from .. import pre_tokenizers
+from typing import Optional, List, Tuple

 Offsets = Tuple[int, int]

 class PreTokenizer:
-    """PreTokenizer
+    """ Base class for all pre-tokenizers
+
+    This class is not supposed to be instantiated directly. Instead, any implementation of a
+    PreTokenizer will return an instance of this class when instantiated.
    """

    def pre_tokenize(self, sequence: str) -> List[Tuple[str, Offsets]]:
+        """ Pre tokenize the given sequence """
        pass

 class ByteLevel:
-    """ByteLevel
+    """ ByteLevel PreTokenizer
+
+    This pre-tokenizer takes care of replacing all bytes of the given string
+    with a corresponding representation, as well as splitting into words.
    """

    @staticmethod
-    def new() -> PreTokenizer:
+    def new(add_prefix_space: Optional[bool]=True) -> PreTokenizer:
+        """ Instantiate a new ByteLevel PreTokenizer
+
+        Args:
+            add_prefix_space: (`optional`) boolean:
+                Whether a space should be added at the very beginning of the sequence
+                if there isn't one already.
+
+        Returns:
+            PreTokenizer
+        """
        pass

    @staticmethod
    def alphabet() -> List[str]:
+        """ Returns the alphabet used by this PreTokenizer.
+
+        Since the ByteLevel works as its name suggests, at the byte level, it
+        encodes any byte to one visible character. This means that there is a
+        total of 256 different characters composing this alphabet.
+        """
        pass

 class Whitespace:
-    """Whitespace
+    """ Whitespace PreTokenizer
+
+    This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
    """

    @staticmethod
    def new() -> PreTokenizer:
+        """ Instantiate a new Whitespace PreTokenizer """
        pass

 class BertPreTokenizer:
-    """BertPreTokenizer
+    """ BertPreTokenizer
+
+    This pre-tokenizer splits tokens on spaces, and also on punctuation.
+    Each occurence of a punctuation character will be treated separately.
    """

    @staticmethod
    def new() -> PreTokenizer:
+        """ Instantiate a new BertPreTokenizer """
        pass
--- a/bindings/python/tokenizers/processors/init.pyi
+++ b/bindings/python/tokenizers/processors/init.pyi
@@ -1,13 +1,33 @@
-from .. import processors
+from typing import Tuple

 class PostProcessor:
-    """PostProcessor
+    """ Base class for all post-processors
+
+    This class is not supposed to be instantiated directly. Instead, any implementation of
+    a PostProcessor will return an instance of this class when instantiated.
    """

 class BertProcessing:
-    """BertProcessing
+    """ BertProcessing
+
+    This post-processor takes care of adding the special tokens needed by
+    a Bert model:
+        - a SEP token
+        - a CLS token
    """

    @staticmethod
    def new(sep: Tuple[str, int], cls: Tuple[str, int]) -> PostProcessor:
+        """ Instantiate a new BertProcessing with the given tokens
+
+        Args:
+            sep: Tuple[str, int]:
+                A tuple with the string representation of the SEP token, and its id
+
+            cls: Tuple[str, int]:
+                A tuple with the string representation of the CLS token, and its id
+
+        Returns:
+            PostProcessor
+        """
        pass
--- a/bindings/python/tokenizers/trainers/init.pyi
+++ b/bindings/python/tokenizers/trainers/init.pyi
@@ -1,21 +1,108 @@
-from .. import trainers
+from typing import Optional, List

 class Trainer:
-    """Trainer
+    """ Base class for all trainers
+
+    This class is not supposed to be instantiated directly. Instead, any implementation of a
+    Trainer will return an instance of this class when instantiated.
    """

 class BpeTrainer:
-    """BpeTrainer
+    """ BpeTrainer
+
+    Capable of training a BPE model
    """

    @staticmethod
-    def new() -> Trainer:
+    def new(vocab_size: int=30000,
+            min_frequency: int=0,
+            show_progress: bool=True,
+            special_tokens: List[str]=[],
+            limit_alphabet: Optional[int]=None,
+            initial_alphabet: List[str]=[],
+            continuing_subword_prefix: Optional[str]=None,
+            end_of_word_suffix: Optional[str]=None) -> Trainer:
+        """ Instantiate a new BpeTrainer with the given options:
+
+        Args:
+            vocab_size: unsigned int:
+                The size of the final vocabulary, including all tokens and alphabet.
+
+            min_frequency: unsigned int:
+                The minimum frequency a pair should have in order to be merged.
+
+            show_progress: boolean:
+                Whether to show progress bars while training.
+
+            special_tokens: List[str]:
+                A list of special tokens the model should know of.
+
+            limit_alphabet: unsigned int:
+                The maximum different characters to keep in the alphabet.
+
+            initial_alphabet: List[str]:
+                A list of characters to include in the initial alphabet, even
+                if not seen in the training dataset.
+                If the strings contains more than one character, only the first one
+                is kept.
+
+            continuing_subword_prefix: Optional[str]:
+                A prefix to be used for every subword that is not a beginning-of-word.
+
+            end_of_word_suffix: Optional[str]:
+                A suffix to be used for every subword that is a end-of-word.
+
+        Returns:
+            Trainer
+        """
        pass

 class WordPieceTrainer:
-    """WordPieceTrainer
+    """ WordPieceTrainer
+
+    Capable of training a WordPiece model
    """

    @staticmethod
-    def new() -> Trainer:
+    def new(vocab_size: int=30000,
+            min_frequency: int=0,
+            show_progress: bool=True,
+            special_tokens: List[str]=[],
+            limit_alphabet: Optional[int]=None,
+            initial_alphabet: List[str]=[],
+            continuing_subword_prefix: Optional[str]="##",
+            end_of_word_suffix: Optional[str]=None) -> Trainer:
+        """ Instantiate a new WordPieceTrainer with the given options:
+
+        Args:
+            vocab_size: unsigned int:
+                The size of the final vocabulary, including all tokens and alphabet.
+
+            min_frequency: unsigned int:
+                The minimum frequency a pair should have in order to be merged.
+
+            show_progress: boolean:
+                Whether to show progress bars while training.
+
+            special_tokens: List[str]:
+                A list of special tokens the model should know of.
+
+            limit_alphabet: unsigned int:
+                The maximum different characters to keep in the alphabet.
+
+            initial_alphabet: List[str]:
+                A list of characters to include in the initial alphabet, even
+                if not seen in the training dataset.
+                If the strings contains more than one character, only the first one
+                is kept.
+
+            continuing_subword_prefix: Optional[str]:
+                A prefix to be used for every subword that is not a beginning-of-word.
+
+            end_of_word_suffix: Optional[str]:
+                A suffix to be used for every subword that is a end-of-word.
+
+        Returns:
+            Trainer
+        """
        pass