Automatically stubbing the pyi files while keeping inspecting ability (#509)

* First pass on automatic stubbing our python files. * And now modifying all rust docs to be visible in Pyi files. * Better assert fail message. * Fixing github workflow. * Removing types not exported anymore. * Fixing `Tokenizer` signature. * Disabling auto __init__.py. * Re-enabling some types. * Don't overwrite non automated __init__.py * Automated most __init__.py * Restubbing after rebase. * Fixing env for tests. * Install blakc in the env. * Use PY35 target in stub.py Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
2025-12-08 05:38:23 +00:00 · 2020-11-17 21:13:00 +01:00
parent fff856cff7
commit 352c92ad33
25 changed files with 2511 additions and 1426 deletions
--- a/bindings/python/py_src/tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/init.pyi
--- a/bindings/python/py_src/tokenizers/decoders/init.pyi
+++ b/bindings/python/py_src/tokenizers/decoders/init.pyi
@@ -1,65 +1,87 @@
-from typing import List
-
+# Generated content DO NOT EDIT
 class Decoder:
-    """Base class for all decoders
+    """
+    Base class for all decoders

    This class is not supposed to be instantiated directly. Instead, any implementation of
    a Decoder will return an instance of this class when instantiated.
    """

-    def decode(self, tokens: List[str]) -> str:
-        """ Decode the given list of string to a final string """
-        pass
-
-class ByteLevel(Decoder):
-    """ ByteLevel Decoder """
-
-    def __init__(self) -> None:
-        """ Instantiate a new ByteLevel Decoder """
-        pass
-
-class WordPiece(Decoder):
-    """ WordPiece Decoder """
-
-    @staticmethod
-    def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder:
-        """Instantiate a new WordPiece Decoder
-
-        Args:
-            prefix: str:
-                The prefix to use for subwords that are not a beginning-of-word
-            cleanup: bool:
-                Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
-                and some abbreviated english forms.
+    def decode(self, tokens):
        """
-        pass
-
-class Metaspace(Decoder):
-    """ Metaspace decoder """
-
-    def __init__(self, replacement: str = "▁", add_prefix_space: bool = True) -> None:
-        """Instantiate a new Metaspace
-
-        Args:
-            replacement: str:
-                The replacement character. Must be exactly one character. By default we
-                use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
-
-            add_prefix_space: boolean:
-                Whether to add a space to the first word if there isn't already one. This
-                lets us treat `hello` exactly like `say hello`.
+        Decode the given list of string to a final string
        """
        pass

 class BPEDecoder(Decoder):
-    """ BPEDecoder """
+    """
+    Instantiate a new BPEDecoder

-    def __init__(self, suffix: str = "</w>") -> None:
-        """Instantiate a new BPEDecoder
+    Args:
+        suffix: str:
+            The suffix that was used to caracterize an end-of-word. This suffix will
+            be replaced by whitespaces during the decoding
+    """

-        Args:
-            suffix: str:
-                The suffix that was used to caracterize an end-of-word. This suffix will
-                be replaced by whitespaces during the decoding
+    def __init__(self, suffix="</w>"):
+        pass
+    def decode(self, tokens):
+        """
+        Decode the given list of string to a final string
+        """
+        pass
+
+class ByteLevel(Decoder):
+    """
+    ByteLevel Decoder
+    """
+
+    def __init__(self):
+        pass
+    def decode(self, tokens):
+        """
+        Decode the given list of string to a final string
+        """
+        pass
+
+class Metaspace(Decoder):
+    """
+    Instantiate a new Metaspace
+
+    Args:
+        replacement: str:
+            The replacement character. Must be exactly one character. By default we
+            use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
+
+        add_prefix_space: boolean:
+            Whether to add a space to the first word if there isn't already one. This
+            lets us treat `hello` exactly like `say hello`.
+    """
+
+    def __init__(self, replacement="▁", add_prefix_space=True):
+        pass
+    def decode(self, tokens):
+        """
+        Decode the given list of string to a final string
+        """
+        pass
+
+class WordPiece(Decoder):
+    """
+    Instantiate a new WordPiece Decoder
+
+    Args:
+        prefix: str:
+            The prefix to use for subwords that are not a beginning-of-word
+        cleanup: bool:
+            Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
+            and some abbreviated english forms.
+    """
+
+    def __init__(self, prefix="##", cleanup=True):
+        pass
+    def decode(self, tokens):
+        """
+        Decode the given list of string to a final string
        """
        pass
--- a/bindings/python/py_src/tokenizers/models/init.py
+++ b/bindings/python/py_src/tokenizers/models/init.py
@@ -1,9 +1,8 @@
-from typing import List, Tuple
-
-from .. import models, Offsets
+# Generated content DO NOT EDIT
+from .. import models

 Model = models.Model
 BPE = models.BPE
-WordPiece = models.WordPiece
-WordLevel = models.WordLevel
 Unigram = models.Unigram
+WordLevel = models.WordLevel
+WordPiece = models.WordPiece
--- a/bindings/python/py_src/tokenizers/models/init.pyi
+++ b/bindings/python/py_src/tokenizers/models/init.pyi
@@ -1,34 +1,37 @@
-from .. import Encoding, Offsets, Token
-from typing import List, Optional, Union, Tuple, Dict
-
+# Generated content DO NOT EDIT
 class Model:
-    """Base class for all models
-
-    This class is not supposed to be instantiated directly. Instead, any implementation of
-    a Model will return a instance of this class when instantiated.
+    """
+    A Model represents some tokenization algorithm like BPE or Word
+    This class cannot be constructed directly. Please use one of the concrete models.
    """

-    def tokenize(self, sequence: str) -> List[Token]:
-        """ Tokenize the given sequence """
+    def id_to_token(self, id):
+        """
+        Returns the token associated with the given id
+        """
        pass
-    def token_to_id(self, token: str) -> Optional[int]:
-        """ Returns the id associated with the given token """
-        pass
-    def id_to_token(self, id: int) -> Optional[str]:
-        """ Returns the token associated with the given id """
-        pass
-    def save(self, folder: str, name: Optional[str] = None) -> List[str]:
-        """Save the current model
+    def save(self, folder, name):
+        """
+        Save the current model

        Save the current model in the given folder, using the given name for the various
        files that will get created.
        Any file with the same name that already exist in this folder will be overwritten.
        """
        pass
+    def token_to_id(self, tokens):
+        """
+        Returns the id associated with the given token
+        """
+        pass
+    def tokenize(self, tokens):
+        """
+        Tokenize the given sequence
+        """
+        pass

 class BPE(Model):
-    """BytePairEncoding model class
-
+    """
    Instantiate a BPE Model from the given vocab and merges.

    Args:
@@ -61,21 +64,18 @@ class BPE(Model):

    def __init__(
        self,
-        vocab: Optional[Union[str, Dict[str, int]]],
-        merges: Optional[Union[str, List[Tuple[str, str]]]],
-        cache_capacity: Optional[int],
-        dropout: Optional[float],
-        unk_token: Optional[str],
-        continuing_subword_prefix: Optional[str],
-        end_of_word_suffix: Optional[str],
-        fuse_unk: Optional[bool],
+        vocab=None,
+        merges=None,
+        cache_capacity=None,
+        dropout=None,
+        unk_token=None,
+        continuing_subword_prefix=None,
+        end_of_word_suffix=None,
+        fuse_unk=None,
    ):
        pass
    @staticmethod
-    def read_file(vocab_filename: str, merges_filename: str) -> Tuple[Vocab, Merges]:
-        pass
-    @staticmethod
-    def from_file(vocab_filename: str, merges_filename: str, **kwargs) -> BPE:
+    def from_file(vocab_filename, merge_filename, **kwargs):
        """
        Convenient method to intialize a BPE from files
        Roughly equivalent to
@@ -85,42 +85,73 @@ class BPE(Model):
            return BPE(vocab, merges, **kwargs)
        """
        pass
+    def id_to_token(self, id):
+        """
+        Returns the token associated with the given id
+        """
+        pass
+    @staticmethod
+    def read_file(self, vocab_filename, merges_filename):
+        """
+        Read a vocab_filename and merge_filename and stores result in memory
+        """
+        pass
+    def save(self, folder, name):
+        """
+        Save the current model

-class WordPiece(Model):
-    """WordPiece model class
+        Save the current model in the given folder, using the given name for the various
+        files that will get created.
+        Any file with the same name that already exist in this folder will be overwritten.
+        """
+        pass
+    def token_to_id(self, tokens):
+        """
+        Returns the id associated with the given token
+        """
+        pass
+    def tokenize(self, tokens):
+        """
+        Tokenize the given sequence
+        """
+        pass

-    Instantiate a WordPiece Model from the given vocab file.
+class Unigram(Model):
+    """
+    UnigramEncoding model class

-        Args:
-            vocab: (`optional`) string:
-                A dictionnary of string keys and their ids {"am": 0,...}
+    Instantiate a Unigram Model from the given model file.

-            unk_token: (`optional`) str:
-                The unknown token to be used by the model.
+    Args:
+       vocab: ('`optional`) string:
+           A list of vocabulary items and their relative score [("am", -0.2442),...]

-            max_input_chars_per_word: (`optional`) int:
-                The maximum number of characters to authorize in a single word.
    """

-    def __init__(
-        self,
-        vocab: Optional[Union[str, Dict[str, int]]],
-        unk_token: Optional[str],
-        max_input_chars_per_word: Optional[int],
-    ):
+    def __init__(self, vocab):
        pass
-    @staticmethod
-    def read_file(vocab_filename: str) -> Vocab:
-        pass
-    @staticmethod
-    def from_file(vocab_filename: str, **kwargs) -> WordPiece:
+    def id_to_token(self, id):
        """
-        Convenient method to intialize a WordPiece from file
-        Roughly equivalent to
+        Returns the token associated with the given id
+        """
+        pass
+    def save(self, folder, name):
+        """
+        Save the current model

-        def from_file(vocab_filename, **kwargs):
-            vocab, merges = WordPiece.read_file(vocab_filename)
-            return WordPiece(vocab, **kwargs)
+        Save the current model in the given folder, using the given name for the various
+        files that will get created.
+        Any file with the same name that already exist in this folder will be overwritten.
+        """
+        pass
+    def token_to_id(self, tokens):
+        """
+        Returns the id associated with the given token
+        """
+        pass
+    def tokenize(self, tokens):
+        """
+        Tokenize the given sequence
        """
        pass

@@ -138,34 +169,89 @@ class WordLevel(Model):
                The unknown token to be used by the model.
    """

-    def __init__(self, vocab: Optional[Union[str, Dict[str, int]]], unk_token: Optional[str]):
+    def __init__(self, vocab, unk_token):
        pass
-    @staticmethod
-    def read_file(vocab_filename: str) -> Vocab:
-        pass
-    @staticmethod
-    def from_file(vocab_filename: str, **kwargs) -> WordLevelg:
+    def id_to_token(self, id):
        """
-        Convenient method to intialize a WordLevelg from file
+        Returns the token associated with the given id
+        """
+        pass
+    def save(self, folder, name):
+        """
+        Save the current model
+
+        Save the current model in the given folder, using the given name for the various
+        files that will get created.
+        Any file with the same name that already exist in this folder will be overwritten.
+        """
+        pass
+    def token_to_id(self, tokens):
+        """
+        Returns the id associated with the given token
+        """
+        pass
+    def tokenize(self, tokens):
+        """
+        Tokenize the given sequence
+        """
+        pass
+
+class WordPiece(Model):
+    """
+    WordPiece model
+    Instantiate a WordPiece Model from the given vocab file.
+
+    Args:
+        vocab: (`optional`) string:
+            A dictionnary of string keys and their ids {"am": 0,...}
+
+        unk_token: (`optional`) str:
+            The unknown token to be used by the model.
+
+        max_input_chars_per_word: (`optional`) int:
+            The maximum number of characters to authorize in a single word.
+    """
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word):
+        pass
+    @staticmethod
+    def from_file(vocab_filename, merge_filename, **kwargs):
+        """
+        Convenient method to intialize a WordPiece from files
        Roughly equivalent to

        def from_file(vocab_filename, **kwargs):
-            vocab, merges = WordLevelg.read_file(vocab_filename)
-            return WordLevelg(vocab, **kwargs)
+            vocab = WordPiece.read_file(vocab_filename)
+            return WordPiece(vocab, **kwargs)
        """
        pass
-
-class Unigram(Model):
-    """UnigramEncoding model class
-
-    Instantiate a Unigram Model from the given model file.
-
-    Args:
-       vocab: ('`optional`) string:
-           A list of vocabulary items and their relative score [("am", -0.2442),...]
-
-    """
-
-    @staticmethod
-    def __init__(self, vocab: Optional[List[Tuple[str, float]]]):
+    def id_to_token(self, id):
+        """
+        Returns the token associated with the given id
+        """
+        pass
+    @staticmethod
+    def read_file(vocab_filename):
+        """
+        Read a vocab_filename and stores result in memory
+        """
+        pass
+    def save(self, folder, name):
+        """
+        Save the current model
+
+        Save the current model in the given folder, using the given name for the various
+        files that will get created.
+        Any file with the same name that already exist in this folder will be overwritten.
+        """
+        pass
+    def token_to_id(self, tokens):
+        """
+        Returns the id associated with the given token
+        """
+        pass
+    def tokenize(self, tokens):
+        """
+        Tokenize the given sequence
+        """
        pass
--- a/bindings/python/py_src/tokenizers/normalizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/normalizers/init.pyi
@@ -1,140 +1,258 @@
-from .. import NormalizedString
-from typing import Optional, List
-
+# Generated content DO NOT EDIT
 class Normalizer:
-    """Base class for all normalizers
+    """
+    Base class for all normalizers

    This class is not supposed to be instantiated directly. Instead, any implementation of a
    Normalizer will return an instance of this class when instantiated.
    """

-    def normalize(self, normalized: NormalizedString):
-        """ Normalize the given NormalizedString in-place """
+    def normalize(self, normalized):
+        """
+        Normalize the given NormalizedString in-place
+        """
        pass
-    def normalize_str(self, sequence: str) -> str:
-        """ Normalize the given str """
-        pass
-
-class BertNormalizer(Normalizer):
-    """BertNormalizer
-
-    Takes care of normalizing raw text before giving it to a Bert model.
-    This includes cleaning the text, handling accents, chinese chars and lowercasing
-    """
-
-    def __init__(
-        self,
-        clean_text: Optional[bool] = True,
-        handle_chinese_chars: Optional[bool] = True,
-        strip_accents: Optional[bool] = None,
-        lowercase: Optional[bool] = True,
-    ) -> None:
-        """Instantiate a BertNormalizer with the given options.
-
-        Args:
-            clean_text: (`optional`) boolean:
-                Whether to clean the text, by removing any control characters
-                and replacing all whitespaces by the classic one.
-
-            handle_chinese_chars: (`optional`) boolean:
-                Whether to handle chinese chars by putting spaces around them.
-
-            strip_accents: (`optional`) boolean:
-                Whether to strip all accents. If this option is not specified (ie == None),
-                then it will be determined by the value for `lowercase` (as in the original Bert).
-
-            lowercase: (`optional`) boolean:
-                Whether to lowercase.
-
-        Returns:
-            Normalizer
+    def normalize_str(self, sequence):
+        """
+        Normalize the given str
        """
        pass

-class NFD(Normalizer):
-    """ NFD Unicode Normalizer """
+class BertNormalizer(Normalizer):
+    """
+    BertNormalizer

-    def __init__(self) -> None:
-        """ Instantiate a new NFD Normalizer """
-        pass
+    Takes care of normalizing raw text before giving it to a Bert model.
+    This includes cleaning the text, handling accents, chinese chars and lowercasing

-class NFKD(Normalizer):
-    """ NFKD Unicode Normalizer """
+    Args:
+        clean_text: (`optional`) boolean:
+            Whether to clean the text, by removing any control characters
+            and replacing all whitespaces by the classic one.

-    def __init__(self) -> None:
-        """ Instantiate a new NFKD Normalizer """
-        pass
+        handle_chinese_chars: (`optional`) boolean:
+            Whether to handle chinese chars by putting spaces around them.

-class NFC(Normalizer):
-    """ NFC Unicode Normalizer """
+        strip_accents: (`optional`) boolean:
+            Whether to strip all accents. If this option is not specified (ie == None),
+            then it will be determined by the value for `lowercase` (as in the original Bert).

-    def __init__(self) -> None:
-        """ Instantiate a new NFC Normalizer """
-        pass
+        lowercase: (`optional`) boolean:
+            Whether to lowercase.

-class NFKC(Normalizer):
-    """ NFKC Unicode Normalizer """
-
-    def __init__(self) -> None:
-        """ Instantiate a new NFKC Normalizer """
-        pass
-
-class Sequence(Normalizer):
-    """Allows concatenating multiple other Normalizer as a Sequence.
-
-    All the normalizers run in sequence in the given order
+    Returns:
+        Normalizer
    """

-    def __init__(self, normalizers: List[Normalizer]) -> None:
-        """Instantiate a new normalization Sequence using the given normalizers
-
-        Args:
-            normalizers: List[Normalizer]:
-                A list of Normalizer to be run as a sequence
+    def __init__(
+        self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True
+    ):
+        pass
+    def normalize(self, normalized):
+        """
+        Normalize the given NormalizedString in-place
+        """
+        pass
+    def normalize_str(self, sequence):
+        """
+        Normalize the given str
        """
        pass

 class Lowercase(Normalizer):
-    """ Lowercase Normalizer """
+    """
+    Lowercase Normalizer
+    """

-    def __init__(self) -> None:
-        """ Instantiate a new Lowercase Normalizer """
+    def __init__(self):
+        pass
+    def normalize(self, normalized):
+        """
+        Normalize the given NormalizedString in-place
+        """
+        pass
+    def normalize_str(self, sequence):
+        """
+        Normalize the given str
+        """
        pass

-class Strip(Normalizer):
-    """ Strip normalizer """
+class NFC(Normalizer):
+    """
+    NFC Unicode Normalizer
+    """

-    def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
+    def __init__(self):
+        pass
+    def normalize(self, normalized):
+        """
+        Normalize the given NormalizedString in-place
+        """
+        pass
+    def normalize_str(self, sequence):
+        """
+        Normalize the given str
+        """
        pass

-class StripAccents(Normalizer):
-    """ StripAccents normalizer """
+class NFD(Normalizer):
+    """
+    NFD Unicode Normalizer
+    """

-    def __init__(self) -> Normalizer:
+    def __init__(self):
+        pass
+    def normalize(self, normalized):
+        """
+        Normalize the given NormalizedString in-place
+        """
+        pass
+    def normalize_str(self, sequence):
+        """
+        Normalize the given str
+        """
+        pass
+
+class NFKC(Normalizer):
+    """
+    NFKC Unicode Normalizer
+    """
+
+    def __init__(self):
+        pass
+    def normalize(self, normalized):
+        """
+        Normalize the given NormalizedString in-place
+        """
+        pass
+    def normalize_str(self, sequence):
+        """
+        Normalize the given str
+        """
+        pass
+
+class NFKD(Normalizer):
+    """
+    NFKD Unicode Normalizer
+    """
+
+    def __init__(self):
+        pass
+    def normalize(self, normalized):
+        """
+        Normalize the given NormalizedString in-place
+        """
+        pass
+    def normalize_str(self, sequence):
+        """
+        Normalize the given str
+        """
        pass

 class Nmt(Normalizer):
-    """ Nmt normalizer """
+    """
+    Nmt normalizer
+    """

-    def __init__(self) -> Normalizer:
+    def __init__(self):
+        pass
+    def normalize(self, normalized):
+        """
+        Normalize the given NormalizedString in-place
+        """
+        pass
+    def normalize_str(self, sequence):
+        """
+        Normalize the given str
+        """
        pass

 class Precompiled(Normalizer):
-    """ Precompiled normalizer """
+    """
+    Precompiled normalizer
+    Don't use manually it is used for compatiblity for SentencePiece.
+    """

-    def __init__(self, precompiled_charsmap: bytes) -> Normalizer:
+    def __init__(self, precompiled_charsmap):
+        pass
+    def normalize(self, normalized):
+        """
+        Normalize the given NormalizedString in-place
+        """
+        pass
+    def normalize_str(self, sequence):
+        """
+        Normalize the given str
+        """
        pass

 class Replace(Normalizer):
-    """ Replace normalizer """
+    """
+    Replace normalizer
+    """

-    def __init__(self, pattern: str, content: str) -> Normalizer:
+    def __init__(self, pattern, content):
+        pass
+    def normalize(self, normalized):
+        """
+        Normalize the given NormalizedString in-place
+        """
+        pass
+    def normalize_str(self, sequence):
+        """
+        Normalize the given str
+        """
        pass

-def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
+class Sequence(Normalizer):
    """
-    Instanciate unicode normalizer from the normalizer name
-    :param normalizer: Name of the normalizer
-    :return:
+    Allows concatenating multiple other Normalizer as a Sequence.
+    All the normalizers run in sequence in the given order
+
+    Args:
+        normalizers: List[Normalizer]:
+            A list of Normalizer to be run as a sequence
    """
-    pass
+
+    def normalize(self, normalized):
+        """
+        Normalize the given NormalizedString in-place
+        """
+        pass
+    def normalize_str(self, sequence):
+        """
+        Normalize the given str
+        """
+        pass
+
+class Strip(Normalizer):
+    """
+    Strip normalizer
+    """
+
+    def __init__(self, left=True, right=True):
+        pass
+    def normalize(self, normalized):
+        """
+        Normalize the given NormalizedString in-place
+        """
+        pass
+    def normalize_str(self, sequence):
+        """
+        Normalize the given str
+        """
+        pass
+
+class StripAccents(Normalizer):
+    def __init__(self):
+        pass
+    def normalize(self, normalized):
+        """
+        Normalize the given NormalizedString in-place
+        """
+        pass
+    def normalize_str(self, sequence):
+        """
+        Normalize the given str
+        """
+        pass
--- a/bindings/python/py_src/tokenizers/pre_tokenizers/init.py
+++ b/bindings/python/py_src/tokenizers/pre_tokenizers/init.py
@@ -1,13 +1,14 @@
+# Generated content DO NOT EDIT
 from .. import pre_tokenizers

 PreTokenizer = pre_tokenizers.PreTokenizer
-ByteLevel = pre_tokenizers.ByteLevel
-Whitespace = pre_tokenizers.Whitespace
-Punctuation = pre_tokenizers.Punctuation
-Sequence = pre_tokenizers.Sequence
-WhitespaceSplit = pre_tokenizers.WhitespaceSplit
 BertPreTokenizer = pre_tokenizers.BertPreTokenizer
-Metaspace = pre_tokenizers.Metaspace
+ByteLevel = pre_tokenizers.ByteLevel
 CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
 Digits = pre_tokenizers.Digits
+Metaspace = pre_tokenizers.Metaspace
+Punctuation = pre_tokenizers.Punctuation
+Sequence = pre_tokenizers.Sequence
 UnicodeScripts = pre_tokenizers.UnicodeScripts
+Whitespace = pre_tokenizers.Whitespace
+WhitespaceSplit = pre_tokenizers.WhitespaceSplit
--- a/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
@@ -1,163 +1,242 @@
-from .. import PreTokenizedString
-from typing import Optional, List, Tuple
-
-Offsets = Tuple[int, int]
-
+# Generated content DO NOT EDIT
 class PreTokenizer:
-    """Base class for all pre-tokenizers
+    """
+    Base class for all pre-tokenizers

    This class is not supposed to be instantiated directly. Instead, any implementation of a
    PreTokenizer will return an instance of this class when instantiated.
    """

-    def pre_tokenize(self, pretokenized: PreTokenizedString):
-        """ Pre tokenize the given PreTokenizedString in-place """
+    def pre_tokenize(self, pretok):
+        """
+        Pre tokenize the given PreTokenizedString in-place
+        """
        pass
-    def pre_tokenize_str(self, sequence: str) -> List[Tuple[str, Offsets]]:
-        """ Pre tokenize the given sequence """
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given sequence
+        """
+        pass
+
+class BertPreTokenizer(PreTokenizer):
+    """
+    BertPreTokenizer
+
+    This pre-tokenizer splits tokens on spaces, and also on punctuation.
+    Each occurence of a punctuation character will be treated separately.
+    """
+
+    def __init__(self):
+        pass
+    def pre_tokenize(self, pretok):
+        """
+        Pre tokenize the given PreTokenizedString in-place
+        """
+        pass
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given sequence
+        """
        pass

 class ByteLevel(PreTokenizer):
-    """ByteLevel PreTokenizer
+    """
+    ByteLevel PreTokenizer

    This pre-tokenizer takes care of replacing all bytes of the given string
    with a corresponding representation, as well as splitting into words.
+
+    Args:
+        add_prefix_space: (`optional`) boolean:
+            Whether to add a space to the first word if there isn't already one. This
+            lets us treat `hello` exactly like `say hello`.
+    Returns:
+        PreTokenizer
    """

-    def __init__(self, add_prefix_space: bool = True) -> None:
-        """Instantiate a new ByteLevel PreTokenizer
-        Args:
-            add_prefix_space: (`optional`) boolean:
-                Whether to add a space to the first word if there isn't already one. This
-                lets us treat `hello` exactly like `say hello`.
-        Returns:
-            PreTokenizer
-        """
+    def __init__(self, add_prefix_space=True):
        pass
    @staticmethod
-    def alphabet() -> List[str]:
-        """Returns the alphabet used by this PreTokenizer.
+    def alphabet():
+        """
+        Returns the alphabet used by this PreTokenizer.

        Since the ByteLevel works as its name suggests, at the byte level, it
        encodes any byte to one visible character. This means that there is a
        total of 256 different characters composing this alphabet.
        """
        pass
-
-class Whitespace(PreTokenizer):
-    """Whitespace PreTokenizer
-
-    This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
-    """
-
-    def __init__(self) -> None:
-        """ Instantiate a new Whitespace PreTokenizer """
+    def pre_tokenize(self, pretok):
+        """
+        Pre tokenize the given PreTokenizedString in-place
+        """
        pass
-
-class WhitespaceSplit(PreTokenizer):
-    """Whitespace PreTokenizer
-
-    This pre-tokenizer simply splits on the whitespace. Works like `.split()`
-    """
-
-    def __init__(self) -> None:
-        """ Instantiate a new WhitespaceSplit PreTokenizer """
-        pass
-
-class BertPreTokenizer(PreTokenizer):
-    """BertPreTokenizer
-
-    This pre-tokenizer splits tokens on spaces, and also on punctuation.
-    Each occurence of a punctuation character will be treated separately.
-    """
-
-    def __init__(self) -> None:
-        """ Instantiate a new BertPreTokenizer """
-        pass
-
-class Metaspace(PreTokenizer):
-    """Metaspace pre-tokenizer
-
-    This pre-tokenizer replaces any whitespace by the provided replacement character.
-    It then tries to split on these spaces.
-    """
-
-    def __init__(self, replacement: str = "▁", add_prefix_space: bool = True) -> None:
-        """Instantiate a new Metaspace
-
-        Args:
-            replacement: str:
-                The replacement character. Must be exactly one character. By default we
-                use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
-
-            add_prefix_space: boolean:
-                Whether to add a space to the first word if there isn't already one. This
-                lets us treat `hello` exactly like `say hello`.
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given sequence
        """
        pass

 class CharDelimiterSplit(PreTokenizer):
-    """CharDelimiterSplit PreTokenizer
-
+    """
    This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
+
+    Args:
+        delimiter: str:
+            The delimiter char that will be used to split input
    """

-    @staticmethod
-    def __init__(self, delimiter: str) -> None:
-        """Instantiate a new CharDelimiterSplit PreTokenizer
+    def pre_tokenize(self, pretok):
+        """
+        Pre tokenize the given PreTokenizedString in-place
+        """
+        pass
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given sequence
+        """
+        pass

-        Args:
-            delimiter: str:
-                The delimiter char that will be used to split input
+class Digits(PreTokenizer):
+    """
+    This pre-tokenizer simply splits using the digits in separate tokens
+    Args:
+        individual_digits: bool:
+            If set to True, digits will each be separated "Call 123 please" -> "Call ", "1", "2", "3", " please"
+            If set to False, digits will grouped "Call 123 please" -> "Call ", "123", " please"
+    """
+
+    def __init__(self, individual_digits=False):
+        pass
+    def pre_tokenize(self, pretok):
+        """
+        Pre tokenize the given PreTokenizedString in-place
+        """
+        pass
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given sequence
+        """
+        pass
+
+class Metaspace(PreTokenizer):
+    """
+    Metaspace pre-tokenizer
+
+    This pre-tokenizer replaces any whitespace by the provided replacement character.
+    It then tries to split on these spaces.
+    Args:
+        replacement: str:
+            The replacement character. Must be exactly one character. By default we
+            use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
+
+        add_prefix_space: boolean:
+            Whether to add a space to the first word if there isn't already one. This
+            lets us treat `hello` exactly like `say hello`.
+    """
+
+    def __init__(self, replacement="▁", add_prefix_space=True):
+        pass
+    def pre_tokenize(self, pretok):
+        """
+        Pre tokenize the given PreTokenizedString in-place
+        """
+        pass
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given sequence
        """
        pass

 class Punctuation(PreTokenizer):
-    """Punctuation PreTokenizer
-
+    """
    This pre-tokenizer simply splits on punctuation as individual characters.`
    """

-    def __init__(self) -> None:
-        """ Instantiate a new Punctuation PreTokenizer """
+    def __init__(self):
+        pass
+    def pre_tokenize(self, pretok):
+        """
+        Pre tokenize the given PreTokenizedString in-place
+        """
+        pass
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given sequence
+        """
        pass

 class Sequence(PreTokenizer):
-    """Sequence PreTokenizer
-
-    This pre-tokenizer composes other pre_tokenizers and applies them in sequence`
+    """
+    This pre-tokenizer composes other pre_tokenizers and applies them in sequence
    """

-    def __init__(self) -> None:
-        """ Instantiate a new Sequence PreTokenizer """
+    def __init__(self, pretokenizers):
        pass
-
-class Digits(PreTokenizer):
-    """Digits PreTokenizer
-
-    This pre-tokenizer simply splits using the digits in separate tokens
-    """
-
-    def __init__(self, individual_digits: bool) -> None:
-        """Instantiate a new Digits
-
-        Args:
-            individual_digits: bool:
-                If set to True, digits will each be separated "Call 123 please" -> "Call ", "1", "2", "3", " please"
-                If set to False, digits will grouped "Call 123 please" -> "Call ", "123", " please"
-
+    def pre_tokenize(self, pretok):
+        """
+        Pre tokenize the given PreTokenizedString in-place
+        """
+        pass
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given sequence
        """
        pass

 class UnicodeScripts(PreTokenizer):
-    """UnicodeScripts PreTokenizer
-
+    """
    This pre-tokenizer splits on characters that belong to different language family
    It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt
    Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
    This mimicks SentencePiece Unigram implementation.
    """

-    def __init__(self) -> None:
-        """ Instantiate a new UnicodeScripts """
+    def __init__(self):
+        pass
+    def pre_tokenize(self, pretok):
+        """
+        Pre tokenize the given PreTokenizedString in-place
+        """
+        pass
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given sequence
+        """
+        pass
+
+class Whitespace(PreTokenizer):
+    """
+    This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
+    """
+
+    def __init__(self):
+        pass
+    def pre_tokenize(self, pretok):
+        """
+        Pre tokenize the given PreTokenizedString in-place
+        """
+        pass
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given sequence
+        """
+        pass
+
+class WhitespaceSplit(PreTokenizer):
+    """
+    This pre-tokenizer simply splits on the whitespace. Works like `.split()`
+    """
+
+    def __init__(self):
+        pass
+    def pre_tokenize(self, pretok):
+        """
+        Pre tokenize the given PreTokenizedString in-place
+        """
+        pass
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given sequence
+        """
        pass
--- a/bindings/python/py_src/tokenizers/processors/init.py
+++ b/bindings/python/py_src/tokenizers/processors/init.py
@@ -1,7 +1,8 @@
+# Generated content DO NOT EDIT
 from .. import processors

 PostProcessor = processors.PostProcessor
 BertProcessing = processors.BertProcessing
-RobertaProcessing = processors.RobertaProcessing
 ByteLevel = processors.ByteLevel
+RobertaProcessing = processors.RobertaProcessing
 TemplateProcessing = processors.TemplateProcessing
--- a/bindings/python/py_src/tokenizers/processors/init.pyi
+++ b/bindings/python/py_src/tokenizers/processors/init.pyi
@@ -1,53 +1,85 @@
-from .. import Encoding
-from typing import Tuple, Union, List
-
+# Generated content DO NOT EDIT
 class PostProcessor:
-    """Base class for all post-processors
+    """
+    Base class for all post-processors

    This class is not supposed to be instantiated directly. Instead, any implementation of
    a PostProcessor will return an instance of this class when instantiated.
    """

-    def num_special_tokens_to_add(self, is_pair: bool) -> int:
+    def num_special_tokens_to_add(self, is_pair):
        """
        Return the number of special tokens that would be added for single/pair sentences.
        :param is_pair: Boolean indicating if the input would be a single sentence or a pair
        :return:
        """
        pass
-    def process(
-        self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True
-    ) -> Encoding:
-        """ Post-process the given encodings, generating the final one """
+    def process(self, encoding, pair=None, add_special_tokens=True):
+        """
+        Post-process the given encodings, generating the final one
+        """
        pass

 class BertProcessing(PostProcessor):
-    """BertProcessing
-
+    """
    This post-processor takes care of adding the special tokens needed by
    a Bert model:
        - a SEP token
        - a CLS token
+    Args:
+        sep: Tuple[str, int]:
+            A tuple with the string representation of the SEP token, and its id
+
+        cls: Tuple[str, int]:
+            A tuple with the string representation of the CLS token, and its id
+
+    Returns:
+        PostProcessor
    """

-    def __init__(self, sep: Tuple[str, int], cls: Tuple[str, int]) -> None:
-        """Instantiate a new BertProcessing with the given tokens
+    def __init__(self, sep, cls):
+        pass
+    def num_special_tokens_to_add(self, is_pair):
+        """
+        Return the number of special tokens that would be added for single/pair sentences.
+        :param is_pair: Boolean indicating if the input would be a single sentence or a pair
+        :return:
+        """
+        pass
+    def process(self, encoding, pair=None, add_special_tokens=True):
+        """
+        Post-process the given encodings, generating the final one
+        """
+        pass

-        Args:
-            sep: Tuple[str, int]:
-                A tuple with the string representation of the SEP token, and its id
+class ByteLevel(PostProcessor):
+    """
+    This post-processor takes care of trimming the offsets.
+    By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
+    want the offsets to include these whitespaces, then this PostProcessor must be used.

-            cls: Tuple[str, int]:
-                A tuple with the string representation of the CLS token, and its id
+    Args:
+        trim_offsets: bool:
+            Whether to trim the whitespaces from the produced offsets.
+    """

-        Returns:
-            PostProcessor
+    def __init__(self, trim_offsets=True):
+        pass
+    def num_special_tokens_to_add(self, is_pair):
+        """
+        Return the number of special tokens that would be added for single/pair sentences.
+        :param is_pair: Boolean indicating if the input would be a single sentence or a pair
+        :return:
+        """
+        pass
+    def process(self, encoding, pair=None, add_special_tokens=True):
+        """
+        Post-process the given encodings, generating the final one
        """
        pass

 class RobertaProcessing(PostProcessor):
-    """RobertaProcessing
-
+    """
    This post-processor takes care of adding the special tokens needed by
    a Roberta model:
        - a SEP token
@@ -57,59 +89,41 @@ class RobertaProcessing(PostProcessor):
    By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
    want the offsets to include these whitespaces, then this PostProcessor should be initialized
    with `trim_offsets=True`
+    Args:
+        sep: Tuple[str, int]:
+            A tuple with the string representation of the SEP token, and its id
+
+        cls: Tuple[str, int]:
+            A tuple with the string representation of the CLS token, and its id
+
+        trim_offsets: bool:
+            Whether to trim the whitespaces from the produced offsets.
+
+        add_prefix_space: bool:
+            Whether the add_prefix_space option was enabled during pre-tokenization. This
+            is relevant because it defines the way the offsets are trimmed out.
+
+    Returns:
+        PostProcessor
    """

-    def __init__(
-        self,
-        sep: Tuple[str, int],
-        cls: Tuple[str, int],
-        trim_offsets: bool = True,
-        add_prefix_space: bool = True,
-    ) -> None:
-        """Instantiate a new RobertaProcessing with the given tokens
-
-        Args:
-            sep: Tuple[str, int]:
-                A tuple with the string representation of the SEP token, and its id
-
-            cls: Tuple[str, int]:
-                A tuple with the string representation of the CLS token, and its id
-
-            trim_offsets: bool:
-                Whether to trim the whitespaces from the produced offsets.
-
-            add_prefix_space: bool:
-                Whether the add_prefix_space option was enabled during pre-tokenization. This
-                is relevant because it defines the way the offsets are trimmed out.
-
-        Returns:
-            PostProcessor
+    def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True):
+        pass
+    def num_special_tokens_to_add(self, is_pair):
+        """
+        Return the number of special tokens that would be added for single/pair sentences.
+        :param is_pair: Boolean indicating if the input would be a single sentence or a pair
+        :return:
        """
        pass
-
-class ByteLevel(PostProcessor):
-    """ByteLevel Post processing
-
-    This post-processor takes care of trimming the offsets.
-    By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
-    want the offsets to include these whitespaces, then this PostProcessor must be used.
-    """
-
-    def __init__(self, trim_offsets: bool = True) -> None:
-        """Instantiate a new ByteLevel
-
-        Args:
-            trim_offsets: bool:
-                Whether to trim the whitespaces from the produced offsets.
+    def process(self, encoding, pair=None, add_special_tokens=True):
+        """
+        Post-process the given encodings, generating the final one
        """
        pass

-Template = Union[str, List[str]]
-Tokens = List[Union[Tuple[int, str], Tuple[str, int], dict]]
-
 class TemplateProcessing(PostProcessor):
-    """TemplateProcessing
-
+    """
    Provides a way to specify templates in order to add the special tokens to each
    input sequence as relevant.

@@ -147,32 +161,42 @@ class TemplateProcessing(PostProcessor):
    will be added to the Encoding without any further check. If the given ids correspond
    to something totally different in a `Tokenizer` using this `PostProcessor`, it
    might lead to unexpected results.
+
+    Args:
+        single: Template
+            The template used for single sequences
+
+        pair: Template:
+            The template used when both sequences are specified
+
+        special_tokens: Tokens:
+            The list of special tokens used in each sequences
+
+    Template: Union[str, List[str]]:
+        - If a `str` is provided, the whitespace is used as delimiter between tokens
+        - If a `List[str]` is provided, a list of tokens
+
+    Tokens: List[Union[Tuple[int, str], Tuple[str, int], dict]]:
+        - A Tuple with both a token and its associated ID, in any order
+        - A dict with the following keys:
+            - "id": str => The special token id, as specified in the Template
+            - "ids": List[int] => The associated IDs
+            - "tokens": List[str] => The associated tokens
+         The given dict expects the provided `ids` and `tokens` lists to have
+         the same length.
    """

-    def __init__(self, single: Template, pair: Template, special_tokens: Tokens) -> None:
-        """Instantiate a new TemplateProcessing
-
-        Args:
-            single: Template
-                The template used for single sequences
-
-            pair: Template:
-                The template used when both sequences are specified
-
-            special_tokens: Tokens:
-                The list of special tokens used in each sequences
-
-        Template: Union[str, List[str]]:
-            - If a `str` is provided, the whitespace is used as delimiter between tokens
-            - If a `List[str]` is provided, a list of tokens
-
-        Tokens: List[Union[Tuple[int, str], Tuple[str, int], dict]]:
-            - A Tuple with both a token and its associated ID, in any order
-            - A dict with the following keys:
-                - "id": str => The special token id, as specified in the Template
-                - "ids": List[int] => The associated IDs
-                - "tokens": List[str] => The associated tokens
-             The given dict expects the provided `ids` and `tokens` lists to have
-             the same length.
+    def __init__(self, single, pair, special_tokens):
+        pass
+    def num_special_tokens_to_add(self, is_pair):
+        """
+        Return the number of special tokens that would be added for single/pair sentences.
+        :param is_pair: Boolean indicating if the input would be a single sentence or a pair
+        :return:
+        """
+        pass
+    def process(self, encoding, pair=None, add_special_tokens=True):
+        """
+        Post-process the given encodings, generating the final one
        """
        pass
--- a/bindings/python/py_src/tokenizers/trainers/init.py
+++ b/bindings/python/py_src/tokenizers/trainers/init.py
@@ -1,6 +1,7 @@
+# Generated content DO NOT EDIT
 from .. import trainers

 Trainer = trainers.Trainer
 BpeTrainer = trainers.BpeTrainer
-WordPieceTrainer = trainers.WordPieceTrainer
 UnigramTrainer = trainers.UnigramTrainer
+WordPieceTrainer = trainers.WordPieceTrainer
--- a/bindings/python/py_src/tokenizers/trainers/init.pyi
+++ b/bindings/python/py_src/tokenizers/trainers/init.pyi
@@ -1,148 +1,132 @@
-from .. import AddedToken
-from typing import Optional, List, Union
-
+# Generated content DO NOT EDIT
 class Trainer:
-    """Base class for all trainers
+    """
+    Base class for all trainers

    This class is not supposed to be instantiated directly. Instead, any implementation of a
    Trainer will return an instance of this class when instantiated.
+
+    Args:
+        vocab_size: unsigned int:
+            The size of the final vocabulary, including all tokens and alphabet.
+
+        min_frequency: unsigned int:
+            The minimum frequency a pair should have in order to be merged.
+
+        show_progress: boolean:
+            Whether to show progress bars while training.
+
+        special_tokens: List[Union[str, AddedToken]]:
+            A list of special tokens the model should know of.
+
+        limit_alphabet: unsigned int:
+            The maximum different characters to keep in the alphabet.
+
+        initial_alphabet: List[str]:
+            A list of characters to include in the initial alphabet, even
+            if not seen in the training dataset.
+            If the strings contain more than one character, only the first one
+            is kept.
+
+        continuing_subword_prefix: Optional[str]:
+            A prefix to be used for every subword that is not a beginning-of-word.
+
+        end_of_word_suffix: Optional[str]:
+            A suffix to be used for every subword that is a end-of-word.
+
+    Returns:
+        Trainer
    """

-class BpeTrainer(Trainer):
-    """BpeTrainer
+    def __init__(
+        self,
+        vocab_size=30000,
+        min_frequency=0,
+        show_progress=True,
+        special_tokens=[],
+        limit_alphabet=None,
+        initial_alphabet=[],
+        continuing_subword_prefix=None,
+        end_of_word_suffix=None,
+    ):
+        pass

+class BpeTrainer(Trainer):
+    """
    Capable of training a BPE model
    """

-    def __init__(
-        self,
-        vocab_size: int = 30000,
-        min_frequency: int = 0,
-        show_progress: bool = True,
-        special_tokens: List[Union[str, AddedToken]] = [],
-        limit_alphabet: Optional[int] = None,
-        initial_alphabet: List[str] = [],
-        continuing_subword_prefix: Optional[str] = None,
-        end_of_word_suffix: Optional[str] = None,
-    ) -> None:
-        """Instantiate a new BpeTrainer with the given options:
+class UnigramTrainer(Trainer):
+    """
+    Capable of training a Unigram model

-        Args:
-            vocab_size: unsigned int:
-                The size of the final vocabulary, including all tokens and alphabet.
+    Args:
+        vocab_size: unsigned int:
+            The size of the final vocabulary, including all tokens and alphabet.

-            min_frequency: unsigned int:
-                The minimum frequency a pair should have in order to be merged.
+        show_progress: boolean:
+            Whether to show progress bars while training.

-            show_progress: boolean:
-                Whether to show progress bars while training.
+        special_tokens: List[Union[str, AddedToken]]:
+            A list of special tokens the model should know of.

-            special_tokens: List[Union[str, AddedToken]]:
-                A list of special tokens the model should know of.
+        initial_alphabet: List[str]:
+            A list of characters to include in the initial alphabet, even
+            if not seen in the training dataset.
+            If the strings contain more than one character, only the first one
+            is kept.

-            limit_alphabet: unsigned int:
-                The maximum different characters to keep in the alphabet.
+    Returns:
+        Trainer
+    """

-            initial_alphabet: List[str]:
-                A list of characters to include in the initial alphabet, even
-                if not seen in the training dataset.
-                If the strings contain more than one character, only the first one
-                is kept.
-
-            continuing_subword_prefix: Optional[str]:
-                A prefix to be used for every subword that is not a beginning-of-word.
-
-            end_of_word_suffix: Optional[str]:
-                A suffix to be used for every subword that is a end-of-word.
-
-        Returns:
-            Trainer
-        """
+    def __init__(self, vocab_size=8000, show_progress=True, special_tokens=[]):
        pass

 class WordPieceTrainer(Trainer):
-    """WordPieceTrainer
-
+    """
    Capable of training a WordPiece model
+    Args:
+        vocab_size: unsigned int:
+            The size of the final vocabulary, including all tokens and alphabet.
+
+        min_frequency: unsigned int:
+            The minimum frequency a pair should have in order to be merged.
+
+        show_progress: boolean:
+            Whether to show progress bars while training.
+
+        special_tokens: List[Union[str, AddedToken]]:
+            A list of special tokens the model should know of.
+
+        limit_alphabet: unsigned int:
+            The maximum different characters to keep in the alphabet.
+
+        initial_alphabet: List[str]:
+            A list of characters to include in the initial alphabet, even
+            if not seen in the training dataset.
+            If the strings contain more than one character, only the first one
+            is kept.
+
+        continuing_subword_prefix: Optional[str]:
+            A prefix to be used for every subword that is not a beginning-of-word.
+
+        end_of_word_suffix: Optional[str]:
+            A suffix to be used for every subword that is a end-of-word.
+
+    Returns:
+        Trainer
    """

    def __init__(
        self,
-        vocab_size: int = 30000,
-        min_frequency: int = 0,
-        show_progress: bool = True,
-        special_tokens: List[Union[str, AddedToken]] = [],
-        limit_alphabet: Optional[int] = None,
-        initial_alphabet: List[str] = [],
-        continuing_subword_prefix: Optional[str] = "##",
-        end_of_word_suffix: Optional[str] = None,
-    ) -> Trainer:
-        """Instantiate a new WordPieceTrainer with the given options:
-
-        Args:
-            vocab_size: unsigned int:
-                The size of the final vocabulary, including all tokens and alphabet.
-
-            min_frequency: unsigned int:
-                The minimum frequency a pair should have in order to be merged.
-
-            show_progress: boolean:
-                Whether to show progress bars while training.
-
-            special_tokens: List[Union[str, AddedToken]]:
-                A list of special tokens the model should know of.
-
-            limit_alphabet: unsigned int:
-                The maximum different characters to keep in the alphabet.
-
-            initial_alphabet: List[str]:
-                A list of characters to include in the initial alphabet, even
-                if not seen in the training dataset.
-                If the strings contain more than one character, only the first one
-                is kept.
-
-            continuing_subword_prefix: Optional[str]:
-                A prefix to be used for every subword that is not a beginning-of-word.
-
-            end_of_word_suffix: Optional[str]:
-                A suffix to be used for every subword that is a end-of-word.
-
-        Returns:
-            Trainer
-        """
-        pass
-
-class UnigramTrainer(Trainer):
-    """UnigramTrainer
-
-    Capable of training a Unigram model
-    """
-
-    def __init__(
-        self,
-        vocab_size: int = 8000,
-        show_progress: bool = True,
-        special_tokens: List[Union[str, AddedToken]] = [],
-    ) -> Trainer:
-        """Instantiate a new UnigramTrainer with the given options:
-
-        Args:
-            vocab_size: unsigned int:
-                The size of the final vocabulary, including all tokens and alphabet.
-
-            show_progress: boolean:
-                Whether to show progress bars while training.
-
-            special_tokens: List[Union[str, AddedToken]]:
-                A list of special tokens the model should know of.
-
-            initial_alphabet: List[str]:
-                A list of characters to include in the initial alphabet, even
-                if not seen in the training dataset.
-                If the strings contain more than one character, only the first one
-                is kept.
-
-        Returns:
-            Trainer
-        """
+        vocab_size=30000,
+        min_frequency=0,
+        show_progress=True,
+        special_tokens=[],
+        limit_alphabet=None,
+        initial_alphabet=[],
+        continuing_subword_prefix="##",
+        end_of_word_suffix=None,
+    ):
        pass