Move Python source to subdirectory.

This allows testing versions not built in-place. Otherwise importing (or testing) in the package root fails without develop builds. Replace maturin with setuptools_rust since maturin fails with proper project structure.
2025-12-05 04:08:22 +00:00 · 2020-07-25 19:41:18 +02:00
parent e9a2e63a67
commit 0d7c232f95
23 changed files with 5 additions and 4 deletions
--- a/bindings/python/py_src/tokenizers/init.py
+++ b/bindings/python/py_src/tokenizers/init.py
@@ -0,0 +1,29 @@
+__version__ = "0.8.1"
+
+from typing import Tuple, Union, Tuple, List
+
+Offsets = Tuple[int, int]
+
+TextInputSequence = str
+PreTokenizedInputSequence = Union[List[str], Tuple[str]]
+TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
+PreTokenizedEncodeInput = Union[
+    PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence]
+]
+
+InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
+EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
+
+from .tokenizers import Tokenizer, Encoding, AddedToken
+from .tokenizers import decoders
+from .tokenizers import models
+from .tokenizers import normalizers
+from .tokenizers import pre_tokenizers
+from .tokenizers import processors
+from .tokenizers import trainers
+from .implementations import (
+    ByteLevelBPETokenizer,
+    CharBPETokenizer,
+    SentencePieceBPETokenizer,
+    BertWordPieceTokenizer,
+)
--- a/bindings/python/py_src/tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/init.pyi
@@ -0,0 +1,636 @@
+from .decoders import *
+from .models import *
+from .normalizers import *
+from .pre_tokenizers import *
+from .processors import *
+from .trainers import *
+
+from .implementations import (
+    ByteLevelBPETokenizer as ByteLevelBPETokenizer,
+    CharBPETokenizer as CharBPETokenizer,
+    SentencePieceBPETokenizer as SentencePieceBPETokenizer,
+    BertWordPieceTokenizer as BertWordPieceTokenizer,
+)
+
+from typing import Optional, Union, List, Tuple
+
+Offsets = Tuple[int, int]
+
+TextInputSequence = str
+PreTokenizedInputSequence = Union[List[str], Tuple[str]]
+TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
+PreTokenizedEncodeInput = Union[
+    PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
+]
+
+InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
+EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
+
+class Encoding:
+    """ An Encoding as returned by the Tokenizer """
+
+    @staticmethod
+    def merge(encodings: List[Encoding], growing_offsets: bool = True) -> Encoding:
+        """ Merge the list of Encoding into one final Encoding
+
+        Args:
+            encodings: List[Encoding]:
+                The list of encodings
+
+            growing_offsets: bool:
+                Whether the offsets should accumulate while merging
+
+        Returns:
+            The resulting Encoding
+        """
+        pass
+    @property
+    def ids(self) -> List[int]:
+        """ The tokenized ids """
+        pass
+    @property
+    def tokens(self) -> List[str]:
+        """ The tokenized strings """
+        pass
+    @property
+    def words(self) -> List[Optional[int]]:
+        """ The tokenized words index """
+        pass
+    @property
+    def type_ids(self) -> List[int]:
+        """ The type ids """
+        pass
+    @property
+    def offsets(self) -> List[Offsets]:
+        """ The offsets.
+        These offsets can be used to index any `IndexableString` directly. If you want to
+        index the original `str`, make sure to retrieve the converted offsets using the `.offsets`
+        method on the `original_str`.
+        """
+        pass
+    @property
+    def special_tokens_mask(self) -> List[int]:
+        """ The special tokens mask """
+        pass
+    @property
+    def attention_mask(self) -> List[int]:
+        """ The attention mask """
+        pass
+    @property
+    def overflowing(self) -> Optional[Encoding]:
+        """ The overflowing encoding, after truncation """
+        pass
+    def word_to_tokens(self, word_index: int) -> Optional[Tuple[int, int]]:
+        """
+        Get the encoded tokens corresponding to the word at the given index in the input
+        sequence, with the form [start_token, end_token + 1]
+
+        Args:
+            word_index: int:
+                The index of the word in the input sequence.
+
+        Returns:
+            The range of tokens with the form [start_token, end_token + 1]
+        """
+        pass
+    def word_to_chars(self, word_index: int) -> Optional[Offsets]:
+        """
+        Get the offsets of the word at the given index in the input sequence.
+
+        Args:
+            word_index: int:
+                The index of the word in the input sequence.
+
+        Returns:
+            The word offsets
+        """
+        pass
+    def token_to_chars(self, token_index: int) -> Optional[Offsets]:
+        """
+        Get the offsets of the token at the given index
+
+        Args:
+            token_index: int:
+                The index of the token in the encoded sequence.
+
+        Returns:
+            The token offsets
+        """
+        pass
+    def token_to_word(self, token_index: int) -> Optional[int]:
+        """
+        Get the word that contains the token at the given index
+
+        Args:
+            token_index: int:
+                The index of the token in the encoded sequence.
+
+        Returns:
+            The index of the word in the input sequence.
+        """
+        pass
+    def char_to_token(self, pos: int) -> Optional[int]:
+        """
+        Get the token that contains the char at the given position
+
+        Args:
+            pos: int:
+                The position of a char in the input string
+
+        Returns:
+            The index of the token that contains this char
+        """
+        pass
+    def char_to_word(self, pos: int) -> Optional[int]:
+        """
+        Get the word that contains the given char.
+
+        Args:
+            pos: int:
+                The position of a char in the input string
+
+        Returns:
+            The index of the word that contains this char
+        """
+        pass
+    def pad(
+        self,
+        length: int,
+        pad_id: Optional[int] = 0,
+        pad_type_id: Optional[int] = 0,
+        pad_token: Optional[str] = "[PAD]",
+        direction: Optional[str] = "right",
+    ):
+        """ Pad the current Encoding at the given length
+
+        Args:
+            length: int:
+                The length at which to pad
+
+            direction: (`optional`) str:
+                Can be one of: `right` or `left`
+
+            pad_id: (`optional`) unsigned int:
+                The indice to be used when padding
+
+            pad_type_id: (`optional`) unsigned int:
+                The type indice to be used when padding
+
+            pad_token: (`optional`) str:
+                The pad token to be used when padding
+        """
+        pass
+    def truncate(self, max_length: int, stride: Optional[int] = 0):
+        """ Truncate the current Encoding at the given max_length
+
+        Args:
+            max_length: int:
+                The maximum length to be kept
+
+            stride: (`optional`) unsigned int:
+                The length of the previous first sequence to be included
+                in the overflowing sequence
+        """
+        pass
+
+class AddedToken:
+    """ AddedToken represents a token to be added to a Tokenizer
+
+    An AddedToken can have special options defining the way it should behave.
+    """
+
+    def __new__(
+        cls,
+        content: str = "",
+        single_word: bool = False,
+        lstrip: bool = False,
+        rstrip: bool = False,
+        normalized: bool = True,
+    ) -> AddedToken:
+        """ Instantiate a new AddedToken
+
+        Args:
+            content: str:
+                The content of the token
+
+            single_word: bool
+                Whether this token should only match against single words. If True,
+                this token will never match inside of a word. For example the token `ing` would
+                match on `tokenizing` if this option if False, but not if this option is True.
+
+            lstrip: bool
+                Whether this token should strip all potential whitespaces on the left side.
+                If True, this token will greedily match any whitespace on the left. For example,
+                if we try to match the token `[MASK]` with lstrip=True, in the text `I saw a [MASK]`
+                we will match on ` [MASK]`.
+
+            rstrip: bool
+                Whether this token should strip all potential whitespaces on the right side.
+                If True, this token will greedily match any whitespace on the right. It works just
+                like lstrip, but on the right.
+
+            normalized: bool:
+                Whether this token should be match the normalized version of the input text. For
+                example, with the added token `yesterday` and a normalizer in charge of lowercasing
+                the text, the token could be extract from the input `I saw a lion Yesterday`.
+        """
+        pass
+
+class Tokenizer:
+    """ Tokenizer
+
+    A Tokenizer works as a pipeline, it processes some raw text as input and outputs
+    an `Encoding`.
+
+    The various steps of the pipeline are:
+        1. The `Normalizer`: in charge of normalizing the text. Common examples of
+           normalization are the unicode normalization standards, such as NFD or NFKC.
+        2. The `PreTokenizer`: in charge of creating initial words splits in the text.
+           The most common way of splitting text is simply on whitespace.
+        3. The `Model`: in charge of doing the actual tokenization. An example of a
+           `Model` would be `BPE` or `WordPiece`.
+        4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything
+           relevant that, for example, a language model would need, such as special tokens.
+    """
+
+    def __new__(cls, model: models.Model) -> Tokenizer:
+        """ Instantiate a new Tokenizer using the given Model
+
+        Args:
+            model: models.Model:
+                The model to be used with this Tokenizer
+
+        Returns:
+            Tokenizer
+        """
+        pass
+    @staticmethod
+    def from_str(s: str) -> Tokenizer:
+        """ Instantiate a new Tokenizer from the given JSON string
+
+        Args:
+            s: str:
+                A JSON string representation of the Tokenizer
+
+        Returns:
+            Tokenizer
+        """
+        pass
+    @staticmethod
+    def from_file(path: str) -> Tokenizer:
+        """ Instantiate a new Tokenizer from the given file
+
+        Args:
+            path: str:
+                Path to a file containing a Tokenizer
+
+        Returns:
+            Tokenizer
+        """
+        pass
+    @staticmethod
+    def from_buffer(buffer: bytes) -> Tokenizer:
+        """ Instantiate a new Tokenizer from the given buffer
+
+        Args:
+            buffer: bytes:
+                A buffer used to instantiate a new Tokenizer
+
+        Returns:
+            Tokenizer
+        """
+        pass
+    def to_str(self, pretty: bool = False) -> str:
+        """ Get a serialized JSON version of the Tokenizer as a str
+
+        Args:
+            pretty: bool:
+                Whether the JSON string should be prettified
+
+        Returns:
+            str
+        """
+        pass
+    def save(self, path: str, pretty: bool = False):
+        """ Save the Tokenizer as JSON to the given path
+
+        Args:
+            pretty: bool:
+                Whether the JSON string should be prettified
+        """
+        pass
+    @property
+    def model(self) -> Model:
+        """ Get the model in use with this Tokenizer """
+        pass
+    @model.setter
+    def model(self, model: models.Model):
+        """ Change the model to use with this Tokenizer """
+        pass
+    @property
+    def pre_tokenizer(self) -> Optional[PreTokenizer]:
+        """ Get the pre-tokenizer in use with this model """
+        pass
+    @pre_tokenizer.setter
+    def pre_tokenizer(self, pre_tokenizer: pre_tokenizers.PreTokenizer):
+        """ Change the pre tokenizer to use with this Tokenizer """
+        pass
+    @property
+    def decoder(self) -> Optional[Decoder]:
+        """ Get the decoder in use with this model """
+        pass
+    @decoder.setter
+    def decoder(self, decoder: decoders.Decoder):
+        """ Change the decoder to use with this Tokenizer """
+        pass
+    @property
+    def post_processor(self) -> Optional[PostProcessor]:
+        """ Get the post-processor in use with this Tokenizer """
+        pass
+    @post_processor.setter
+    def post_processor(self, processor: processors.PostProcessor):
+        """ Change the post processor to use with this Tokenizer """
+    @property
+    def normalizer(self) -> Optional[Normalizer]:
+        """ Get the normalizer in use with this Tokenizer """
+        pass
+    @normalizer.setter
+    def normalizer(self, normalizer: normalizers.Normalizer):
+        """ Change the normalizer to use with this Tokenizer """
+    def num_special_tokens_to_add(self, is_pair: bool) -> int:
+        """
+        Return the number of special tokens that would be added for single/pair sentences.
+        :param is_pair: Boolean indicating if the input would be a single sentence or a pair
+        :return:
+        """
+        pass
+    def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
+        """ Returns the vocabulary
+
+        Args:
+            with_added_tokens: boolean:
+                Whether to include the added tokens in the vocabulary
+
+        Returns:
+            The vocabulary
+        """
+        pass
+    def get_vocab_size(self, with_added_tokens: bool = True) -> int:
+        """ Returns the size of the vocabulary
+
+        Args:
+            with_added_tokens: boolean:
+                Whether to include the added tokens in the vocabulary's size
+
+        Returns:
+            The size of the vocabulary
+        """
+        pass
+    def enable_truncation(self, max_length: int, stride: Optional[int], strategy: Optional[str]):
+        """ Enable the truncation
+
+        Args:
+            max_length: unsigned int:
+                The maximum length at which to truncate
+
+            stride: (`optional`) unsigned int:
+                The length of the previous first sequence to be included
+                in the overflowing sequence
+
+            strategy: (`optional) str:
+                Can be one of `longest_first`, `only_first` or `only_second`
+        """
+        pass
+    def no_truncation(self):
+        """ Disable truncation """
+        pass
+    @property
+    def truncation(self) -> Optional[dict]:
+        """ Get the current truncation parameters
+
+        Returns:
+            None if truncation is disabled, a dict with the current truncation parameters if
+            truncation is enabled
+        """
+        pass
+    def enable_padding(
+        self,
+        direction: Optional[str] = "right",
+        pad_to_multiple_of: Optional[int] = None,
+        pad_id: Optional[int] = 0,
+        pad_type_id: Optional[int] = 0,
+        pad_token: Optional[str] = "[PAD]",
+        length: Optional[int] = None,
+    ):
+        """ Enable the padding
+
+        Args:
+            direction: (`optional`) str:
+                Can be one of: `right` or `left`
+
+            pad_to_multiple_of: (`optional`) unsigned int:
+                If specified, the padding length should always snap to the next multiple of
+                the given value. For example if we were going to pad with a length of 250 but
+                `pad_to_multiple_of=8` then we will pad to 256.
+
+            pad_id: (`optional`) unsigned int:
+                The indice to be used when padding
+
+            pad_type_id: (`optional`) unsigned int:
+                The type indice to be used when padding
+
+            pad_token: (`optional`) str:
+                The pad token to be used when padding
+
+            length: (`optional`) unsigned int:
+                If specified, the length at which to pad. If not specified
+                we pad using the size of the longest sequence in a batch
+        """
+        pass
+    def no_padding(self):
+        """ Disable padding """
+        pass
+    @property
+    def padding(self) -> Optional[dict]:
+        """ Get the current padding parameters
+
+        Returns:
+            None if padding is disabled, a dict with the currently set parameters
+            if the padding is enabled.
+        """
+        pass
+    def normalize(self, sequence: str) -> str:
+        """ Normalize the given sequence
+
+        Args:
+            sequence: str:
+                The sequence to normalize
+
+        Returns:
+            The normalized string
+        """
+        pass
+    def encode(
+        self,
+        sequence: InputSequence,
+        pair: Optional[InputSequence],
+        is_pretokenized: bool = False,
+        add_special_tokens: bool = True,
+    ) -> Encoding:
+        """ Encode the given sequence and pair. This method can process raw text sequences as well
+        as already pre-tokenized sequences.
+
+        Args:
+            sequence: InputSequence:
+                The sequence we want to encode. This sequence can be either raw text or
+                pre-tokenized, according to the `is_pretokenized` argument:
+
+                - If `is_pretokenized=False`: `InputSequence` is expected to be `str`
+                - If `is_pretokenized=True`: `InputSequence` is expected to be
+                    `Union[List[str], Tuple[str]]`
+
+            is_pretokenized: bool:
+                Whether the input is already pre-tokenized
+
+            add_special_tokens: bool:
+                Whether to add the special tokens while encoding.
+
+        Returns:
+            An Encoding
+        """
+        pass
+    def encode_batch(
+        self,
+        inputs: List[EncodeInput],
+        is_pretokenized: bool = False,
+        add_special_tokens: bool = True,
+    ) -> List[Encoding]:
+        """ Encode the given inputs. This method accept both raw text sequences as well as already
+        pre-tokenized sequences.
+
+        Args:
+            inputs: List[EncodeInput]:
+                A list of single sequences or pair sequences to encode. Each `EncodeInput` is
+                expected to be of the following form:
+                    `Union[InputSequence, Tuple[InputSequence, InputSequence]]`
+
+                Each `InputSequence` can either be raw text or pre-tokenized,
+                according to the `is_pretokenized` argument:
+
+                - If `is_pretokenized=False`: `InputSequence` is expected to be `str`
+                - If `is_pretokenized=True`: `InputSequence` is expected to be
+                    `Union[List[str], Tuple[str]]`
+
+            is_pretokenized: bool:
+                Whether the input is already pre-tokenized.
+
+            add_special_tokens: bool:
+                Whether to add the special tokens while encoding.
+
+        Returns:
+            A list of Encoding
+        """
+        pass
+    def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
+        """ Decode the given list of ids to a string sequence
+
+        Args:
+            ids: List[unsigned int]:
+                A list of ids to be decoded
+
+            skip_special_tokens: (`optional`) boolean:
+                Whether to remove all the special tokens from the output string
+
+        Returns:
+            The decoded string
+        """
+        pass
+    def decode_batch(
+        self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
+    ) -> str:
+        """ Decode the list of sequences to a list of string sequences
+
+        Args:
+            sequences: List[List[unsigned int]]:
+                A list of sequence of ids to be decoded
+
+            skip_special_tokens: (`optional`) boolean:
+                Whether to remove all the special tokens from the output strings
+
+        Returns:
+            A list of decoded strings
+        """
+        pass
+    def token_to_id(self, token: str) -> Optional[int]:
+        """ Convert the given token to its corresponding id
+
+        Args:
+            token: str:
+                The token to convert
+
+        Returns:
+            The corresponding id if it exists, None otherwise
+        """
+        pass
+    def id_to_token(self, id: int) -> Optional[str]:
+        """ Convert the given token id to its corresponding string
+
+        Args:
+            token: id:
+                The token id to convert
+
+        Returns:
+            The corresponding string if it exists, None otherwise
+        """
+        pass
+    def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
+        """ Add the given tokens to the vocabulary
+
+        Args:
+            tokens: List[Union[str, AddedToken]]:
+                A list of tokens to add to the vocabulary. Each token can either be
+                a string, or an instance of AddedToken
+
+        Returns:
+            The number of tokens that were added to the vocabulary
+        """
+        pass
+    def add_special_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
+        """ Add the given special tokens to the vocabulary, and treat them as special tokens.
+
+        The special tokens will never be processed by the model, and will be
+        removed while decoding.
+
+        Args:
+            tokens: List[Union[str, AddedToken]]:
+                The list of special tokens to add. Each token can either be a string
+                or an instance of AddedToken
+
+        Returns:
+            The number of tokens that were added to the vocabulary
+        """
+        pass
+    def post_process(
+        self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True,
+    ) -> Encoding:
+        """ Apply all the post-processing steps to the given encodings.
+
+        The various steps are:
+            1. Truncate according to global params (provided to `enable_truncation`)
+            2. Apply the PostProcessor
+            3. Pad according to global params. (provided to `enable_padding`)
+
+        Args:
+            encoding: Encoding:
+                The main Encoding to post process
+
+            pair: Optional[Encoding]:
+                An optional pair Encoding
+
+            add_special_tokens: bool:
+                Whether to add special tokens
+
+        Returns:
+            The resulting Encoding
+        """
+        pass
--- a/bindings/python/py_src/tokenizers/decoders/init.py
+++ b/bindings/python/py_src/tokenizers/decoders/init.py
@@ -0,0 +1,7 @@
+from .. import decoders
+
+Decoder = decoders.Decoder
+ByteLevel = decoders.ByteLevel
+WordPiece = decoders.WordPiece
+Metaspace = decoders.Metaspace
+BPEDecoder = decoders.BPEDecoder
--- a/bindings/python/py_src/tokenizers/decoders/init.pyi
+++ b/bindings/python/py_src/tokenizers/decoders/init.pyi
@@ -0,0 +1,65 @@
+from typing import List
+
+class Decoder:
+    """ Base class for all decoders
+
+    This class is not supposed to be instantiated directly. Instead, any implementation of
+    a Decoder will return an instance of this class when instantiated.
+    """
+
+    def decode(self, tokens: List[str]) -> str:
+        """ Decode the given list of string to a final string """
+        pass
+
+class ByteLevel(Decoder):
+    """ ByteLevel Decoder """
+
+    def __init__(self) -> None:
+        """ Instantiate a new ByteLevel Decoder """
+        pass
+
+class WordPiece(Decoder):
+    """ WordPiece Decoder """
+
+    @staticmethod
+    def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder:
+        """ Instantiate a new WordPiece Decoder
+
+        Args:
+            prefix: str:
+                The prefix to use for subwords that are not a beginning-of-word
+            cleanup: bool:
+                Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
+                and some abbreviated english forms.
+        """
+        pass
+
+class Metaspace(Decoder):
+    """ Metaspace decoder """
+
+    def __init__(self, replacement: str = "▁", add_prefix_space: bool = True) -> None:
+        """ Instantiate a new Metaspace
+
+        Args:
+            replacement: str:
+                The replacement character. Must be exactly one character. By default we
+                use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
+
+            add_prefix_space: boolean:
+                Whether to add a space to the first word if there isn't already one. This
+                lets us treat `hello` exactly like `say hello`.
+        """
+        pass
+
+class BPEDecoder(Decoder):
+    """ BPEDecoder """
+
+    def __init__(self, suffix: str = "</w>") -> None:
+        """ Instantiate a new BPEDecoder
+
+        Args:
+            suffix: str:
+                The suffix that was used to caracterize an end-of-word. This suffix will
+                be replaced by whitespaces during the decoding
+        """
+        pass
--- a/bindings/python/py_src/tokenizers/implementations/init.py
+++ b/bindings/python/py_src/tokenizers/implementations/init.py
@@ -0,0 +1,5 @@
+from .base_tokenizer import BaseTokenizer
+from .byte_level_bpe import ByteLevelBPETokenizer
+from .char_level_bpe import CharBPETokenizer
+from .sentencepiece_bpe import SentencePieceBPETokenizer
+from .bert_wordpiece import BertWordPieceTokenizer
--- a/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py
+++ b/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py
@@ -0,0 +1,369 @@
+from tokenizers import Tokenizer, Encoding, AddedToken, InputSequence, EncodeInput
+from tokenizers.models import TokenizedSequence, TokenizedSequenceWithOffsets
+
+from typing import List, Union, Tuple, Optional, Dict
+
+Offsets = Tuple[int, int]
+
+
+class BaseTokenizer:
+    def __init__(self, tokenizer: Tokenizer, parameters=None):
+        self._tokenizer = tokenizer
+        self._parameters = parameters if parameters is not None else {}
+
+    def __repr__(self):
+        return "Tokenizer(vocabulary_size={}, {})".format(
+            self._tokenizer.get_vocab_size(),
+            ", ".join(k + "=" + str(v) for k, v in self._parameters.items()),
+        )
+
+    def num_special_tokens_to_add(self, is_pair: bool) -> int:
+        """
+        Return the number of special tokens that would be added for single/pair sentences.
+        :param is_pair: Boolean indicating if the input would be a single sentence or a pair
+        :return:
+        """
+        return self._tokenizer.num_special_tokens_to_add(is_pair)
+
+    def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
+        """ Returns the vocabulary
+
+        Args:
+            with_added_tokens: boolean:
+                Whether to include the added tokens in the vocabulary
+
+        Returns:
+            The vocabulary
+        """
+        return self._tokenizer.get_vocab(with_added_tokens=with_added_tokens)
+
+    def get_vocab_size(self, with_added_tokens: bool = True) -> int:
+        """ Return the size of vocabulary, with or without added tokens.
+
+        Args:
+            with_added_tokens: (`optional`) bool:
+                Whether to count in added special tokens or not
+
+        Returns:
+            Size of vocabulary
+        """
+        return self._tokenizer.get_vocab_size(with_added_tokens=with_added_tokens)
+
+    def enable_padding(
+        self,
+        direction: Optional[str] = "right",
+        pad_to_multiple_of: Optional[int] = None,
+        pad_id: Optional[int] = 0,
+        pad_type_id: Optional[int] = 0,
+        pad_token: Optional[str] = "[PAD]",
+        length: Optional[int] = None,
+    ):
+        """ Change the padding strategy
+
+        Args:
+            direction: (`optional`) str:
+                Can be one of: `right` or `left`
+
+            pad_to_multiple_of: (`optional`) unsigned int:
+                If specified, the padding length should always snap to the next multiple of
+                the given value. For example if we were going to pad with a length of 250 but
+                `pad_to_multiple_of=8` then we will pad to 256.
+
+            pad_id: (`optional`) unsigned int:
+                The indice to be used when padding
+
+            pad_type_id: (`optional`) unsigned int:
+                The type indice to be used when padding
+
+            pad_token: (`optional`) str:
+                The pad token to be used when padding
+
+            length: (`optional`) unsigned int:
+                If specified, the length at which to pad. If not specified
+                we pad using the size of the longest sequence in a batch
+        """
+        return self._tokenizer.enable_padding(
+            direction=direction,
+            pad_to_multiple_of=pad_to_multiple_of,
+            pad_id=pad_id,
+            pad_type_id=pad_type_id,
+            pad_token=pad_token,
+            length=length,
+        )
+
+    def no_padding(self):
+        """ Disable padding """
+        return self._tokenizer.no_padding()
+
+    @property
+    def padding(self) -> Optional[dict]:
+        """ Get the current padding parameters
+
+        Returns:
+            None if padding is disabled, a dict with the currently set parameters
+            if the padding is enabled.
+        """
+        return self._tokenizer.padding
+
+    def enable_truncation(
+        self, max_length: int, stride: Optional[int] = 0, strategy: Optional[str] = "longest_first"
+    ):
+        """ Change the truncation options
+
+        Args:
+            max_length: unsigned int:
+                The maximum length at which to truncate
+
+            stride: (`optional`) unsigned int:
+                The length of the previous first sequence to be included
+                in the overflowing sequence
+
+            strategy: (`optional) str:
+                Can be one of `longest_first`, `only_first` or `only_second`
+        """
+        return self._tokenizer.enable_truncation(max_length, stride=stride, strategy=strategy)
+
+    def no_truncation(self):
+        """ Disable truncation """
+        return self._tokenizer.no_truncation()
+
+    @property
+    def truncation(self) -> Optional[dict]:
+        """ Get the current truncation parameters
+
+        Returns:
+            None if truncation is disabled, a dict with the current truncation parameters if
+            truncation is enabled
+        """
+        return self._tokenizer.truncation
+
+    def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
+        """ Add the given tokens to the vocabulary
+
+        Args:
+            tokens: List[Union[str, AddedToken]]:
+                A list of tokens to add to the vocabulary. Each token can either be
+                a string, or an instance of AddedToken
+
+        Returns:
+            The number of tokens that were added to the vocabulary
+        """
+        return self._tokenizer.add_tokens(tokens)
+
+    def add_special_tokens(self, special_tokens: List[Union[str, AddedToken]]) -> int:
+        """ Add the given special tokens to the vocabulary, and treat them as special tokens.
+
+        The special tokens will never be processed by the model, and will be
+        removed while decoding.
+
+        Args:
+            tokens: List[Union[str, AddedToken]]:
+                A list of special tokens to add to the vocabulary. Each token can either be
+                a string, or an instance of AddedToken
+
+        Returns:
+            The number of tokens that were added to the vocabulary
+        """
+        return self._tokenizer.add_special_tokens(special_tokens)
+
+    def normalize(self, sequence: str) -> str:
+        """ Normalize the given sequence
+
+        Args:
+            sequence: str:
+                The sequence to normalize
+
+        Returns:
+            The normalized string
+        """
+        return self._tokenizer.normalize(sequence)
+
+    def encode(
+        self,
+        sequence: InputSequence,
+        pair: Optional[InputSequence] = None,
+        is_pretokenized: bool = False,
+        add_special_tokens: bool = True,
+    ) -> Encoding:
+        """ Encode the given sequence and pair. This method can process raw text sequences as well
+        as already pre-tokenized sequences.
+
+        Args:
+            sequence: InputSequence:
+                The sequence we want to encode. This sequence can be either raw text or
+                pre-tokenized, according to the `is_pretokenized` argument:
+
+                - If `is_pretokenized=False`: `InputSequence` is expected to be `str`
+                - If `is_pretokenized=True`: `InputSequence` is expected to be
+                    `Union[List[str], Tuple[str]]`
+
+            is_pretokenized: bool:
+                Whether the input is already pre-tokenized.
+
+            add_special_tokens: bool:
+                Whether to add the special tokens while encoding.
+
+        Returns:
+            An Encoding
+        """
+        if sequence is None:
+            raise ValueError("encode: `sequence` can't be `None`")
+
+        return self._tokenizer.encode(sequence, pair, is_pretokenized, add_special_tokens)
+
+    def encode_batch(
+        self,
+        inputs: List[EncodeInput],
+        is_pretokenized: bool = False,
+        add_special_tokens: bool = True,
+    ) -> List[Encoding]:
+        """ Encode the given inputs. This method accept both raw text sequences as well as already
+        pre-tokenized sequences.
+
+        Args:
+            inputs: List[EncodeInput]:
+                A list of single sequences or pair sequences to encode. Each `EncodeInput` is
+                expected to be of the following form:
+                    `Union[InputSequence, Tuple[InputSequence, InputSequence]]`
+
+                Each `InputSequence` can either be raw text or pre-tokenized,
+                according to the `is_pretokenized` argument:
+
+                - If `is_pretokenized=False`: `InputSequence` is expected to be `str`
+                - If `is_pretokenized=True`: `InputSequence` is expected to be
+                    `Union[List[str], Tuple[str]]`
+
+            is_pretokenized: bool:
+                Whether the input is already pre-tokenized.
+
+            add_special_tokens: bool:
+                Whether to add the special tokens while encoding.
+
+        Returns:
+            A list of Encoding
+        """
+
+        if inputs is None:
+            raise ValueError("encode_batch: `inputs` can't be `None`")
+
+        return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)
+
+    def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
+        """ Decode the given list of ids to a string sequence
+
+        Args:
+            ids: List[unsigned int]:
+                A list of ids to be decoded
+
+            skip_special_tokens: (`optional`) boolean:
+                Whether to remove all the special tokens from the output string
+
+        Returns:
+            The decoded string
+        """
+        if ids is None:
+            raise ValueError("None input is not valid. Should be a list of integers.")
+
+        return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
+
+    def decode_batch(
+        self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
+    ) -> str:
+        """ Decode the list of sequences to a list of string sequences
+
+        Args:
+            sequences: List[List[unsigned int]]:
+                A list of sequence of ids to be decoded
+
+            skip_special_tokens: (`optional`) boolean:
+                Whether to remove all the special tokens from the output strings
+
+        Returns:
+            A list of decoded strings
+        """
+        if sequences is None:
+            raise ValueError("None input is not valid. Should be list of list of integers.")
+
+        return self._tokenizer.decode_batch(sequences, skip_special_tokens=skip_special_tokens)
+
+    def token_to_id(self, token: str) -> Optional[int]:
+        """ Convert the given token to its corresponding id
+
+        Args:
+            token: str:
+                The token to convert
+
+        Returns:
+            The corresponding id if it exists, None otherwise
+        """
+        return self._tokenizer.token_to_id(token)
+
+    def id_to_token(self, id: int) -> Optional[str]:
+        """ Convert the given token id to its corresponding string
+
+        Args:
+            token: id:
+                The token id to convert
+
+        Returns:
+            The corresponding string if it exists, None otherwise
+        """
+        return self._tokenizer.id_to_token(id)
+
+    def save_model(self, directory: str, name: Optional[str] = None):
+        """ Save the current model to the given directory
+
+        Args:
+            directory: str:
+                A path to the destination directory
+
+            name: (Optional) str:
+                The name of the tokenizer, to be used in the saved files
+        """
+        return self._tokenizer.model.save(directory, name=name)
+
+    def save(self, path: str, pretty: bool = False):
+        """ Save the current Tokenizer at the given path
+
+        Args:
+            path: str:
+                A path to the destination Tokenizer file
+        """
+        return self._tokenizer.save(path, pretty)
+
+    def to_str(self, pretty: bool = False):
+        """ Get a serialized JSON version of the Tokenizer as a str
+
+        Args:
+            pretty: bool:
+                Whether the JSON string should be prettified
+
+        Returns:
+            str
+        """
+        return self._tokenizer.to_str(pretty)
+
+    def post_process(
+        self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True
+    ) -> Encoding:
+        """ Apply all the post-processing steps to the given encodings.
+
+        The various steps are:
+            1. Truncate according to global params (provided to `enable_truncation`)
+            2. Apply the PostProcessor
+            3. Pad according to global params. (provided to `enable_padding`)
+
+        Args:
+            encoding: Encoding:
+                The main Encoding to post process
+
+            pair: Optional[Encoding]:
+                An optional pair Encoding
+
+            add_special_tokens: bool:
+                Whether to add special tokens
+
+        Returns:
+            The resulting Encoding
+        """
+        return self._tokenizer.post_process(encoding, pair, add_special_tokens)
--- a/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py
+++ b/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py
@@ -0,0 +1,113 @@
+from tokenizers import Tokenizer, AddedToken, decoders, trainers
+from tokenizers.models import WordPiece
+from tokenizers.normalizers import BertNormalizer
+from tokenizers.pre_tokenizers import BertPreTokenizer
+from tokenizers.processors import BertProcessing
+from .base_tokenizer import BaseTokenizer
+
+from typing import Optional, List, Union
+
+
+class BertWordPieceTokenizer(BaseTokenizer):
+    """ Bert WordPiece Tokenizer """
+
+    def __init__(
+        self,
+        vocab_file: Optional[str] = None,
+        unk_token: Union[str, AddedToken] = "[UNK]",
+        sep_token: Union[str, AddedToken] = "[SEP]",
+        cls_token: Union[str, AddedToken] = "[CLS]",
+        pad_token: Union[str, AddedToken] = "[PAD]",
+        mask_token: Union[str, AddedToken] = "[MASK]",
+        clean_text: bool = True,
+        handle_chinese_chars: bool = True,
+        strip_accents: Optional[bool] = None,
+        lowercase: bool = True,
+        wordpieces_prefix: str = "##",
+    ):
+
+        if vocab_file is not None:
+            tokenizer = Tokenizer(WordPiece(vocab_file, unk_token=str(unk_token)))
+        else:
+            tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token)))
+
+        # Let the tokenizer know about special tokens if they are part of the vocab
+        if tokenizer.token_to_id(str(unk_token)) is not None:
+            tokenizer.add_special_tokens([str(unk_token)])
+        if tokenizer.token_to_id(str(sep_token)) is not None:
+            tokenizer.add_special_tokens([str(sep_token)])
+        if tokenizer.token_to_id(str(cls_token)) is not None:
+            tokenizer.add_special_tokens([str(cls_token)])
+        if tokenizer.token_to_id(str(pad_token)) is not None:
+            tokenizer.add_special_tokens([str(pad_token)])
+        if tokenizer.token_to_id(str(mask_token)) is not None:
+            tokenizer.add_special_tokens([str(mask_token)])
+
+        tokenizer.normalizer = BertNormalizer(
+            clean_text=clean_text,
+            handle_chinese_chars=handle_chinese_chars,
+            strip_accents=strip_accents,
+            lowercase=lowercase,
+        )
+        tokenizer.pre_tokenizer = BertPreTokenizer()
+
+        if vocab_file is not None:
+            sep_token_id = tokenizer.token_to_id(str(sep_token))
+            if sep_token_id is None:
+                raise TypeError("sep_token not found in the vocabulary")
+            cls_token_id = tokenizer.token_to_id(str(cls_token))
+            if cls_token_id is None:
+                raise TypeError("cls_token not found in the vocabulary")
+
+            tokenizer.post_processor = BertProcessing(
+                (str(sep_token), sep_token_id), (str(cls_token), cls_token_id)
+            )
+        tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)
+
+        parameters = {
+            "model": "BertWordPiece",
+            "unk_token": unk_token,
+            "sep_token": sep_token,
+            "cls_token": cls_token,
+            "pad_token": pad_token,
+            "mask_token": mask_token,
+            "clean_text": clean_text,
+            "handle_chinese_chars": handle_chinese_chars,
+            "strip_accents": strip_accents,
+            "lowercase": lowercase,
+            "wordpieces_prefix": wordpieces_prefix,
+        }
+
+        super().__init__(tokenizer, parameters)
+
+    def train(
+        self,
+        files: Union[str, List[str]],
+        vocab_size: int = 30000,
+        min_frequency: int = 2,
+        limit_alphabet: int = 1000,
+        initial_alphabet: List[str] = [],
+        special_tokens: List[Union[str, AddedToken]] = [
+            "[PAD]",
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[MASK]",
+        ],
+        show_progress: bool = True,
+        wordpieces_prefix: str = "##",
+    ):
+        """ Train the model using the given files """
+
+        trainer = trainers.WordPieceTrainer(
+            vocab_size=vocab_size,
+            min_frequency=min_frequency,
+            limit_alphabet=limit_alphabet,
+            initial_alphabet=initial_alphabet,
+            special_tokens=special_tokens,
+            show_progress=show_progress,
+            continuing_subword_prefix=wordpieces_prefix,
+        )
+        if isinstance(files, str):
+            files = [files]
+        self._tokenizer.train(trainer, files)
--- a/bindings/python/py_src/tokenizers/implementations/byte_level_bpe.py
+++ b/bindings/python/py_src/tokenizers/implementations/byte_level_bpe.py
@@ -0,0 +1,92 @@
+from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers, processors
+from tokenizers.models import BPE
+from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
+from .base_tokenizer import BaseTokenizer
+
+from typing import Optional, List, Union
+
+
+class ByteLevelBPETokenizer(BaseTokenizer):
+    """ ByteLevelBPETokenizer
+
+    Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
+    """
+
+    def __init__(
+        self,
+        vocab_file: Optional[str] = None,
+        merges_file: Optional[str] = None,
+        add_prefix_space: bool = False,
+        lowercase: bool = False,
+        dropout: Optional[float] = None,
+        unicode_normalizer: Optional[str] = None,
+        continuing_subword_prefix: Optional[str] = None,
+        end_of_word_suffix: Optional[str] = None,
+        trim_offsets: bool = False,
+    ):
+        if vocab_file is not None and merges_file is not None:
+            tokenizer = Tokenizer(
+                BPE(
+                    vocab_file,
+                    merges_file,
+                    dropout=dropout,
+                    continuing_subword_prefix=continuing_subword_prefix or "",
+                    end_of_word_suffix=end_of_word_suffix or "",
+                )
+            )
+        else:
+            tokenizer = Tokenizer(BPE())
+
+        # Check for Unicode normalization first (before everything else)
+        normalizers = []
+
+        if unicode_normalizer:
+            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
+
+        if lowercase:
+            normalizers += [Lowercase()]
+
+        # Create the normalizer structure
+        if len(normalizers) > 0:
+            if len(normalizers) > 1:
+                tokenizer.normalizer = Sequence(normalizers)
+            else:
+                tokenizer.normalizer = normalizers[0]
+
+        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
+        tokenizer.decoder = decoders.ByteLevel()
+        tokenizer.post_processor = processors.ByteLevel(trim_offsets=trim_offsets)
+
+        parameters = {
+            "model": "ByteLevelBPE",
+            "add_prefix_space": add_prefix_space,
+            "lowercase": lowercase,
+            "dropout": dropout,
+            "unicode_normalizer": unicode_normalizer,
+            "continuing_subword_prefix": continuing_subword_prefix,
+            "end_of_word_suffix": end_of_word_suffix,
+            "trim_offsets": trim_offsets,
+        }
+
+        super().__init__(tokenizer, parameters)
+
+    def train(
+        self,
+        files: Union[str, List[str]],
+        vocab_size: int = 30000,
+        min_frequency: int = 2,
+        show_progress: bool = True,
+        special_tokens: List[Union[str, AddedToken]] = [],
+    ):
+        """ Train the model using the given files """
+
+        trainer = trainers.BpeTrainer(
+            vocab_size=vocab_size,
+            min_frequency=min_frequency,
+            show_progress=show_progress,
+            special_tokens=special_tokens,
+            initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
+        )
+        if isinstance(files, str):
+            files = [files]
+        self._tokenizer.train(trainer, files)
--- a/bindings/python/py_src/tokenizers/implementations/char_level_bpe.py
+++ b/bindings/python/py_src/tokenizers/implementations/char_level_bpe.py
@@ -0,0 +1,116 @@
+from .. import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers
+from ..models import BPE
+from ..normalizers import Sequence, Lowercase, unicode_normalizer_from_str, BertNormalizer
+from .base_tokenizer import BaseTokenizer
+
+from typing import Optional, List, Union
+
+
+class CharBPETokenizer(BaseTokenizer):
+    """ Original BPE Tokenizer
+
+        Represents the BPE algorithm, as introduced by Rico Sennrich
+        (https://arxiv.org/abs/1508.07909)
+
+        The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
+        Sennrich subword-nmt implementation by the following options that you can deactivate:
+            - adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
+                * removing any control characters and replacing all whitespaces by the classic one.
+                * handle chinese chars by putting spaces around them.
+                * strip all accents.
+            - spitting on punctuation in addition to whitespaces (deactivate it with
+              `split_on_whitespace_only=True`)
+    """
+
+    def __init__(
+        self,
+        vocab_file: Optional[str] = None,
+        merges_file: Optional[str] = None,
+        unk_token: Union[str, AddedToken] = "<unk>",
+        suffix: str = "</w>",
+        dropout: Optional[float] = None,
+        lowercase: bool = False,
+        unicode_normalizer: Optional[str] = None,
+        bert_normalizer: bool = True,
+        split_on_whitespace_only: bool = False,
+    ):
+        if vocab_file is not None and merges_file is not None:
+            tokenizer = Tokenizer(
+                BPE(
+                    vocab_file,
+                    merges_file,
+                    dropout=dropout,
+                    unk_token=str(unk_token),
+                    end_of_word_suffix=suffix,
+                )
+            )
+        else:
+            tokenizer = Tokenizer(BPE())
+
+        if tokenizer.token_to_id(str(unk_token)) is not None:
+            tokenizer.add_special_tokens([str(unk_token)])
+
+        # Check for Unicode normalization first (before everything else)
+        normalizers = []
+
+        if unicode_normalizer:
+            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
+
+        if bert_normalizer:
+            normalizers += [BertNormalizer(lowercase=False)]
+
+        if lowercase:
+            normalizers += [Lowercase()]
+
+        # Create the normalizer structure
+        if len(normalizers) > 0:
+            if len(normalizers) > 1:
+                tokenizer.normalizer = Sequence(normalizers)
+            else:
+                tokenizer.normalizer = normalizers[0]
+
+        if split_on_whitespace_only:
+            tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
+        else:
+            tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
+
+        tokenizer.decoder = decoders.BPEDecoder(suffix=suffix)
+
+        parameters = {
+            "model": "BPE",
+            "unk_token": unk_token,
+            "suffix": suffix,
+            "dropout": dropout,
+            "lowercase": lowercase,
+            "unicode_normalizer": unicode_normalizer,
+            "bert_normalizer": bert_normalizer,
+            "split_on_whitespace_only": split_on_whitespace_only,
+        }
+
+        super().__init__(tokenizer, parameters)
+
+    def train(
+        self,
+        files: Union[str, List[str]],
+        vocab_size: int = 30000,
+        min_frequency: int = 2,
+        special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
+        limit_alphabet: int = 1000,
+        initial_alphabet: List[str] = [],
+        suffix: Optional[str] = "</w>",
+        show_progress: bool = True,
+    ):
+        """ Train the model using the given files """
+
+        trainer = trainers.BpeTrainer(
+            vocab_size=vocab_size,
+            min_frequency=min_frequency,
+            special_tokens=special_tokens,
+            limit_alphabet=limit_alphabet,
+            initial_alphabet=initial_alphabet,
+            end_of_word_suffix=suffix,
+            show_progress=show_progress,
+        )
+        if isinstance(files, str):
+            files = [files]
+        self._tokenizer.train(trainer, files)
--- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py
+++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py
@@ -0,0 +1,74 @@
+from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers
+from tokenizers.models import BPE
+from tokenizers.normalizers import NFKC
+from .base_tokenizer import BaseTokenizer
+
+from typing import Optional, List, Union
+
+
+class SentencePieceBPETokenizer(BaseTokenizer):
+    """ SentencePiece BPE Tokenizer
+
+    Represents the BPE algorithm, with the pretokenization used by SentencePiece
+    """
+
+    def __init__(
+        self,
+        vocab_file: Optional[str] = None,
+        merges_file: Optional[str] = None,
+        unk_token: Union[str, AddedToken] = "<unk>",
+        replacement: str = "▁",
+        add_prefix_space: bool = True,
+        dropout: Optional[float] = None,
+    ):
+        if vocab_file is not None and merges_file is not None:
+            tokenizer = Tokenizer(
+                BPE(vocab_file, merges_file, dropout=dropout, unk_token=unk_token)
+            )
+        else:
+            tokenizer = Tokenizer(BPE())
+
+        if tokenizer.token_to_id(str(unk_token)) is not None:
+            tokenizer.add_special_tokens([str(unk_token)])
+
+        tokenizer.normalizer = NFKC()
+        tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
+            replacement=replacement, add_prefix_space=add_prefix_space
+        )
+        tokenizer.decoder = decoders.Metaspace(
+            replacement=replacement, add_prefix_space=add_prefix_space
+        )
+
+        parameters = {
+            "model": "SentencePieceBPE",
+            "unk_token": unk_token,
+            "replacement": replacement,
+            "add_prefix_space": add_prefix_space,
+            "dropout": dropout,
+        }
+
+        super().__init__(tokenizer, parameters)
+
+    def train(
+        self,
+        files: Union[str, List[str]],
+        vocab_size: int = 30000,
+        min_frequency: int = 2,
+        special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
+        limit_alphabet: int = 1000,
+        initial_alphabet: List[str] = [],
+        show_progress: bool = True,
+    ):
+        """ Train the model using the given files """
+
+        trainer = trainers.BpeTrainer(
+            vocab_size=vocab_size,
+            min_frequency=min_frequency,
+            special_tokens=special_tokens,
+            limit_alphabet=limit_alphabet,
+            initial_alphabet=initial_alphabet,
+            show_progress=show_progress,
+        )
+        if isinstance(files, str):
+            files = [files]
+        self._tokenizer.train(trainer, files)
--- a/bindings/python/py_src/tokenizers/models/init.py
+++ b/bindings/python/py_src/tokenizers/models/init.py
@@ -0,0 +1,11 @@
+from typing import List, Tuple
+
+from .. import models, Offsets
+
+TokenizedSequence = List[str]
+TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
+
+Model = models.Model
+BPE = models.BPE
+WordPiece = models.WordPiece
+WordLevel = models.WordLevel
--- a/bindings/python/py_src/tokenizers/models/init.pyi
+++ b/bindings/python/py_src/tokenizers/models/init.pyi
@@ -0,0 +1,156 @@
+from .. import Encoding, Offsets
+from typing import List, Optional, Union, Tuple
+
+TokenizedSequence = List[str]
+TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
+
+class Model:
+    """ Base class for all models
+
+    This class is not supposed to be instantiated directly. Instead, any implementation of
+    a Model will return a instance of this class when instantiated.
+    """
+
+    def save(self, folder: str, name: Optional[str] = None) -> List[str]:
+        """ Save the current model
+
+        Save the current model in the given folder, using the given name for the various
+        files that will get created.
+        Any file with the same name that already exist in this folder will be overwritten.
+        """
+        pass
+    def encode(
+        self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
+    ) -> Encoding:
+        """ Encode the given sequence.
+
+        A sequence can either be:
+            - `TokenizedSequence`: (`List[str]`)
+            - `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
+            a Tuple[int, int].
+
+        If the Offsets are not provided, they will be automatically generated, making the hypothesis
+        that all the tokens in the `TokenizedSequence` are contiguous in the original string.
+
+        Args:
+            sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
+                Either a TokenizedSequence or a TokenizedSequenceWithOffsets
+
+            type_id: int:
+                The type id of the given sequence
+
+        Returns:
+            An Encoding
+        """
+        pass
+    def encode_batch(
+        self,
+        sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
+        type_id: int = 0,
+    ) -> List[Encoding]:
+        """ Encode the given batch of sequence.
+
+        A sequence can either be:
+            - `TokenizedSequence`: (`List[str]`)
+            - `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
+            a Tuple[int, int].
+
+        If the Offsets are not provided, they will be automatically generated, making the hypothesis
+        that all the tokens in the `TokenizedSequence` are contiguous in the original string.
+
+        Args:
+            sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
+                A list of sequence. Each sequence is either a TokenizedSequence or a
+                TokenizedSequenceWithOffsets
+
+            type_id: int:
+                The type if of the given sequence
+
+        Returns:
+            A list of Encoding
+        """
+        pass
+
+class BPE(Model):
+    """BytePairEncoding model class
+
+    Instantiate a BPE Model from the given vocab and merges files.
+
+    Args:
+       vocab: ('`optional`) string:
+           Path to a vocabulary JSON file.
+
+       merges: (`optional`) string:
+           Path to a merge file.
+
+       cache_capacity: (`optional`) int:
+           The number of words that the BPE cache can contain. The cache allows
+           to speed-up the process by keeping the result of the merge operations
+           for a number of words.
+
+       dropout: (`optional`) Optional[float] [0, 1]:
+           The BPE dropout to use. Must be an float between 0 and 1
+
+       unk_token: (`optional`) str:
+           The unknown token to be used by the model.
+
+       continuing_subword_prefix: (`optional`) str:
+           The prefix to attach to subword units that don't represent a beginning of word.
+
+       end_of_word_suffix: (`optional`) str:
+           The suffix to attach to subword units that represent an end of word.
+    """
+
+    @staticmethod
+    def __init__(
+        self,
+        vocab: Optional[str],
+        merges: Optional[str],
+        cache_capacity: Optional[int],
+        dropout: Optional[float],
+        unk_token: Optional[str],
+        continuing_subword_prefix: Optional[str],
+        end_of_word_suffix: Optional[str],
+    ):
+        pass
+
+class WordPiece(Model):
+    """ WordPiece model class
+
+    Instantiate a WordPiece Model from the given vocab file.
+
+        Args:
+            vocab: (`optional`) string:
+                Path to a vocabulary file.
+
+            unk_token: (`optional`) str:
+                The unknown token to be used by the model.
+
+            max_input_chars_per_word: (`optional`) int:
+                The maximum number of characters to authorize in a single word.
+    """
+
+    def __init__(
+        self,
+        vocab: Optional[str],
+        unk_token: Optional[str],
+        max_input_chars_per_word: Optional[int],
+    ):
+        pass
+
+class WordLevel(Model):
+    """
+    Most simple tokenizer model based on mapping token from a vocab file to their corresponding id.
+
+    Instantiate a WordLevel Model from the given vocab file.
+
+        Args:
+            vocab: (`optional`) string:
+                Path to a vocabulary file.
+
+            unk_token: str:
+                The unknown token to be used by the model.
+    """
+
+    def __init__(self, vocab: Optional[str], unk_token: Optional[str]):
+        pass
--- a/bindings/python/py_src/tokenizers/normalizers/init.py
+++ b/bindings/python/py_src/tokenizers/normalizers/init.py
@@ -0,0 +1,25 @@
+from .. import normalizers
+
+Normalizer = normalizers.Normalizer
+BertNormalizer = normalizers.BertNormalizer
+NFD = normalizers.NFD
+NFKD = normalizers.NFKD
+NFC = normalizers.NFC
+NFKC = normalizers.NFKC
+Sequence = normalizers.Sequence
+Lowercase = normalizers.Lowercase
+Strip = normalizers.Strip
+
+
+NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
+
+
+def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
+    if normalizer not in NORMALIZERS:
+        raise ValueError(
+            "{} is not a known unicode normalizer. Available are {}".format(
+                normalizer, NORMALIZERS.keys()
+            )
+        )
+
+    return NORMALIZERS[normalizer]()
--- a/bindings/python/py_src/tokenizers/normalizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/normalizers/init.pyi
@@ -0,0 +1,108 @@
+from typing import Optional, List
+
+class Normalizer:
+    """ Base class for all normalizers
+
+    This class is not supposed to be instantiated directly. Instead, any implementation of a
+    Normalizer will return an instance of this class when instantiated.
+    """
+
+class BertNormalizer(Normalizer):
+    """ BertNormalizer
+
+    Takes care of normalizing raw text before giving it to a Bert model.
+    This includes cleaning the text, handling accents, chinese chars and lowercasing
+    """
+
+    def __init__(
+        self,
+        clean_text: Optional[bool] = True,
+        handle_chinese_chars: Optional[bool] = True,
+        strip_accents: Optional[bool] = None,
+        lowercase: Optional[bool] = True,
+    ) -> None:
+        """ Instantiate a BertNormalizer with the given options.
+
+        Args:
+            clean_text: (`optional`) boolean:
+                Whether to clean the text, by removing any control characters
+                and replacing all whitespaces by the classic one.
+
+            handle_chinese_chars: (`optional`) boolean:
+                Whether to handle chinese chars by putting spaces around them.
+
+            strip_accents: (`optional`) boolean:
+                Whether to strip all accents. If this option is not specified (ie == None),
+                then it will be determined by the value for `lowercase` (as in the original Bert).
+
+            lowercase: (`optional`) boolean:
+                Whether to lowercase.
+
+        Returns:
+            Normalizer
+        """
+        pass
+
+class NFD(Normalizer):
+    """ NFD Unicode Normalizer """
+
+    def __init__(self) -> None:
+        """ Instantiate a new NFD Normalizer """
+        pass
+
+class NFKD(Normalizer):
+    """ NFKD Unicode Normalizer """
+
+    def __init__(self) -> None:
+        """ Instantiate a new NFKD Normalizer """
+        pass
+
+class NFC(Normalizer):
+    """ NFC Unicode Normalizer """
+
+    def __init__(self) -> None:
+        """ Instantiate a new NFC Normalizer """
+        pass
+
+class NFKC(Normalizer):
+    """ NFKC Unicode Normalizer """
+
+    def __init__(self) -> None:
+        """ Instantiate a new NFKC Normalizer """
+        pass
+
+class Sequence(Normalizer):
+    """ Allows concatenating multiple other Normalizer as a Sequence.
+
+    All the normalizers run in sequence in the given order
+    """
+
+    def __init__(self, normalizers: List[Normalizer]) -> None:
+        """ Instantiate a new normalization Sequence using the given normalizers
+
+        Args:
+            normalizers: List[Normalizer]:
+                A list of Normalizer to be run as a sequence
+        """
+        pass
+
+class Lowercase(Normalizer):
+    """ Lowercase Normalizer """
+
+    def __init__(self) -> None:
+        """ Instantiate a new Lowercase Normalizer """
+        pass
+
+class Strip(Normalizer):
+    """ Strip normalizer """
+
+    def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
+        pass
+
+def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
+    """
+    Instanciate unicode normalizer from the normalizer name
+    :param normalizer: Name of the normalizer
+    :return:
+    """
+    pass
--- a/bindings/python/py_src/tokenizers/pre_tokenizers/init.py
+++ b/bindings/python/py_src/tokenizers/pre_tokenizers/init.py
@@ -0,0 +1,9 @@
+from .. import pre_tokenizers
+
+PreTokenizer = pre_tokenizers.PreTokenizer
+ByteLevel = pre_tokenizers.ByteLevel
+Whitespace = pre_tokenizers.Whitespace
+WhitespaceSplit = pre_tokenizers.WhitespaceSplit
+BertPreTokenizer = pre_tokenizers.BertPreTokenizer
+Metaspace = pre_tokenizers.Metaspace
+CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
--- a/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
@@ -0,0 +1,109 @@
+from typing import Optional, List, Tuple
+
+Offsets = Tuple[int, int]
+
+class PreTokenizer:
+    """ Base class for all pre-tokenizers
+
+    This class is not supposed to be instantiated directly. Instead, any implementation of a
+    PreTokenizer will return an instance of this class when instantiated.
+    """
+
+    def pre_tokenize(self, sequence: str) -> List[Tuple[str, Offsets]]:
+        """ Pre tokenize the given sequence """
+        pass
+
+class ByteLevel(PreTokenizer):
+    """ ByteLevel PreTokenizer
+
+    This pre-tokenizer takes care of replacing all bytes of the given string
+    with a corresponding representation, as well as splitting into words.
+    """
+
+    def __init__(self, add_prefix_space: bool = True) -> None:
+        """ Instantiate a new ByteLevel PreTokenizer
+        Args:
+            add_prefix_space: (`optional`) boolean:
+                Whether to add a space to the first word if there isn't already one. This
+                lets us treat `hello` exactly like `say hello`.
+        Returns:
+            PreTokenizer
+        """
+        pass
+    @staticmethod
+    def alphabet() -> List[str]:
+        """ Returns the alphabet used by this PreTokenizer.
+
+        Since the ByteLevel works as its name suggests, at the byte level, it
+        encodes any byte to one visible character. This means that there is a
+        total of 256 different characters composing this alphabet.
+        """
+        pass
+
+class Whitespace(PreTokenizer):
+    """ Whitespace PreTokenizer
+
+    This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
+    """
+
+    def __init__(self) -> None:
+        """ Instantiate a new Whitespace PreTokenizer """
+        pass
+
+class WhitespaceSplit(PreTokenizer):
+    """ Whitespace PreTokenizer
+
+    This pre-tokenizer simply splits on the whitespace. Works like `.split()`
+    """
+
+    def __init__(self) -> None:
+        """ Instantiate a new WhitespaceSplit PreTokenizer """
+        pass
+
+class BertPreTokenizer(PreTokenizer):
+    """ BertPreTokenizer
+
+    This pre-tokenizer splits tokens on spaces, and also on punctuation.
+    Each occurence of a punctuation character will be treated separately.
+    """
+
+    def __init__(self) -> None:
+        """ Instantiate a new BertPreTokenizer """
+        pass
+
+class Metaspace(PreTokenizer):
+    """ Metaspace pre-tokenizer
+
+    This pre-tokenizer replaces any whitespace by the provided replacement character.
+    It then tries to split on these spaces.
+    """
+
+    def __init__(self, replacement: str = "▁", add_prefix_space: bool = True) -> None:
+        """ Instantiate a new Metaspace
+
+        Args:
+            replacement: str:
+                The replacement character. Must be exactly one character. By default we
+                use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
+
+            add_prefix_space: boolean:
+                Whether to add a space to the first word if there isn't already one. This
+                lets us treat `hello` exactly like `say hello`.
+        """
+        pass
+
+class CharDelimiterSplit(PreTokenizer):
+    """ CharDelimiterSplit PreTokenizer
+
+    This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
+    """
+
+    @staticmethod
+    def __init__(self, delimiter: str) -> None:
+        """ Instantiate a new CharDelimiterSplit PreTokenizer
+
+        Args:
+            delimiter: str:
+                The delimiter char that will be used to split input
+        """
+        pass
--- a/bindings/python/py_src/tokenizers/processors/init.py
+++ b/bindings/python/py_src/tokenizers/processors/init.py
@@ -0,0 +1,6 @@
+from .. import processors
+
+PostProcessor = processors.PostProcessor
+BertProcessing = processors.BertProcessing
+RobertaProcessing = processors.RobertaProcessing
+ByteLevel = processors.ByteLevel
--- a/bindings/python/py_src/tokenizers/processors/init.pyi
+++ b/bindings/python/py_src/tokenizers/processors/init.pyi
@@ -0,0 +1,99 @@
+from typing import Tuple
+
+class PostProcessor:
+    """ Base class for all post-processors
+
+    This class is not supposed to be instantiated directly. Instead, any implementation of
+    a PostProcessor will return an instance of this class when instantiated.
+    """
+
+    def num_special_tokens_to_add(self, is_pair: bool) -> int:
+        """
+        Return the number of special tokens that would be added for single/pair sentences.
+        :param is_pair: Boolean indicating if the input would be a single sentence or a pair
+        :return:
+        """
+        pass
+
+class BertProcessing(PostProcessor):
+    """ BertProcessing
+
+    This post-processor takes care of adding the special tokens needed by
+    a Bert model:
+        - a SEP token
+        - a CLS token
+    """
+
+    def __init__(self, sep: Tuple[str, int], cls: Tuple[str, int]) -> None:
+        """ Instantiate a new BertProcessing with the given tokens
+
+        Args:
+            sep: Tuple[str, int]:
+                A tuple with the string representation of the SEP token, and its id
+
+            cls: Tuple[str, int]:
+                A tuple with the string representation of the CLS token, and its id
+
+        Returns:
+            PostProcessor
+        """
+        pass
+
+class RobertaProcessing(PostProcessor):
+    """ RobertaProcessing
+
+    This post-processor takes care of adding the special tokens needed by
+    a Roberta model:
+        - a SEP token
+        - a CLS token
+
+    It also takes care of trimming the offsets.
+    By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
+    want the offsets to include these whitespaces, then this PostProcessor should be initialized
+    with `trim_offsets=True`
+    """
+
+    def __init__(
+        self,
+        sep: Tuple[str, int],
+        cls: Tuple[str, int],
+        trim_offsets: bool = True,
+        add_prefix_space: bool = True,
+    ) -> None:
+        """ Instantiate a new RobertaProcessing with the given tokens
+
+        Args:
+            sep: Tuple[str, int]:
+                A tuple with the string representation of the SEP token, and its id
+
+            cls: Tuple[str, int]:
+                A tuple with the string representation of the CLS token, and its id
+
+            trim_offsets: bool:
+                Whether to trim the whitespaces from the produced offsets.
+
+            add_prefix_space: bool:
+                Whether the add_prefix_space option was enabled during pre-tokenization. This
+                is relevant because it defines the way the offsets are trimmed out.
+
+        Returns:
+            PostProcessor
+        """
+        pass
+
+class ByteLevel(PostProcessor):
+    """ ByteLevel Post processing
+
+    This post-processor takes care of trimming the offsets.
+    By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
+    want the offsets to include these whitespaces, then this PostProcessor must be used.
+    """
+
+    def __init(self, trim_offsets: bool = True) -> None:
+        """ Instantiate a new ByteLevel
+
+        Args:
+            trim_offsets: bool:
+                Whether to trim the whitespaces from the produced offsets.
+        """
+        pass
--- a/bindings/python/py_src/tokenizers/trainers/init.py
+++ b/bindings/python/py_src/tokenizers/trainers/init.py
@@ -0,0 +1,5 @@
+from .. import trainers
+
+Trainer = trainers.Trainer
+BpeTrainer = trainers.BpeTrainer
+WordPieceTrainer = trainers.WordPieceTrainer
--- a/bindings/python/py_src/tokenizers/trainers/init.pyi
+++ b/bindings/python/py_src/tokenizers/trainers/init.pyi
@@ -0,0 +1,113 @@
+from .. import AddedToken
+from typing import Optional, List, Union
+
+class Trainer:
+    """ Base class for all trainers
+
+    This class is not supposed to be instantiated directly. Instead, any implementation of a
+    Trainer will return an instance of this class when instantiated.
+    """
+
+class BpeTrainer(Trainer):
+    """ BpeTrainer
+
+    Capable of training a BPE model
+    """
+
+    def __init__(
+        self,
+        vocab_size: int = 30000,
+        min_frequency: int = 0,
+        show_progress: bool = True,
+        special_tokens: List[Union[str, AddedToken]] = [],
+        limit_alphabet: Optional[int] = None,
+        initial_alphabet: List[str] = [],
+        continuing_subword_prefix: Optional[str] = None,
+        end_of_word_suffix: Optional[str] = None,
+    ) -> None:
+        """ Instantiate a new BpeTrainer with the given options:
+
+        Args:
+            vocab_size: unsigned int:
+                The size of the final vocabulary, including all tokens and alphabet.
+
+            min_frequency: unsigned int:
+                The minimum frequency a pair should have in order to be merged.
+
+            show_progress: boolean:
+                Whether to show progress bars while training.
+
+            special_tokens: List[Union[str, AddedToken]]:
+                A list of special tokens the model should know of.
+
+            limit_alphabet: unsigned int:
+                The maximum different characters to keep in the alphabet.
+
+            initial_alphabet: List[str]:
+                A list of characters to include in the initial alphabet, even
+                if not seen in the training dataset.
+                If the strings contains more than one character, only the first one
+                is kept.
+
+            continuing_subword_prefix: Optional[str]:
+                A prefix to be used for every subword that is not a beginning-of-word.
+
+            end_of_word_suffix: Optional[str]:
+                A suffix to be used for every subword that is a end-of-word.
+
+        Returns:
+            Trainer
+        """
+        pass
+
+class WordPieceTrainer(Trainer):
+    """ WordPieceTrainer
+
+    Capable of training a WordPiece model
+    """
+
+    def __init__(
+        self,
+        vocab_size: int = 30000,
+        min_frequency: int = 0,
+        show_progress: bool = True,
+        special_tokens: List[Union[str, AddedToken]] = [],
+        limit_alphabet: Optional[int] = None,
+        initial_alphabet: List[str] = [],
+        continuing_subword_prefix: Optional[str] = "##",
+        end_of_word_suffix: Optional[str] = None,
+    ) -> Trainer:
+        """ Instantiate a new WordPieceTrainer with the given options:
+
+        Args:
+            vocab_size: unsigned int:
+                The size of the final vocabulary, including all tokens and alphabet.
+
+            min_frequency: unsigned int:
+                The minimum frequency a pair should have in order to be merged.
+
+            show_progress: boolean:
+                Whether to show progress bars while training.
+
+            special_tokens: List[Union[str, AddedToken]]:
+                A list of special tokens the model should know of.
+
+            limit_alphabet: unsigned int:
+                The maximum different characters to keep in the alphabet.
+
+            initial_alphabet: List[str]:
+                A list of characters to include in the initial alphabet, even
+                if not seen in the training dataset.
+                If the strings contains more than one character, only the first one
+                is kept.
+
+            continuing_subword_prefix: Optional[str]:
+                A prefix to be used for every subword that is not a beginning-of-word.
+
+            end_of_word_suffix: Optional[str]:
+                A suffix to be used for every subword that is a end-of-word.
+
+        Returns:
+            Trainer
+        """
+        pass