Doc - API Reference for most Tokenizer methods/attributes

2025-08-22 16:25:30 +00:00 · 2020-10-07 13:12:07 -04:00
parent 8c0370657e
commit a86d49634c
2 changed files with 304 additions and 151 deletions
--- a/bindings/python/py_src/tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/init.pyi
@ -422,9 +422,37 @@ class Encoding:
        pass

 class AddedToken:
-    """AddedToken represents a token to be added to a Tokenizer
+    """AddedToken

-    An AddedToken can have special options defining the way it should behave.
+    Represents a token that can be be added to a :class:`~tokenizers.Tokenizer`.
+    It can have special options that defines the way it should behave.
+
+    Args:
+        content (:obj:`str`): The content of the token
+
+        single_word (:obj:`bool`, defaults to :obj:`False`):
+            Defines whether this token should only match single words. If :obj:`True`, this
+            token will never match inside of a word. For example the token ``ing`` would match
+            on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`.
+            The notion of "`inside of a word`" is defined by the word boundaries pattern in
+            regular expressions (ie. the token should start and end with word boundaries).
+
+        lstrip (:obj:`bool`, defaults to :obj:`False`):
+            Defines whether this token should strip all potential whitespaces on its left side.
+            If :obj:`True`, this token will greedily match any whitespace on its left. For
+            example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text
+            ``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left).
+
+        rstrip (:obj:`bool`, defaults to :obj:`False`):
+            Defines whether this token should strip all potential whitespaces on its right
+            side. If :obj:`True`, this token will greedily match any whitespace on its right.
+            It works just like :obj:`lstrip` but on the right.
+
+        normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
+            Defines whether this token should match against the normalized version of the input
+            text. For example, with the added token ``"yesterday"``, and a normalizer in charge of
+            lowercasing the text, the token could be extract from the input ``"I saw a lion
+            Yesterday"``.
    """

    def __new__(
@ -438,55 +466,54 @@ class AddedToken:
        """Instantiate a new AddedToken

        Args:
-            content: str:
-                The content of the token
+            content (:obj:`str`): The content of the token

-            single_word: bool
-                Whether this token should only match against single words. If True,
-                this token will never match inside of a word. For example the token `ing` would
-                match on `tokenizing` if this option if False, but not if this option is True.
+            single_word (:obj:`bool`, defaults to :obj:`False`):
+                Defines whether this token should only match single words. If :obj:`True`, this
+                token will never match inside of a word. For example the token ``ing`` would match
+                on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`.
+                The notion of "`inside of a word`" is defined by the word boundaries pattern in
+                regular expressions (ie. the token should start and end with word boundaries).

-            lstrip: bool
-                Whether this token should strip all potential whitespaces on the left side.
-                If True, this token will greedily match any whitespace on the left. For example,
-                if we try to match the token `[MASK]` with lstrip=True, in the text `I saw a [MASK]`
-                we will match on ` [MASK]`.
+            lstrip (:obj:`bool`, defaults to :obj:`False`):
+                Defines whether this token should strip all potential whitespaces on its left side.
+                If :obj:`True`, this token will greedily match any whitespace on its left. For
+                example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text
+                ``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left).

-            rstrip: bool
-                Whether this token should strip all potential whitespaces on the right side.
-                If True, this token will greedily match any whitespace on the right. It works just
-                like lstrip, but on the right.
+            rstrip (:obj:`bool`, defaults to :obj:`False`):
+                Defines whether this token should strip all potential whitespaces on its right
+                side. If :obj:`True`, this token will greedily match any whitespace on its right.
+                It works just like :obj:`lstrip` but on the right.

-            normalized: bool:
-                Whether this token should be match the normalized version of the input text. For
-                example, with the added token `yesterday` and a normalizer in charge of lowercasing
-                the text, the token could be extract from the input `I saw a lion Yesterday`.
+            normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
+                Defines whether this token should match against the normalized version of the input
+                text. For example, with the added token ``"yesterday"``, and a normalizer in charge of
+                lowercasing the text, the token could be extract from the input ``"I saw a lion
+                Yesterday"``.
        """
        pass

 class Tokenizer:
    """Tokenizer

-    A Tokenizer works as a pipeline, it processes some raw text as input and outputs
-    an `Encoding`.
+    A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
+    and outputs an :class:`~tokenizers.Encoding`.

-    The various steps of the pipeline are:
-        1. The `Normalizer`: in charge of normalizing the text. Common examples of
-           normalization are the unicode normalization standards, such as NFD or NFKC.
-        2. The `PreTokenizer`: in charge of creating initial words splits in the text.
-           The most common way of splitting text is simply on whitespace.
-        3. The `Model`: in charge of doing the actual tokenization. An example of a
-           `Model` would be `BPE` or `WordPiece`.
-        4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything
-           relevant that, for example, a language model would need, such as special tokens.
+    Args:
+        model (:class:`~tokenizers.models.Model`):
+            The core algorithm that this :obj:`Tokenizer` should be using.
    """

    def __new__(cls, model: models.Model) -> Tokenizer:
        """Instantiate a new Tokenizer using the given Model

+        A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
+        and outputs an :class:`~tokenizers.Encoding`.
+
        Args:
-            model: models.Model:
-                The model to be used with this Tokenizer
+            model (:class:`~tokenizers.models.Model`):
+                The core algorithm that this :obj:`Tokenizer` should be using.

        Returns:
            Tokenizer
@ -494,57 +521,62 @@ class Tokenizer:
        pass
    @staticmethod
    def from_str(s: str) -> Tokenizer:
-        """Instantiate a new Tokenizer from the given JSON string
+        """Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.

        Args:
-            s: str:
-                A JSON string representation of the Tokenizer
+            json (:obj:`str`):
+                A valid JSON string representing a previously serialized
+                :class:`~tokenizers.Tokenizer`

        Returns:
-            Tokenizer
+            :class:`~tokenizers.Tokenizer`: The new tokenizer
        """
        pass
    @staticmethod
    def from_file(path: str) -> Tokenizer:
-        """Instantiate a new Tokenizer from the given file
+        """Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path.

        Args:
-            path: str:
-                Path to a file containing a Tokenizer
+            path (:obj:`str`):
+                A path to a local JSON file representing a previously serialized
+                :class:`~tokenizers.Tokenizer`

        Returns:
-            Tokenizer
+            :class:`~tokenizers.Tokenizer`: The new tokenizer
        """
        pass
    @staticmethod
    def from_buffer(buffer: bytes) -> Tokenizer:
-        """Instantiate a new Tokenizer from the given buffer
+        """Instantiate a new :class:`~tokenizers.Tokenizer` from the given buffer.

        Args:
-            buffer: bytes:
-                A buffer used to instantiate a new Tokenizer
+            buffer (:obj:`bytes`):
+                A buffer containing a previously serialized :class:`~tokenizers.Tokenizer`

        Returns:
-            Tokenizer
+            :class:`~tokenizers.Tokenizer`: The new tokenizer
        """
        pass
    def to_str(self, pretty: bool = False) -> str:
-        """Get a serialized JSON version of the Tokenizer as a str
+        """Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.

        Args:
-            pretty: bool:
-                Whether the JSON string should be prettified
+            pretty (:obj:`bool`, defaults to :obj:`False`):
+                Whether the JSON string should be pretty formatted.

        Returns:
-            str
+            :obj:`str`: A string representing the serialized Tokenizer
        """
        pass
    def save(self, path: str, pretty: bool = False):
-        """Save the Tokenizer as JSON to the given path
+        """Save the :class:`~tokenizers.Tokenizer` to the file at the given path.

        Args:
-            pretty: bool:
-                Whether the JSON string should be prettified
+            path (:obj:`str`):
+                A path to a file in which to save the serialized tokenizer.
+
+            pretty (:obj:`bool`, defaults to :obj:`False`):
+                Whether the JSON file should be pretty formatted.
        """
        pass
    @property
@ -593,40 +625,41 @@ class Tokenizer:
        """
        pass
    def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
-        """Returns the vocabulary
+        """Get the underlying vocabulary

        Args:
-            with_added_tokens: boolean:
-                Whether to include the added tokens in the vocabulary
+            with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether to include the added tokens

        Returns:
-            The vocabulary
+            :obj:`Dict[str, int]`: The vocabulary
        """
        pass
    def get_vocab_size(self, with_added_tokens: bool = True) -> int:
-        """Returns the size of the vocabulary
+        """Get the size of the underlying vocabulary

        Args:
-            with_added_tokens: boolean:
-                Whether to include the added tokens in the vocabulary's size
+            with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether to include the added tokens

        Returns:
-            The size of the vocabulary
+            :obj:`int`: The size of the vocabulary
        """
        pass
    def enable_truncation(self, max_length: int, stride: Optional[int], strategy: Optional[str]):
-        """Enable the truncation
+        """Enable truncation

        Args:
-            max_length: unsigned int:
-                The maximum length at which to truncate
+            max_length (:obj:`int`):
+                The max length at which to truncate

-            stride: (`optional`) unsigned int:
-                The length of the previous first sequence to be included
-                in the overflowing sequence
+            stride (:obj:`int`, `optional`):
+                The length of the previous first sequence to be included in the overflowing
+                sequence

-            strategy: (`optional) str:
-                Can be one of `longest_first`, `only_first` or `only_second`
+            strategy (:obj:`str`, `optional`, defaults to :obj:`longest_first`):
+                The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or
+                ``only_second``.
        """
        pass
    def no_truncation(self):
@ -634,11 +667,13 @@ class Tokenizer:
        pass
    @property
    def truncation(self) -> Optional[dict]:
-        """Get the current truncation parameters
+        """Get the currently set truncation parameters
+
+        `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`

        Returns:
-            None if truncation is disabled, a dict with the current truncation parameters if
-            truncation is enabled
+            (:obj:`dict`, `optional`):
+                A dict with the current truncation parameters if truncation is enabled
        """
        pass
    def enable_padding(
@ -653,26 +688,26 @@ class Tokenizer:
        """Enable the padding

        Args:
-            direction: (`optional`) str:
-                Can be one of: `right` or `left`
+            direction (:obj:`str`, `optional`, defaults to :obj:`right`):
+                The direction in which to pad. Can be either ``right`` or ``left``

-            pad_to_multiple_of: (`optional`) unsigned int:
-                If specified, the padding length should always snap to the next multiple of
-                the given value. For example if we were going to pad with a length of 250 but
-                `pad_to_multiple_of=8` then we will pad to 256.
+            pad_to_multiple_of (:obj:`int`, `optional`):
+                If specified, the padding length should always snap to the next multiple of the
+                given value. For example if we were going to pad witha length of 250 but
+                ``pad_to_multiple_of=8`` then we will pad to 256.

-            pad_id: (`optional`) unsigned int:
-                The indice to be used when padding
+            pad_id (:obj:`int`, defaults to 0):
+                The id to be used when padding

-            pad_type_id: (`optional`) unsigned int:
-                The type indice to be used when padding
+            pad_type_id (:obj:`int`, defaults to 0):
+                The type id to be used when padding

-            pad_token: (`optional`) str:
+            pad_token (:obj:`str`, defaults to :obj:`[PAD]`):
                The pad token to be used when padding

-            length: (`optional`) unsigned int:
-                If specified, the length at which to pad. If not specified
-                we pad using the size of the longest sequence in a batch
+            length (:obj:`int`, `optional`):
+                If specified, the length at which to pad. If not specified we pad using the size of
+                the longest sequence in a batch.
        """
        pass
    def no_padding(self):
@ -682,9 +717,11 @@ class Tokenizer:
    def padding(self) -> Optional[dict]:
        """Get the current padding parameters

+        `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
+
        Returns:
-            None if padding is disabled, a dict with the currently set parameters
-            if the padding is enabled.
+            (:obj:`dict`, `optional`):
+                A dict with the current padding parameters if padding is enabled
        """
        pass
    def encode(
@ -694,8 +731,7 @@ class Tokenizer:
        is_pretokenized: bool = False,
        add_special_tokens: bool = True,
    ) -> Encoding:
-        """
-        Encode the given sequence and pair. This method can process raw text sequences
+        """Encode the given sequence and pair. This method can process raw text sequences
        as well as already pre-tokenized sequences.

        Example:
@ -736,8 +772,7 @@ class Tokenizer:
        is_pretokenized: bool = False,
        add_special_tokens: bool = True,
    ) -> List[Encoding]:
-        """
-        Encode the given batch of inputs. This method accept both raw text sequences
+        """Encode the given batch of inputs. This method accept both raw text sequences
        as well as already pre-tokenized sequences.

        Example:
@ -771,82 +806,91 @@ class Tokenizer:
        """
        pass
    def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
-        """Decode the given list of ids to a string sequence
+        """Decode the given list of ids back to a string
+
+        This is used to decode anything coming back from a Language Model

        Args:
-            ids: List[unsigned int]:
-                A list of ids to be decoded
+            ids (A :obj:`List/Tuple` of :obj:`int`):
+                The list of ids that we want to decode

-            skip_special_tokens: (`optional`) boolean:
-                Whether to remove all the special tokens from the output string
+            skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether the special tokens should be removed from the decoded string

        Returns:
-            The decoded string
+            :obj:`str`: The decoded string
        """
        pass
    def decode_batch(
        self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
    ) -> str:
-        """Decode the list of sequences to a list of string sequences
+        """Decode a batch of ids back to their corresponding string

        Args:
-            sequences: List[List[unsigned int]]:
-                A list of sequence of ids to be decoded
+            sequences (:obj:`List` of :obj:`List[int]`):
+                The batch of sequences we want to decode

-            skip_special_tokens: (`optional`) boolean:
-                Whether to remove all the special tokens from the output strings
+            skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether the special tokens should be removed from the decoded strings

        Returns:
-            A list of decoded strings
+            :obj:`List[str]`: A list of decoded strings
        """
        pass
    def token_to_id(self, token: str) -> Optional[int]:
-        """Convert the given token to its corresponding id
+        """Convert the given token to its corresponding id if it exists

        Args:
-            token: str:
+            token (:obj:`str`):
                The token to convert

        Returns:
-            The corresponding id if it exists, None otherwise
+            :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
        """
        pass
    def id_to_token(self, id: int) -> Optional[str]:
-        """Convert the given token id to its corresponding string
+        """Convert the given id to its corresponding token if it exists

        Args:
-            token: id:
-                The token id to convert
+            id (:obj:`int`):
+                The id to convert

        Returns:
-            The corresponding string if it exists, None otherwise
+            :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
        """
        pass
    def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
        """Add the given tokens to the vocabulary

+        The given tokens are added only if they don't already exist in the vocabulary.
+        Each token then gets a new attributed id.
+
        Args:
-            tokens: List[Union[str, AddedToken]]:
-                A list of tokens to add to the vocabulary. Each token can either be
-                a string, or an instance of AddedToken
+            tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
+                The list of tokens we want to add to the vocabulary. Each token can be either a
+                string or an instance of :class:`~tokenizers.AddedToken` for more customization.

        Returns:
-            The number of tokens that were added to the vocabulary
+            :obj:`int`: The number of tokens that were created in the vocabulary
        """
        pass
    def add_special_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
-        """Add the given special tokens to the vocabulary, and treat them as special tokens.
+        """Add the given special tokens to the Tokenizer.

-        The special tokens will never be processed by the model, and will be
-        removed while decoding.
+        If these tokens are already part of the vocabulary, it just let the Tokenizer know about
+        them. If they don't exist, the Tokenizer creates them, giving them a new id.
+
+        These special tokens will never be processed by the model (ie won't be split into
+        multiple tokens), and they can be removed from the output when decoding.

        Args:
-            tokens: List[Union[str, AddedToken]]:
-                The list of special tokens to add. Each token can either be a string
-                or an instance of AddedToken
+            tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
+                The list of special tokens we want to add to the vocabulary. Each token can either
+                be a string or an instance of :class:`~tokenizers.AddedToken` for more
+                customization.

        Returns:
-            The number of tokens that were added to the vocabulary
+            :obj:`int`: The number of tokens that were created in the vocabulary
        """
        pass
    def post_process(
@ -858,21 +902,24 @@ class Tokenizer:
        """Apply all the post-processing steps to the given encodings.

        The various steps are:
-            1. Truncate according to global params (provided to `enable_truncation`)
-            2. Apply the PostProcessor
-            3. Pad according to global params. (provided to `enable_padding`)
+
+            1. Truncate according to the set truncation params (provided with
+               :meth:`~tokenizers.Tokenizer.enable_truncation`)
+            2. Apply the :class:`~tokenizers.processors.PostProcessor`
+            3. Pad according to the set padding params (provided with
+               :meth:`~tokenizers.Tokenizer.enable_padding`)

        Args:
-            encoding: Encoding:
-                The main Encoding to post process
+            encoding (:class:`~tokenizers.Encoding`):
+                The :class:`~tokenizers.Encoding` corresponding to the main sequence.

-            pair: Optional[Encoding]:
-                An optional pair Encoding
+            pair (:class:`~tokenizers.Encoding`, `optional`):
+                An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence.

-            add_special_tokens: bool:
-                Whether to add special tokens
+            add_special_tokens (:obj:`bool`):
+                Whether to add the special tokens

        Returns:
-            The resulting Encoding
+            :class:`~tokenizers.Encoding`: The final post-processed encoding
        """
        pass
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@ -53,7 +53,7 @@ use crate::processors::PyPostProcessor;
 ///         Yesterday"``.
 ///
 #[pyclass(dict, module = "tokenizers", name=AddedToken)]
-#[text_signature = "(content, **kwargs)"]
+#[text_signature = "(content, single_word=False, lstrip=False, rstrip=False, normalized=True)"]
 pub struct PyAddedToken {
    pub content: String,
    pub is_special_token: bool,
@ -408,6 +408,7 @@ type Tokenizer = TokenizerImpl<PyModel, PyNormalizer, PyPreTokenizer, PyPostProc
 ///         The core algorithm that this :obj:`Tokenizer` should be using.
 ///
 #[pyclass(dict, module = "tokenizers", name=Tokenizer)]
+#[text_signature = "(model)"]
 #[derive(Clone)]
 pub struct PyTokenizer {
    tokenizer: Tokenizer,
@ -533,7 +534,7 @@ impl PyTokenizer {
    ///     path (:obj:`str`):
    ///         A path to a file in which to save the serialized tokenizer.
    ///
-    ///     pretty (:obj:`bool`, defaults to :obj:`False):
+    ///     pretty (:obj:`bool`, defaults to :obj:`False`):
    ///         Whether the JSON file should be pretty formatted.
    #[args(pretty = false)]
    #[text_signature = "($self, pretty=False)"]
@ -551,7 +552,7 @@ impl PyTokenizer {
    /// Get the underlying vocabulary
    ///
    /// Args:
-    ///     with_added_tokens (:obj:`bool, defaults to :obj:`True`):
+    ///     with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
    ///         Whether to include the added tokens
    ///
    /// Returns:
@ -565,7 +566,7 @@ impl PyTokenizer {
    /// Get the size of the underlying vocabulary
    ///
    /// Args:
-    ///     with_added_tokens (:obj:`bool, defaults to :obj:`True`):
+    ///     with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
    ///         Whether to include the added tokens
    ///
    /// Returns:
@ -632,6 +633,8 @@ impl PyTokenizer {

    /// Get the currently set truncation parameters
    ///
+    /// `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
+    ///
    /// Returns:
    ///     (:obj:`dict`, `optional`):
    ///         A dict with the current truncation parameters if truncation is enabled
@ -737,6 +740,8 @@ impl PyTokenizer {

    /// Get the current padding parameters
    ///
+    /// `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
+    ///
    /// Returns:
    ///     (:obj:`dict`, `optional`):
    ///         A dict with the current padding parameters if padding is enabled
@ -797,7 +802,7 @@ impl PyTokenizer {
    ///     :class:`~tokenizers.Encoding`: The encoded result
    ///
    #[args(pair = "None", is_pretokenized = "false", add_special_tokens = "true")]
-    #[text_signature = "($self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True, /)"]
+    #[text_signature = "($self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)"]
    fn encode(
        &self,
        sequence: &PyAny,
@ -862,7 +867,7 @@ impl PyTokenizer {
    ///     A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
    ///
    #[args(is_pretokenized = "false", add_special_tokens = "true")]
-    #[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True, /)"]
+    #[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True)"]
    fn encode_batch(
        &self,
        input: Vec<&PyAny>,
@ -891,37 +896,88 @@ impl PyTokenizer {
        })
    }

-    fn decode(&self, ids: Vec<u32>, skip_special_tokens: Option<bool>) -> PyResult<String> {
-        ToPyResult(
-            self.tokenizer
-                .decode(ids, skip_special_tokens.unwrap_or(true)),
-        )
-        .into()
+    /// Decode the given list of ids back to a string
+    ///
+    /// This is used to decode anything coming back from a Language Model
+    ///
+    /// Args:
+    ///     ids (A :obj:`List/Tuple` of :obj:`int`):
+    ///         The list of ids that we want to decode
+    ///
+    ///     skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+    ///         Whether the special tokens should be removed from the decoded string
+    ///
+    /// Returns:
+    ///     :obj:`str`: The decoded string
+    #[args(skip_special_tokens = true)]
+    #[text_signature = "($self, ids, skip_special_tokens=True)"]
+    fn decode(&self, ids: Vec<u32>, skip_special_tokens: bool) -> PyResult<String> {
+        ToPyResult(self.tokenizer.decode(ids, skip_special_tokens)).into()
    }

+    /// Decode a batch of ids back to their corresponding string
+    ///
+    /// Args:
+    ///     sequences (:obj:`List` of :obj:`List[int]`):
+    ///         The batch of sequences we want to decode
+    ///
+    ///     skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+    ///         Whether the special tokens should be removed from the decoded strings
+    ///
+    /// Returns:
+    ///     :obj:`List[str]`: A list of decoded strings
+    #[args(skip_special_tokens = true)]
+    #[text_signature = "($self, sequences, skip_special_tokens=True)"]
    fn decode_batch(
        &self,
-        sentences: Vec<Vec<u32>>,
-        skip_special_tokens: Option<bool>,
+        sequences: Vec<Vec<u32>>,
+        skip_special_tokens: bool,
    ) -> PyResult<Vec<String>> {
        let gil = Python::acquire_gil();
        gil.python().allow_threads(|| {
-            ToPyResult(
-                self.tokenizer
-                    .decode_batch(sentences, skip_special_tokens.unwrap_or(true)),
-            )
-            .into()
+            ToPyResult(self.tokenizer.decode_batch(sequences, skip_special_tokens)).into()
        })
    }

+    /// Convert the given token to its corresponding id if it exists
+    ///
+    /// Args:
+    ///     token (:obj:`str`):
+    ///         The token to convert
+    ///
+    /// Returns:
+    ///     :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
+    #[text_signature = "($self, token)"]
    fn token_to_id(&self, token: &str) -> Option<u32> {
        self.tokenizer.token_to_id(token)
    }

+    /// Convert the given id to its corresponding token if it exists
+    ///
+    /// Args:
+    ///     id (:obj:`int`):
+    ///         The id to convert
+    ///
+    /// Returns:
+    ///     :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
+    #[text_signature = "($self, id)"]
    fn id_to_token(&self, id: u32) -> Option<&str> {
        self.tokenizer.id_to_token(id)
    }

+    /// Add the given tokens to the vocabulary
+    ///
+    /// The given tokens are added only if they don't already exist in the vocabulary.
+    /// Each token then gets a new attributed id.
+    ///
+    /// Args:
+    ///     tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
+    ///         The list of tokens we want to add to the vocabulary. Each token can be either a
+    ///         string or an instance of :class:`~tokenizers.AddedToken` for more customization.
+    ///
+    /// Returns:
+    ///     :obj:`int`: The number of tokens that were created in the vocabulary
+    #[text_signature = "($self, tokens)"]
    fn add_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
        let tokens = tokens
            .into_iter()
@ -942,6 +998,23 @@ impl PyTokenizer {
        Ok(self.tokenizer.add_tokens(&tokens))
    }

+    /// Add the given special tokens to the Tokenizer.
+    ///
+    /// If these tokens are already part of the vocabulary, it just let the Tokenizer know about
+    /// them. If they don't exist, the Tokenizer creates them, giving them a new id.
+    ///
+    /// These special tokens will never be processed by the model (ie won't be split into
+    /// multiple tokens), and they can be removed from the output when decoding.
+    ///
+    /// Args:
+    ///     tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
+    ///         The list of special tokens we want to add to the vocabulary. Each token can either
+    ///         be a string or an instance of :class:`~tokenizers.AddedToken` for more
+    ///         customization.
+    ///
+    /// Returns:
+    ///     :obj:`int`: The number of tokens that were created in the vocabulary
+    #[text_signature = "($self, tokens)"]
    fn add_special_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
        let tokens = tokens
            .into_iter()
@ -968,7 +1041,30 @@ impl PyTokenizer {
            .allow_threads(|| ToPyResult(self.tokenizer.train_and_replace(trainer, files)).into())
    }

+    /// Apply all the post-processing steps to the given encodings.
+    ///
+    /// The various steps are:
+    ///
+    ///     1. Truncate according to the set truncation params (provided with
+    ///        :meth:`~tokenizers.Tokenizer.enable_truncation`)
+    ///     2. Apply the :class:`~tokenizers.processors.PostProcessor`
+    ///     3. Pad according to the set padding params (provided with
+    ///        :meth:`~tokenizers.Tokenizer.enable_padding`)
+    ///
+    /// Args:
+    ///     encoding (:class:`~tokenizers.Encoding`):
+    ///         The :class:`~tokenizers.Encoding` corresponding to the main sequence.
+    ///
+    ///     pair (:class:`~tokenizers.Encoding`, `optional`):
+    ///         An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence.
+    ///
+    ///     add_special_tokens (:obj:`bool`):
+    ///         Whether to add the special tokens
+    ///
+    /// Returns:
+    ///     :class:`~tokenizers.Encoding`: The final post-processed encoding
    #[args(pair = "None", add_special_tokens = true)]
+    #[text_signature = "($self, encoding, pair=None, add_special_tokens=True)"]
    fn post_process(
        &self,
        encoding: &PyEncoding,
@ -987,16 +1083,19 @@ impl PyTokenizer {
        .into()
    }

+    /// The :class:`~tokenizers.models.Model` in use by the Tokenizer
    #[getter]
    fn get_model(&self) -> PyResult<PyObject> {
        self.tokenizer.get_model().get_as_subtype()
    }

+    /// Set the :class:`~tokenizers.models.Model`
    #[setter]
    fn set_model(&mut self, model: PyRef<PyModel>) {
        self.tokenizer.with_model(model.clone());
    }

+    /// The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
    #[getter]
    fn get_normalizer(&self) -> PyResult<PyObject> {
        if let Some(n) = self.tokenizer.get_normalizer() {
@ -1006,11 +1105,13 @@ impl PyTokenizer {
        }
    }

+    /// Set the :class:`~tokenizers.normalizers.Normalizer`
    #[setter]
    fn set_normalizer(&mut self, normalizer: PyRef<PyNormalizer>) {
        self.tokenizer.with_normalizer(normalizer.clone());
    }

+    /// The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
    #[getter]
    fn get_pre_tokenizer(&self) -> PyResult<PyObject> {
        if let Some(pt) = self.tokenizer.get_pre_tokenizer() {
@ -1020,11 +1121,13 @@ impl PyTokenizer {
        }
    }

+    /// Set the :class:`~tokenizers.normalizers.Normalizer`
    #[setter]
    fn set_pre_tokenizer(&mut self, pretok: PyRef<PyPreTokenizer>) {
        self.tokenizer.with_pre_tokenizer(pretok.clone());
    }

+    /// The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
    #[getter]
    fn get_post_processor(&self) -> PyResult<PyObject> {
        if let Some(n) = self.tokenizer.get_post_processor() {
@ -1034,11 +1137,13 @@ impl PyTokenizer {
        }
    }

+    /// Set the :class:`~tokenizers.processors.PostProcessor`
    #[setter]
    fn set_post_processor(&mut self, processor: PyRef<PyPostProcessor>) {
        self.tokenizer.with_post_processor(processor.clone());
    }

+    /// The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
    #[getter]
    fn get_decoder(&self) -> PyResult<PyObject> {
        if let Some(dec) = self.tokenizer.get_decoder() {
@ -1048,6 +1153,7 @@ impl PyTokenizer {
        }
    }

+    /// Set the :class:`~tokenizers.decoders.Decoder`
    #[setter]
    fn set_decoder(&mut self, decoder: PyRef<PyDecoder>) {
        self.tokenizer.with_decoder(decoder.clone());