Doc - API Reference for most Tokenizer methods/attributes

2025-12-09 06:08:22 +00:00 · 2020-10-07 13:12:07 -04:00
parent 8c0370657e
commit a86d49634c
2 changed files with 304 additions and 151 deletions
--- a/bindings/python/py_src/tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/init.pyi
@@ -422,9 +422,37 @@ class Encoding:
        pass
 class AddedToken:
-    """AddedToken represents a token to be added to a Tokenizer
+    """AddedToken
-    An AddedToken can have special options defining the way it should behave.
+    Represents a token that can be be added to a :class:`~tokenizers.Tokenizer`.
    It can have special options that defines the way it should behave.
    Args:
        content (:obj:`str`): The content of the token
        single_word (:obj:`bool`, defaults to :obj:`False`):
            Defines whether this token should only match single words. If :obj:`True`, this
            token will never match inside of a word. For example the token ``ing`` would match
            on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`.
            The notion of "`inside of a word`" is defined by the word boundaries pattern in
            regular expressions (ie. the token should start and end with word boundaries).
        lstrip (:obj:`bool`, defaults to :obj:`False`):
            Defines whether this token should strip all potential whitespaces on its left side.
            If :obj:`True`, this token will greedily match any whitespace on its left. For
            example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text
            ``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left).
        rstrip (:obj:`bool`, defaults to :obj:`False`):
            Defines whether this token should strip all potential whitespaces on its right
            side. If :obj:`True`, this token will greedily match any whitespace on its right.
            It works just like :obj:`lstrip` but on the right.
        normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
            Defines whether this token should match against the normalized version of the input
            text. For example, with the added token ``"yesterday"``, and a normalizer in charge of
            lowercasing the text, the token could be extract from the input ``"I saw a lion
            Yesterday"``.
    """
    def __new__(
@@ -438,55 +466,54 @@ class AddedToken:
        """Instantiate a new AddedToken
        Args:
-            content: str:
+            content (:obj:`str`): The content of the token
                The content of the token
-            single_word: bool
+            single_word (:obj:`bool`, defaults to :obj:`False`):
-                Whether this token should only match against single words. If True,
+                Defines whether this token should only match single words. If :obj:`True`, this
-                this token will never match inside of a word. For example the token `ing` would
+                token will never match inside of a word. For example the token ``ing`` would match
-                match on `tokenizing` if this option if False, but not if this option is True.
+                on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`.
                The notion of "`inside of a word`" is defined by the word boundaries pattern in
                regular expressions (ie. the token should start and end with word boundaries).
-            lstrip: bool
+            lstrip (:obj:`bool`, defaults to :obj:`False`):
-                Whether this token should strip all potential whitespaces on the left side.
+                Defines whether this token should strip all potential whitespaces on its left side.
-                If True, this token will greedily match any whitespace on the left. For example,
+                If :obj:`True`, this token will greedily match any whitespace on its left. For
-                if we try to match the token `[MASK]` with lstrip=True, in the text `I saw a [MASK]`
+                example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text
-                we will match on ` [MASK]`.
+                ``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left).
-            rstrip: bool
+            rstrip (:obj:`bool`, defaults to :obj:`False`):
-                Whether this token should strip all potential whitespaces on the right side.
+                Defines whether this token should strip all potential whitespaces on its right
-                If True, this token will greedily match any whitespace on the right. It works just
+                side. If :obj:`True`, this token will greedily match any whitespace on its right.
-                like lstrip, but on the right.
+                It works just like :obj:`lstrip` but on the right.
-            normalized: bool:
+            normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
-                Whether this token should be match the normalized version of the input text. For
+                Defines whether this token should match against the normalized version of the input
-                example, with the added token `yesterday` and a normalizer in charge of lowercasing
+                text. For example, with the added token ``"yesterday"``, and a normalizer in charge of
-                the text, the token could be extract from the input `I saw a lion Yesterday`.
+                lowercasing the text, the token could be extract from the input ``"I saw a lion
                Yesterday"``.
        """
        pass
 class Tokenizer:
    """Tokenizer
-    A Tokenizer works as a pipeline, it processes some raw text as input and outputs
+    A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
-    an `Encoding`.
+    and outputs an :class:`~tokenizers.Encoding`.
-    The various steps of the pipeline are:
+    Args:
-        1. The `Normalizer`: in charge of normalizing the text. Common examples of
+        model (:class:`~tokenizers.models.Model`):
-           normalization are the unicode normalization standards, such as NFD or NFKC.
+            The core algorithm that this :obj:`Tokenizer` should be using.
        2. The `PreTokenizer`: in charge of creating initial words splits in the text.
           The most common way of splitting text is simply on whitespace.
        3. The `Model`: in charge of doing the actual tokenization. An example of a
           `Model` would be `BPE` or `WordPiece`.
        4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything
           relevant that, for example, a language model would need, such as special tokens.
    """
    def __new__(cls, model: models.Model) -> Tokenizer:
        """Instantiate a new Tokenizer using the given Model
        A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
        and outputs an :class:`~tokenizers.Encoding`.
        Args:
-            model: models.Model:
+            model (:class:`~tokenizers.models.Model`):
-                The model to be used with this Tokenizer
+                The core algorithm that this :obj:`Tokenizer` should be using.
        Returns:
            Tokenizer
@@ -494,57 +521,62 @@ class Tokenizer:
        pass
    @staticmethod
    def from_str(s: str) -> Tokenizer:
-        """Instantiate a new Tokenizer from the given JSON string
+        """Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
        Args:
-            s: str:
+            json (:obj:`str`):
-                A JSON string representation of the Tokenizer
+                A valid JSON string representing a previously serialized
                :class:`~tokenizers.Tokenizer`
        Returns:
-            Tokenizer
+            :class:`~tokenizers.Tokenizer`: The new tokenizer
        """
        pass
    @staticmethod
    def from_file(path: str) -> Tokenizer:
-        """Instantiate a new Tokenizer from the given file
+        """Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path.
        Args:
-            path: str:
+            path (:obj:`str`):
-                Path to a file containing a Tokenizer
+                A path to a local JSON file representing a previously serialized
                :class:`~tokenizers.Tokenizer`
        Returns:
-            Tokenizer
+            :class:`~tokenizers.Tokenizer`: The new tokenizer
        """
        pass
    @staticmethod
    def from_buffer(buffer: bytes) -> Tokenizer:
-        """Instantiate a new Tokenizer from the given buffer
+        """Instantiate a new :class:`~tokenizers.Tokenizer` from the given buffer.
        Args:
-            buffer: bytes:
+            buffer (:obj:`bytes`):
-                A buffer used to instantiate a new Tokenizer
+                A buffer containing a previously serialized :class:`~tokenizers.Tokenizer`
        Returns:
-            Tokenizer
+            :class:`~tokenizers.Tokenizer`: The new tokenizer
        """
        pass
    def to_str(self, pretty: bool = False) -> str:
-        """Get a serialized JSON version of the Tokenizer as a str
+        """Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
        Args:
-            pretty: bool:
+            pretty (:obj:`bool`, defaults to :obj:`False`):
-                Whether the JSON string should be prettified
+                Whether the JSON string should be pretty formatted.
        Returns:
-            str
+            :obj:`str`: A string representing the serialized Tokenizer
        """
        pass
    def save(self, path: str, pretty: bool = False):
-        """Save the Tokenizer as JSON to the given path
+        """Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
        Args:
-            pretty: bool:
+            path (:obj:`str`):
-                Whether the JSON string should be prettified
+                A path to a file in which to save the serialized tokenizer.
            pretty (:obj:`bool`, defaults to :obj:`False`):
                Whether the JSON file should be pretty formatted.
        """
        pass
    @property
@@ -593,40 +625,41 @@ class Tokenizer:
        """
        pass
    def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
-        """Returns the vocabulary
+        """Get the underlying vocabulary
        Args:
-            with_added_tokens: boolean:
+            with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
-                Whether to include the added tokens in the vocabulary
+                Whether to include the added tokens
        Returns:
-            The vocabulary
+            :obj:`Dict[str, int]`: The vocabulary
        """
        pass
    def get_vocab_size(self, with_added_tokens: bool = True) -> int:
-        """Returns the size of the vocabulary
+        """Get the size of the underlying vocabulary
        Args:
-            with_added_tokens: boolean:
+            with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
-                Whether to include the added tokens in the vocabulary's size
+                Whether to include the added tokens
        Returns:
-            The size of the vocabulary
+            :obj:`int`: The size of the vocabulary
        """
        pass
    def enable_truncation(self, max_length: int, stride: Optional[int], strategy: Optional[str]):
-        """Enable the truncation
+        """Enable truncation
        Args:
-            max_length: unsigned int:
+            max_length (:obj:`int`):
-                The maximum length at which to truncate
+                The max length at which to truncate
-            stride: (`optional`) unsigned int:
+            stride (:obj:`int`, `optional`):
-                The length of the previous first sequence to be included
+                The length of the previous first sequence to be included in the overflowing
-                in the overflowing sequence
+                sequence
-            strategy: (`optional) str:
+            strategy (:obj:`str`, `optional`, defaults to :obj:`longest_first`):
-                Can be one of `longest_first`, `only_first` or `only_second`
+                The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or
                ``only_second``.
        """
        pass
    def no_truncation(self):
@@ -634,11 +667,13 @@ class Tokenizer:
        pass
    @property
    def truncation(self) -> Optional[dict]:
-        """Get the current truncation parameters
+        """Get the currently set truncation parameters
        `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
        Returns:
-            None if truncation is disabled, a dict with the current truncation parameters if
+            (:obj:`dict`, `optional`):
-            truncation is enabled
+                A dict with the current truncation parameters if truncation is enabled
        """
        pass
    def enable_padding(
@@ -653,26 +688,26 @@ class Tokenizer:
        """Enable the padding
        Args:
-            direction: (`optional`) str:
+            direction (:obj:`str`, `optional`, defaults to :obj:`right`):
-                Can be one of: `right` or `left`
+                The direction in which to pad. Can be either ``right`` or ``left``
-            pad_to_multiple_of: (`optional`) unsigned int:
+            pad_to_multiple_of (:obj:`int`, `optional`):
-                If specified, the padding length should always snap to the next multiple of
+                If specified, the padding length should always snap to the next multiple of the
-                the given value. For example if we were going to pad with a length of 250 but
+                given value. For example if we were going to pad witha length of 250 but
-                `pad_to_multiple_of=8` then we will pad to 256.
+                ``pad_to_multiple_of=8`` then we will pad to 256.
-            pad_id: (`optional`) unsigned int:
+            pad_id (:obj:`int`, defaults to 0):
-                The indice to be used when padding
+                The id to be used when padding
-            pad_type_id: (`optional`) unsigned int:
+            pad_type_id (:obj:`int`, defaults to 0):
-                The type indice to be used when padding
+                The type id to be used when padding
-            pad_token: (`optional`) str:
+            pad_token (:obj:`str`, defaults to :obj:`[PAD]`):
                The pad token to be used when padding
-            length: (`optional`) unsigned int:
+            length (:obj:`int`, `optional`):
-                If specified, the length at which to pad. If not specified
+                If specified, the length at which to pad. If not specified we pad using the size of
-                we pad using the size of the longest sequence in a batch
+                the longest sequence in a batch.
        """
        pass
    def no_padding(self):
@@ -682,9 +717,11 @@ class Tokenizer:
    def padding(self) -> Optional[dict]:
        """Get the current padding parameters
        `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
        Returns:
-            None if padding is disabled, a dict with the currently set parameters
+            (:obj:`dict`, `optional`):
-            if the padding is enabled.
+                A dict with the current padding parameters if padding is enabled
        """
        pass
    def encode(
@@ -694,8 +731,7 @@ class Tokenizer:
        is_pretokenized: bool = False,
        add_special_tokens: bool = True,
    ) -> Encoding:
-        """
+        """Encode the given sequence and pair. This method can process raw text sequences
        Encode the given sequence and pair. This method can process raw text sequences
        as well as already pre-tokenized sequences.
        Example:
@@ -736,8 +772,7 @@ class Tokenizer:
        is_pretokenized: bool = False,
        add_special_tokens: bool = True,
    ) -> List[Encoding]:
-        """
+        """Encode the given batch of inputs. This method accept both raw text sequences
        Encode the given batch of inputs. This method accept both raw text sequences
        as well as already pre-tokenized sequences.
        Example:
@@ -771,82 +806,91 @@ class Tokenizer:
        """
        pass
    def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
-        """Decode the given list of ids to a string sequence
+        """Decode the given list of ids back to a string
        This is used to decode anything coming back from a Language Model
        Args:
-            ids: List[unsigned int]:
+            ids (A :obj:`List/Tuple` of :obj:`int`):
-                A list of ids to be decoded
+                The list of ids that we want to decode
-            skip_special_tokens: (`optional`) boolean:
+            skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
-                Whether to remove all the special tokens from the output string
+                Whether the special tokens should be removed from the decoded string
        Returns:
-            The decoded string
+            :obj:`str`: The decoded string
        """
        pass
    def decode_batch(
        self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
    ) -> str:
-        """Decode the list of sequences to a list of string sequences
+        """Decode a batch of ids back to their corresponding string
        Args:
-            sequences: List[List[unsigned int]]:
+            sequences (:obj:`List` of :obj:`List[int]`):
-                A list of sequence of ids to be decoded
+                The batch of sequences we want to decode
-            skip_special_tokens: (`optional`) boolean:
+            skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
-                Whether to remove all the special tokens from the output strings
+                Whether the special tokens should be removed from the decoded strings
        Returns:
-            A list of decoded strings
+            :obj:`List[str]`: A list of decoded strings
        """
        pass
    def token_to_id(self, token: str) -> Optional[int]:
-        """Convert the given token to its corresponding id
+        """Convert the given token to its corresponding id if it exists
        Args:
-            token: str:
+            token (:obj:`str`):
                The token to convert
        Returns:
-            The corresponding id if it exists, None otherwise
+            :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
        """
        pass
    def id_to_token(self, id: int) -> Optional[str]:
-        """Convert the given token id to its corresponding string
+        """Convert the given id to its corresponding token if it exists
        Args:
-            token: id:
+            id (:obj:`int`):
-                The token id to convert
+                The id to convert
        Returns:
-            The corresponding string if it exists, None otherwise
+            :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
        """
        pass
    def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
        """Add the given tokens to the vocabulary
        The given tokens are added only if they don't already exist in the vocabulary.
        Each token then gets a new attributed id.
        Args:
-            tokens: List[Union[str, AddedToken]]:
+            tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
-                A list of tokens to add to the vocabulary. Each token can either be
+                The list of tokens we want to add to the vocabulary. Each token can be either a
-                a string, or an instance of AddedToken
+                string or an instance of :class:`~tokenizers.AddedToken` for more customization.
        Returns:
-            The number of tokens that were added to the vocabulary
+            :obj:`int`: The number of tokens that were created in the vocabulary
        """
        pass
    def add_special_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
-        """Add the given special tokens to the vocabulary, and treat them as special tokens.
+        """Add the given special tokens to the Tokenizer.
-        The special tokens will never be processed by the model, and will be
+        If these tokens are already part of the vocabulary, it just let the Tokenizer know about
-        removed while decoding.
+        them. If they don't exist, the Tokenizer creates them, giving them a new id.
        These special tokens will never be processed by the model (ie won't be split into
        multiple tokens), and they can be removed from the output when decoding.
        Args:
-            tokens: List[Union[str, AddedToken]]:
+            tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
-                The list of special tokens to add. Each token can either be a string
+                The list of special tokens we want to add to the vocabulary. Each token can either
-                or an instance of AddedToken
+                be a string or an instance of :class:`~tokenizers.AddedToken` for more
                customization.
        Returns:
-            The number of tokens that were added to the vocabulary
+            :obj:`int`: The number of tokens that were created in the vocabulary
        """
        pass
    def post_process(
@@ -858,21 +902,24 @@ class Tokenizer:
        """Apply all the post-processing steps to the given encodings.
        The various steps are:
-            1. Truncate according to global params (provided to `enable_truncation`)
+
-            2. Apply the PostProcessor
+            1. Truncate according to the set truncation params (provided with
-            3. Pad according to global params. (provided to `enable_padding`)
+               :meth:`~tokenizers.Tokenizer.enable_truncation`)
            2. Apply the :class:`~tokenizers.processors.PostProcessor`
            3. Pad according to the set padding params (provided with
               :meth:`~tokenizers.Tokenizer.enable_padding`)
        Args:
-            encoding: Encoding:
+            encoding (:class:`~tokenizers.Encoding`):
-                The main Encoding to post process
+                The :class:`~tokenizers.Encoding` corresponding to the main sequence.
-            pair: Optional[Encoding]:
+            pair (:class:`~tokenizers.Encoding`, `optional`):
-                An optional pair Encoding
+                An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence.
-            add_special_tokens: bool:
+            add_special_tokens (:obj:`bool`):
-                Whether to add special tokens
+                Whether to add the special tokens
        Returns:
-            The resulting Encoding
+            :class:`~tokenizers.Encoding`: The final post-processed encoding
        """
        pass
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@@ -53,7 +53,7 @@ use crate::processors::PyPostProcessor;
 ///         Yesterday"``.
 ///
 #[pyclass(dict, module = "tokenizers", name=AddedToken)]
-#[text_signature = "(content, **kwargs)"]
+#[text_signature = "(content, single_word=False, lstrip=False, rstrip=False, normalized=True)"]
 pub struct PyAddedToken {
    pub content: String,
    pub is_special_token: bool,
@@ -408,6 +408,7 @@ type Tokenizer = TokenizerImpl<PyModel, PyNormalizer, PyPreTokenizer, PyPostProc
 ///         The core algorithm that this :obj:`Tokenizer` should be using.
 ///
 #[pyclass(dict, module = "tokenizers", name=Tokenizer)]
 #[text_signature = "(model)"]
 #[derive(Clone)]
 pub struct PyTokenizer {
    tokenizer: Tokenizer,
@@ -533,7 +534,7 @@ impl PyTokenizer {
    ///     path (:obj:`str`):
    ///         A path to a file in which to save the serialized tokenizer.
    ///
-    ///     pretty (:obj:`bool`, defaults to :obj:`False):
+    ///     pretty (:obj:`bool`, defaults to :obj:`False`):
    ///         Whether the JSON file should be pretty formatted.
    #[args(pretty = false)]
    #[text_signature = "($self, pretty=False)"]
@@ -551,7 +552,7 @@ impl PyTokenizer {
    /// Get the underlying vocabulary
    ///
    /// Args:
-    ///     with_added_tokens (:obj:`bool, defaults to :obj:`True`):
+    ///     with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
    ///         Whether to include the added tokens
    ///
    /// Returns:
@@ -565,7 +566,7 @@ impl PyTokenizer {
    /// Get the size of the underlying vocabulary
    ///
    /// Args:
-    ///     with_added_tokens (:obj:`bool, defaults to :obj:`True`):
+    ///     with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
    ///         Whether to include the added tokens
    ///
    /// Returns:
@@ -632,6 +633,8 @@ impl PyTokenizer {
    /// Get the currently set truncation parameters
    ///
    /// `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
    ///
    /// Returns:
    ///     (:obj:`dict`, `optional`):
    ///         A dict with the current truncation parameters if truncation is enabled
@@ -737,6 +740,8 @@ impl PyTokenizer {
    /// Get the current padding parameters
    ///
    /// `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
    ///
    /// Returns:
    ///     (:obj:`dict`, `optional`):
    ///         A dict with the current padding parameters if padding is enabled
@@ -797,7 +802,7 @@ impl PyTokenizer {
    ///     :class:`~tokenizers.Encoding`: The encoded result
    ///
    #[args(pair = "None", is_pretokenized = "false", add_special_tokens = "true")]
-    #[text_signature = "($self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True, /)"]
+    #[text_signature = "($self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)"]
    fn encode(
        &self,
        sequence: &PyAny,
@@ -862,7 +867,7 @@ impl PyTokenizer {
    ///     A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
    ///
    #[args(is_pretokenized = "false", add_special_tokens = "true")]
-    #[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True, /)"]
+    #[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True)"]
    fn encode_batch(
        &self,
        input: Vec<&PyAny>,
@@ -891,37 +896,88 @@ impl PyTokenizer {
        })
    }
-    fn decode(&self, ids: Vec<u32>, skip_special_tokens: Option<bool>) -> PyResult<String> {
+    /// Decode the given list of ids back to a string
-        ToPyResult(
+    ///
-            self.tokenizer
+    /// This is used to decode anything coming back from a Language Model
-                .decode(ids, skip_special_tokens.unwrap_or(true)),
+    ///
-        )
+    /// Args:
-        .into()
+    ///     ids (A :obj:`List/Tuple` of :obj:`int`):
    ///         The list of ids that we want to decode
    ///
    ///     skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
    ///         Whether the special tokens should be removed from the decoded string
    ///
    /// Returns:
    ///     :obj:`str`: The decoded string
    #[args(skip_special_tokens = true)]
    #[text_signature = "($self, ids, skip_special_tokens=True)"]
    fn decode(&self, ids: Vec<u32>, skip_special_tokens: bool) -> PyResult<String> {
        ToPyResult(self.tokenizer.decode(ids, skip_special_tokens)).into()
    }
    /// Decode a batch of ids back to their corresponding string
    ///
    /// Args:
    ///     sequences (:obj:`List` of :obj:`List[int]`):
    ///         The batch of sequences we want to decode
    ///
    ///     skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
    ///         Whether the special tokens should be removed from the decoded strings
    ///
    /// Returns:
    ///     :obj:`List[str]`: A list of decoded strings
    #[args(skip_special_tokens = true)]
    #[text_signature = "($self, sequences, skip_special_tokens=True)"]
    fn decode_batch(
        &self,
-        sentences: Vec<Vec<u32>>,
+        sequences: Vec<Vec<u32>>,
-        skip_special_tokens: Option<bool>,
+        skip_special_tokens: bool,
    ) -> PyResult<Vec<String>> {
        let gil = Python::acquire_gil();
        gil.python().allow_threads(|| {
-            ToPyResult(
+            ToPyResult(self.tokenizer.decode_batch(sequences, skip_special_tokens)).into()
                self.tokenizer
                    .decode_batch(sentences, skip_special_tokens.unwrap_or(true)),
            )
            .into()
        })
    }
    /// Convert the given token to its corresponding id if it exists
    ///
    /// Args:
    ///     token (:obj:`str`):
    ///         The token to convert
    ///
    /// Returns:
    ///     :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
    #[text_signature = "($self, token)"]
    fn token_to_id(&self, token: &str) -> Option<u32> {
        self.tokenizer.token_to_id(token)
    }
    /// Convert the given id to its corresponding token if it exists
    ///
    /// Args:
    ///     id (:obj:`int`):
    ///         The id to convert
    ///
    /// Returns:
    ///     :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
    #[text_signature = "($self, id)"]
    fn id_to_token(&self, id: u32) -> Option<&str> {
        self.tokenizer.id_to_token(id)
    }
    /// Add the given tokens to the vocabulary
    ///
    /// The given tokens are added only if they don't already exist in the vocabulary.
    /// Each token then gets a new attributed id.
    ///
    /// Args:
    ///     tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
    ///         The list of tokens we want to add to the vocabulary. Each token can be either a
    ///         string or an instance of :class:`~tokenizers.AddedToken` for more customization.
    ///
    /// Returns:
    ///     :obj:`int`: The number of tokens that were created in the vocabulary
    #[text_signature = "($self, tokens)"]
    fn add_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
        let tokens = tokens
            .into_iter()
@@ -942,6 +998,23 @@ impl PyTokenizer {
        Ok(self.tokenizer.add_tokens(&tokens))
    }
    /// Add the given special tokens to the Tokenizer.
    ///
    /// If these tokens are already part of the vocabulary, it just let the Tokenizer know about
    /// them. If they don't exist, the Tokenizer creates them, giving them a new id.
    ///
    /// These special tokens will never be processed by the model (ie won't be split into
    /// multiple tokens), and they can be removed from the output when decoding.
    ///
    /// Args:
    ///     tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
    ///         The list of special tokens we want to add to the vocabulary. Each token can either
    ///         be a string or an instance of :class:`~tokenizers.AddedToken` for more
    ///         customization.
    ///
    /// Returns:
    ///     :obj:`int`: The number of tokens that were created in the vocabulary
    #[text_signature = "($self, tokens)"]
    fn add_special_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
        let tokens = tokens
            .into_iter()
@@ -968,7 +1041,30 @@ impl PyTokenizer {
            .allow_threads(|| ToPyResult(self.tokenizer.train_and_replace(trainer, files)).into())
    }
    /// Apply all the post-processing steps to the given encodings.
    ///
    /// The various steps are:
    ///
    ///     1. Truncate according to the set truncation params (provided with
    ///        :meth:`~tokenizers.Tokenizer.enable_truncation`)
    ///     2. Apply the :class:`~tokenizers.processors.PostProcessor`
    ///     3. Pad according to the set padding params (provided with
    ///        :meth:`~tokenizers.Tokenizer.enable_padding`)
    ///
    /// Args:
    ///     encoding (:class:`~tokenizers.Encoding`):
    ///         The :class:`~tokenizers.Encoding` corresponding to the main sequence.
    ///
    ///     pair (:class:`~tokenizers.Encoding`, `optional`):
    ///         An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence.
    ///
    ///     add_special_tokens (:obj:`bool`):
    ///         Whether to add the special tokens
    ///
    /// Returns:
    ///     :class:`~tokenizers.Encoding`: The final post-processed encoding
    #[args(pair = "None", add_special_tokens = true)]
    #[text_signature = "($self, encoding, pair=None, add_special_tokens=True)"]
    fn post_process(
        &self,
        encoding: &PyEncoding,
@@ -987,16 +1083,19 @@ impl PyTokenizer {
        .into()
    }
    /// The :class:`~tokenizers.models.Model` in use by the Tokenizer
    #[getter]
    fn get_model(&self) -> PyResult<PyObject> {
        self.tokenizer.get_model().get_as_subtype()
    }
    /// Set the :class:`~tokenizers.models.Model`
    #[setter]
    fn set_model(&mut self, model: PyRef<PyModel>) {
        self.tokenizer.with_model(model.clone());
    }
    /// The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
    #[getter]
    fn get_normalizer(&self) -> PyResult<PyObject> {
        if let Some(n) = self.tokenizer.get_normalizer() {
@@ -1006,11 +1105,13 @@ impl PyTokenizer {
        }
    }
    /// Set the :class:`~tokenizers.normalizers.Normalizer`
    #[setter]
    fn set_normalizer(&mut self, normalizer: PyRef<PyNormalizer>) {
        self.tokenizer.with_normalizer(normalizer.clone());
    }
    /// The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
    #[getter]
    fn get_pre_tokenizer(&self) -> PyResult<PyObject> {
        if let Some(pt) = self.tokenizer.get_pre_tokenizer() {
@@ -1020,11 +1121,13 @@ impl PyTokenizer {
        }
    }
    /// Set the :class:`~tokenizers.normalizers.Normalizer`
    #[setter]
    fn set_pre_tokenizer(&mut self, pretok: PyRef<PyPreTokenizer>) {
        self.tokenizer.with_pre_tokenizer(pretok.clone());
    }
    /// The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
    #[getter]
    fn get_post_processor(&self) -> PyResult<PyObject> {
        if let Some(n) = self.tokenizer.get_post_processor() {
@@ -1034,11 +1137,13 @@ impl PyTokenizer {
        }
    }
    /// Set the :class:`~tokenizers.processors.PostProcessor`
    #[setter]
    fn set_post_processor(&mut self, processor: PyRef<PyPostProcessor>) {
        self.tokenizer.with_post_processor(processor.clone());
    }
    /// The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
    #[getter]
    fn get_decoder(&self) -> PyResult<PyObject> {
        if let Some(dec) = self.tokenizer.get_decoder() {
@@ -1048,6 +1153,7 @@ impl PyTokenizer {
        }
    }
    /// Set the :class:`~tokenizers.decoders.Decoder`
    #[setter]
    fn set_decoder(&mut self, decoder: PyRef<PyDecoder>) {
        self.tokenizer.with_decoder(decoder.clone());