Automatically stubbing the pyi files while keeping inspecting ability (#509)

* First pass on automatic stubbing our python files. * And now modifying all rust docs to be visible in Pyi files. * Better assert fail message. * Fixing github workflow. * Removing types not exported anymore. * Fixing `Tokenizer` signature. * Disabling auto __init__.py. * Re-enabling some types. * Don't overwrite non automated __init__.py * Automated most __init__.py * Restubbing after rebase. * Fixing env for tests. * Install blakc in the env. * Use PY35 target in stub.py Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
2025-08-23 00:35:35 +00:00 · 2020-11-17 21:13:00 +01:00
parent fff856cff7
commit 352c92ad33
25 changed files with 2511 additions and 1426 deletions
--- a/bindings/python/src/processors.rs
+++ b/bindings/python/src/processors.rs
@ -16,6 +16,10 @@ use tk::processors::PostProcessorWrapper;
 use tk::{Encoding, PostProcessor};
 use tokenizers as tk;

+/// Base class for all post-processors
+///
+/// This class is not supposed to be instantiated directly. Instead, any implementation of
+/// a PostProcessor will return an instance of this class when instantiated.
 #[pyclass(dict, module = "tokenizers.processors", name=PostProcessor)]
 #[derive(Clone, Deserialize, Serialize)]
 pub struct PyPostProcessor {
@ -88,11 +92,17 @@ impl PyPostProcessor {
        }
    }

+    /// Return the number of special tokens that would be added for single/pair sentences.
+    /// :param is_pair: Boolean indicating if the input would be a single sentence or a pair
+    /// :return:
+    #[text_signature = "(self, is_pair)"]
    fn num_special_tokens_to_add(&self, is_pair: bool) -> usize {
        self.processor.added_tokens(is_pair)
    }

+    /// Post-process the given encodings, generating the final one
    #[args(pair = "None", add_special_tokens = "true")]
+    #[text_signature = "(self, encoding, pair=None, add_special_tokens=True)"]
    fn process(
        &self,
        encoding: &PyEncoding,
@ -109,7 +119,21 @@ impl PyPostProcessor {
    }
 }

+/// This post-processor takes care of adding the special tokens needed by
+/// a Bert model:
+///     - a SEP token
+///     - a CLS token
+/// Args:
+///     sep: Tuple[str, int]:
+///         A tuple with the string representation of the SEP token, and its id
+///
+///     cls: Tuple[str, int]:
+///         A tuple with the string representation of the CLS token, and its id
+///
+/// Returns:
+///     PostProcessor
 #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=BertProcessing)]
+#[text_signature = "(self, sep, cls)"]
 pub struct PyBertProcessing {}
 #[pymethods]
 impl PyBertProcessing {
@ -126,7 +150,33 @@ impl PyBertProcessing {
    }
 }

+/// This post-processor takes care of adding the special tokens needed by
+/// a Roberta model:
+///     - a SEP token
+///     - a CLS token
+///
+/// It also takes care of trimming the offsets.
+/// By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
+/// want the offsets to include these whitespaces, then this PostProcessor should be initialized
+/// with `trim_offsets=True`
+/// Args:
+///     sep: Tuple[str, int]:
+///         A tuple with the string representation of the SEP token, and its id
+///
+///     cls: Tuple[str, int]:
+///         A tuple with the string representation of the CLS token, and its id
+///
+///     trim_offsets: bool:
+///         Whether to trim the whitespaces from the produced offsets.
+///
+///     add_prefix_space: bool:
+///         Whether the add_prefix_space option was enabled during pre-tokenization. This
+///         is relevant because it defines the way the offsets are trimmed out.
+///
+/// Returns:
+///     PostProcessor
 #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=RobertaProcessing)]
+#[text_signature = "(self, sep, cls, trim_offsets=True, add_prefix_space=True)"]
 pub struct PyRobertaProcessing {}
 #[pymethods]
 impl PyRobertaProcessing {
@ -152,7 +202,15 @@ impl PyRobertaProcessing {
    }
 }

+/// This post-processor takes care of trimming the offsets.
+/// By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
+/// want the offsets to include these whitespaces, then this PostProcessor must be used.
+///
+/// Args:
+///     trim_offsets: bool:
+///         Whether to trim the whitespaces from the produced offsets.
 #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=ByteLevel)]
+#[text_signature = "(self, trim_offsets=True)"]
 pub struct PyByteLevel {}
 #[pymethods]
 impl PyByteLevel {
@ -244,7 +302,68 @@ impl FromPyObject<'_> for PyTemplate {
    }
 }

+/// Provides a way to specify templates in order to add the special tokens to each
+/// input sequence as relevant.
+///
+/// Let's take `BERT` tokenizer as an example. It uses two special tokens, used to
+/// delimitate each sequence. `[CLS]` is always used at the beginning of the first
+/// sequence, and `[SEP]` is added at the end of both the first, and the pair
+/// sequences. The final result looks like this:
+///     - Single sequence: `[CLS] Hello there [SEP]`
+///     - Pair sequences: `[CLS] My name is Anthony [SEP] What is my name? [SEP]`
+/// With the type ids as following:
+/// ```markdown
+/// [CLS]   ...   [SEP]   ...   [SEP]
+///   0      0      0      1      1
+/// ```
+///
+/// You can achieve such behavior using a TemplateProcessing:
+/// ```
+/// TemplateProcessing(
+///     single="[CLS] $0 [SEP]",
+///     pair="[CLS] $A [SEP] $B:1 [SEP]:1",
+///     special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
+/// )
+/// ```
+///
+/// In this example, each input sequence is identified using a `$` construct. This identifier
+/// lets us specify each input sequence, and the type_id to use. When nothing is specified,
+/// it uses the default values. Here are the different ways to specify it:
+/// - Specifying the sequence, with default `type_id == 0`: `$A` or `$B`
+/// - Specifying the `type_id` with default `sequence == A`: `$0`, `$1`, `$2`, ...
+/// - Specifying both: `$A:0`, `$B:1`, ...
+///
+/// The same construct is used for special tokens: `<identifier>(:<type_id>)?`.
+///
+/// **Warning**: You must ensure that you are giving the correct tokens/ids as these
+/// will be added to the Encoding without any further check. If the given ids correspond
+/// to something totally different in a `Tokenizer` using this `PostProcessor`, it
+/// might lead to unexpected results.
+///
+/// Args:
+///     single: Template
+///         The template used for single sequences
+///
+///     pair: Template:
+///         The template used when both sequences are specified
+///
+///     special_tokens: Tokens:
+///         The list of special tokens used in each sequences
+///
+/// Template: Union[str, List[str]]:
+///     - If a `str` is provided, the whitespace is used as delimiter between tokens
+///     - If a `List[str]` is provided, a list of tokens
+///
+/// Tokens: List[Union[Tuple[int, str], Tuple[str, int], dict]]:
+///     - A Tuple with both a token and its associated ID, in any order
+///     - A dict with the following keys:
+///         - "id": str => The special token id, as specified in the Template
+///         - "ids": List[int] => The associated IDs
+///         - "tokens": List[str] => The associated tokens
+///      The given dict expects the provided `ids` and `tokens` lists to have
+///      the same length.
 #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=TemplateProcessing)]
+#[text_signature = "(self, single, pair, special_tokens)"]
 pub struct PyTemplateProcessing {}
 #[pymethods]
 impl PyTemplateProcessing {