Automatically stubbing the pyi files while keeping inspecting ability (#509)

* First pass on automatic stubbing our python files. * And now modifying all rust docs to be visible in Pyi files. * Better assert fail message. * Fixing github workflow. * Removing types not exported anymore. * Fixing `Tokenizer` signature. * Disabling auto __init__.py. * Re-enabling some types. * Don't overwrite non automated __init__.py * Automated most __init__.py * Restubbing after rebase. * Fixing env for tests. * Install blakc in the env. * Use PY35 target in stub.py Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
2025-08-23 00:35:35 +00:00 · 2020-11-17 21:13:00 +01:00
parent fff856cff7
commit 352c92ad33
25 changed files with 2511 additions and 1426 deletions
--- a/bindings/python/src/trainers.rs
+++ b/bindings/python/src/trainers.rs
@ -11,7 +11,43 @@ use tokenizers as tk;
 use crate::models::PyModel;
 use crate::tokenizer::PyAddedToken;

+/// Base class for all trainers
+///
+/// This class is not supposed to be instantiated directly. Instead, any implementation of a
+/// Trainer will return an instance of this class when instantiated.
+///
+/// Args:
+///     vocab_size: unsigned int:
+///         The size of the final vocabulary, including all tokens and alphabet.
+///
+///     min_frequency: unsigned int:
+///         The minimum frequency a pair should have in order to be merged.
+///
+///     show_progress: boolean:
+///         Whether to show progress bars while training.
+///
+///     special_tokens: List[Union[str, AddedToken]]:
+///         A list of special tokens the model should know of.
+///
+///     limit_alphabet: unsigned int:
+///         The maximum different characters to keep in the alphabet.
+///
+///     initial_alphabet: List[str]:
+///         A list of characters to include in the initial alphabet, even
+///         if not seen in the training dataset.
+///         If the strings contain more than one character, only the first one
+///         is kept.
+///
+///     continuing_subword_prefix: Optional[str]:
+///         A prefix to be used for every subword that is not a beginning-of-word.
+///
+///     end_of_word_suffix: Optional[str]:
+///         A suffix to be used for every subword that is a end-of-word.
+///
+/// Returns:
+///     Trainer
 #[pyclass(name=Trainer)]
+#[text_signature = "(self, vocab_size=30000, min_frequency=0,show_progress=True, special_tokens=[],limit_alphabet=None, initial_alphabet = [], continuing_subword_prefix=None, end_of_word_suffix=None)"]
 pub struct PyTrainer {
    pub trainer: TrainerWrapper,
 }
@ -41,6 +77,7 @@ impl Trainer for PyTrainer {
    }
 }

+/// Capable of training a BPE model
 #[pyclass(extends=PyTrainer, name=BpeTrainer)]
 pub struct PyBpeTrainer {}
 #[pymethods]
@ -105,7 +142,39 @@ impl PyBpeTrainer {
    }
 }

+/// Capable of training a WordPiece model
+/// Args:
+///     vocab_size: unsigned int:
+///         The size of the final vocabulary, including all tokens and alphabet.
+///
+///     min_frequency: unsigned int:
+///         The minimum frequency a pair should have in order to be merged.
+///
+///     show_progress: boolean:
+///         Whether to show progress bars while training.
+///
+///     special_tokens: List[Union[str, AddedToken]]:
+///         A list of special tokens the model should know of.
+///
+///     limit_alphabet: unsigned int:
+///         The maximum different characters to keep in the alphabet.
+///
+///     initial_alphabet: List[str]:
+///         A list of characters to include in the initial alphabet, even
+///         if not seen in the training dataset.
+///         If the strings contain more than one character, only the first one
+///         is kept.
+///
+///     continuing_subword_prefix: Optional[str]:
+///         A prefix to be used for every subword that is not a beginning-of-word.
+///
+///     end_of_word_suffix: Optional[str]:
+///         A suffix to be used for every subword that is a end-of-word.
+///
+/// Returns:
+///     Trainer
 #[pyclass(extends=PyTrainer, name=WordPieceTrainer)]
+#[text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"]
 pub struct PyWordPieceTrainer {}
 #[pymethods]
 impl PyWordPieceTrainer {
@ -173,7 +242,28 @@ impl PyWordPieceTrainer {
    }
 }

+/// Capable of training a Unigram model
+///
+/// Args:
+///     vocab_size: unsigned int:
+///         The size of the final vocabulary, including all tokens and alphabet.
+///
+///     show_progress: boolean:
+///         Whether to show progress bars while training.
+///
+///     special_tokens: List[Union[str, AddedToken]]:
+///         A list of special tokens the model should know of.
+///
+///     initial_alphabet: List[str]:
+///         A list of characters to include in the initial alphabet, even
+///         if not seen in the training dataset.
+///         If the strings contain more than one character, only the first one
+///         is kept.
+///
+/// Returns:
+///     Trainer
 #[pyclass(extends=PyTrainer, name=UnigramTrainer)]
+#[text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens= [])"]
 pub struct PyUnigramTrainer {}
 #[pymethods]
 impl PyUnigramTrainer {