Automatically stubbing the pyi files while keeping inspecting ability (#509)

* First pass on automatic stubbing our python files.

* And now modifying all rust docs to be visible in Pyi files.

* Better assert fail message.

* Fixing github workflow.

* Removing types not exported anymore.

* Fixing `Tokenizer` signature.

* Disabling auto __init__.py.

* Re-enabling some types.

* Don't overwrite non automated __init__.py

* Automated most __init__.py

* Restubbing after rebase.

* Fixing env for tests.

* Install blakc in the env.

* Use PY35 target in stub.py

Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
This commit is contained in:
Nicolas Patry
2020-11-17 21:13:00 +01:00
committed by GitHub
parent fff856cff7
commit 352c92ad33
25 changed files with 2511 additions and 1426 deletions

View File

@ -11,7 +11,43 @@ use tokenizers as tk;
use crate::models::PyModel;
use crate::tokenizer::PyAddedToken;
/// Base class for all trainers
///
/// This class is not supposed to be instantiated directly. Instead, any implementation of a
/// Trainer will return an instance of this class when instantiated.
///
/// Args:
/// vocab_size: unsigned int:
/// The size of the final vocabulary, including all tokens and alphabet.
///
/// min_frequency: unsigned int:
/// The minimum frequency a pair should have in order to be merged.
///
/// show_progress: boolean:
/// Whether to show progress bars while training.
///
/// special_tokens: List[Union[str, AddedToken]]:
/// A list of special tokens the model should know of.
///
/// limit_alphabet: unsigned int:
/// The maximum different characters to keep in the alphabet.
///
/// initial_alphabet: List[str]:
/// A list of characters to include in the initial alphabet, even
/// if not seen in the training dataset.
/// If the strings contain more than one character, only the first one
/// is kept.
///
/// continuing_subword_prefix: Optional[str]:
/// A prefix to be used for every subword that is not a beginning-of-word.
///
/// end_of_word_suffix: Optional[str]:
/// A suffix to be used for every subword that is a end-of-word.
///
/// Returns:
/// Trainer
#[pyclass(name=Trainer)]
#[text_signature = "(self, vocab_size=30000, min_frequency=0,show_progress=True, special_tokens=[],limit_alphabet=None, initial_alphabet = [], continuing_subword_prefix=None, end_of_word_suffix=None)"]
pub struct PyTrainer {
pub trainer: TrainerWrapper,
}
@ -41,6 +77,7 @@ impl Trainer for PyTrainer {
}
}
/// Capable of training a BPE model
#[pyclass(extends=PyTrainer, name=BpeTrainer)]
pub struct PyBpeTrainer {}
#[pymethods]
@ -105,7 +142,39 @@ impl PyBpeTrainer {
}
}
/// Capable of training a WordPiece model
/// Args:
/// vocab_size: unsigned int:
/// The size of the final vocabulary, including all tokens and alphabet.
///
/// min_frequency: unsigned int:
/// The minimum frequency a pair should have in order to be merged.
///
/// show_progress: boolean:
/// Whether to show progress bars while training.
///
/// special_tokens: List[Union[str, AddedToken]]:
/// A list of special tokens the model should know of.
///
/// limit_alphabet: unsigned int:
/// The maximum different characters to keep in the alphabet.
///
/// initial_alphabet: List[str]:
/// A list of characters to include in the initial alphabet, even
/// if not seen in the training dataset.
/// If the strings contain more than one character, only the first one
/// is kept.
///
/// continuing_subword_prefix: Optional[str]:
/// A prefix to be used for every subword that is not a beginning-of-word.
///
/// end_of_word_suffix: Optional[str]:
/// A suffix to be used for every subword that is a end-of-word.
///
/// Returns:
/// Trainer
#[pyclass(extends=PyTrainer, name=WordPieceTrainer)]
#[text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"]
pub struct PyWordPieceTrainer {}
#[pymethods]
impl PyWordPieceTrainer {
@ -173,7 +242,28 @@ impl PyWordPieceTrainer {
}
}
/// Capable of training a Unigram model
///
/// Args:
/// vocab_size: unsigned int:
/// The size of the final vocabulary, including all tokens and alphabet.
///
/// show_progress: boolean:
/// Whether to show progress bars while training.
///
/// special_tokens: List[Union[str, AddedToken]]:
/// A list of special tokens the model should know of.
///
/// initial_alphabet: List[str]:
/// A list of characters to include in the initial alphabet, even
/// if not seen in the training dataset.
/// If the strings contain more than one character, only the first one
/// is kept.
///
/// Returns:
/// Trainer
#[pyclass(extends=PyTrainer, name=UnigramTrainer)]
#[text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens= [])"]
pub struct PyUnigramTrainer {}
#[pymethods]
impl PyUnigramTrainer {