mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Automatically stubbing the pyi
files while keeping inspecting ability (#509)
* First pass on automatic stubbing our python files. * And now modifying all rust docs to be visible in Pyi files. * Better assert fail message. * Fixing github workflow. * Removing types not exported anymore. * Fixing `Tokenizer` signature. * Disabling auto __init__.py. * Re-enabling some types. * Don't overwrite non automated __init__.py * Automated most __init__.py * Restubbing after rebase. * Fixing env for tests. * Install blakc in the env. * Use PY35 target in stub.py Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
This commit is contained in:
@ -11,7 +11,43 @@ use tokenizers as tk;
|
||||
use crate::models::PyModel;
|
||||
use crate::tokenizer::PyAddedToken;
|
||||
|
||||
/// Base class for all trainers
|
||||
///
|
||||
/// This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
/// Trainer will return an instance of this class when instantiated.
|
||||
///
|
||||
/// Args:
|
||||
/// vocab_size: unsigned int:
|
||||
/// The size of the final vocabulary, including all tokens and alphabet.
|
||||
///
|
||||
/// min_frequency: unsigned int:
|
||||
/// The minimum frequency a pair should have in order to be merged.
|
||||
///
|
||||
/// show_progress: boolean:
|
||||
/// Whether to show progress bars while training.
|
||||
///
|
||||
/// special_tokens: List[Union[str, AddedToken]]:
|
||||
/// A list of special tokens the model should know of.
|
||||
///
|
||||
/// limit_alphabet: unsigned int:
|
||||
/// The maximum different characters to keep in the alphabet.
|
||||
///
|
||||
/// initial_alphabet: List[str]:
|
||||
/// A list of characters to include in the initial alphabet, even
|
||||
/// if not seen in the training dataset.
|
||||
/// If the strings contain more than one character, only the first one
|
||||
/// is kept.
|
||||
///
|
||||
/// continuing_subword_prefix: Optional[str]:
|
||||
/// A prefix to be used for every subword that is not a beginning-of-word.
|
||||
///
|
||||
/// end_of_word_suffix: Optional[str]:
|
||||
/// A suffix to be used for every subword that is a end-of-word.
|
||||
///
|
||||
/// Returns:
|
||||
/// Trainer
|
||||
#[pyclass(name=Trainer)]
|
||||
#[text_signature = "(self, vocab_size=30000, min_frequency=0,show_progress=True, special_tokens=[],limit_alphabet=None, initial_alphabet = [], continuing_subword_prefix=None, end_of_word_suffix=None)"]
|
||||
pub struct PyTrainer {
|
||||
pub trainer: TrainerWrapper,
|
||||
}
|
||||
@ -41,6 +77,7 @@ impl Trainer for PyTrainer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Capable of training a BPE model
|
||||
#[pyclass(extends=PyTrainer, name=BpeTrainer)]
|
||||
pub struct PyBpeTrainer {}
|
||||
#[pymethods]
|
||||
@ -105,7 +142,39 @@ impl PyBpeTrainer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Capable of training a WordPiece model
|
||||
/// Args:
|
||||
/// vocab_size: unsigned int:
|
||||
/// The size of the final vocabulary, including all tokens and alphabet.
|
||||
///
|
||||
/// min_frequency: unsigned int:
|
||||
/// The minimum frequency a pair should have in order to be merged.
|
||||
///
|
||||
/// show_progress: boolean:
|
||||
/// Whether to show progress bars while training.
|
||||
///
|
||||
/// special_tokens: List[Union[str, AddedToken]]:
|
||||
/// A list of special tokens the model should know of.
|
||||
///
|
||||
/// limit_alphabet: unsigned int:
|
||||
/// The maximum different characters to keep in the alphabet.
|
||||
///
|
||||
/// initial_alphabet: List[str]:
|
||||
/// A list of characters to include in the initial alphabet, even
|
||||
/// if not seen in the training dataset.
|
||||
/// If the strings contain more than one character, only the first one
|
||||
/// is kept.
|
||||
///
|
||||
/// continuing_subword_prefix: Optional[str]:
|
||||
/// A prefix to be used for every subword that is not a beginning-of-word.
|
||||
///
|
||||
/// end_of_word_suffix: Optional[str]:
|
||||
/// A suffix to be used for every subword that is a end-of-word.
|
||||
///
|
||||
/// Returns:
|
||||
/// Trainer
|
||||
#[pyclass(extends=PyTrainer, name=WordPieceTrainer)]
|
||||
#[text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"]
|
||||
pub struct PyWordPieceTrainer {}
|
||||
#[pymethods]
|
||||
impl PyWordPieceTrainer {
|
||||
@ -173,7 +242,28 @@ impl PyWordPieceTrainer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Capable of training a Unigram model
|
||||
///
|
||||
/// Args:
|
||||
/// vocab_size: unsigned int:
|
||||
/// The size of the final vocabulary, including all tokens and alphabet.
|
||||
///
|
||||
/// show_progress: boolean:
|
||||
/// Whether to show progress bars while training.
|
||||
///
|
||||
/// special_tokens: List[Union[str, AddedToken]]:
|
||||
/// A list of special tokens the model should know of.
|
||||
///
|
||||
/// initial_alphabet: List[str]:
|
||||
/// A list of characters to include in the initial alphabet, even
|
||||
/// if not seen in the training dataset.
|
||||
/// If the strings contain more than one character, only the first one
|
||||
/// is kept.
|
||||
///
|
||||
/// Returns:
|
||||
/// Trainer
|
||||
#[pyclass(extends=PyTrainer, name=UnigramTrainer)]
|
||||
#[text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens= [])"]
|
||||
pub struct PyUnigramTrainer {}
|
||||
#[pymethods]
|
||||
impl PyUnigramTrainer {
|
||||
|
Reference in New Issue
Block a user