From 58e1d8de67bab479a35b3fe1d8eea94a782ee909 Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Fri, 20 Nov 2020 17:54:53 -0500 Subject: [PATCH] Python - Improve documentation for trainers --- .../py_src/tokenizers/trainers/__init__.pyi | 115 +++++++--------- bindings/python/src/trainers.rs | 129 ++++++++---------- docs/source/api/python.inc | 34 +++++ 3 files changed, 140 insertions(+), 138 deletions(-) diff --git a/bindings/python/py_src/tokenizers/trainers/__init__.pyi b/bindings/python/py_src/tokenizers/trainers/__init__.pyi index 189b8793..f80995d4 100644 --- a/bindings/python/py_src/tokenizers/trainers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/trainers/__init__.pyi @@ -5,37 +5,6 @@ class Trainer: This class is not supposed to be instantiated directly. Instead, any implementation of a Trainer will return an instance of this class when instantiated. - - Args: - vocab_size: unsigned int: - The size of the final vocabulary, including all tokens and alphabet. - - min_frequency: unsigned int: - The minimum frequency a pair should have in order to be merged. - - show_progress: boolean: - Whether to show progress bars while training. - - special_tokens: List[Union[str, AddedToken]]: - A list of special tokens the model should know of. - - limit_alphabet: unsigned int: - The maximum different characters to keep in the alphabet. - - initial_alphabet: List[str]: - A list of characters to include in the initial alphabet, even - if not seen in the training dataset. - If the strings contain more than one character, only the first one - is kept. - - continuing_subword_prefix: Optional[str]: - A prefix to be used for every subword that is not a beginning-of-word. - - end_of_word_suffix: Optional[str]: - A suffix to be used for every subword that is a end-of-word. - - Returns: - Trainer """ def __init__( @@ -53,31 +22,56 @@ class Trainer: class BpeTrainer(Trainer): """ - Capable of training a BPE model - """ - -class UnigramTrainer(Trainer): - """ - Capable of training a Unigram model + Trainer capable of training a BPE model Args: - vocab_size: unsigned int: + vocab_size (:obj:`int`, `optional`): The size of the final vocabulary, including all tokens and alphabet. - show_progress: boolean: + min_frequency (:obj:`int`, `optional`): + The minimum frequency a pair should have in order to be merged. + + show_progress (:obj:`bool`, `optional`): Whether to show progress bars while training. - special_tokens: List[Union[str, AddedToken]]: + special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`): A list of special tokens the model should know of. - initial_alphabet: List[str]: + limit_alphabet (:obj:`int`, `optional`): + The maximum different characters to keep in the alphabet. + + initial_alphabet (:obj:`List[str]`, `optional`): A list of characters to include in the initial alphabet, even if not seen in the training dataset. If the strings contain more than one character, only the first one is kept. - Returns: - Trainer + continuing_subword_prefix (:obj:`str`, `optional`): + A prefix to be used for every subword that is not a beginning-of-word. + + end_of_word_suffix (:obj:`str`, `optional`): + A suffix to be used for every subword that is a end-of-word. + """ + +class UnigramTrainer(Trainer): + """ + Trainer capable of training a Unigram model + + Args: + vocab_size (:obj:`int`): + The size of the final vocabulary, including all tokens and alphabet. + + show_progress (:obj:`bool`): + Whether to show progress bars while training. + + special_tokens (:obj:`List[Union[str, AddedToken]]`): + A list of special tokens the model should know of. + + initial_alphabet (:obj:`List[str]`): + A list of characters to include in the initial alphabet, even + if not seen in the training dataset. + If the strings contain more than one character, only the first one + is kept. """ def __init__(self, vocab_size=8000, show_progress=True, special_tokens=[]): @@ -85,58 +79,53 @@ class UnigramTrainer(Trainer): class WordLevelTrainer(Trainer): """ - Capable of training a WorldLevel model + Trainer capable of training a WorldLevel model Args: - vocab_size: unsigned int: + vocab_size (:obj:`int`, `optional`): The size of the final vocabulary, including all tokens and alphabet. - min_frequency: unsigned int: + min_frequency (:obj:`int`, `optional`): The minimum frequency a pair should have in order to be merged. - show_progress: boolean: + show_progress (:obj:`bool`, `optional`): Whether to show progress bars while training. - special_tokens: List[Union[str, AddedToken]]: + special_tokens (:obj:`List[Union[str, AddedToken]]`): A list of special tokens the model should know of. - - Returns: - Trainer """ class WordPieceTrainer(Trainer): """ - Capable of training a WordPiece model + Trainer capable of training a WordPiece model + Args: - vocab_size: unsigned int: + vocab_size (:obj:`int`, `optional`): The size of the final vocabulary, including all tokens and alphabet. - min_frequency: unsigned int: + min_frequency (:obj:`int`, `optional`): The minimum frequency a pair should have in order to be merged. - show_progress: boolean: + show_progress (:obj:`bool`, `optional`): Whether to show progress bars while training. - special_tokens: List[Union[str, AddedToken]]: + special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`): A list of special tokens the model should know of. - limit_alphabet: unsigned int: + limit_alphabet (:obj:`int`, `optional`): The maximum different characters to keep in the alphabet. - initial_alphabet: List[str]: + initial_alphabet (:obj:`List[str]`, `optional`): A list of characters to include in the initial alphabet, even if not seen in the training dataset. If the strings contain more than one character, only the first one is kept. - continuing_subword_prefix: Optional[str]: + continuing_subword_prefix (:obj:`str`, `optional`): A prefix to be used for every subword that is not a beginning-of-word. - end_of_word_suffix: Optional[str]: + end_of_word_suffix (:obj:`str`, `optional`): A suffix to be used for every subword that is a end-of-word. - - Returns: - Trainer """ def __init__( diff --git a/bindings/python/src/trainers.rs b/bindings/python/src/trainers.rs index e72f1bd8..aeaef862 100644 --- a/bindings/python/src/trainers.rs +++ b/bindings/python/src/trainers.rs @@ -15,38 +15,7 @@ use crate::tokenizer::PyAddedToken; /// /// This class is not supposed to be instantiated directly. Instead, any implementation of a /// Trainer will return an instance of this class when instantiated. -/// -/// Args: -/// vocab_size: unsigned int: -/// The size of the final vocabulary, including all tokens and alphabet. -/// -/// min_frequency: unsigned int: -/// The minimum frequency a pair should have in order to be merged. -/// -/// show_progress: boolean: -/// Whether to show progress bars while training. -/// -/// special_tokens: List[Union[str, AddedToken]]: -/// A list of special tokens the model should know of. -/// -/// limit_alphabet: unsigned int: -/// The maximum different characters to keep in the alphabet. -/// -/// initial_alphabet: List[str]: -/// A list of characters to include in the initial alphabet, even -/// if not seen in the training dataset. -/// If the strings contain more than one character, only the first one -/// is kept. -/// -/// continuing_subword_prefix: Optional[str]: -/// A prefix to be used for every subword that is not a beginning-of-word. -/// -/// end_of_word_suffix: Optional[str]: -/// A suffix to be used for every subword that is a end-of-word. -/// -/// Returns: -/// Trainer -#[pyclass(name=Trainer)] +#[pyclass(name=Trainer, module = "tokenizers.trainers", name=Trainer)] #[derive(Clone)] #[text_signature = "(self, vocab_size=30000, min_frequency=0,show_progress=True, special_tokens=[],limit_alphabet=None, initial_alphabet = [], continuing_subword_prefix=None, end_of_word_suffix=None)"] pub struct PyTrainer { @@ -108,15 +77,39 @@ where } } -/// Capable of training a BPE model -#[pyclass(extends=PyTrainer, name=BpeTrainer)] +/// Trainer capable of training a BPE model +/// +/// Args: +/// vocab_size (:obj:`int`, `optional`): +/// The size of the final vocabulary, including all tokens and alphabet. +/// +/// min_frequency (:obj:`int`, `optional`): +/// The minimum frequency a pair should have in order to be merged. +/// +/// show_progress (:obj:`bool`, `optional`): +/// Whether to show progress bars while training. +/// +/// special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`): +/// A list of special tokens the model should know of. +/// +/// limit_alphabet (:obj:`int`, `optional`): +/// The maximum different characters to keep in the alphabet. +/// +/// initial_alphabet (:obj:`List[str]`, `optional`): +/// A list of characters to include in the initial alphabet, even +/// if not seen in the training dataset. +/// If the strings contain more than one character, only the first one +/// is kept. +/// +/// continuing_subword_prefix (:obj:`str`, `optional`): +/// A prefix to be used for every subword that is not a beginning-of-word. +/// +/// end_of_word_suffix (:obj:`str`, `optional`): +/// A suffix to be used for every subword that is a end-of-word. +#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=BpeTrainer)] pub struct PyBpeTrainer {} #[pymethods] impl PyBpeTrainer { - /// new(/ vocab_size, min_frequency) - /// -- - /// - /// Create a new BpeTrainer with the given configuration #[new] #[args(kwargs = "**")] pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { @@ -176,46 +169,40 @@ impl PyBpeTrainer { } } -/// Capable of training a WordPiece model +/// Trainer capable of training a WordPiece model +/// /// Args: -/// vocab_size: unsigned int: +/// vocab_size (:obj:`int`, `optional`): /// The size of the final vocabulary, including all tokens and alphabet. /// -/// min_frequency: unsigned int: +/// min_frequency (:obj:`int`, `optional`): /// The minimum frequency a pair should have in order to be merged. /// -/// show_progress: boolean: +/// show_progress (:obj:`bool`, `optional`): /// Whether to show progress bars while training. /// -/// special_tokens: List[Union[str, AddedToken]]: +/// special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`): /// A list of special tokens the model should know of. /// -/// limit_alphabet: unsigned int: +/// limit_alphabet (:obj:`int`, `optional`): /// The maximum different characters to keep in the alphabet. /// -/// initial_alphabet: List[str]: +/// initial_alphabet (:obj:`List[str]`, `optional`): /// A list of characters to include in the initial alphabet, even /// if not seen in the training dataset. /// If the strings contain more than one character, only the first one /// is kept. /// -/// continuing_subword_prefix: Optional[str]: +/// continuing_subword_prefix (:obj:`str`, `optional`): /// A prefix to be used for every subword that is not a beginning-of-word. /// -/// end_of_word_suffix: Optional[str]: +/// end_of_word_suffix (:obj:`str`, `optional`): /// A suffix to be used for every subword that is a end-of-word. -/// -/// Returns: -/// Trainer -#[pyclass(extends=PyTrainer, name=WordPieceTrainer)] +#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=WordPieceTrainer)] #[text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"] pub struct PyWordPieceTrainer {} #[pymethods] impl PyWordPieceTrainer { - /// new(/ vocab_size, min_frequency) - /// -- - /// - /// Create a new BpeTrainer with the given configuration #[new] #[args(kwargs = "**")] pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { @@ -276,28 +263,24 @@ impl PyWordPieceTrainer { } } -/// Capable of training a WorldLevel model +/// Trainer capable of training a WorldLevel model /// /// Args: -/// vocab_size: unsigned int: +/// vocab_size (:obj:`int`, `optional`): /// The size of the final vocabulary, including all tokens and alphabet. /// -/// min_frequency: unsigned int: +/// min_frequency (:obj:`int`, `optional`): /// The minimum frequency a pair should have in order to be merged. /// -/// show_progress: boolean: +/// show_progress (:obj:`bool`, `optional`): /// Whether to show progress bars while training. /// -/// special_tokens: List[Union[str, AddedToken]]: +/// special_tokens (:obj:`List[Union[str, AddedToken]]`): /// A list of special tokens the model should know of. -/// -/// Returns: -/// Trainer -#[pyclass(extends=PyTrainer, name=WordLevelTrainer)] +#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=WordLevelTrainer)] pub struct PyWordLevelTrainer {} #[pymethods] impl PyWordLevelTrainer { - /// Create a new WordLevelTrainer with the given configuration #[new] #[args(kwargs = "**")] pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { @@ -354,32 +337,28 @@ impl PyWordLevelTrainer { } } -/// Capable of training a Unigram model +/// Trainer capable of training a Unigram model /// /// Args: -/// vocab_size: unsigned int: +/// vocab_size (:obj:`int`): /// The size of the final vocabulary, including all tokens and alphabet. /// -/// show_progress: boolean: +/// show_progress (:obj:`bool`): /// Whether to show progress bars while training. /// -/// special_tokens: List[Union[str, AddedToken]]: +/// special_tokens (:obj:`List[Union[str, AddedToken]]`): /// A list of special tokens the model should know of. /// -/// initial_alphabet: List[str]: +/// initial_alphabet (:obj:`List[str]`): /// A list of characters to include in the initial alphabet, even /// if not seen in the training dataset. /// If the strings contain more than one character, only the first one /// is kept. -/// -/// Returns: -/// Trainer -#[pyclass(extends=PyTrainer, name=UnigramTrainer)] +#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=UnigramTrainer)] #[text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens= [])"] pub struct PyUnigramTrainer {} #[pymethods] impl PyUnigramTrainer { - /// Create a new UnigramTrainer with the given configuration #[new] #[args(kwargs = "**")] pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { diff --git a/docs/source/api/python.inc b/docs/source/api/python.inc index 3b8df71b..ee4ed8f1 100644 --- a/docs/source/api/python.inc +++ b/docs/source/api/python.inc @@ -44,3 +44,37 @@ Added Tokens .. autoclass:: tokenizers.AddedToken :members: + + +Models +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: tokenizers.models + :members: + +Normalizers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: tokenizers.normalizers + :members: + + +Pre-tokenizers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: tokenizers.pre_tokenizers + :members: + + +Post-processor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: tokenizers.processors + :members: + + +Trainers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: tokenizers.trainers + :members: