Python - Improve documentation for trainers

2025-08-22 16:25:30 +00:00 · 2020-11-20 17:54:53 -05:00
parent 64441b54b1
commit 58e1d8de67
3 changed files with 140 additions and 138 deletions
--- a/bindings/python/py_src/tokenizers/trainers/init.pyi
+++ b/bindings/python/py_src/tokenizers/trainers/init.pyi
@ -5,37 +5,6 @@ class Trainer:

    This class is not supposed to be instantiated directly. Instead, any implementation of a
    Trainer will return an instance of this class when instantiated.
-
-    Args:
-        vocab_size: unsigned int:
-            The size of the final vocabulary, including all tokens and alphabet.
-
-        min_frequency: unsigned int:
-            The minimum frequency a pair should have in order to be merged.
-
-        show_progress: boolean:
-            Whether to show progress bars while training.
-
-        special_tokens: List[Union[str, AddedToken]]:
-            A list of special tokens the model should know of.
-
-        limit_alphabet: unsigned int:
-            The maximum different characters to keep in the alphabet.
-
-        initial_alphabet: List[str]:
-            A list of characters to include in the initial alphabet, even
-            if not seen in the training dataset.
-            If the strings contain more than one character, only the first one
-            is kept.
-
-        continuing_subword_prefix: Optional[str]:
-            A prefix to be used for every subword that is not a beginning-of-word.
-
-        end_of_word_suffix: Optional[str]:
-            A suffix to be used for every subword that is a end-of-word.
-
-    Returns:
-        Trainer
    """

    def __init__(
@ -53,31 +22,56 @@ class Trainer:

 class BpeTrainer(Trainer):
    """
-    Capable of training a BPE model
-    """
-
-class UnigramTrainer(Trainer):
-    """
-    Capable of training a Unigram model
+    Trainer capable of training a BPE model

    Args:
-        vocab_size: unsigned int:
+        vocab_size (:obj:`int`, `optional`):
            The size of the final vocabulary, including all tokens and alphabet.

-        show_progress: boolean:
+        min_frequency (:obj:`int`, `optional`):
+            The minimum frequency a pair should have in order to be merged.
+
+        show_progress (:obj:`bool`, `optional`):
            Whether to show progress bars while training.

-        special_tokens: List[Union[str, AddedToken]]:
+        special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
            A list of special tokens the model should know of.

-        initial_alphabet: List[str]:
+        limit_alphabet (:obj:`int`, `optional`):
+            The maximum different characters to keep in the alphabet.
+
+        initial_alphabet (:obj:`List[str]`, `optional`):
            A list of characters to include in the initial alphabet, even
            if not seen in the training dataset.
            If the strings contain more than one character, only the first one
            is kept.

-    Returns:
-        Trainer
+        continuing_subword_prefix (:obj:`str`, `optional`):
+            A prefix to be used for every subword that is not a beginning-of-word.
+
+        end_of_word_suffix (:obj:`str`, `optional`):
+            A suffix to be used for every subword that is a end-of-word.
+    """
+
+class UnigramTrainer(Trainer):
+    """
+    Trainer capable of training a Unigram model
+
+    Args:
+        vocab_size (:obj:`int`):
+            The size of the final vocabulary, including all tokens and alphabet.
+
+        show_progress (:obj:`bool`):
+            Whether to show progress bars while training.
+
+        special_tokens (:obj:`List[Union[str, AddedToken]]`):
+            A list of special tokens the model should know of.
+
+        initial_alphabet (:obj:`List[str]`):
+            A list of characters to include in the initial alphabet, even
+            if not seen in the training dataset.
+            If the strings contain more than one character, only the first one
+            is kept.
    """

    def __init__(self, vocab_size=8000, show_progress=True, special_tokens=[]):
@ -85,58 +79,53 @@ class UnigramTrainer(Trainer):

 class WordLevelTrainer(Trainer):
    """
-    Capable of training a WorldLevel model
+    Trainer capable of training a WorldLevel model

    Args:
-        vocab_size: unsigned int:
+        vocab_size (:obj:`int`, `optional`):
            The size of the final vocabulary, including all tokens and alphabet.

-        min_frequency: unsigned int:
+        min_frequency (:obj:`int`, `optional`):
            The minimum frequency a pair should have in order to be merged.

-        show_progress: boolean:
+        show_progress (:obj:`bool`, `optional`):
            Whether to show progress bars while training.

-        special_tokens: List[Union[str, AddedToken]]:
+        special_tokens (:obj:`List[Union[str, AddedToken]]`):
            A list of special tokens the model should know of.
-
-    Returns:
-        Trainer
    """

 class WordPieceTrainer(Trainer):
    """
-    Capable of training a WordPiece model
+    Trainer capable of training a WordPiece model
+
    Args:
-        vocab_size: unsigned int:
+        vocab_size (:obj:`int`, `optional`):
            The size of the final vocabulary, including all tokens and alphabet.

-        min_frequency: unsigned int:
+        min_frequency (:obj:`int`, `optional`):
            The minimum frequency a pair should have in order to be merged.

-        show_progress: boolean:
+        show_progress (:obj:`bool`, `optional`):
            Whether to show progress bars while training.

-        special_tokens: List[Union[str, AddedToken]]:
+        special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
            A list of special tokens the model should know of.

-        limit_alphabet: unsigned int:
+        limit_alphabet (:obj:`int`, `optional`):
            The maximum different characters to keep in the alphabet.

-        initial_alphabet: List[str]:
+        initial_alphabet (:obj:`List[str]`, `optional`):
            A list of characters to include in the initial alphabet, even
            if not seen in the training dataset.
            If the strings contain more than one character, only the first one
            is kept.

-        continuing_subword_prefix: Optional[str]:
+        continuing_subword_prefix (:obj:`str`, `optional`):
            A prefix to be used for every subword that is not a beginning-of-word.

-        end_of_word_suffix: Optional[str]:
+        end_of_word_suffix (:obj:`str`, `optional`):
            A suffix to be used for every subword that is a end-of-word.
-
-    Returns:
-        Trainer
    """

    def __init__(
--- a/bindings/python/src/trainers.rs
+++ b/bindings/python/src/trainers.rs
@ -15,38 +15,7 @@ use crate::tokenizer::PyAddedToken;
 ///
 /// This class is not supposed to be instantiated directly. Instead, any implementation of a
 /// Trainer will return an instance of this class when instantiated.
-///
-/// Args:
-///     vocab_size: unsigned int:
-///         The size of the final vocabulary, including all tokens and alphabet.
-///
-///     min_frequency: unsigned int:
-///         The minimum frequency a pair should have in order to be merged.
-///
-///     show_progress: boolean:
-///         Whether to show progress bars while training.
-///
-///     special_tokens: List[Union[str, AddedToken]]:
-///         A list of special tokens the model should know of.
-///
-///     limit_alphabet: unsigned int:
-///         The maximum different characters to keep in the alphabet.
-///
-///     initial_alphabet: List[str]:
-///         A list of characters to include in the initial alphabet, even
-///         if not seen in the training dataset.
-///         If the strings contain more than one character, only the first one
-///         is kept.
-///
-///     continuing_subword_prefix: Optional[str]:
-///         A prefix to be used for every subword that is not a beginning-of-word.
-///
-///     end_of_word_suffix: Optional[str]:
-///         A suffix to be used for every subword that is a end-of-word.
-///
-/// Returns:
-///     Trainer
-#[pyclass(name=Trainer)]
+#[pyclass(name=Trainer, module = "tokenizers.trainers", name=Trainer)]
 #[derive(Clone)]
 #[text_signature = "(self, vocab_size=30000, min_frequency=0,show_progress=True, special_tokens=[],limit_alphabet=None, initial_alphabet = [], continuing_subword_prefix=None, end_of_word_suffix=None)"]
 pub struct PyTrainer {
@ -108,15 +77,39 @@ where
    }
 }

-/// Capable of training a BPE model
-#[pyclass(extends=PyTrainer, name=BpeTrainer)]
+/// Trainer capable of training a BPE model
+///
+/// Args:
+///     vocab_size (:obj:`int`, `optional`):
+///         The size of the final vocabulary, including all tokens and alphabet.
+///
+///     min_frequency (:obj:`int`, `optional`):
+///         The minimum frequency a pair should have in order to be merged.
+///
+///     show_progress (:obj:`bool`, `optional`):
+///         Whether to show progress bars while training.
+///
+///     special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
+///         A list of special tokens the model should know of.
+///
+///     limit_alphabet (:obj:`int`, `optional`):
+///         The maximum different characters to keep in the alphabet.
+///
+///     initial_alphabet (:obj:`List[str]`, `optional`):
+///         A list of characters to include in the initial alphabet, even
+///         if not seen in the training dataset.
+///         If the strings contain more than one character, only the first one
+///         is kept.
+///
+///     continuing_subword_prefix (:obj:`str`, `optional`):
+///         A prefix to be used for every subword that is not a beginning-of-word.
+///
+///     end_of_word_suffix (:obj:`str`, `optional`):
+///         A suffix to be used for every subword that is a end-of-word.
+#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=BpeTrainer)]
 pub struct PyBpeTrainer {}
 #[pymethods]
 impl PyBpeTrainer {
-    /// new(/ vocab_size, min_frequency)
-    /// --
-    ///
-    /// Create a new BpeTrainer with the given configuration
    #[new]
    #[args(kwargs = "**")]
    pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
@ -176,46 +169,40 @@ impl PyBpeTrainer {
    }
 }

-/// Capable of training a WordPiece model
+/// Trainer capable of training a WordPiece model
+///
 /// Args:
-///     vocab_size: unsigned int:
+///     vocab_size (:obj:`int`, `optional`):
 ///         The size of the final vocabulary, including all tokens and alphabet.
 ///
-///     min_frequency: unsigned int:
+///     min_frequency (:obj:`int`, `optional`):
 ///         The minimum frequency a pair should have in order to be merged.
 ///
-///     show_progress: boolean:
+///     show_progress (:obj:`bool`, `optional`):
 ///         Whether to show progress bars while training.
 ///
-///     special_tokens: List[Union[str, AddedToken]]:
+///     special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
 ///         A list of special tokens the model should know of.
 ///
-///     limit_alphabet: unsigned int:
+///     limit_alphabet (:obj:`int`, `optional`):
 ///         The maximum different characters to keep in the alphabet.
 ///
-///     initial_alphabet: List[str]:
+///     initial_alphabet (:obj:`List[str]`, `optional`):
 ///         A list of characters to include in the initial alphabet, even
 ///         if not seen in the training dataset.
 ///         If the strings contain more than one character, only the first one
 ///         is kept.
 ///
-///     continuing_subword_prefix: Optional[str]:
+///     continuing_subword_prefix (:obj:`str`, `optional`):
 ///         A prefix to be used for every subword that is not a beginning-of-word.
 ///
-///     end_of_word_suffix: Optional[str]:
+///     end_of_word_suffix (:obj:`str`, `optional`):
 ///         A suffix to be used for every subword that is a end-of-word.
-///
-/// Returns:
-///     Trainer
-#[pyclass(extends=PyTrainer, name=WordPieceTrainer)]
+#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=WordPieceTrainer)]
 #[text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"]
 pub struct PyWordPieceTrainer {}
 #[pymethods]
 impl PyWordPieceTrainer {
-    /// new(/ vocab_size, min_frequency)
-    /// --
-    ///
-    /// Create a new BpeTrainer with the given configuration
    #[new]
    #[args(kwargs = "**")]
    pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
@ -276,28 +263,24 @@ impl PyWordPieceTrainer {
    }
 }

-/// Capable of training a WorldLevel model
+/// Trainer capable of training a WorldLevel model
 ///
 /// Args:
-///     vocab_size: unsigned int:
+///     vocab_size (:obj:`int`, `optional`):
 ///         The size of the final vocabulary, including all tokens and alphabet.
 ///
-///     min_frequency: unsigned int:
+///     min_frequency (:obj:`int`, `optional`):
 ///         The minimum frequency a pair should have in order to be merged.
 ///
-///     show_progress: boolean:
+///     show_progress (:obj:`bool`, `optional`):
 ///         Whether to show progress bars while training.
 ///
-///     special_tokens: List[Union[str, AddedToken]]:
+///     special_tokens (:obj:`List[Union[str, AddedToken]]`):
 ///         A list of special tokens the model should know of.
-///
-/// Returns:
-///     Trainer
-#[pyclass(extends=PyTrainer, name=WordLevelTrainer)]
+#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=WordLevelTrainer)]
 pub struct PyWordLevelTrainer {}
 #[pymethods]
 impl PyWordLevelTrainer {
-    /// Create a new WordLevelTrainer with the given configuration
    #[new]
    #[args(kwargs = "**")]
    pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
@ -354,32 +337,28 @@ impl PyWordLevelTrainer {
    }
 }

-/// Capable of training a Unigram model
+/// Trainer capable of training a Unigram model
 ///
 /// Args:
-///     vocab_size: unsigned int:
+///     vocab_size (:obj:`int`):
 ///         The size of the final vocabulary, including all tokens and alphabet.
 ///
-///     show_progress: boolean:
+///     show_progress (:obj:`bool`):
 ///         Whether to show progress bars while training.
 ///
-///     special_tokens: List[Union[str, AddedToken]]:
+///     special_tokens (:obj:`List[Union[str, AddedToken]]`):
 ///         A list of special tokens the model should know of.
 ///
-///     initial_alphabet: List[str]:
+///     initial_alphabet (:obj:`List[str]`):
 ///         A list of characters to include in the initial alphabet, even
 ///         if not seen in the training dataset.
 ///         If the strings contain more than one character, only the first one
 ///         is kept.
-///
-/// Returns:
-///     Trainer
-#[pyclass(extends=PyTrainer, name=UnigramTrainer)]
+#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=UnigramTrainer)]
 #[text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens= [])"]
 pub struct PyUnigramTrainer {}
 #[pymethods]
 impl PyUnigramTrainer {
-    /// Create a new UnigramTrainer with the given configuration
    #[new]
    #[args(kwargs = "**")]
    pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
--- a/docs/source/api/python.inc
+++ b/docs/source/api/python.inc
@ -44,3 +44,37 @@ Added Tokens

 .. autoclass:: tokenizers.AddedToken
    :members:
+
+
+Models
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: tokenizers.models
+    :members:
+
+Normalizers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: tokenizers.normalizers
+    :members:
+
+
+Pre-tokenizers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: tokenizers.pre_tokenizers
+    :members:
+
+
+Post-processor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: tokenizers.processors
+    :members:
+
+
+Trainers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: tokenizers.trainers
+    :members: