Expand documentation of UnigramTrainer (#770)

* Expand documentation of UnigramTrainer * Put doc at the source * Add signature * make style Co-authored-by: Anthony Moi <m.anthony.moi@gmail.com>
2025-08-22 16:25:30 +00:00 · 2021-08-12 16:12:26 +02:00
parent da4c7b10e4
commit 6616e699f7
2 changed files with 39 additions and 2 deletions
--- a/bindings/python/py_src/tokenizers/trainers/init.pyi
+++ b/bindings/python/py_src/tokenizers/trainers/init.pyi
@ -72,9 +72,32 @@ class UnigramTrainer(Trainer):
            if not seen in the training dataset.
            If the strings contain more than one character, only the first one
            is kept.
+
+        shrinking_factor (:obj:`float`):
+            The shrinking factor used at each step of the training to prune the
+            vocabulary.
+
+        unk_token (:obj:`str`):
+            The token used for out-of-vocabulary tokens.
+
+        max_piece_length (:obj:`int`):
+            The maximum length of a given token.
+
+        n_sub_iterations (:obj:`int`):
+            The number of iterations of the EM algorithm to perform before
+            pruning the vocabulary.
    """

-    def __init__(self, vocab_size=8000, show_progress=True, special_tokens=[]):
+    def __init__(
+        self,
+        vocab_size=8000,
+        show_progress=True,
+        special_tokens=[],
+        shrinking_factor=0.75,
+        unk_token=None,
+        max_piece_length=16,
+        n_sub_iterations=2,
+    ):
        pass

 class WordLevelTrainer(Trainer):
--- a/bindings/python/src/trainers.rs
+++ b/bindings/python/src/trainers.rs
@ -669,8 +669,22 @@ impl PyWordLevelTrainer {
 ///         if not seen in the training dataset.
 ///         If the strings contain more than one character, only the first one
 ///         is kept.
+///
+///     shrinking_factor (:obj:`float`):
+///         The shrinking factor used at each step of the training to prune the
+///         vocabulary.
+///
+///     unk_token (:obj:`str`):
+///         The token used for out-of-vocabulary tokens.
+///
+///     max_piece_length (:obj:`int`):
+///         The maximum length of a given token.
+///
+///     n_sub_iterations (:obj:`int`):
+///         The number of iterations of the EM algorithm to perform before
+///         pruning the vocabulary.
 #[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=UnigramTrainer)]
-#[text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens= [])"]
+#[text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)"]
 pub struct PyUnigramTrainer {}
 #[pymethods]
 impl PyUnigramTrainer {