Expose the trainer to Python bindings.

2025-12-05 04:08:22 +00:00 · 2020-09-01 16:14:12 +02:00
parent 52082b5476
commit 558e76f18e
5 changed files with 63 additions and 0 deletions
--- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
+++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
@@ -40,3 +40,20 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
        }

        super().__init__(tokenizer, parameters)
+
+    def train(
+        self,
+        files: Union[str, List[str]],
+        vocab_size: int = 8000,
+        show_progress: bool = True,
+        special_tokens: List[Union[str, AddedToken]] = [],
+    ):
+        """ Train the model using the given files """
+
+        trainer = trainers.UnigramTrainer(
+            vocab_size=vocab_size, special_tokens=special_tokens, show_progress=show_progress,
+        )
+
+        if isinstance(files, str):
+            files = [files]
+        self._tokenizer.train(trainer, files)
--- a/bindings/python/py_src/tokenizers/trainers/init.py
+++ b/bindings/python/py_src/tokenizers/trainers/init.py
@@ -3,3 +3,4 @@ from .. import trainers
 Trainer = trainers.Trainer
 BpeTrainer = trainers.BpeTrainer
 WordPieceTrainer = trainers.WordPieceTrainer
+UnigramTrainer = trainers.UnigramTrainer
--- a/bindings/python/py_src/tokenizers/trainers/init.pyi
+++ b/bindings/python/py_src/tokenizers/trainers/init.pyi
@@ -111,3 +111,32 @@ class WordPieceTrainer(Trainer):
            Trainer
        """
        pass
+
+class UnigramTrainer(Trainer):
+    """ UnigramTrainer
+
+    Capable of training a Unigram model
+    """
+
+    def __init__(
+        self,
+        vocab_size: int = 8000,
+        show_progress: bool = True,
+        special_tokens: List[Union[str, AddedToken]] = [],
+    ) -> Trainer:
+        """ Instantiate a new UnigramTrainer with the given options:
+
+        Args:
+            vocab_size: unsigned int:
+                The size of the final vocabulary, including all tokens and alphabet.
+
+            show_progress: boolean:
+                Whether to show progress bars while training.
+
+            special_tokens: List[Union[str, AddedToken]]:
+                A list of special tokens the model should know of.
+
+        Returns:
+            Trainer
+        """
+        pass