Add a way to specify the unknown token in SentencePieceUnigramTokenizer python implem (#762)

* add a way to specify the unknown token in `SentencePieceUnigramTokenizer` * add test that verify that an exception is raised for the missing unknown token * style * add test tokens
2025-12-08 05:38:23 +00:00 · 2021-08-12 15:42:44 +02:00
parent 46bed542fa
commit da4c7b10e4
3 changed files with 77 additions and 3 deletions
--- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
+++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
@@ -49,13 +49,29 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
        vocab_size: int = 8000,
        show_progress: bool = True,
        special_tokens: List[Union[str, AddedToken]] = [],
+        unk_token: Optional[str] = None,
    ):
-        """ Train the model using the given files """
+        """
+        Train the model using the given files
+
+        Args:
+            files (:obj:`List[str]`):
+                A list of path to the files that we should use for training
+            vocab_size (:obj:`int`):
+                The size of the final vocabulary, including all tokens and alphabet.
+            show_progress (:obj:`bool`):
+                Whether to show progress bars while training.
+            special_tokens (:obj:`List[Union[str, AddedToken]]`):
+                A list of special tokens the model should know of.
+            unk_token (:obj:`str`, `optional`):
+                The unknown token to be used by the model.
+        """

        trainer = trainers.UnigramTrainer(
            vocab_size=vocab_size,
            special_tokens=special_tokens,
            show_progress=show_progress,
+            unk_token=unk_token,
        )

        if isinstance(files, str):
@@ -68,13 +84,29 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
        vocab_size: int = 8000,
        show_progress: bool = True,
        special_tokens: List[Union[str, AddedToken]] = [],
+        unk_token: Optional[str] = None,
    ):
-        """ Train the model using the given iterator """
+        """
+        Train the model using the given iterator
+
+        Args:
+            iterator (:obj:`Union[Iterator[str], Iterator[Iterator[str]]]`):
+                Any iterator over strings or list of strings
+            vocab_size (:obj:`int`):
+                The size of the final vocabulary, including all tokens and alphabet.
+            show_progress (:obj:`bool`):
+                Whether to show progress bars while training.
+            special_tokens (:obj:`List[Union[str, AddedToken]]`):
+                A list of special tokens the model should know of.
+            unk_token (:obj:`str`, `optional`):
+                The unknown token to be used by the model.
+        """

        trainer = trainers.UnigramTrainer(
            vocab_size=vocab_size,
            special_tokens=special_tokens,
            show_progress=show_progress,
+            unk_token=unk_token,
        )

        self._tokenizer.train_from_iterator(iterator, trainer=trainer)