Python - add initial_alphabet to spm unigram trainer (#942)

* Python - add initial_alphabet to spm unigram trainer * Python - use optional instead of mutable defaults in spm unigram trainer
2025-12-03 11:18:29 +00:00 · 2022-03-09 03:54:03 -05:00
parent 98249dfb0f
commit 71ae5421eb
1 changed files with 30 additions and 4 deletions
--- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
+++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
@@ -48,7 +48,8 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
        files: Union[str, List[str]],
        vocab_size: int = 8000,
        show_progress: bool = True,
-        special_tokens: List[Union[str, AddedToken]] = [],
+        special_tokens: Optional[List[Union[str, AddedToken]]] = None,
+        initial_alphabet: Optional[List[str]] = None,
        unk_token: Optional[str] = None,
    ):
        """
@@ -61,16 +62,28 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
                The size of the final vocabulary, including all tokens and alphabet.
            show_progress (:obj:`bool`):
                Whether to show progress bars while training.
-            special_tokens (:obj:`List[Union[str, AddedToken]]`):
+            special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
                A list of special tokens the model should know of.
+            initial_alphabet (:obj:`List[str]`, `optional`):
+                A list of characters to include in the initial alphabet, even
+                if not seen in the training dataset.
+                If the strings contain more than one character, only the first one
+                is kept.
            unk_token (:obj:`str`, `optional`):
                The unknown token to be used by the model.
        """

+        if special_tokens is None:
+            special_tokens = []
+
+        if initial_alphabet is None:
+            initial_alphabet = []
+
        trainer = trainers.UnigramTrainer(
            vocab_size=vocab_size,
            special_tokens=special_tokens,
            show_progress=show_progress,
+            initial_alphabet=initial_alphabet,
            unk_token=unk_token,
        )

@@ -83,7 +96,8 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
        iterator: Union[Iterator[str], Iterator[Iterator[str]]],
        vocab_size: int = 8000,
        show_progress: bool = True,
-        special_tokens: List[Union[str, AddedToken]] = [],
+        special_tokens: Optional[List[Union[str, AddedToken]]] = None,
+        initial_alphabet: Optional[List[str]] = None,
        unk_token: Optional[str] = None,
        length: Optional[int] = None,
    ):
@@ -97,8 +111,13 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
                The size of the final vocabulary, including all tokens and alphabet.
            show_progress (:obj:`bool`):
                Whether to show progress bars while training.
-            special_tokens (:obj:`List[Union[str, AddedToken]]`):
+            special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
                A list of special tokens the model should know of.
+            initial_alphabet (:obj:`List[str]`, `optional`):
+                A list of characters to include in the initial alphabet, even
+                if not seen in the training dataset.
+                If the strings contain more than one character, only the first one
+                is kept.
            unk_token (:obj:`str`, `optional`):
                The unknown token to be used by the model.
            length (:obj:`int`, `optional`):
@@ -106,10 +125,17 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
                provide meaningful progress tracking
        """

+        if special_tokens is None:
+            special_tokens = []
+
+        if initial_alphabet is None:
+            initial_alphabet = []
+
        trainer = trainers.UnigramTrainer(
            vocab_size=vocab_size,
            special_tokens=special_tokens,
            show_progress=show_progress,
+            initial_alphabet=initial_alphabet,
            unk_token=unk_token,
        )