Python - add initial_alphabet to spm unigram trainer (#942)

* Python - add initial_alphabet to spm unigram trainer * Python - use optional instead of mutable defaults in spm unigram trainer
2025-12-03 11:18:29 +00:00 · 2022-03-09 03:54:03 -05:00
parent 98249dfb0f
commit 71ae5421eb
1 changed files with 30 additions and 4 deletions
--- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
+++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
@@ -48,7 +48,8 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
        files: Union[str, List[str]],
        vocab_size: int = 8000,
        show_progress: bool = True,
-        special_tokens: List[Union[str, AddedToken]] = [],
+        special_tokens: Optional[List[Union[str, AddedToken]]] = None,
        initial_alphabet: Optional[List[str]] = None,
        unk_token: Optional[str] = None,
    ):
        """
@@ -61,16 +62,28 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
                The size of the final vocabulary, including all tokens and alphabet.
            show_progress (:obj:`bool`):
                Whether to show progress bars while training.
-            special_tokens (:obj:`List[Union[str, AddedToken]]`):
+            special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
                A list of special tokens the model should know of.
            initial_alphabet (:obj:`List[str]`, `optional`):
                A list of characters to include in the initial alphabet, even
                if not seen in the training dataset.
                If the strings contain more than one character, only the first one
                is kept.
            unk_token (:obj:`str`, `optional`):
                The unknown token to be used by the model.
        """
        if special_tokens is None:
            special_tokens = []
        if initial_alphabet is None:
            initial_alphabet = []
        trainer = trainers.UnigramTrainer(
            vocab_size=vocab_size,
            special_tokens=special_tokens,
            show_progress=show_progress,
            initial_alphabet=initial_alphabet,
            unk_token=unk_token,
        )
@@ -83,7 +96,8 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
        iterator: Union[Iterator[str], Iterator[Iterator[str]]],
        vocab_size: int = 8000,
        show_progress: bool = True,
-        special_tokens: List[Union[str, AddedToken]] = [],
+        special_tokens: Optional[List[Union[str, AddedToken]]] = None,
        initial_alphabet: Optional[List[str]] = None,
        unk_token: Optional[str] = None,
        length: Optional[int] = None,
    ):
@@ -97,8 +111,13 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
                The size of the final vocabulary, including all tokens and alphabet.
            show_progress (:obj:`bool`):
                Whether to show progress bars while training.
-            special_tokens (:obj:`List[Union[str, AddedToken]]`):
+            special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
                A list of special tokens the model should know of.
            initial_alphabet (:obj:`List[str]`, `optional`):
                A list of characters to include in the initial alphabet, even
                if not seen in the training dataset.
                If the strings contain more than one character, only the first one
                is kept.
            unk_token (:obj:`str`, `optional`):
                The unknown token to be used by the model.
            length (:obj:`int`, `optional`):
@@ -106,10 +125,17 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
                provide meaningful progress tracking
        """
        if special_tokens is None:
            special_tokens = []
        if initial_alphabet is None:
            initial_alphabet = []
        trainer = trainers.UnigramTrainer(
            vocab_size=vocab_size,
            special_tokens=special_tokens,
            show_progress=show_progress,
            initial_alphabet=initial_alphabet,
            unk_token=unk_token,
        )