Allow initial_alphabet on UnigramTrainer

2025-12-08 05:38:23 +00:00 · 2020-10-22 14:32:40 -04:00
parent f7c61c267a
commit 1a6f4b5204
3 changed files with 44 additions and 4 deletions
--- a/bindings/python/py_src/tokenizers/trainers/init.pyi
+++ b/bindings/python/py_src/tokenizers/trainers/init.pyi
@@ -46,7 +46,7 @@ class BpeTrainer(Trainer):
            initial_alphabet: List[str]:
                A list of characters to include in the initial alphabet, even
                if not seen in the training dataset.
-                If the strings contains more than one character, only the first one
+                If the strings contain more than one character, only the first one
                is kept.

            continuing_subword_prefix: Optional[str]:
@@ -98,7 +98,7 @@ class WordPieceTrainer(Trainer):
            initial_alphabet: List[str]:
                A list of characters to include in the initial alphabet, even
                if not seen in the training dataset.
-                If the strings contains more than one character, only the first one
+                If the strings contain more than one character, only the first one
                is kept.

            continuing_subword_prefix: Optional[str]:
@@ -136,6 +136,12 @@ class UnigramTrainer(Trainer):
            special_tokens: List[Union[str, AddedToken]]:
                A list of special tokens the model should know of.

+            initial_alphabet: List[str]:
+                A list of characters to include in the initial alphabet, even
+                if not seen in the training dataset.
+                If the strings contain more than one character, only the first one
+                is kept.
+
        Returns:
            Trainer
        """