Python - add doctype to length in implementations spm unigram (#943)

2025-08-22 16:25:30 +00:00 · 2022-03-08 05:59:07 -05:00
parent 4a8f5db067
commit 98249dfb0f
1 changed files with 3 additions and 0 deletions
--- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
+++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
@ -101,6 +101,9 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
                A list of special tokens the model should know of.
            unk_token (:obj:`str`, `optional`):
                The unknown token to be used by the model.
+            length (:obj:`int`, `optional`):
+                The total number of sequences in the iterator. This is used to
+                provide meaningful progress tracking
        """

        trainer = trainers.UnigramTrainer(