Python - Improved stub file for models

2025-08-22 16:25:30 +00:00 · 2020-01-06 15:55:00 -05:00
parent 0e41e0b327
commit 1a083a6e6f
1 changed files with 64 additions and 12 deletions
--- a/bindings/python/tokenizers/models/init.pyi
+++ b/bindings/python/tokenizers/models/init.pyi
@ -1,33 +1,85 @@
-from .. import models
+from typing import List, Optional

 class Model:
-    """Model
+    """ Base class for all models
+
+    This class is not supposed to be instantiated directly. Instead, any implementation of
+    a Model will return a instance of this class when instantiated.
    """

    def save(folder: str, name: str) -> List[str]:
-        """ save
-        Save the current Model in the given folder, using the given name for the various
+        """ Save the current model
+
+        Save the current model in the given folder, using the given name for the various
        files that will get created.
-        Any file with the same name that already exist in this folder will be overwritten
+        Any file with the same name that already exist in this folder will be overwritten.
        """
        pass

-class BPE:
-    """BPE
-    """

-    def from_files(vocab: str, merges: str) -> Model:
+class BPE:
+    """ BytePairEncoding model class """
+
+    def from_files(vocab: str,
+                   merges: str,
+                   cache_capacity: Optional[int],
+                   dropout: Optional[float],
+                   unk_token: Optional[int],
+                   continuing_subword_prefix: Optional[str],
+                   end_of_word_suffix: Optional[str]) -> Model:
+        """ Instantiate a BPE Model from the given vocab and merges files.
+
+        Args:
+            vocab: string:
+                Path to a vocabulary JSON file.
+
+            merges: string:
+                Path to a merge file.
+
+            cache_capacity: (`optional`) int:
+                The number of words that the BPE cache can contain. The cache allows
+                to speed-up the process by keeping the result of the merge operations
+                for a number of words.
+
+            dropout: (`optional`) Optional[float] [0, 1]:
+                The BPE dropout to use. Must be an float between 0 and 1
+
+            unk_token: (`optional`) int:
+                The unknown token id to be used by the model.
+
+            continuing_subword_prefix: (`optional`) str:
+                The prefix to attach to subword units that don't represent a beginning of word.
+
+            end_of_word_suffix: (`optional`) str:
+                The suffix to attach to subword units that represent an end of word.
+        """
        pass

    def empty() -> Model:
+        """ Instantiate an empty BPE Model. """
        pass

+
 class WordPiece:
-    """WordPiece
-    """
+    """ WordPiece model class """

-    def from_files(vocab: str) -> Model:
+    def from_files(vocab: str,
+                   unk_token: Optional[str],
+                   max_input_chars_per_word: Optional[int]) -> Model:
+        """ Instantiate a WordPiece Model from the given vocab file.
+
+        Args:
+            vocab: string:
+                Path to a vocabulary file.
+
+            unk_token: (`optional`) str:
+                The unknown token to be used by the model.
+
+            max_input_chars_per_word: (`optional`) int:
+                The maximum number of characters to authorize in a single word.
+        """
        pass

    def empty() -> Model:
+        """ Instantiate an empty WordPiece Model. """
        pass