Python - Improve documentation for models

2025-12-08 05:38:23 +00:00 · 2020-11-20 14:47:58 -05:00
parent dc60d4fc0c
commit a50d4b7d25
2 changed files with 532 additions and 177 deletions
--- a/bindings/python/py_src/tokenizers/models/init.pyi
+++ b/bindings/python/py_src/tokenizers/models/init.pyi
@@ -1,65 +1,100 @@
 # Generated content DO NOT EDIT
 class Model:
    """
-    A Model represents some tokenization algorithm like BPE or Word
+    Base class for all models
+
+    The model represents the actual tokenization algorithm. This is the part that
+    will contain and manage the learned vocabulary.
+
    This class cannot be constructed directly. Please use one of the concrete models.
    """

    def id_to_token(self, id):
        """
-        Returns the token associated with the given id
+        Get the token associated to an ID
+
+        Args:
+            id (:obj:`int`):
+                An ID to convert to a token
+
+        Returns:
+            :obj:`str`: The token associated to the ID
        """
        pass
-    def save(self, folder, name):
+    def save(self, folder, prefix):
        """
        Save the current model

-        Save the current model in the given folder, using the given name for the various
+        Save the current model in the given folder, using the given prefix for the various
        files that will get created.
-        Any file with the same name that already exist in this folder will be overwritten.
+        Any file with the same name that already exists in this folder will be overwritten.
+
+        Args:
+            folder (:obj:`str`):
+                The path to the target folder in which to save the various files
+
+            prefix (:obj:`str`, `optional`):
+                An optional prefix, used to prefix each file name
+
+        Returns:
+            :obj:`List[str]`: The list of saved files
        """
        pass
    def token_to_id(self, tokens):
        """
-        Returns the id associated with the given token
+        Get the ID associated to a token
+
+        Args:
+            token (:obj:`str`):
+                A token to convert to an ID
+
+        Returns:
+            :obj:`int`: The ID associated to the token
        """
        pass
-    def tokenize(self, tokens):
+    def tokenize(self, sequence):
        """
-        Tokenize the given sequence
+        Tokenize a sequence
+
+        Args:
+            sequence (:obj:`str`):
+                A sequence to tokenize
+
+        Returns:
+            A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
        """
        pass

 class BPE(Model):
    """
-    Instantiate a BPE Model from the given vocab and merges.
+    An implementation of the BPE (Byte-Pair Encoding) algorithm

    Args:
-       vocab: ('`optional`) Dict[str, int]:
-           A dictionnary of string keys and their ids {"am": 0,...}
+        vocab (:obj:`Dict[str, int]`, `optional`):
+            A dictionnary of string keys and their ids :obj:`{"am": 0,...}`

-       merges: (`optional`) string:
-           A list of pairs of tokens [("a", "b"),...]
+        merges (:obj:`List[Tuple[str, str]]`, `optional`):
+            A list of pairs of tokens (:obj:`Tuple[str, str]`) :obj:`[("a", "b"),...]`

-       cache_capacity: (`optional`) int:
-           The number of words that the BPE cache can contain. The cache allows
-           to speed-up the process by keeping the result of the merge operations
-           for a number of words.
+        cache_capacity (:obj:`int`, `optional`):
+            The number of words that the BPE cache can contain. The cache allows
+            to speed-up the process by keeping the result of the merge operations
+            for a number of words.

-       dropout: (`optional`) Optional[float] [0, 1]:
-           The BPE dropout to use. Must be an float between 0 and 1
+        dropout (:obj:`float`, `optional`):
+            A float between 0 and 1 that represents the BPE dropout to use.

-       unk_token: (`optional`) str:
-           The unknown token to be used by the model.
+        unk_token (:obj:`str`, `optional`):
+            The unknown token to be used by the model.

-       continuing_subword_prefix: (`optional`) str:
-           The prefix to attach to subword units that don't represent a beginning of word.
+        continuing_subword_prefix (:obj:`str`, `optional`):
+            The prefix to attach to subword units that don't represent a beginning of word.

-       end_of_word_suffix: (`optional`) str:
-           The suffix to attach to subword units that represent an end of word.
+        end_of_word_suffix (:obj:`str`, `optional`):
+            The suffix to attach to subword units that represent an end of word.

-       fuse_unk: (`optional`) bool:
-           Multiple unk tokens get fused into only 1
+        fuse_unk (:obj:`bool`, `optional`):
+            Whether to fuse any subsequent unknown tokens into a single one
    """

    def __init__(
@@ -75,183 +110,372 @@ class BPE(Model):
    ):
        pass
    @staticmethod
-    def from_file(vocab_filename, merge_filename, **kwargs):
+    def from_file(cls, vocab, merge, **kwargs):
        """
-        Convenient method to intialize a BPE from files
-        Roughly equivalent to
+        Instantiate a BPE model from the given files.

-        def from_file(vocab_filename, merges_filenames, **kwargs):
-            vocab, merges = BPE.read_file(vocab_filename, merges_filename)
-            return BPE(vocab, merges, **kwargs)
+        This method is roughly equivalent to doing::
+
+           vocab, merges = BPE.read_file(vocab_filename, merges_filename)
+           bpe = BPE(vocab, merges)
+
+        If you don't need to keep the :obj:`vocab, merges` values lying around,
+        this method is more optimized than manually calling
+        :meth:`~tokenizers.models.BPE.read_file` to initialize a :class:`~tokenizers.models.BPE`
+
+        Args:
+            vocab (:obj:`str`):
+                The path to a :obj:`vocab.json` file
+
+            merges (:obj:`str`):
+                The path to a :obj:`merges.txt` file
+
+        Returns:
+            :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
        """
        pass
    def id_to_token(self, id):
        """
-        Returns the token associated with the given id
+        Get the token associated to an ID
+
+        Args:
+            id (:obj:`int`):
+                An ID to convert to a token
+
+        Returns:
+            :obj:`str`: The token associated to the ID
        """
        pass
    @staticmethod
-    def read_file(self, vocab_filename, merges_filename):
+    def read_file(self, vocab, merges):
        """
-        Read a vocab_filename and merge_filename and stores result in memory
+        Read a :obj:`vocab.json` and a :obj:`merges.txt` files
+
+        This method provides a way to read and parse the content of these files,
+        returning the relevant data structures. If you want to instantiate some BPE models
+        from memory, this method gives you the expected input from the standard files.
+
+        Args:
+            vocab (:obj:`str`):
+                The path to a :obj:`vocab.json` file
+
+            merges (:obj:`str`):
+                The path to a :obj:`merges.txt` file
+
+        Returns:
+            A :obj:`Tuple` with the vocab and the merges:
+                The vocabulary and merges loaded into memory
        """
        pass
-    def save(self, folder, name):
+    def save(self, folder, prefix):
        """
        Save the current model

-        Save the current model in the given folder, using the given name for the various
+        Save the current model in the given folder, using the given prefix for the various
        files that will get created.
-        Any file with the same name that already exist in this folder will be overwritten.
+        Any file with the same name that already exists in this folder will be overwritten.
+
+        Args:
+            folder (:obj:`str`):
+                The path to the target folder in which to save the various files
+
+            prefix (:obj:`str`, `optional`):
+                An optional prefix, used to prefix each file name
+
+        Returns:
+            :obj:`List[str]`: The list of saved files
        """
        pass
    def token_to_id(self, tokens):
        """
-        Returns the id associated with the given token
+        Get the ID associated to a token
+
+        Args:
+            token (:obj:`str`):
+                A token to convert to an ID
+
+        Returns:
+            :obj:`int`: The ID associated to the token
        """
        pass
-    def tokenize(self, tokens):
+    def tokenize(self, sequence):
        """
-        Tokenize the given sequence
+        Tokenize a sequence
+
+        Args:
+            sequence (:obj:`str`):
+                A sequence to tokenize
+
+        Returns:
+            A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
        """
        pass

 class Unigram(Model):
    """
-    UnigramEncoding model class
-
-    Instantiate a Unigram Model from the given model file.
+    An implementation of the Unigram algorithm

    Args:
-       vocab: ('`optional`) string:
-           A list of vocabulary items and their relative score [("am", -0.2442),...]
-
+        vocab (:obj:`List[Tuple[str, float]]`, `optional`):
+            A list of vocabulary items and their relative score [("am", -0.2442),...]
    """

    def __init__(self, vocab):
        pass
    def id_to_token(self, id):
        """
-        Returns the token associated with the given id
+        Get the token associated to an ID
+
+        Args:
+            id (:obj:`int`):
+                An ID to convert to a token
+
+        Returns:
+            :obj:`str`: The token associated to the ID
        """
        pass
-    def save(self, folder, name):
+    def save(self, folder, prefix):
        """
        Save the current model

-        Save the current model in the given folder, using the given name for the various
+        Save the current model in the given folder, using the given prefix for the various
        files that will get created.
-        Any file with the same name that already exist in this folder will be overwritten.
+        Any file with the same name that already exists in this folder will be overwritten.
+
+        Args:
+            folder (:obj:`str`):
+                The path to the target folder in which to save the various files
+
+            prefix (:obj:`str`, `optional`):
+                An optional prefix, used to prefix each file name
+
+        Returns:
+            :obj:`List[str]`: The list of saved files
        """
        pass
    def token_to_id(self, tokens):
        """
-        Returns the id associated with the given token
+        Get the ID associated to a token
+
+        Args:
+            token (:obj:`str`):
+                A token to convert to an ID
+
+        Returns:
+            :obj:`int`: The ID associated to the token
        """
        pass
-    def tokenize(self, tokens):
+    def tokenize(self, sequence):
        """
-        Tokenize the given sequence
+        Tokenize a sequence
+
+        Args:
+            sequence (:obj:`str`):
+                A sequence to tokenize
+
+        Returns:
+            A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
        """
        pass

 class WordLevel(Model):
    """
-    Most simple tokenizer model based on mapping token from a vocab file to their corresponding id.
+    An implementation of the WordLevel algorithm

-    Instantiate a WordLevel Model from the given vocab file.
+    Most simple tokenizer model based on mapping tokens to their corresponding id.

-        Args:
-            vocab: (`optional`) string:
-                A dictionnary of string keys and their ids {"am": 0,...}
+    Args:
+        vocab (:obj:`str`, `optional`):
+            A dictionnary of string keys and their ids :obj:`{"am": 0,...}`

-            unk_token: str:
-                The unknown token to be used by the model.
+        unk_token (:obj:`str`, `optional`):
+            The unknown token to be used by the model.
    """

    def __init__(self, vocab, unk_token):
        pass
    def id_to_token(self, id):
        """
-        Returns the token associated with the given id
+        Get the token associated to an ID
+
+        Args:
+            id (:obj:`int`):
+                An ID to convert to a token
+
+        Returns:
+            :obj:`str`: The token associated to the ID
        """
        pass
-    def save(self, folder, name):
+    @staticmethod
+    def read_file(vocab):
+        """
+        Read a :obj:`vocab.json`
+
+        This method provides a way to read and parse the content of a vocabulary file,
+        returning the relevant data structures. If you want to instantiate some WordLevel models
+        from memory, this method gives you the expected input from the standard files.
+
+        Args:
+            vocab (:obj:`str`):
+                The path to a :obj:`vocab.json` file
+
+        Returns:
+            :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
+        """
+        pass
+    def save(self, folder, prefix):
        """
        Save the current model

-        Save the current model in the given folder, using the given name for the various
+        Save the current model in the given folder, using the given prefix for the various
        files that will get created.
-        Any file with the same name that already exist in this folder will be overwritten.
+        Any file with the same name that already exists in this folder will be overwritten.
+
+        Args:
+            folder (:obj:`str`):
+                The path to the target folder in which to save the various files
+
+            prefix (:obj:`str`, `optional`):
+                An optional prefix, used to prefix each file name
+
+        Returns:
+            :obj:`List[str]`: The list of saved files
        """
        pass
    def token_to_id(self, tokens):
        """
-        Returns the id associated with the given token
+        Get the ID associated to a token
+
+        Args:
+            token (:obj:`str`):
+                A token to convert to an ID
+
+        Returns:
+            :obj:`int`: The ID associated to the token
        """
        pass
-    def tokenize(self, tokens):
+    def tokenize(self, sequence):
        """
-        Tokenize the given sequence
+        Tokenize a sequence
+
+        Args:
+            sequence (:obj:`str`):
+                A sequence to tokenize
+
+        Returns:
+            A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
        """
        pass

 class WordPiece(Model):
    """
-    WordPiece model
-    Instantiate a WordPiece Model from the given vocab file.
+    An implementation of the WordPiece algorithm

    Args:
-        vocab: (`optional`) string:
-            A dictionnary of string keys and their ids {"am": 0,...}
+        vocab (:obj:`Dict[str, int]`, `optional`):
+            A dictionnary of string keys and their ids :obj:`{"am": 0,...}`

-        unk_token: (`optional`) str:
+        unk_token (:obj:`str`, `optional`):
            The unknown token to be used by the model.

-        max_input_chars_per_word: (`optional`) int:
+        max_input_chars_per_word (:obj:`int`, `optional`):
            The maximum number of characters to authorize in a single word.
    """

    def __init__(self, vocab, unk_token, max_input_chars_per_word):
        pass
    @staticmethod
-    def from_file(vocab_filename, merge_filename, **kwargs):
+    def from_file(vocab, **kwargs):
        """
-        Convenient method to intialize a WordPiece from files
-        Roughly equivalent to
+        Instantiate a WordPiece model from the given file
+
+        This method is roughly equivalent to doing::

-        def from_file(vocab_filename, **kwargs):
            vocab = WordPiece.read_file(vocab_filename)
-            return WordPiece(vocab, **kwargs)
+            wordpiece = WordPiece(vocab)
+
+        If you don't need to keep the :obj:`vocab` values lying around, this method is
+        more optimized than manually calling :meth:`~tokenizers.models.WordPiece.read_file` to
+        initialize a :class:`~tokenizers.models.WordPiece`
+
+        Args:
+            vocab (:obj:`str`):
+                The path to a :obj:`vocab.txt` file
+
+        Returns:
+            :class:`~tokenizers.models.WordPiece`: And instance of WordPiece loaded from file
        """
        pass
    def id_to_token(self, id):
        """
-        Returns the token associated with the given id
+        Get the token associated to an ID
+
+        Args:
+            id (:obj:`int`):
+                An ID to convert to a token
+
+        Returns:
+            :obj:`str`: The token associated to the ID
        """
        pass
    @staticmethod
-    def read_file(vocab_filename):
+    def read_file(vocab):
        """
-        Read a vocab_filename and stores result in memory
+        Read a :obj:`vocab.txt` file
+
+        This method provides a way to read and parse the content of a standard `vocab.txt`
+        file as used by the WordPiece Model, returning the relevant data structures. If you
+        want to instantiate some WordPiece models from memory, this method gives you the
+        expected input from the standard files.
+
+        Args:
+            vocab (:obj:`str`):
+                The path to a :obj:`vocab.txt` file
+
+        Returns:
+            :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
        """
        pass
-    def save(self, folder, name):
+    def save(self, folder, prefix):
        """
        Save the current model

-        Save the current model in the given folder, using the given name for the various
+        Save the current model in the given folder, using the given prefix for the various
        files that will get created.
-        Any file with the same name that already exist in this folder will be overwritten.
+        Any file with the same name that already exists in this folder will be overwritten.
+
+        Args:
+            folder (:obj:`str`):
+                The path to the target folder in which to save the various files
+
+            prefix (:obj:`str`, `optional`):
+                An optional prefix, used to prefix each file name
+
+        Returns:
+            :obj:`List[str]`: The list of saved files
        """
        pass
    def token_to_id(self, tokens):
        """
-        Returns the id associated with the given token
+        Get the ID associated to a token
+
+        Args:
+            token (:obj:`str`):
+                A token to convert to an ID
+
+        Returns:
+            :obj:`int`: The ID associated to the token
        """
        pass
-    def tokenize(self, tokens):
+    def tokenize(self, sequence):
        """
-        Tokenize the given sequence
+        Tokenize a sequence
+
+        Args:
+            sequence (:obj:`str`):
+                A sequence to tokenize
+
+        Returns:
+            A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
        """
        pass