mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-08 05:38:23 +00:00
Python - Improve documentation for models
This commit is contained in:
@@ -1,65 +1,100 @@
|
||||
# Generated content DO NOT EDIT
|
||||
class Model:
|
||||
"""
|
||||
A Model represents some tokenization algorithm like BPE or Word
|
||||
Base class for all models
|
||||
|
||||
The model represents the actual tokenization algorithm. This is the part that
|
||||
will contain and manage the learned vocabulary.
|
||||
|
||||
This class cannot be constructed directly. Please use one of the concrete models.
|
||||
"""
|
||||
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
Returns the token associated with the given id
|
||||
Get the token associated to an ID
|
||||
|
||||
Args:
|
||||
id (:obj:`int`):
|
||||
An ID to convert to a token
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The token associated to the ID
|
||||
"""
|
||||
pass
|
||||
def save(self, folder, name):
|
||||
def save(self, folder, prefix):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
Save the current model in the given folder, using the given name for the various
|
||||
Save the current model in the given folder, using the given prefix for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exist in this folder will be overwritten.
|
||||
Any file with the same name that already exists in this folder will be overwritten.
|
||||
|
||||
Args:
|
||||
folder (:obj:`str`):
|
||||
The path to the target folder in which to save the various files
|
||||
|
||||
prefix (:obj:`str`, `optional`):
|
||||
An optional prefix, used to prefix each file name
|
||||
|
||||
Returns:
|
||||
:obj:`List[str]`: The list of saved files
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Returns the id associated with the given token
|
||||
Get the ID associated to a token
|
||||
|
||||
Args:
|
||||
token (:obj:`str`):
|
||||
A token to convert to an ID
|
||||
|
||||
Returns:
|
||||
:obj:`int`: The ID associated to the token
|
||||
"""
|
||||
pass
|
||||
def tokenize(self, tokens):
|
||||
def tokenize(self, sequence):
|
||||
"""
|
||||
Tokenize the given sequence
|
||||
Tokenize a sequence
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A sequence to tokenize
|
||||
|
||||
Returns:
|
||||
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
||||
"""
|
||||
pass
|
||||
|
||||
class BPE(Model):
|
||||
"""
|
||||
Instantiate a BPE Model from the given vocab and merges.
|
||||
An implementation of the BPE (Byte-Pair Encoding) algorithm
|
||||
|
||||
Args:
|
||||
vocab: ('`optional`) Dict[str, int]:
|
||||
A dictionnary of string keys and their ids {"am": 0,...}
|
||||
vocab (:obj:`Dict[str, int]`, `optional`):
|
||||
A dictionnary of string keys and their ids :obj:`{"am": 0,...}`
|
||||
|
||||
merges: (`optional`) string:
|
||||
A list of pairs of tokens [("a", "b"),...]
|
||||
merges (:obj:`List[Tuple[str, str]]`, `optional`):
|
||||
A list of pairs of tokens (:obj:`Tuple[str, str]`) :obj:`[("a", "b"),...]`
|
||||
|
||||
cache_capacity: (`optional`) int:
|
||||
The number of words that the BPE cache can contain. The cache allows
|
||||
to speed-up the process by keeping the result of the merge operations
|
||||
for a number of words.
|
||||
cache_capacity (:obj:`int`, `optional`):
|
||||
The number of words that the BPE cache can contain. The cache allows
|
||||
to speed-up the process by keeping the result of the merge operations
|
||||
for a number of words.
|
||||
|
||||
dropout: (`optional`) Optional[float] [0, 1]:
|
||||
The BPE dropout to use. Must be an float between 0 and 1
|
||||
dropout (:obj:`float`, `optional`):
|
||||
A float between 0 and 1 that represents the BPE dropout to use.
|
||||
|
||||
unk_token: (`optional`) str:
|
||||
The unknown token to be used by the model.
|
||||
unk_token (:obj:`str`, `optional`):
|
||||
The unknown token to be used by the model.
|
||||
|
||||
continuing_subword_prefix: (`optional`) str:
|
||||
The prefix to attach to subword units that don't represent a beginning of word.
|
||||
continuing_subword_prefix (:obj:`str`, `optional`):
|
||||
The prefix to attach to subword units that don't represent a beginning of word.
|
||||
|
||||
end_of_word_suffix: (`optional`) str:
|
||||
The suffix to attach to subword units that represent an end of word.
|
||||
end_of_word_suffix (:obj:`str`, `optional`):
|
||||
The suffix to attach to subword units that represent an end of word.
|
||||
|
||||
fuse_unk: (`optional`) bool:
|
||||
Multiple unk tokens get fused into only 1
|
||||
fuse_unk (:obj:`bool`, `optional`):
|
||||
Whether to fuse any subsequent unknown tokens into a single one
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -75,183 +110,372 @@ class BPE(Model):
|
||||
):
|
||||
pass
|
||||
@staticmethod
|
||||
def from_file(vocab_filename, merge_filename, **kwargs):
|
||||
def from_file(cls, vocab, merge, **kwargs):
|
||||
"""
|
||||
Convenient method to intialize a BPE from files
|
||||
Roughly equivalent to
|
||||
Instantiate a BPE model from the given files.
|
||||
|
||||
def from_file(vocab_filename, merges_filenames, **kwargs):
|
||||
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
|
||||
return BPE(vocab, merges, **kwargs)
|
||||
This method is roughly equivalent to doing::
|
||||
|
||||
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
|
||||
bpe = BPE(vocab, merges)
|
||||
|
||||
If you don't need to keep the :obj:`vocab, merges` values lying around,
|
||||
this method is more optimized than manually calling
|
||||
:meth:`~tokenizers.models.BPE.read_file` to initialize a :class:`~tokenizers.models.BPE`
|
||||
|
||||
Args:
|
||||
vocab (:obj:`str`):
|
||||
The path to a :obj:`vocab.json` file
|
||||
|
||||
merges (:obj:`str`):
|
||||
The path to a :obj:`merges.txt` file
|
||||
|
||||
Returns:
|
||||
:class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
|
||||
"""
|
||||
pass
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
Returns the token associated with the given id
|
||||
Get the token associated to an ID
|
||||
|
||||
Args:
|
||||
id (:obj:`int`):
|
||||
An ID to convert to a token
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The token associated to the ID
|
||||
"""
|
||||
pass
|
||||
@staticmethod
|
||||
def read_file(self, vocab_filename, merges_filename):
|
||||
def read_file(self, vocab, merges):
|
||||
"""
|
||||
Read a vocab_filename and merge_filename and stores result in memory
|
||||
Read a :obj:`vocab.json` and a :obj:`merges.txt` files
|
||||
|
||||
This method provides a way to read and parse the content of these files,
|
||||
returning the relevant data structures. If you want to instantiate some BPE models
|
||||
from memory, this method gives you the expected input from the standard files.
|
||||
|
||||
Args:
|
||||
vocab (:obj:`str`):
|
||||
The path to a :obj:`vocab.json` file
|
||||
|
||||
merges (:obj:`str`):
|
||||
The path to a :obj:`merges.txt` file
|
||||
|
||||
Returns:
|
||||
A :obj:`Tuple` with the vocab and the merges:
|
||||
The vocabulary and merges loaded into memory
|
||||
"""
|
||||
pass
|
||||
def save(self, folder, name):
|
||||
def save(self, folder, prefix):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
Save the current model in the given folder, using the given name for the various
|
||||
Save the current model in the given folder, using the given prefix for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exist in this folder will be overwritten.
|
||||
Any file with the same name that already exists in this folder will be overwritten.
|
||||
|
||||
Args:
|
||||
folder (:obj:`str`):
|
||||
The path to the target folder in which to save the various files
|
||||
|
||||
prefix (:obj:`str`, `optional`):
|
||||
An optional prefix, used to prefix each file name
|
||||
|
||||
Returns:
|
||||
:obj:`List[str]`: The list of saved files
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Returns the id associated with the given token
|
||||
Get the ID associated to a token
|
||||
|
||||
Args:
|
||||
token (:obj:`str`):
|
||||
A token to convert to an ID
|
||||
|
||||
Returns:
|
||||
:obj:`int`: The ID associated to the token
|
||||
"""
|
||||
pass
|
||||
def tokenize(self, tokens):
|
||||
def tokenize(self, sequence):
|
||||
"""
|
||||
Tokenize the given sequence
|
||||
Tokenize a sequence
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A sequence to tokenize
|
||||
|
||||
Returns:
|
||||
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
||||
"""
|
||||
pass
|
||||
|
||||
class Unigram(Model):
|
||||
"""
|
||||
UnigramEncoding model class
|
||||
|
||||
Instantiate a Unigram Model from the given model file.
|
||||
An implementation of the Unigram algorithm
|
||||
|
||||
Args:
|
||||
vocab: ('`optional`) string:
|
||||
A list of vocabulary items and their relative score [("am", -0.2442),...]
|
||||
|
||||
vocab (:obj:`List[Tuple[str, float]]`, `optional`):
|
||||
A list of vocabulary items and their relative score [("am", -0.2442),...]
|
||||
"""
|
||||
|
||||
def __init__(self, vocab):
|
||||
pass
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
Returns the token associated with the given id
|
||||
Get the token associated to an ID
|
||||
|
||||
Args:
|
||||
id (:obj:`int`):
|
||||
An ID to convert to a token
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The token associated to the ID
|
||||
"""
|
||||
pass
|
||||
def save(self, folder, name):
|
||||
def save(self, folder, prefix):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
Save the current model in the given folder, using the given name for the various
|
||||
Save the current model in the given folder, using the given prefix for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exist in this folder will be overwritten.
|
||||
Any file with the same name that already exists in this folder will be overwritten.
|
||||
|
||||
Args:
|
||||
folder (:obj:`str`):
|
||||
The path to the target folder in which to save the various files
|
||||
|
||||
prefix (:obj:`str`, `optional`):
|
||||
An optional prefix, used to prefix each file name
|
||||
|
||||
Returns:
|
||||
:obj:`List[str]`: The list of saved files
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Returns the id associated with the given token
|
||||
Get the ID associated to a token
|
||||
|
||||
Args:
|
||||
token (:obj:`str`):
|
||||
A token to convert to an ID
|
||||
|
||||
Returns:
|
||||
:obj:`int`: The ID associated to the token
|
||||
"""
|
||||
pass
|
||||
def tokenize(self, tokens):
|
||||
def tokenize(self, sequence):
|
||||
"""
|
||||
Tokenize the given sequence
|
||||
Tokenize a sequence
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A sequence to tokenize
|
||||
|
||||
Returns:
|
||||
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
||||
"""
|
||||
pass
|
||||
|
||||
class WordLevel(Model):
|
||||
"""
|
||||
Most simple tokenizer model based on mapping token from a vocab file to their corresponding id.
|
||||
An implementation of the WordLevel algorithm
|
||||
|
||||
Instantiate a WordLevel Model from the given vocab file.
|
||||
Most simple tokenizer model based on mapping tokens to their corresponding id.
|
||||
|
||||
Args:
|
||||
vocab: (`optional`) string:
|
||||
A dictionnary of string keys and their ids {"am": 0,...}
|
||||
Args:
|
||||
vocab (:obj:`str`, `optional`):
|
||||
A dictionnary of string keys and their ids :obj:`{"am": 0,...}`
|
||||
|
||||
unk_token: str:
|
||||
The unknown token to be used by the model.
|
||||
unk_token (:obj:`str`, `optional`):
|
||||
The unknown token to be used by the model.
|
||||
"""
|
||||
|
||||
def __init__(self, vocab, unk_token):
|
||||
pass
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
Returns the token associated with the given id
|
||||
Get the token associated to an ID
|
||||
|
||||
Args:
|
||||
id (:obj:`int`):
|
||||
An ID to convert to a token
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The token associated to the ID
|
||||
"""
|
||||
pass
|
||||
def save(self, folder, name):
|
||||
@staticmethod
|
||||
def read_file(vocab):
|
||||
"""
|
||||
Read a :obj:`vocab.json`
|
||||
|
||||
This method provides a way to read and parse the content of a vocabulary file,
|
||||
returning the relevant data structures. If you want to instantiate some WordLevel models
|
||||
from memory, this method gives you the expected input from the standard files.
|
||||
|
||||
Args:
|
||||
vocab (:obj:`str`):
|
||||
The path to a :obj:`vocab.json` file
|
||||
|
||||
Returns:
|
||||
:obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
|
||||
"""
|
||||
pass
|
||||
def save(self, folder, prefix):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
Save the current model in the given folder, using the given name for the various
|
||||
Save the current model in the given folder, using the given prefix for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exist in this folder will be overwritten.
|
||||
Any file with the same name that already exists in this folder will be overwritten.
|
||||
|
||||
Args:
|
||||
folder (:obj:`str`):
|
||||
The path to the target folder in which to save the various files
|
||||
|
||||
prefix (:obj:`str`, `optional`):
|
||||
An optional prefix, used to prefix each file name
|
||||
|
||||
Returns:
|
||||
:obj:`List[str]`: The list of saved files
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Returns the id associated with the given token
|
||||
Get the ID associated to a token
|
||||
|
||||
Args:
|
||||
token (:obj:`str`):
|
||||
A token to convert to an ID
|
||||
|
||||
Returns:
|
||||
:obj:`int`: The ID associated to the token
|
||||
"""
|
||||
pass
|
||||
def tokenize(self, tokens):
|
||||
def tokenize(self, sequence):
|
||||
"""
|
||||
Tokenize the given sequence
|
||||
Tokenize a sequence
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A sequence to tokenize
|
||||
|
||||
Returns:
|
||||
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
||||
"""
|
||||
pass
|
||||
|
||||
class WordPiece(Model):
|
||||
"""
|
||||
WordPiece model
|
||||
Instantiate a WordPiece Model from the given vocab file.
|
||||
An implementation of the WordPiece algorithm
|
||||
|
||||
Args:
|
||||
vocab: (`optional`) string:
|
||||
A dictionnary of string keys and their ids {"am": 0,...}
|
||||
vocab (:obj:`Dict[str, int]`, `optional`):
|
||||
A dictionnary of string keys and their ids :obj:`{"am": 0,...}`
|
||||
|
||||
unk_token: (`optional`) str:
|
||||
unk_token (:obj:`str`, `optional`):
|
||||
The unknown token to be used by the model.
|
||||
|
||||
max_input_chars_per_word: (`optional`) int:
|
||||
max_input_chars_per_word (:obj:`int`, `optional`):
|
||||
The maximum number of characters to authorize in a single word.
|
||||
"""
|
||||
|
||||
def __init__(self, vocab, unk_token, max_input_chars_per_word):
|
||||
pass
|
||||
@staticmethod
|
||||
def from_file(vocab_filename, merge_filename, **kwargs):
|
||||
def from_file(vocab, **kwargs):
|
||||
"""
|
||||
Convenient method to intialize a WordPiece from files
|
||||
Roughly equivalent to
|
||||
Instantiate a WordPiece model from the given file
|
||||
|
||||
This method is roughly equivalent to doing::
|
||||
|
||||
def from_file(vocab_filename, **kwargs):
|
||||
vocab = WordPiece.read_file(vocab_filename)
|
||||
return WordPiece(vocab, **kwargs)
|
||||
wordpiece = WordPiece(vocab)
|
||||
|
||||
If you don't need to keep the :obj:`vocab` values lying around, this method is
|
||||
more optimized than manually calling :meth:`~tokenizers.models.WordPiece.read_file` to
|
||||
initialize a :class:`~tokenizers.models.WordPiece`
|
||||
|
||||
Args:
|
||||
vocab (:obj:`str`):
|
||||
The path to a :obj:`vocab.txt` file
|
||||
|
||||
Returns:
|
||||
:class:`~tokenizers.models.WordPiece`: And instance of WordPiece loaded from file
|
||||
"""
|
||||
pass
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
Returns the token associated with the given id
|
||||
Get the token associated to an ID
|
||||
|
||||
Args:
|
||||
id (:obj:`int`):
|
||||
An ID to convert to a token
|
||||
|
||||
Returns:
|
||||
:obj:`str`: The token associated to the ID
|
||||
"""
|
||||
pass
|
||||
@staticmethod
|
||||
def read_file(vocab_filename):
|
||||
def read_file(vocab):
|
||||
"""
|
||||
Read a vocab_filename and stores result in memory
|
||||
Read a :obj:`vocab.txt` file
|
||||
|
||||
This method provides a way to read and parse the content of a standard `vocab.txt`
|
||||
file as used by the WordPiece Model, returning the relevant data structures. If you
|
||||
want to instantiate some WordPiece models from memory, this method gives you the
|
||||
expected input from the standard files.
|
||||
|
||||
Args:
|
||||
vocab (:obj:`str`):
|
||||
The path to a :obj:`vocab.txt` file
|
||||
|
||||
Returns:
|
||||
:obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
|
||||
"""
|
||||
pass
|
||||
def save(self, folder, name):
|
||||
def save(self, folder, prefix):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
Save the current model in the given folder, using the given name for the various
|
||||
Save the current model in the given folder, using the given prefix for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exist in this folder will be overwritten.
|
||||
Any file with the same name that already exists in this folder will be overwritten.
|
||||
|
||||
Args:
|
||||
folder (:obj:`str`):
|
||||
The path to the target folder in which to save the various files
|
||||
|
||||
prefix (:obj:`str`, `optional`):
|
||||
An optional prefix, used to prefix each file name
|
||||
|
||||
Returns:
|
||||
:obj:`List[str]`: The list of saved files
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Returns the id associated with the given token
|
||||
Get the ID associated to a token
|
||||
|
||||
Args:
|
||||
token (:obj:`str`):
|
||||
A token to convert to an ID
|
||||
|
||||
Returns:
|
||||
:obj:`int`: The ID associated to the token
|
||||
"""
|
||||
pass
|
||||
def tokenize(self, tokens):
|
||||
def tokenize(self, sequence):
|
||||
"""
|
||||
Tokenize the given sequence
|
||||
Tokenize a sequence
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A sequence to tokenize
|
||||
|
||||
Returns:
|
||||
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
||||
"""
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user