mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-08 21:58:18 +00:00
Python - Improve documentation for models
This commit is contained in:
@@ -1,65 +1,100 @@
|
|||||||
# Generated content DO NOT EDIT
|
# Generated content DO NOT EDIT
|
||||||
class Model:
|
class Model:
|
||||||
"""
|
"""
|
||||||
A Model represents some tokenization algorithm like BPE or Word
|
Base class for all models
|
||||||
|
|
||||||
|
The model represents the actual tokenization algorithm. This is the part that
|
||||||
|
will contain and manage the learned vocabulary.
|
||||||
|
|
||||||
This class cannot be constructed directly. Please use one of the concrete models.
|
This class cannot be constructed directly. Please use one of the concrete models.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def id_to_token(self, id):
|
def id_to_token(self, id):
|
||||||
"""
|
"""
|
||||||
Returns the token associated with the given id
|
Get the token associated to an ID
|
||||||
|
|
||||||
|
Args:
|
||||||
|
id (:obj:`int`):
|
||||||
|
An ID to convert to a token
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`str`: The token associated to the ID
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def save(self, folder, name):
|
def save(self, folder, prefix):
|
||||||
"""
|
"""
|
||||||
Save the current model
|
Save the current model
|
||||||
|
|
||||||
Save the current model in the given folder, using the given name for the various
|
Save the current model in the given folder, using the given prefix for the various
|
||||||
files that will get created.
|
files that will get created.
|
||||||
Any file with the same name that already exist in this folder will be overwritten.
|
Any file with the same name that already exists in this folder will be overwritten.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
folder (:obj:`str`):
|
||||||
|
The path to the target folder in which to save the various files
|
||||||
|
|
||||||
|
prefix (:obj:`str`, `optional`):
|
||||||
|
An optional prefix, used to prefix each file name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[str]`: The list of saved files
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def token_to_id(self, tokens):
|
def token_to_id(self, tokens):
|
||||||
"""
|
"""
|
||||||
Returns the id associated with the given token
|
Get the ID associated to a token
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token (:obj:`str`):
|
||||||
|
A token to convert to an ID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`int`: The ID associated to the token
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def tokenize(self, tokens):
|
def tokenize(self, sequence):
|
||||||
"""
|
"""
|
||||||
Tokenize the given sequence
|
Tokenize a sequence
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sequence (:obj:`str`):
|
||||||
|
A sequence to tokenize
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class BPE(Model):
|
class BPE(Model):
|
||||||
"""
|
"""
|
||||||
Instantiate a BPE Model from the given vocab and merges.
|
An implementation of the BPE (Byte-Pair Encoding) algorithm
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab: ('`optional`) Dict[str, int]:
|
vocab (:obj:`Dict[str, int]`, `optional`):
|
||||||
A dictionnary of string keys and their ids {"am": 0,...}
|
A dictionnary of string keys and their ids :obj:`{"am": 0,...}`
|
||||||
|
|
||||||
merges: (`optional`) string:
|
merges (:obj:`List[Tuple[str, str]]`, `optional`):
|
||||||
A list of pairs of tokens [("a", "b"),...]
|
A list of pairs of tokens (:obj:`Tuple[str, str]`) :obj:`[("a", "b"),...]`
|
||||||
|
|
||||||
cache_capacity: (`optional`) int:
|
cache_capacity (:obj:`int`, `optional`):
|
||||||
The number of words that the BPE cache can contain. The cache allows
|
The number of words that the BPE cache can contain. The cache allows
|
||||||
to speed-up the process by keeping the result of the merge operations
|
to speed-up the process by keeping the result of the merge operations
|
||||||
for a number of words.
|
for a number of words.
|
||||||
|
|
||||||
dropout: (`optional`) Optional[float] [0, 1]:
|
dropout (:obj:`float`, `optional`):
|
||||||
The BPE dropout to use. Must be an float between 0 and 1
|
A float between 0 and 1 that represents the BPE dropout to use.
|
||||||
|
|
||||||
unk_token: (`optional`) str:
|
unk_token (:obj:`str`, `optional`):
|
||||||
The unknown token to be used by the model.
|
The unknown token to be used by the model.
|
||||||
|
|
||||||
continuing_subword_prefix: (`optional`) str:
|
continuing_subword_prefix (:obj:`str`, `optional`):
|
||||||
The prefix to attach to subword units that don't represent a beginning of word.
|
The prefix to attach to subword units that don't represent a beginning of word.
|
||||||
|
|
||||||
end_of_word_suffix: (`optional`) str:
|
end_of_word_suffix (:obj:`str`, `optional`):
|
||||||
The suffix to attach to subword units that represent an end of word.
|
The suffix to attach to subword units that represent an end of word.
|
||||||
|
|
||||||
fuse_unk: (`optional`) bool:
|
fuse_unk (:obj:`bool`, `optional`):
|
||||||
Multiple unk tokens get fused into only 1
|
Whether to fuse any subsequent unknown tokens into a single one
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -75,183 +110,372 @@ class BPE(Model):
|
|||||||
):
|
):
|
||||||
pass
|
pass
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_file(vocab_filename, merge_filename, **kwargs):
|
def from_file(cls, vocab, merge, **kwargs):
|
||||||
"""
|
"""
|
||||||
Convenient method to intialize a BPE from files
|
Instantiate a BPE model from the given files.
|
||||||
Roughly equivalent to
|
|
||||||
|
|
||||||
def from_file(vocab_filename, merges_filenames, **kwargs):
|
This method is roughly equivalent to doing::
|
||||||
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
|
|
||||||
return BPE(vocab, merges, **kwargs)
|
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
|
||||||
|
bpe = BPE(vocab, merges)
|
||||||
|
|
||||||
|
If you don't need to keep the :obj:`vocab, merges` values lying around,
|
||||||
|
this method is more optimized than manually calling
|
||||||
|
:meth:`~tokenizers.models.BPE.read_file` to initialize a :class:`~tokenizers.models.BPE`
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab (:obj:`str`):
|
||||||
|
The path to a :obj:`vocab.json` file
|
||||||
|
|
||||||
|
merges (:obj:`str`):
|
||||||
|
The path to a :obj:`merges.txt` file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def id_to_token(self, id):
|
def id_to_token(self, id):
|
||||||
"""
|
"""
|
||||||
Returns the token associated with the given id
|
Get the token associated to an ID
|
||||||
|
|
||||||
|
Args:
|
||||||
|
id (:obj:`int`):
|
||||||
|
An ID to convert to a token
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`str`: The token associated to the ID
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def read_file(self, vocab_filename, merges_filename):
|
def read_file(self, vocab, merges):
|
||||||
"""
|
"""
|
||||||
Read a vocab_filename and merge_filename and stores result in memory
|
Read a :obj:`vocab.json` and a :obj:`merges.txt` files
|
||||||
|
|
||||||
|
This method provides a way to read and parse the content of these files,
|
||||||
|
returning the relevant data structures. If you want to instantiate some BPE models
|
||||||
|
from memory, this method gives you the expected input from the standard files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab (:obj:`str`):
|
||||||
|
The path to a :obj:`vocab.json` file
|
||||||
|
|
||||||
|
merges (:obj:`str`):
|
||||||
|
The path to a :obj:`merges.txt` file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A :obj:`Tuple` with the vocab and the merges:
|
||||||
|
The vocabulary and merges loaded into memory
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def save(self, folder, name):
|
def save(self, folder, prefix):
|
||||||
"""
|
"""
|
||||||
Save the current model
|
Save the current model
|
||||||
|
|
||||||
Save the current model in the given folder, using the given name for the various
|
Save the current model in the given folder, using the given prefix for the various
|
||||||
files that will get created.
|
files that will get created.
|
||||||
Any file with the same name that already exist in this folder will be overwritten.
|
Any file with the same name that already exists in this folder will be overwritten.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
folder (:obj:`str`):
|
||||||
|
The path to the target folder in which to save the various files
|
||||||
|
|
||||||
|
prefix (:obj:`str`, `optional`):
|
||||||
|
An optional prefix, used to prefix each file name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[str]`: The list of saved files
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def token_to_id(self, tokens):
|
def token_to_id(self, tokens):
|
||||||
"""
|
"""
|
||||||
Returns the id associated with the given token
|
Get the ID associated to a token
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token (:obj:`str`):
|
||||||
|
A token to convert to an ID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`int`: The ID associated to the token
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def tokenize(self, tokens):
|
def tokenize(self, sequence):
|
||||||
"""
|
"""
|
||||||
Tokenize the given sequence
|
Tokenize a sequence
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sequence (:obj:`str`):
|
||||||
|
A sequence to tokenize
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class Unigram(Model):
|
class Unigram(Model):
|
||||||
"""
|
"""
|
||||||
UnigramEncoding model class
|
An implementation of the Unigram algorithm
|
||||||
|
|
||||||
Instantiate a Unigram Model from the given model file.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab: ('`optional`) string:
|
vocab (:obj:`List[Tuple[str, float]]`, `optional`):
|
||||||
A list of vocabulary items and their relative score [("am", -0.2442),...]
|
A list of vocabulary items and their relative score [("am", -0.2442),...]
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab):
|
def __init__(self, vocab):
|
||||||
pass
|
pass
|
||||||
def id_to_token(self, id):
|
def id_to_token(self, id):
|
||||||
"""
|
"""
|
||||||
Returns the token associated with the given id
|
Get the token associated to an ID
|
||||||
|
|
||||||
|
Args:
|
||||||
|
id (:obj:`int`):
|
||||||
|
An ID to convert to a token
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`str`: The token associated to the ID
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def save(self, folder, name):
|
def save(self, folder, prefix):
|
||||||
"""
|
"""
|
||||||
Save the current model
|
Save the current model
|
||||||
|
|
||||||
Save the current model in the given folder, using the given name for the various
|
Save the current model in the given folder, using the given prefix for the various
|
||||||
files that will get created.
|
files that will get created.
|
||||||
Any file with the same name that already exist in this folder will be overwritten.
|
Any file with the same name that already exists in this folder will be overwritten.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
folder (:obj:`str`):
|
||||||
|
The path to the target folder in which to save the various files
|
||||||
|
|
||||||
|
prefix (:obj:`str`, `optional`):
|
||||||
|
An optional prefix, used to prefix each file name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[str]`: The list of saved files
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def token_to_id(self, tokens):
|
def token_to_id(self, tokens):
|
||||||
"""
|
"""
|
||||||
Returns the id associated with the given token
|
Get the ID associated to a token
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token (:obj:`str`):
|
||||||
|
A token to convert to an ID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`int`: The ID associated to the token
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def tokenize(self, tokens):
|
def tokenize(self, sequence):
|
||||||
"""
|
"""
|
||||||
Tokenize the given sequence
|
Tokenize a sequence
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sequence (:obj:`str`):
|
||||||
|
A sequence to tokenize
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class WordLevel(Model):
|
class WordLevel(Model):
|
||||||
"""
|
"""
|
||||||
Most simple tokenizer model based on mapping token from a vocab file to their corresponding id.
|
An implementation of the WordLevel algorithm
|
||||||
|
|
||||||
Instantiate a WordLevel Model from the given vocab file.
|
Most simple tokenizer model based on mapping tokens to their corresponding id.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab: (`optional`) string:
|
vocab (:obj:`str`, `optional`):
|
||||||
A dictionnary of string keys and their ids {"am": 0,...}
|
A dictionnary of string keys and their ids :obj:`{"am": 0,...}`
|
||||||
|
|
||||||
unk_token: str:
|
unk_token (:obj:`str`, `optional`):
|
||||||
The unknown token to be used by the model.
|
The unknown token to be used by the model.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab, unk_token):
|
def __init__(self, vocab, unk_token):
|
||||||
pass
|
pass
|
||||||
def id_to_token(self, id):
|
def id_to_token(self, id):
|
||||||
"""
|
"""
|
||||||
Returns the token associated with the given id
|
Get the token associated to an ID
|
||||||
|
|
||||||
|
Args:
|
||||||
|
id (:obj:`int`):
|
||||||
|
An ID to convert to a token
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`str`: The token associated to the ID
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def save(self, folder, name):
|
@staticmethod
|
||||||
|
def read_file(vocab):
|
||||||
|
"""
|
||||||
|
Read a :obj:`vocab.json`
|
||||||
|
|
||||||
|
This method provides a way to read and parse the content of a vocabulary file,
|
||||||
|
returning the relevant data structures. If you want to instantiate some WordLevel models
|
||||||
|
from memory, this method gives you the expected input from the standard files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab (:obj:`str`):
|
||||||
|
The path to a :obj:`vocab.json` file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
def save(self, folder, prefix):
|
||||||
"""
|
"""
|
||||||
Save the current model
|
Save the current model
|
||||||
|
|
||||||
Save the current model in the given folder, using the given name for the various
|
Save the current model in the given folder, using the given prefix for the various
|
||||||
files that will get created.
|
files that will get created.
|
||||||
Any file with the same name that already exist in this folder will be overwritten.
|
Any file with the same name that already exists in this folder will be overwritten.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
folder (:obj:`str`):
|
||||||
|
The path to the target folder in which to save the various files
|
||||||
|
|
||||||
|
prefix (:obj:`str`, `optional`):
|
||||||
|
An optional prefix, used to prefix each file name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[str]`: The list of saved files
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def token_to_id(self, tokens):
|
def token_to_id(self, tokens):
|
||||||
"""
|
"""
|
||||||
Returns the id associated with the given token
|
Get the ID associated to a token
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token (:obj:`str`):
|
||||||
|
A token to convert to an ID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`int`: The ID associated to the token
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def tokenize(self, tokens):
|
def tokenize(self, sequence):
|
||||||
"""
|
"""
|
||||||
Tokenize the given sequence
|
Tokenize a sequence
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sequence (:obj:`str`):
|
||||||
|
A sequence to tokenize
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class WordPiece(Model):
|
class WordPiece(Model):
|
||||||
"""
|
"""
|
||||||
WordPiece model
|
An implementation of the WordPiece algorithm
|
||||||
Instantiate a WordPiece Model from the given vocab file.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab: (`optional`) string:
|
vocab (:obj:`Dict[str, int]`, `optional`):
|
||||||
A dictionnary of string keys and their ids {"am": 0,...}
|
A dictionnary of string keys and their ids :obj:`{"am": 0,...}`
|
||||||
|
|
||||||
unk_token: (`optional`) str:
|
unk_token (:obj:`str`, `optional`):
|
||||||
The unknown token to be used by the model.
|
The unknown token to be used by the model.
|
||||||
|
|
||||||
max_input_chars_per_word: (`optional`) int:
|
max_input_chars_per_word (:obj:`int`, `optional`):
|
||||||
The maximum number of characters to authorize in a single word.
|
The maximum number of characters to authorize in a single word.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab, unk_token, max_input_chars_per_word):
|
def __init__(self, vocab, unk_token, max_input_chars_per_word):
|
||||||
pass
|
pass
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_file(vocab_filename, merge_filename, **kwargs):
|
def from_file(vocab, **kwargs):
|
||||||
"""
|
"""
|
||||||
Convenient method to intialize a WordPiece from files
|
Instantiate a WordPiece model from the given file
|
||||||
Roughly equivalent to
|
|
||||||
|
This method is roughly equivalent to doing::
|
||||||
|
|
||||||
def from_file(vocab_filename, **kwargs):
|
|
||||||
vocab = WordPiece.read_file(vocab_filename)
|
vocab = WordPiece.read_file(vocab_filename)
|
||||||
return WordPiece(vocab, **kwargs)
|
wordpiece = WordPiece(vocab)
|
||||||
|
|
||||||
|
If you don't need to keep the :obj:`vocab` values lying around, this method is
|
||||||
|
more optimized than manually calling :meth:`~tokenizers.models.WordPiece.read_file` to
|
||||||
|
initialize a :class:`~tokenizers.models.WordPiece`
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab (:obj:`str`):
|
||||||
|
The path to a :obj:`vocab.txt` file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:class:`~tokenizers.models.WordPiece`: And instance of WordPiece loaded from file
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def id_to_token(self, id):
|
def id_to_token(self, id):
|
||||||
"""
|
"""
|
||||||
Returns the token associated with the given id
|
Get the token associated to an ID
|
||||||
|
|
||||||
|
Args:
|
||||||
|
id (:obj:`int`):
|
||||||
|
An ID to convert to a token
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`str`: The token associated to the ID
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def read_file(vocab_filename):
|
def read_file(vocab):
|
||||||
"""
|
"""
|
||||||
Read a vocab_filename and stores result in memory
|
Read a :obj:`vocab.txt` file
|
||||||
|
|
||||||
|
This method provides a way to read and parse the content of a standard `vocab.txt`
|
||||||
|
file as used by the WordPiece Model, returning the relevant data structures. If you
|
||||||
|
want to instantiate some WordPiece models from memory, this method gives you the
|
||||||
|
expected input from the standard files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab (:obj:`str`):
|
||||||
|
The path to a :obj:`vocab.txt` file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def save(self, folder, name):
|
def save(self, folder, prefix):
|
||||||
"""
|
"""
|
||||||
Save the current model
|
Save the current model
|
||||||
|
|
||||||
Save the current model in the given folder, using the given name for the various
|
Save the current model in the given folder, using the given prefix for the various
|
||||||
files that will get created.
|
files that will get created.
|
||||||
Any file with the same name that already exist in this folder will be overwritten.
|
Any file with the same name that already exists in this folder will be overwritten.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
folder (:obj:`str`):
|
||||||
|
The path to the target folder in which to save the various files
|
||||||
|
|
||||||
|
prefix (:obj:`str`, `optional`):
|
||||||
|
An optional prefix, used to prefix each file name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[str]`: The list of saved files
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def token_to_id(self, tokens):
|
def token_to_id(self, tokens):
|
||||||
"""
|
"""
|
||||||
Returns the id associated with the given token
|
Get the ID associated to a token
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token (:obj:`str`):
|
||||||
|
A token to convert to an ID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`int`: The ID associated to the token
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def tokenize(self, tokens):
|
def tokenize(self, sequence):
|
||||||
"""
|
"""
|
||||||
Tokenize the given sequence
|
Tokenize a sequence
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sequence (:obj:`str`):
|
||||||
|
A sequence to tokenize
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -18,7 +18,11 @@ use tokenizers as tk;
|
|||||||
|
|
||||||
use super::error::{deprecation_warning, ToPyResult};
|
use super::error::{deprecation_warning, ToPyResult};
|
||||||
|
|
||||||
/// A Model represents some tokenization algorithm like BPE or Word
|
/// Base class for all models
|
||||||
|
///
|
||||||
|
/// The model represents the actual tokenization algorithm. This is the part that
|
||||||
|
/// will contain and manage the learned vocabulary.
|
||||||
|
///
|
||||||
/// This class cannot be constructed directly. Please use one of the concrete models.
|
/// This class cannot be constructed directly. Please use one of the concrete models.
|
||||||
#[pyclass(module = "tokenizers.models", name=Model)]
|
#[pyclass(module = "tokenizers.models", name=Model)]
|
||||||
#[derive(Clone, Serialize, Deserialize)]
|
#[derive(Clone, Serialize, Deserialize)]
|
||||||
@@ -113,23 +117,44 @@ impl PyModel {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Tokenize the given sequence
|
/// Tokenize a sequence
|
||||||
#[text_signature = "(self, tokens)"]
|
///
|
||||||
fn tokenize(&self, tokens: &str) -> PyResult<Vec<PyToken>> {
|
/// Args:
|
||||||
Ok(ToPyResult(self.model.read().unwrap().tokenize(tokens))
|
/// sequence (:obj:`str`):
|
||||||
|
/// A sequence to tokenize
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
||||||
|
#[text_signature = "(self, sequence)"]
|
||||||
|
fn tokenize(&self, sequence: &str) -> PyResult<Vec<PyToken>> {
|
||||||
|
Ok(ToPyResult(self.model.read().unwrap().tokenize(sequence))
|
||||||
.into_py()?
|
.into_py()?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|t| t.into())
|
.map(|t| t.into())
|
||||||
.collect())
|
.collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the id associated with the given token
|
/// Get the ID associated to a token
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// token (:obj:`str`):
|
||||||
|
/// A token to convert to an ID
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :obj:`int`: The ID associated to the token
|
||||||
#[text_signature = "(self, tokens)"]
|
#[text_signature = "(self, tokens)"]
|
||||||
fn token_to_id(&self, token: &str) -> Option<u32> {
|
fn token_to_id(&self, token: &str) -> Option<u32> {
|
||||||
self.model.read().unwrap().token_to_id(token)
|
self.model.read().unwrap().token_to_id(token)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the token associated with the given id
|
/// Get the token associated to an ID
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// id (:obj:`int`):
|
||||||
|
/// An ID to convert to a token
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :obj:`str`: The token associated to the ID
|
||||||
#[text_signature = "(self, id)"]
|
#[text_signature = "(self, id)"]
|
||||||
fn id_to_token(&self, id: u32) -> Option<String> {
|
fn id_to_token(&self, id: u32) -> Option<String> {
|
||||||
self.model.read().unwrap().id_to_token(id)
|
self.model.read().unwrap().id_to_token(id)
|
||||||
@@ -137,13 +162,23 @@ impl PyModel {
|
|||||||
|
|
||||||
/// Save the current model
|
/// Save the current model
|
||||||
///
|
///
|
||||||
/// Save the current model in the given folder, using the given name for the various
|
/// Save the current model in the given folder, using the given prefix for the various
|
||||||
/// files that will get created.
|
/// files that will get created.
|
||||||
/// Any file with the same name that already exist in this folder will be overwritten.
|
/// Any file with the same name that already exists in this folder will be overwritten.
|
||||||
#[text_signature = "(self, folder, name)"]
|
///
|
||||||
fn save(&self, folder: &str, name: Option<&str>) -> PyResult<Vec<String>> {
|
/// Args:
|
||||||
|
/// folder (:obj:`str`):
|
||||||
|
/// The path to the target folder in which to save the various files
|
||||||
|
///
|
||||||
|
/// prefix (:obj:`str`, `optional`):
|
||||||
|
/// An optional prefix, used to prefix each file name
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :obj:`List[str]`: The list of saved files
|
||||||
|
#[text_signature = "(self, folder, prefix)"]
|
||||||
|
fn save(&self, folder: &str, prefix: Option<&str>) -> PyResult<Vec<String>> {
|
||||||
let saved: PyResult<Vec<_>> =
|
let saved: PyResult<Vec<_>> =
|
||||||
ToPyResult(self.model.read().unwrap().save(Path::new(folder), name)).into();
|
ToPyResult(self.model.read().unwrap().save(Path::new(folder), prefix)).into();
|
||||||
|
|
||||||
Ok(saved?
|
Ok(saved?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
@@ -151,39 +186,46 @@ impl PyModel {
|
|||||||
.collect())
|
.collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the associated :class:`~tokenizers.trainers.Trainer`
|
||||||
|
///
|
||||||
|
/// Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
|
||||||
|
/// :class:`~tokenizers.models.Model`.
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
||||||
fn get_trainer(&self) -> PyResult<PyObject> {
|
fn get_trainer(&self) -> PyResult<PyObject> {
|
||||||
PyTrainer::from(self.model.read().unwrap().get_trainer()).get_as_subtype()
|
PyTrainer::from(self.model.read().unwrap().get_trainer()).get_as_subtype()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Instantiate a BPE Model from the given vocab and merges.
|
/// An implementation of the BPE (Byte-Pair Encoding) algorithm
|
||||||
///
|
///
|
||||||
/// Args:
|
/// Args:
|
||||||
/// vocab: ('`optional`) Dict[str, int]:
|
/// vocab (:obj:`Dict[str, int]`, `optional`):
|
||||||
/// A dictionnary of string keys and their ids {"am": 0,...}
|
/// A dictionnary of string keys and their ids :obj:`{"am": 0,...}`
|
||||||
///
|
///
|
||||||
/// merges: (`optional`) string:
|
/// merges (:obj:`List[Tuple[str, str]]`, `optional`):
|
||||||
/// A list of pairs of tokens [("a", "b"),...]
|
/// A list of pairs of tokens (:obj:`Tuple[str, str]`) :obj:`[("a", "b"),...]`
|
||||||
///
|
///
|
||||||
/// cache_capacity: (`optional`) int:
|
/// cache_capacity (:obj:`int`, `optional`):
|
||||||
/// The number of words that the BPE cache can contain. The cache allows
|
/// The number of words that the BPE cache can contain. The cache allows
|
||||||
/// to speed-up the process by keeping the result of the merge operations
|
/// to speed-up the process by keeping the result of the merge operations
|
||||||
/// for a number of words.
|
/// for a number of words.
|
||||||
///
|
///
|
||||||
/// dropout: (`optional`) Optional[float] [0, 1]:
|
/// dropout (:obj:`float`, `optional`):
|
||||||
/// The BPE dropout to use. Must be an float between 0 and 1
|
/// A float between 0 and 1 that represents the BPE dropout to use.
|
||||||
///
|
///
|
||||||
/// unk_token: (`optional`) str:
|
/// unk_token (:obj:`str`, `optional`):
|
||||||
/// The unknown token to be used by the model.
|
/// The unknown token to be used by the model.
|
||||||
///
|
///
|
||||||
/// continuing_subword_prefix: (`optional`) str:
|
/// continuing_subword_prefix (:obj:`str`, `optional`):
|
||||||
/// The prefix to attach to subword units that don't represent a beginning of word.
|
/// The prefix to attach to subword units that don't represent a beginning of word.
|
||||||
///
|
///
|
||||||
/// end_of_word_suffix: (`optional`) str:
|
/// end_of_word_suffix (:obj:`str`, `optional`):
|
||||||
/// The suffix to attach to subword units that represent an end of word.
|
/// The suffix to attach to subword units that represent an end of word.
|
||||||
///
|
///
|
||||||
/// fuse_unk: (`optional`) bool:
|
/// fuse_unk (:obj:`bool`, `optional`):
|
||||||
/// Multiple unk tokens get fused into only 1
|
/// Whether to fuse any subsequent unknown tokens into a single one
|
||||||
#[pyclass(extends=PyModel, module = "tokenizers.models", name=BPE)]
|
#[pyclass(extends=PyModel, module = "tokenizers.models", name=BPE)]
|
||||||
#[text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None)"]
|
#[text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None)"]
|
||||||
pub struct PyBPE {}
|
pub struct PyBPE {}
|
||||||
@@ -276,35 +318,65 @@ impl PyBPE {
|
|||||||
PyBPE::with_builder(builder, kwargs)
|
PyBPE::with_builder(builder, kwargs)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Read a vocab_filename and merge_filename and stores result in memory
|
/// Read a :obj:`vocab.json` and a :obj:`merges.txt` files
|
||||||
|
///
|
||||||
|
/// This method provides a way to read and parse the content of these files,
|
||||||
|
/// returning the relevant data structures. If you want to instantiate some BPE models
|
||||||
|
/// from memory, this method gives you the expected input from the standard files.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// vocab (:obj:`str`):
|
||||||
|
/// The path to a :obj:`vocab.json` file
|
||||||
|
///
|
||||||
|
/// merges (:obj:`str`):
|
||||||
|
/// The path to a :obj:`merges.txt` file
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// A :obj:`Tuple` with the vocab and the merges:
|
||||||
|
/// The vocabulary and merges loaded into memory
|
||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
#[text_signature = "(self, vocab_filename, merges_filename)"]
|
#[text_signature = "(self, vocab, merges)"]
|
||||||
fn read_file(vocab_filename: &str, merges_filename: &str) -> PyResult<(Vocab, Merges)> {
|
fn read_file(vocab: &str, merges: &str) -> PyResult<(Vocab, Merges)> {
|
||||||
BPE::read_file(vocab_filename, merges_filename).map_err(|e| {
|
BPE::read_file(vocab, merges).map_err(|e| {
|
||||||
exceptions::PyValueError::new_err(format!(
|
exceptions::PyException::new_err(format!(
|
||||||
"Error while reading vocab&merges files: {}",
|
"Error while reading vocab & merges files: {}",
|
||||||
e
|
e
|
||||||
))
|
))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Convenient method to intialize a BPE from files
|
/// Instantiate a BPE model from the given files.
|
||||||
/// Roughly equivalent to
|
|
||||||
///
|
///
|
||||||
/// def from_file(vocab_filename, merges_filenames, **kwargs):
|
/// This method is roughly equivalent to doing::
|
||||||
/// vocab, merges = BPE.read_file(vocab_filename, merges_filename)
|
///
|
||||||
/// return BPE(vocab, merges, **kwargs)
|
/// vocab, merges = BPE.read_file(vocab_filename, merges_filename)
|
||||||
#[staticmethod]
|
/// bpe = BPE(vocab, merges)
|
||||||
|
///
|
||||||
|
/// If you don't need to keep the :obj:`vocab, merges` values lying around,
|
||||||
|
/// this method is more optimized than manually calling
|
||||||
|
/// :meth:`~tokenizers.models.BPE.read_file` to initialize a :class:`~tokenizers.models.BPE`
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// vocab (:obj:`str`):
|
||||||
|
/// The path to a :obj:`vocab.json` file
|
||||||
|
///
|
||||||
|
/// merges (:obj:`str`):
|
||||||
|
/// The path to a :obj:`merges.txt` file
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
|
||||||
|
#[classmethod]
|
||||||
#[args(kwargs = "**")]
|
#[args(kwargs = "**")]
|
||||||
#[text_signature = "(vocab_filename, merge_filename, **kwargs)"]
|
#[text_signature = "(cls, vocab, merge, **kwargs)"]
|
||||||
fn from_file(
|
fn from_file(
|
||||||
|
_cls: &PyType,
|
||||||
py: Python,
|
py: Python,
|
||||||
vocab_filename: &str,
|
vocab: &str,
|
||||||
merges_filename: &str,
|
merges: &str,
|
||||||
kwargs: Option<&PyDict>,
|
kwargs: Option<&PyDict>,
|
||||||
) -> PyResult<Py<Self>> {
|
) -> PyResult<Py<Self>> {
|
||||||
let (vocab, merges) = BPE::read_file(vocab_filename, merges_filename).map_err(|e| {
|
let (vocab, merges) = BPE::read_file(vocab, merges).map_err(|e| {
|
||||||
exceptions::PyValueError::new_err(format!("Error while reading BPE files: {}", e))
|
exceptions::PyException::new_err(format!("Error while reading BPE files: {}", e))
|
||||||
})?;
|
})?;
|
||||||
Py::new(
|
Py::new(
|
||||||
py,
|
py,
|
||||||
@@ -317,17 +389,16 @@ impl PyBPE {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// WordPiece model
|
/// An implementation of the WordPiece algorithm
|
||||||
/// Instantiate a WordPiece Model from the given vocab file.
|
|
||||||
///
|
///
|
||||||
/// Args:
|
/// Args:
|
||||||
/// vocab: (`optional`) string:
|
/// vocab (:obj:`Dict[str, int]`, `optional`):
|
||||||
/// A dictionnary of string keys and their ids {"am": 0,...}
|
/// A dictionnary of string keys and their ids :obj:`{"am": 0,...}`
|
||||||
///
|
///
|
||||||
/// unk_token: (`optional`) str:
|
/// unk_token (:obj:`str`, `optional`):
|
||||||
/// The unknown token to be used by the model.
|
/// The unknown token to be used by the model.
|
||||||
///
|
///
|
||||||
/// max_input_chars_per_word: (`optional`) int:
|
/// max_input_chars_per_word (:obj:`int`, `optional`):
|
||||||
/// The maximum number of characters to authorize in a single word.
|
/// The maximum number of characters to authorize in a single word.
|
||||||
#[pyclass(extends=PyModel, module = "tokenizers.models", name=WordPiece)]
|
#[pyclass(extends=PyModel, module = "tokenizers.models", name=WordPiece)]
|
||||||
#[text_signature = "(self, vocab, unk_token, max_input_chars_per_word)"]
|
#[text_signature = "(self, vocab, unk_token, max_input_chars_per_word)"]
|
||||||
@@ -394,42 +465,70 @@ impl PyWordPiece {
|
|||||||
PyWordPiece::with_builder(builder, kwargs)
|
PyWordPiece::with_builder(builder, kwargs)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Read a vocab_filename and stores result in memory
|
/// Read a :obj:`vocab.txt` file
|
||||||
|
///
|
||||||
|
/// This method provides a way to read and parse the content of a standard `vocab.txt`
|
||||||
|
/// file as used by the WordPiece Model, returning the relevant data structures. If you
|
||||||
|
/// want to instantiate some WordPiece models from memory, this method gives you the
|
||||||
|
/// expected input from the standard files.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// vocab (:obj:`str`):
|
||||||
|
/// The path to a :obj:`vocab.txt` file
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
|
||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
#[text_signature = "(vocab_filename)"]
|
#[text_signature = "(vocab)"]
|
||||||
fn read_file(vocab_filename: &str) -> PyResult<Vocab> {
|
fn read_file(vocab: &str) -> PyResult<Vocab> {
|
||||||
WordPiece::read_file(vocab_filename).map_err(|e| {
|
WordPiece::read_file(vocab).map_err(|e| {
|
||||||
exceptions::PyValueError::new_err(format!("Error while reading WordPiece file: {}", e))
|
exceptions::PyException::new_err(format!("Error while reading WordPiece file: {}", e))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Convenient method to intialize a WordPiece from files
|
/// Instantiate a WordPiece model from the given file
|
||||||
/// Roughly equivalent to
|
///
|
||||||
|
/// This method is roughly equivalent to doing::
|
||||||
///
|
///
|
||||||
/// def from_file(vocab_filename, **kwargs):
|
|
||||||
/// vocab = WordPiece.read_file(vocab_filename)
|
/// vocab = WordPiece.read_file(vocab_filename)
|
||||||
/// return WordPiece(vocab, **kwargs)
|
/// wordpiece = WordPiece(vocab)
|
||||||
#[staticmethod]
|
///
|
||||||
|
/// If you don't need to keep the :obj:`vocab` values lying around, this method is
|
||||||
|
/// more optimized than manually calling :meth:`~tokenizers.models.WordPiece.read_file` to
|
||||||
|
/// initialize a :class:`~tokenizers.models.WordPiece`
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// vocab (:obj:`str`):
|
||||||
|
/// The path to a :obj:`vocab.txt` file
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :class:`~tokenizers.models.WordPiece`: And instance of WordPiece loaded from file
|
||||||
|
#[classmethod]
|
||||||
#[args(kwargs = "**")]
|
#[args(kwargs = "**")]
|
||||||
#[text_signature = "(vocab_filename, merge_filename, **kwargs)"]
|
#[text_signature = "(vocab, **kwargs)"]
|
||||||
fn from_file(py: Python, vocab_filename: &str, kwargs: Option<&PyDict>) -> PyResult<Py<Self>> {
|
fn from_file(
|
||||||
let vocab = WordPiece::read_file(vocab_filename).map_err(|e| {
|
_cls: &PyType,
|
||||||
exceptions::PyValueError::new_err(format!("Error while reading WordPiece file: {}", e))
|
py: Python,
|
||||||
|
vocab: &str,
|
||||||
|
kwargs: Option<&PyDict>,
|
||||||
|
) -> PyResult<Py<Self>> {
|
||||||
|
let vocab = WordPiece::read_file(vocab).map_err(|e| {
|
||||||
|
exceptions::PyException::new_err(format!("Error while reading WordPiece file: {}", e))
|
||||||
})?;
|
})?;
|
||||||
Py::new(py, PyWordPiece::new(Some(PyVocab::Vocab(vocab)), kwargs)?)
|
Py::new(py, PyWordPiece::new(Some(PyVocab::Vocab(vocab)), kwargs)?)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Most simple tokenizer model based on mapping token from a vocab file to their corresponding id.
|
/// An implementation of the WordLevel algorithm
|
||||||
///
|
///
|
||||||
/// Instantiate a WordLevel Model from the given vocab file.
|
/// Most simple tokenizer model based on mapping tokens to their corresponding id.
|
||||||
///
|
///
|
||||||
/// Args:
|
/// Args:
|
||||||
/// vocab: (`optional`) string:
|
/// vocab (:obj:`str`, `optional`):
|
||||||
/// A dictionnary of string keys and their ids {"am": 0,...}
|
/// A dictionnary of string keys and their ids :obj:`{"am": 0,...}`
|
||||||
///
|
///
|
||||||
/// unk_token: str:
|
/// unk_token (:obj:`str`, `optional`):
|
||||||
/// The unknown token to be used by the model.
|
/// The unknown token to be used by the model.
|
||||||
#[pyclass(extends=PyModel, module = "tokenizers.models", name=WordLevel)]
|
#[pyclass(extends=PyModel, module = "tokenizers.models", name=WordLevel)]
|
||||||
#[text_signature = "(self, vocab, unk_token)"]
|
#[text_signature = "(self, vocab, unk_token)"]
|
||||||
pub struct PyWordLevel {}
|
pub struct PyWordLevel {}
|
||||||
@@ -492,31 +591,63 @@ impl PyWordLevel {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Read a :obj:`vocab.json`
|
||||||
|
///
|
||||||
|
/// This method provides a way to read and parse the content of a vocabulary file,
|
||||||
|
/// returning the relevant data structures. If you want to instantiate some WordLevel models
|
||||||
|
/// from memory, this method gives you the expected input from the standard files.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// vocab (:obj:`str`):
|
||||||
|
/// The path to a :obj:`vocab.json` file
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
|
||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
fn read_file(vocab_filename: &str) -> PyResult<Vocab> {
|
#[text_signature = "(vocab)"]
|
||||||
WordLevel::read_file(vocab_filename).map_err(|e| {
|
fn read_file(vocab: &str) -> PyResult<Vocab> {
|
||||||
exceptions::PyValueError::new_err(format!("Error while reading WordLevel file: {}", e))
|
WordLevel::read_file(vocab).map_err(|e| {
|
||||||
|
exceptions::PyException::new_err(format!("Error while reading WordLevel file: {}", e))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[staticmethod]
|
/// Instantiate a WordLevel model from the given file
|
||||||
|
///
|
||||||
|
/// This method is roughly equivalent to doing::
|
||||||
|
///
|
||||||
|
/// vocab = WordLevel.read_file(vocab_filename)
|
||||||
|
/// wordlevel = WordLevel(vocab)
|
||||||
|
///
|
||||||
|
/// If you don't need to keep the :obj:`vocab` values lying around, this method is
|
||||||
|
/// more optimized than manually calling :meth:`~tokenizers.models.WordLevel.read_file` to
|
||||||
|
/// initialize a :class:`~tokenizers.models.WordLevel`
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// vocab (:obj:`str`):
|
||||||
|
/// The path to a :obj:`vocab.json` file
|
||||||
|
///
|
||||||
|
/// Returns:
|
||||||
|
/// :class:`~tokenizers.models.WordLevel`: And instance of WordLevel loaded from file
|
||||||
|
#[classmethod]
|
||||||
#[args(kwargs = "**")]
|
#[args(kwargs = "**")]
|
||||||
fn from_file(py: Python, vocab_filename: &str, kwargs: Option<&PyDict>) -> PyResult<Py<Self>> {
|
fn from_file(
|
||||||
let vocab = WordLevel::read_file(vocab_filename).map_err(|e| {
|
_cls: &PyType,
|
||||||
exceptions::PyValueError::new_err(format!("Error while reading WordLevel file: {}", e))
|
py: Python,
|
||||||
|
vocab: &str,
|
||||||
|
kwargs: Option<&PyDict>,
|
||||||
|
) -> PyResult<Py<Self>> {
|
||||||
|
let vocab = WordLevel::read_file(vocab).map_err(|e| {
|
||||||
|
exceptions::PyException::new_err(format!("Error while reading WordLevel file: {}", e))
|
||||||
})?;
|
})?;
|
||||||
Py::new(py, PyWordLevel::new(Some(PyVocab::Vocab(vocab)), kwargs)?)
|
Py::new(py, PyWordLevel::new(Some(PyVocab::Vocab(vocab)), kwargs)?)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// UnigramEncoding model class
|
/// An implementation of the Unigram algorithm
|
||||||
///
|
|
||||||
/// Instantiate a Unigram Model from the given model file.
|
|
||||||
///
|
///
|
||||||
/// Args:
|
/// Args:
|
||||||
/// vocab: ('`optional`) string:
|
/// vocab (:obj:`List[Tuple[str, float]]`, `optional`):
|
||||||
/// A list of vocabulary items and their relative score [("am", -0.2442),...]
|
/// A list of vocabulary items and their relative score [("am", -0.2442),...]
|
||||||
///
|
|
||||||
#[pyclass(extends=PyModel, module = "tokenizers.models", name=Unigram)]
|
#[pyclass(extends=PyModel, module = "tokenizers.models", name=Unigram)]
|
||||||
#[text_signature = "(self, vocab)"]
|
#[text_signature = "(self, vocab)"]
|
||||||
pub struct PyUnigram {}
|
pub struct PyUnigram {}
|
||||||
|
|||||||
Reference in New Issue
Block a user