mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-17 01:28:46 +00:00
Python - Improved stub file for models
This commit is contained in:
@@ -1,33 +1,85 @@
|
|||||||
from .. import models
|
from typing import List, Optional
|
||||||
|
|
||||||
class Model:
|
class Model:
|
||||||
"""Model
|
""" Base class for all models
|
||||||
|
|
||||||
|
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||||
|
a Model will return a instance of this class when instantiated.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def save(folder: str, name: str) -> List[str]:
|
def save(folder: str, name: str) -> List[str]:
|
||||||
""" save
|
""" Save the current model
|
||||||
Save the current Model in the given folder, using the given name for the various
|
|
||||||
|
Save the current model in the given folder, using the given name for the various
|
||||||
files that will get created.
|
files that will get created.
|
||||||
Any file with the same name that already exist in this folder will be overwritten
|
Any file with the same name that already exist in this folder will be overwritten.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class BPE:
|
|
||||||
"""BPE
|
|
||||||
"""
|
|
||||||
|
|
||||||
def from_files(vocab: str, merges: str) -> Model:
|
class BPE:
|
||||||
|
""" BytePairEncoding model class """
|
||||||
|
|
||||||
|
def from_files(vocab: str,
|
||||||
|
merges: str,
|
||||||
|
cache_capacity: Optional[int],
|
||||||
|
dropout: Optional[float],
|
||||||
|
unk_token: Optional[int],
|
||||||
|
continuing_subword_prefix: Optional[str],
|
||||||
|
end_of_word_suffix: Optional[str]) -> Model:
|
||||||
|
""" Instantiate a BPE Model from the given vocab and merges files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab: string:
|
||||||
|
Path to a vocabulary JSON file.
|
||||||
|
|
||||||
|
merges: string:
|
||||||
|
Path to a merge file.
|
||||||
|
|
||||||
|
cache_capacity: (`optional`) int:
|
||||||
|
The number of words that the BPE cache can contain. The cache allows
|
||||||
|
to speed-up the process by keeping the result of the merge operations
|
||||||
|
for a number of words.
|
||||||
|
|
||||||
|
dropout: (`optional`) Optional[float] [0, 1]:
|
||||||
|
The BPE dropout to use. Must be an float between 0 and 1
|
||||||
|
|
||||||
|
unk_token: (`optional`) int:
|
||||||
|
The unknown token id to be used by the model.
|
||||||
|
|
||||||
|
continuing_subword_prefix: (`optional`) str:
|
||||||
|
The prefix to attach to subword units that don't represent a beginning of word.
|
||||||
|
|
||||||
|
end_of_word_suffix: (`optional`) str:
|
||||||
|
The suffix to attach to subword units that represent an end of word.
|
||||||
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def empty() -> Model:
|
def empty() -> Model:
|
||||||
|
""" Instantiate an empty BPE Model. """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class WordPiece:
|
class WordPiece:
|
||||||
"""WordPiece
|
""" WordPiece model class """
|
||||||
"""
|
|
||||||
|
|
||||||
def from_files(vocab: str) -> Model:
|
def from_files(vocab: str,
|
||||||
|
unk_token: Optional[str],
|
||||||
|
max_input_chars_per_word: Optional[int]) -> Model:
|
||||||
|
""" Instantiate a WordPiece Model from the given vocab file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab: string:
|
||||||
|
Path to a vocabulary file.
|
||||||
|
|
||||||
|
unk_token: (`optional`) str:
|
||||||
|
The unknown token to be used by the model.
|
||||||
|
|
||||||
|
max_input_chars_per_word: (`optional`) int:
|
||||||
|
The maximum number of characters to authorize in a single word.
|
||||||
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def empty() -> Model:
|
def empty() -> Model:
|
||||||
|
""" Instantiate an empty WordPiece Model. """
|
||||||
pass
|
pass
|
||||||
|
|||||||
Reference in New Issue
Block a user