mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-08 05:38:23 +00:00
Automatically stubbing the pyi files while keeping inspecting ability (#509)
* First pass on automatic stubbing our python files. * And now modifying all rust docs to be visible in Pyi files. * Better assert fail message. * Fixing github workflow. * Removing types not exported anymore. * Fixing `Tokenizer` signature. * Disabling auto __init__.py. * Re-enabling some types. * Don't overwrite non automated __init__.py * Automated most __init__.py * Restubbing after rebase. * Fixing env for tests. * Install blakc in the env. * Use PY35 target in stub.py Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -1,65 +1,87 @@
|
||||
from typing import List
|
||||
|
||||
# Generated content DO NOT EDIT
|
||||
class Decoder:
|
||||
"""Base class for all decoders
|
||||
"""
|
||||
Base class for all decoders
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||
a Decoder will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
def decode(self, tokens: List[str]) -> str:
|
||||
""" Decode the given list of string to a final string """
|
||||
pass
|
||||
|
||||
class ByteLevel(Decoder):
|
||||
""" ByteLevel Decoder """
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new ByteLevel Decoder """
|
||||
pass
|
||||
|
||||
class WordPiece(Decoder):
|
||||
""" WordPiece Decoder """
|
||||
|
||||
@staticmethod
|
||||
def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder:
|
||||
"""Instantiate a new WordPiece Decoder
|
||||
|
||||
Args:
|
||||
prefix: str:
|
||||
The prefix to use for subwords that are not a beginning-of-word
|
||||
cleanup: bool:
|
||||
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
|
||||
and some abbreviated english forms.
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
pass
|
||||
|
||||
class Metaspace(Decoder):
|
||||
""" Metaspace decoder """
|
||||
|
||||
def __init__(self, replacement: str = "▁", add_prefix_space: bool = True) -> None:
|
||||
"""Instantiate a new Metaspace
|
||||
|
||||
Args:
|
||||
replacement: str:
|
||||
The replacement character. Must be exactly one character. By default we
|
||||
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||
|
||||
add_prefix_space: boolean:
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
Decode the given list of string to a final string
|
||||
"""
|
||||
pass
|
||||
|
||||
class BPEDecoder(Decoder):
|
||||
""" BPEDecoder """
|
||||
"""
|
||||
Instantiate a new BPEDecoder
|
||||
|
||||
def __init__(self, suffix: str = "</w>") -> None:
|
||||
"""Instantiate a new BPEDecoder
|
||||
Args:
|
||||
suffix: str:
|
||||
The suffix that was used to caracterize an end-of-word. This suffix will
|
||||
be replaced by whitespaces during the decoding
|
||||
"""
|
||||
|
||||
Args:
|
||||
suffix: str:
|
||||
The suffix that was used to caracterize an end-of-word. This suffix will
|
||||
be replaced by whitespaces during the decoding
|
||||
def __init__(self, suffix="</w>"):
|
||||
pass
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of string to a final string
|
||||
"""
|
||||
pass
|
||||
|
||||
class ByteLevel(Decoder):
|
||||
"""
|
||||
ByteLevel Decoder
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of string to a final string
|
||||
"""
|
||||
pass
|
||||
|
||||
class Metaspace(Decoder):
|
||||
"""
|
||||
Instantiate a new Metaspace
|
||||
|
||||
Args:
|
||||
replacement: str:
|
||||
The replacement character. Must be exactly one character. By default we
|
||||
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||
|
||||
add_prefix_space: boolean:
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
"""
|
||||
|
||||
def __init__(self, replacement="▁", add_prefix_space=True):
|
||||
pass
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of string to a final string
|
||||
"""
|
||||
pass
|
||||
|
||||
class WordPiece(Decoder):
|
||||
"""
|
||||
Instantiate a new WordPiece Decoder
|
||||
|
||||
Args:
|
||||
prefix: str:
|
||||
The prefix to use for subwords that are not a beginning-of-word
|
||||
cleanup: bool:
|
||||
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
|
||||
and some abbreviated english forms.
|
||||
"""
|
||||
|
||||
def __init__(self, prefix="##", cleanup=True):
|
||||
pass
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of string to a final string
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
from typing import List, Tuple
|
||||
|
||||
from .. import models, Offsets
|
||||
# Generated content DO NOT EDIT
|
||||
from .. import models
|
||||
|
||||
Model = models.Model
|
||||
BPE = models.BPE
|
||||
WordPiece = models.WordPiece
|
||||
WordLevel = models.WordLevel
|
||||
Unigram = models.Unigram
|
||||
WordLevel = models.WordLevel
|
||||
WordPiece = models.WordPiece
|
||||
|
||||
@@ -1,34 +1,37 @@
|
||||
from .. import Encoding, Offsets, Token
|
||||
from typing import List, Optional, Union, Tuple, Dict
|
||||
|
||||
# Generated content DO NOT EDIT
|
||||
class Model:
|
||||
"""Base class for all models
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||
a Model will return a instance of this class when instantiated.
|
||||
"""
|
||||
A Model represents some tokenization algorithm like BPE or Word
|
||||
This class cannot be constructed directly. Please use one of the concrete models.
|
||||
"""
|
||||
|
||||
def tokenize(self, sequence: str) -> List[Token]:
|
||||
""" Tokenize the given sequence """
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
Returns the token associated with the given id
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, token: str) -> Optional[int]:
|
||||
""" Returns the id associated with the given token """
|
||||
pass
|
||||
def id_to_token(self, id: int) -> Optional[str]:
|
||||
""" Returns the token associated with the given id """
|
||||
pass
|
||||
def save(self, folder: str, name: Optional[str] = None) -> List[str]:
|
||||
"""Save the current model
|
||||
def save(self, folder, name):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
Save the current model in the given folder, using the given name for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exist in this folder will be overwritten.
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Returns the id associated with the given token
|
||||
"""
|
||||
pass
|
||||
def tokenize(self, tokens):
|
||||
"""
|
||||
Tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class BPE(Model):
|
||||
"""BytePairEncoding model class
|
||||
|
||||
"""
|
||||
Instantiate a BPE Model from the given vocab and merges.
|
||||
|
||||
Args:
|
||||
@@ -61,21 +64,18 @@ class BPE(Model):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Optional[Union[str, Dict[str, int]]],
|
||||
merges: Optional[Union[str, List[Tuple[str, str]]]],
|
||||
cache_capacity: Optional[int],
|
||||
dropout: Optional[float],
|
||||
unk_token: Optional[str],
|
||||
continuing_subword_prefix: Optional[str],
|
||||
end_of_word_suffix: Optional[str],
|
||||
fuse_unk: Optional[bool],
|
||||
vocab=None,
|
||||
merges=None,
|
||||
cache_capacity=None,
|
||||
dropout=None,
|
||||
unk_token=None,
|
||||
continuing_subword_prefix=None,
|
||||
end_of_word_suffix=None,
|
||||
fuse_unk=None,
|
||||
):
|
||||
pass
|
||||
@staticmethod
|
||||
def read_file(vocab_filename: str, merges_filename: str) -> Tuple[Vocab, Merges]:
|
||||
pass
|
||||
@staticmethod
|
||||
def from_file(vocab_filename: str, merges_filename: str, **kwargs) -> BPE:
|
||||
def from_file(vocab_filename, merge_filename, **kwargs):
|
||||
"""
|
||||
Convenient method to intialize a BPE from files
|
||||
Roughly equivalent to
|
||||
@@ -85,42 +85,73 @@ class BPE(Model):
|
||||
return BPE(vocab, merges, **kwargs)
|
||||
"""
|
||||
pass
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
Returns the token associated with the given id
|
||||
"""
|
||||
pass
|
||||
@staticmethod
|
||||
def read_file(self, vocab_filename, merges_filename):
|
||||
"""
|
||||
Read a vocab_filename and merge_filename and stores result in memory
|
||||
"""
|
||||
pass
|
||||
def save(self, folder, name):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
class WordPiece(Model):
|
||||
"""WordPiece model class
|
||||
Save the current model in the given folder, using the given name for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exist in this folder will be overwritten.
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Returns the id associated with the given token
|
||||
"""
|
||||
pass
|
||||
def tokenize(self, tokens):
|
||||
"""
|
||||
Tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
Instantiate a WordPiece Model from the given vocab file.
|
||||
class Unigram(Model):
|
||||
"""
|
||||
UnigramEncoding model class
|
||||
|
||||
Args:
|
||||
vocab: (`optional`) string:
|
||||
A dictionnary of string keys and their ids {"am": 0,...}
|
||||
Instantiate a Unigram Model from the given model file.
|
||||
|
||||
unk_token: (`optional`) str:
|
||||
The unknown token to be used by the model.
|
||||
Args:
|
||||
vocab: ('`optional`) string:
|
||||
A list of vocabulary items and their relative score [("am", -0.2442),...]
|
||||
|
||||
max_input_chars_per_word: (`optional`) int:
|
||||
The maximum number of characters to authorize in a single word.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Optional[Union[str, Dict[str, int]]],
|
||||
unk_token: Optional[str],
|
||||
max_input_chars_per_word: Optional[int],
|
||||
):
|
||||
def __init__(self, vocab):
|
||||
pass
|
||||
@staticmethod
|
||||
def read_file(vocab_filename: str) -> Vocab:
|
||||
pass
|
||||
@staticmethod
|
||||
def from_file(vocab_filename: str, **kwargs) -> WordPiece:
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
Convenient method to intialize a WordPiece from file
|
||||
Roughly equivalent to
|
||||
Returns the token associated with the given id
|
||||
"""
|
||||
pass
|
||||
def save(self, folder, name):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
def from_file(vocab_filename, **kwargs):
|
||||
vocab, merges = WordPiece.read_file(vocab_filename)
|
||||
return WordPiece(vocab, **kwargs)
|
||||
Save the current model in the given folder, using the given name for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exist in this folder will be overwritten.
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Returns the id associated with the given token
|
||||
"""
|
||||
pass
|
||||
def tokenize(self, tokens):
|
||||
"""
|
||||
Tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -138,34 +169,89 @@ class WordLevel(Model):
|
||||
The unknown token to be used by the model.
|
||||
"""
|
||||
|
||||
def __init__(self, vocab: Optional[Union[str, Dict[str, int]]], unk_token: Optional[str]):
|
||||
def __init__(self, vocab, unk_token):
|
||||
pass
|
||||
@staticmethod
|
||||
def read_file(vocab_filename: str) -> Vocab:
|
||||
pass
|
||||
@staticmethod
|
||||
def from_file(vocab_filename: str, **kwargs) -> WordLevelg:
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
Convenient method to intialize a WordLevelg from file
|
||||
Returns the token associated with the given id
|
||||
"""
|
||||
pass
|
||||
def save(self, folder, name):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
Save the current model in the given folder, using the given name for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exist in this folder will be overwritten.
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Returns the id associated with the given token
|
||||
"""
|
||||
pass
|
||||
def tokenize(self, tokens):
|
||||
"""
|
||||
Tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class WordPiece(Model):
|
||||
"""
|
||||
WordPiece model
|
||||
Instantiate a WordPiece Model from the given vocab file.
|
||||
|
||||
Args:
|
||||
vocab: (`optional`) string:
|
||||
A dictionnary of string keys and their ids {"am": 0,...}
|
||||
|
||||
unk_token: (`optional`) str:
|
||||
The unknown token to be used by the model.
|
||||
|
||||
max_input_chars_per_word: (`optional`) int:
|
||||
The maximum number of characters to authorize in a single word.
|
||||
"""
|
||||
|
||||
def __init__(self, vocab, unk_token, max_input_chars_per_word):
|
||||
pass
|
||||
@staticmethod
|
||||
def from_file(vocab_filename, merge_filename, **kwargs):
|
||||
"""
|
||||
Convenient method to intialize a WordPiece from files
|
||||
Roughly equivalent to
|
||||
|
||||
def from_file(vocab_filename, **kwargs):
|
||||
vocab, merges = WordLevelg.read_file(vocab_filename)
|
||||
return WordLevelg(vocab, **kwargs)
|
||||
vocab = WordPiece.read_file(vocab_filename)
|
||||
return WordPiece(vocab, **kwargs)
|
||||
"""
|
||||
pass
|
||||
|
||||
class Unigram(Model):
|
||||
"""UnigramEncoding model class
|
||||
|
||||
Instantiate a Unigram Model from the given model file.
|
||||
|
||||
Args:
|
||||
vocab: ('`optional`) string:
|
||||
A list of vocabulary items and their relative score [("am", -0.2442),...]
|
||||
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def __init__(self, vocab: Optional[List[Tuple[str, float]]]):
|
||||
def id_to_token(self, id):
|
||||
"""
|
||||
Returns the token associated with the given id
|
||||
"""
|
||||
pass
|
||||
@staticmethod
|
||||
def read_file(vocab_filename):
|
||||
"""
|
||||
Read a vocab_filename and stores result in memory
|
||||
"""
|
||||
pass
|
||||
def save(self, folder, name):
|
||||
"""
|
||||
Save the current model
|
||||
|
||||
Save the current model in the given folder, using the given name for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exist in this folder will be overwritten.
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, tokens):
|
||||
"""
|
||||
Returns the id associated with the given token
|
||||
"""
|
||||
pass
|
||||
def tokenize(self, tokens):
|
||||
"""
|
||||
Tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -1,140 +1,258 @@
|
||||
from .. import NormalizedString
|
||||
from typing import Optional, List
|
||||
|
||||
# Generated content DO NOT EDIT
|
||||
class Normalizer:
|
||||
"""Base class for all normalizers
|
||||
"""
|
||||
Base class for all normalizers
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
Normalizer will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
def normalize(self, normalized: NormalizedString):
|
||||
""" Normalize the given NormalizedString in-place """
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence: str) -> str:
|
||||
""" Normalize the given str """
|
||||
pass
|
||||
|
||||
class BertNormalizer(Normalizer):
|
||||
"""BertNormalizer
|
||||
|
||||
Takes care of normalizing raw text before giving it to a Bert model.
|
||||
This includes cleaning the text, handling accents, chinese chars and lowercasing
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
clean_text: Optional[bool] = True,
|
||||
handle_chinese_chars: Optional[bool] = True,
|
||||
strip_accents: Optional[bool] = None,
|
||||
lowercase: Optional[bool] = True,
|
||||
) -> None:
|
||||
"""Instantiate a BertNormalizer with the given options.
|
||||
|
||||
Args:
|
||||
clean_text: (`optional`) boolean:
|
||||
Whether to clean the text, by removing any control characters
|
||||
and replacing all whitespaces by the classic one.
|
||||
|
||||
handle_chinese_chars: (`optional`) boolean:
|
||||
Whether to handle chinese chars by putting spaces around them.
|
||||
|
||||
strip_accents: (`optional`) boolean:
|
||||
Whether to strip all accents. If this option is not specified (ie == None),
|
||||
then it will be determined by the value for `lowercase` (as in the original Bert).
|
||||
|
||||
lowercase: (`optional`) boolean:
|
||||
Whether to lowercase.
|
||||
|
||||
Returns:
|
||||
Normalizer
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class NFD(Normalizer):
|
||||
""" NFD Unicode Normalizer """
|
||||
class BertNormalizer(Normalizer):
|
||||
"""
|
||||
BertNormalizer
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new NFD Normalizer """
|
||||
pass
|
||||
Takes care of normalizing raw text before giving it to a Bert model.
|
||||
This includes cleaning the text, handling accents, chinese chars and lowercasing
|
||||
|
||||
class NFKD(Normalizer):
|
||||
""" NFKD Unicode Normalizer """
|
||||
Args:
|
||||
clean_text: (`optional`) boolean:
|
||||
Whether to clean the text, by removing any control characters
|
||||
and replacing all whitespaces by the classic one.
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new NFKD Normalizer """
|
||||
pass
|
||||
handle_chinese_chars: (`optional`) boolean:
|
||||
Whether to handle chinese chars by putting spaces around them.
|
||||
|
||||
class NFC(Normalizer):
|
||||
""" NFC Unicode Normalizer """
|
||||
strip_accents: (`optional`) boolean:
|
||||
Whether to strip all accents. If this option is not specified (ie == None),
|
||||
then it will be determined by the value for `lowercase` (as in the original Bert).
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new NFC Normalizer """
|
||||
pass
|
||||
lowercase: (`optional`) boolean:
|
||||
Whether to lowercase.
|
||||
|
||||
class NFKC(Normalizer):
|
||||
""" NFKC Unicode Normalizer """
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new NFKC Normalizer """
|
||||
pass
|
||||
|
||||
class Sequence(Normalizer):
|
||||
"""Allows concatenating multiple other Normalizer as a Sequence.
|
||||
|
||||
All the normalizers run in sequence in the given order
|
||||
Returns:
|
||||
Normalizer
|
||||
"""
|
||||
|
||||
def __init__(self, normalizers: List[Normalizer]) -> None:
|
||||
"""Instantiate a new normalization Sequence using the given normalizers
|
||||
|
||||
Args:
|
||||
normalizers: List[Normalizer]:
|
||||
A list of Normalizer to be run as a sequence
|
||||
def __init__(
|
||||
self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True
|
||||
):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class Lowercase(Normalizer):
|
||||
""" Lowercase Normalizer """
|
||||
"""
|
||||
Lowercase Normalizer
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new Lowercase Normalizer """
|
||||
def __init__(self):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class Strip(Normalizer):
|
||||
""" Strip normalizer """
|
||||
class NFC(Normalizer):
|
||||
"""
|
||||
NFC Unicode Normalizer
|
||||
"""
|
||||
|
||||
def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
|
||||
def __init__(self):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class StripAccents(Normalizer):
|
||||
""" StripAccents normalizer """
|
||||
class NFD(Normalizer):
|
||||
"""
|
||||
NFD Unicode Normalizer
|
||||
"""
|
||||
|
||||
def __init__(self) -> Normalizer:
|
||||
def __init__(self):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class NFKC(Normalizer):
|
||||
"""
|
||||
NFKC Unicode Normalizer
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class NFKD(Normalizer):
|
||||
"""
|
||||
NFKD Unicode Normalizer
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class Nmt(Normalizer):
|
||||
""" Nmt normalizer """
|
||||
"""
|
||||
Nmt normalizer
|
||||
"""
|
||||
|
||||
def __init__(self) -> Normalizer:
|
||||
def __init__(self):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class Precompiled(Normalizer):
|
||||
""" Precompiled normalizer """
|
||||
"""
|
||||
Precompiled normalizer
|
||||
Don't use manually it is used for compatiblity for SentencePiece.
|
||||
"""
|
||||
|
||||
def __init__(self, precompiled_charsmap: bytes) -> Normalizer:
|
||||
def __init__(self, precompiled_charsmap):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class Replace(Normalizer):
|
||||
""" Replace normalizer """
|
||||
"""
|
||||
Replace normalizer
|
||||
"""
|
||||
|
||||
def __init__(self, pattern: str, content: str) -> Normalizer:
|
||||
def __init__(self, pattern, content):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
|
||||
class Sequence(Normalizer):
|
||||
"""
|
||||
Instanciate unicode normalizer from the normalizer name
|
||||
:param normalizer: Name of the normalizer
|
||||
:return:
|
||||
Allows concatenating multiple other Normalizer as a Sequence.
|
||||
All the normalizers run in sequence in the given order
|
||||
|
||||
Args:
|
||||
normalizers: List[Normalizer]:
|
||||
A list of Normalizer to be run as a sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class Strip(Normalizer):
|
||||
"""
|
||||
Strip normalizer
|
||||
"""
|
||||
|
||||
def __init__(self, left=True, right=True):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
class StripAccents(Normalizer):
|
||||
def __init__(self):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize the given NormalizedString in-place
|
||||
"""
|
||||
pass
|
||||
def normalize_str(self, sequence):
|
||||
"""
|
||||
Normalize the given str
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
# Generated content DO NOT EDIT
|
||||
from .. import pre_tokenizers
|
||||
|
||||
PreTokenizer = pre_tokenizers.PreTokenizer
|
||||
ByteLevel = pre_tokenizers.ByteLevel
|
||||
Whitespace = pre_tokenizers.Whitespace
|
||||
Punctuation = pre_tokenizers.Punctuation
|
||||
Sequence = pre_tokenizers.Sequence
|
||||
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
|
||||
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
|
||||
Metaspace = pre_tokenizers.Metaspace
|
||||
ByteLevel = pre_tokenizers.ByteLevel
|
||||
CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
|
||||
Digits = pre_tokenizers.Digits
|
||||
Metaspace = pre_tokenizers.Metaspace
|
||||
Punctuation = pre_tokenizers.Punctuation
|
||||
Sequence = pre_tokenizers.Sequence
|
||||
UnicodeScripts = pre_tokenizers.UnicodeScripts
|
||||
Whitespace = pre_tokenizers.Whitespace
|
||||
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
|
||||
|
||||
@@ -1,163 +1,242 @@
|
||||
from .. import PreTokenizedString
|
||||
from typing import Optional, List, Tuple
|
||||
|
||||
Offsets = Tuple[int, int]
|
||||
|
||||
# Generated content DO NOT EDIT
|
||||
class PreTokenizer:
|
||||
"""Base class for all pre-tokenizers
|
||||
"""
|
||||
Base class for all pre-tokenizers
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
PreTokenizer will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
def pre_tokenize(self, pretokenized: PreTokenizedString):
|
||||
""" Pre tokenize the given PreTokenizedString in-place """
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence: str) -> List[Tuple[str, Offsets]]:
|
||||
""" Pre tokenize the given sequence """
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class BertPreTokenizer(PreTokenizer):
|
||||
"""
|
||||
BertPreTokenizer
|
||||
|
||||
This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||
Each occurence of a punctuation character will be treated separately.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class ByteLevel(PreTokenizer):
|
||||
"""ByteLevel PreTokenizer
|
||||
"""
|
||||
ByteLevel PreTokenizer
|
||||
|
||||
This pre-tokenizer takes care of replacing all bytes of the given string
|
||||
with a corresponding representation, as well as splitting into words.
|
||||
|
||||
Args:
|
||||
add_prefix_space: (`optional`) boolean:
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
Returns:
|
||||
PreTokenizer
|
||||
"""
|
||||
|
||||
def __init__(self, add_prefix_space: bool = True) -> None:
|
||||
"""Instantiate a new ByteLevel PreTokenizer
|
||||
Args:
|
||||
add_prefix_space: (`optional`) boolean:
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
Returns:
|
||||
PreTokenizer
|
||||
"""
|
||||
def __init__(self, add_prefix_space=True):
|
||||
pass
|
||||
@staticmethod
|
||||
def alphabet() -> List[str]:
|
||||
"""Returns the alphabet used by this PreTokenizer.
|
||||
def alphabet():
|
||||
"""
|
||||
Returns the alphabet used by this PreTokenizer.
|
||||
|
||||
Since the ByteLevel works as its name suggests, at the byte level, it
|
||||
encodes any byte to one visible character. This means that there is a
|
||||
total of 256 different characters composing this alphabet.
|
||||
"""
|
||||
pass
|
||||
|
||||
class Whitespace(PreTokenizer):
|
||||
"""Whitespace PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new Whitespace PreTokenizer """
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
|
||||
class WhitespaceSplit(PreTokenizer):
|
||||
"""Whitespace PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new WhitespaceSplit PreTokenizer """
|
||||
pass
|
||||
|
||||
class BertPreTokenizer(PreTokenizer):
|
||||
"""BertPreTokenizer
|
||||
|
||||
This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||
Each occurence of a punctuation character will be treated separately.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new BertPreTokenizer """
|
||||
pass
|
||||
|
||||
class Metaspace(PreTokenizer):
|
||||
"""Metaspace pre-tokenizer
|
||||
|
||||
This pre-tokenizer replaces any whitespace by the provided replacement character.
|
||||
It then tries to split on these spaces.
|
||||
"""
|
||||
|
||||
def __init__(self, replacement: str = "▁", add_prefix_space: bool = True) -> None:
|
||||
"""Instantiate a new Metaspace
|
||||
|
||||
Args:
|
||||
replacement: str:
|
||||
The replacement character. Must be exactly one character. By default we
|
||||
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||
|
||||
add_prefix_space: boolean:
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class CharDelimiterSplit(PreTokenizer):
|
||||
"""CharDelimiterSplit PreTokenizer
|
||||
|
||||
"""
|
||||
This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
|
||||
|
||||
Args:
|
||||
delimiter: str:
|
||||
The delimiter char that will be used to split input
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def __init__(self, delimiter: str) -> None:
|
||||
"""Instantiate a new CharDelimiterSplit PreTokenizer
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
Args:
|
||||
delimiter: str:
|
||||
The delimiter char that will be used to split input
|
||||
class Digits(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer simply splits using the digits in separate tokens
|
||||
Args:
|
||||
individual_digits: bool:
|
||||
If set to True, digits will each be separated "Call 123 please" -> "Call ", "1", "2", "3", " please"
|
||||
If set to False, digits will grouped "Call 123 please" -> "Call ", "123", " please"
|
||||
"""
|
||||
|
||||
def __init__(self, individual_digits=False):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class Metaspace(PreTokenizer):
|
||||
"""
|
||||
Metaspace pre-tokenizer
|
||||
|
||||
This pre-tokenizer replaces any whitespace by the provided replacement character.
|
||||
It then tries to split on these spaces.
|
||||
Args:
|
||||
replacement: str:
|
||||
The replacement character. Must be exactly one character. By default we
|
||||
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||
|
||||
add_prefix_space: boolean:
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
"""
|
||||
|
||||
def __init__(self, replacement="▁", add_prefix_space=True):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class Punctuation(PreTokenizer):
|
||||
"""Punctuation PreTokenizer
|
||||
|
||||
"""
|
||||
This pre-tokenizer simply splits on punctuation as individual characters.`
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new Punctuation PreTokenizer """
|
||||
def __init__(self):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class Sequence(PreTokenizer):
|
||||
"""Sequence PreTokenizer
|
||||
|
||||
This pre-tokenizer composes other pre_tokenizers and applies them in sequence`
|
||||
"""
|
||||
This pre-tokenizer composes other pre_tokenizers and applies them in sequence
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new Sequence PreTokenizer """
|
||||
def __init__(self, pretokenizers):
|
||||
pass
|
||||
|
||||
class Digits(PreTokenizer):
|
||||
"""Digits PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits using the digits in separate tokens
|
||||
"""
|
||||
|
||||
def __init__(self, individual_digits: bool) -> None:
|
||||
"""Instantiate a new Digits
|
||||
|
||||
Args:
|
||||
individual_digits: bool:
|
||||
If set to True, digits will each be separated "Call 123 please" -> "Call ", "1", "2", "3", " please"
|
||||
If set to False, digits will grouped "Call 123 please" -> "Call ", "123", " please"
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class UnicodeScripts(PreTokenizer):
|
||||
"""UnicodeScripts PreTokenizer
|
||||
|
||||
"""
|
||||
This pre-tokenizer splits on characters that belong to different language family
|
||||
It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt
|
||||
Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
|
||||
This mimicks SentencePiece Unigram implementation.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new UnicodeScripts """
|
||||
def __init__(self):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class Whitespace(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class WhitespaceSplit(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
# Generated content DO NOT EDIT
|
||||
from .. import processors
|
||||
|
||||
PostProcessor = processors.PostProcessor
|
||||
BertProcessing = processors.BertProcessing
|
||||
RobertaProcessing = processors.RobertaProcessing
|
||||
ByteLevel = processors.ByteLevel
|
||||
RobertaProcessing = processors.RobertaProcessing
|
||||
TemplateProcessing = processors.TemplateProcessing
|
||||
|
||||
@@ -1,53 +1,85 @@
|
||||
from .. import Encoding
|
||||
from typing import Tuple, Union, List
|
||||
|
||||
# Generated content DO NOT EDIT
|
||||
class PostProcessor:
|
||||
"""Base class for all post-processors
|
||||
"""
|
||||
Base class for all post-processors
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||
a PostProcessor will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
def num_special_tokens_to_add(self, is_pair: bool) -> int:
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
||||
:return:
|
||||
"""
|
||||
pass
|
||||
def process(
|
||||
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True
|
||||
) -> Encoding:
|
||||
""" Post-process the given encodings, generating the final one """
|
||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||
"""
|
||||
Post-process the given encodings, generating the final one
|
||||
"""
|
||||
pass
|
||||
|
||||
class BertProcessing(PostProcessor):
|
||||
"""BertProcessing
|
||||
|
||||
"""
|
||||
This post-processor takes care of adding the special tokens needed by
|
||||
a Bert model:
|
||||
- a SEP token
|
||||
- a CLS token
|
||||
Args:
|
||||
sep: Tuple[str, int]:
|
||||
A tuple with the string representation of the SEP token, and its id
|
||||
|
||||
cls: Tuple[str, int]:
|
||||
A tuple with the string representation of the CLS token, and its id
|
||||
|
||||
Returns:
|
||||
PostProcessor
|
||||
"""
|
||||
|
||||
def __init__(self, sep: Tuple[str, int], cls: Tuple[str, int]) -> None:
|
||||
"""Instantiate a new BertProcessing with the given tokens
|
||||
def __init__(self, sep, cls):
|
||||
pass
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
||||
:return:
|
||||
"""
|
||||
pass
|
||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||
"""
|
||||
Post-process the given encodings, generating the final one
|
||||
"""
|
||||
pass
|
||||
|
||||
Args:
|
||||
sep: Tuple[str, int]:
|
||||
A tuple with the string representation of the SEP token, and its id
|
||||
class ByteLevel(PostProcessor):
|
||||
"""
|
||||
This post-processor takes care of trimming the offsets.
|
||||
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
|
||||
want the offsets to include these whitespaces, then this PostProcessor must be used.
|
||||
|
||||
cls: Tuple[str, int]:
|
||||
A tuple with the string representation of the CLS token, and its id
|
||||
Args:
|
||||
trim_offsets: bool:
|
||||
Whether to trim the whitespaces from the produced offsets.
|
||||
"""
|
||||
|
||||
Returns:
|
||||
PostProcessor
|
||||
def __init__(self, trim_offsets=True):
|
||||
pass
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
||||
:return:
|
||||
"""
|
||||
pass
|
||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||
"""
|
||||
Post-process the given encodings, generating the final one
|
||||
"""
|
||||
pass
|
||||
|
||||
class RobertaProcessing(PostProcessor):
|
||||
"""RobertaProcessing
|
||||
|
||||
"""
|
||||
This post-processor takes care of adding the special tokens needed by
|
||||
a Roberta model:
|
||||
- a SEP token
|
||||
@@ -57,59 +89,41 @@ class RobertaProcessing(PostProcessor):
|
||||
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
|
||||
want the offsets to include these whitespaces, then this PostProcessor should be initialized
|
||||
with `trim_offsets=True`
|
||||
Args:
|
||||
sep: Tuple[str, int]:
|
||||
A tuple with the string representation of the SEP token, and its id
|
||||
|
||||
cls: Tuple[str, int]:
|
||||
A tuple with the string representation of the CLS token, and its id
|
||||
|
||||
trim_offsets: bool:
|
||||
Whether to trim the whitespaces from the produced offsets.
|
||||
|
||||
add_prefix_space: bool:
|
||||
Whether the add_prefix_space option was enabled during pre-tokenization. This
|
||||
is relevant because it defines the way the offsets are trimmed out.
|
||||
|
||||
Returns:
|
||||
PostProcessor
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sep: Tuple[str, int],
|
||||
cls: Tuple[str, int],
|
||||
trim_offsets: bool = True,
|
||||
add_prefix_space: bool = True,
|
||||
) -> None:
|
||||
"""Instantiate a new RobertaProcessing with the given tokens
|
||||
|
||||
Args:
|
||||
sep: Tuple[str, int]:
|
||||
A tuple with the string representation of the SEP token, and its id
|
||||
|
||||
cls: Tuple[str, int]:
|
||||
A tuple with the string representation of the CLS token, and its id
|
||||
|
||||
trim_offsets: bool:
|
||||
Whether to trim the whitespaces from the produced offsets.
|
||||
|
||||
add_prefix_space: bool:
|
||||
Whether the add_prefix_space option was enabled during pre-tokenization. This
|
||||
is relevant because it defines the way the offsets are trimmed out.
|
||||
|
||||
Returns:
|
||||
PostProcessor
|
||||
def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True):
|
||||
pass
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
||||
:return:
|
||||
"""
|
||||
pass
|
||||
|
||||
class ByteLevel(PostProcessor):
|
||||
"""ByteLevel Post processing
|
||||
|
||||
This post-processor takes care of trimming the offsets.
|
||||
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
|
||||
want the offsets to include these whitespaces, then this PostProcessor must be used.
|
||||
"""
|
||||
|
||||
def __init__(self, trim_offsets: bool = True) -> None:
|
||||
"""Instantiate a new ByteLevel
|
||||
|
||||
Args:
|
||||
trim_offsets: bool:
|
||||
Whether to trim the whitespaces from the produced offsets.
|
||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||
"""
|
||||
Post-process the given encodings, generating the final one
|
||||
"""
|
||||
pass
|
||||
|
||||
Template = Union[str, List[str]]
|
||||
Tokens = List[Union[Tuple[int, str], Tuple[str, int], dict]]
|
||||
|
||||
class TemplateProcessing(PostProcessor):
|
||||
"""TemplateProcessing
|
||||
|
||||
"""
|
||||
Provides a way to specify templates in order to add the special tokens to each
|
||||
input sequence as relevant.
|
||||
|
||||
@@ -147,32 +161,42 @@ class TemplateProcessing(PostProcessor):
|
||||
will be added to the Encoding without any further check. If the given ids correspond
|
||||
to something totally different in a `Tokenizer` using this `PostProcessor`, it
|
||||
might lead to unexpected results.
|
||||
|
||||
Args:
|
||||
single: Template
|
||||
The template used for single sequences
|
||||
|
||||
pair: Template:
|
||||
The template used when both sequences are specified
|
||||
|
||||
special_tokens: Tokens:
|
||||
The list of special tokens used in each sequences
|
||||
|
||||
Template: Union[str, List[str]]:
|
||||
- If a `str` is provided, the whitespace is used as delimiter between tokens
|
||||
- If a `List[str]` is provided, a list of tokens
|
||||
|
||||
Tokens: List[Union[Tuple[int, str], Tuple[str, int], dict]]:
|
||||
- A Tuple with both a token and its associated ID, in any order
|
||||
- A dict with the following keys:
|
||||
- "id": str => The special token id, as specified in the Template
|
||||
- "ids": List[int] => The associated IDs
|
||||
- "tokens": List[str] => The associated tokens
|
||||
The given dict expects the provided `ids` and `tokens` lists to have
|
||||
the same length.
|
||||
"""
|
||||
|
||||
def __init__(self, single: Template, pair: Template, special_tokens: Tokens) -> None:
|
||||
"""Instantiate a new TemplateProcessing
|
||||
|
||||
Args:
|
||||
single: Template
|
||||
The template used for single sequences
|
||||
|
||||
pair: Template:
|
||||
The template used when both sequences are specified
|
||||
|
||||
special_tokens: Tokens:
|
||||
The list of special tokens used in each sequences
|
||||
|
||||
Template: Union[str, List[str]]:
|
||||
- If a `str` is provided, the whitespace is used as delimiter between tokens
|
||||
- If a `List[str]` is provided, a list of tokens
|
||||
|
||||
Tokens: List[Union[Tuple[int, str], Tuple[str, int], dict]]:
|
||||
- A Tuple with both a token and its associated ID, in any order
|
||||
- A dict with the following keys:
|
||||
- "id": str => The special token id, as specified in the Template
|
||||
- "ids": List[int] => The associated IDs
|
||||
- "tokens": List[str] => The associated tokens
|
||||
The given dict expects the provided `ids` and `tokens` lists to have
|
||||
the same length.
|
||||
def __init__(self, single, pair, special_tokens):
|
||||
pass
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
||||
:return:
|
||||
"""
|
||||
pass
|
||||
def process(self, encoding, pair=None, add_special_tokens=True):
|
||||
"""
|
||||
Post-process the given encodings, generating the final one
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# Generated content DO NOT EDIT
|
||||
from .. import trainers
|
||||
|
||||
Trainer = trainers.Trainer
|
||||
BpeTrainer = trainers.BpeTrainer
|
||||
WordPieceTrainer = trainers.WordPieceTrainer
|
||||
UnigramTrainer = trainers.UnigramTrainer
|
||||
WordPieceTrainer = trainers.WordPieceTrainer
|
||||
|
||||
@@ -1,148 +1,132 @@
|
||||
from .. import AddedToken
|
||||
from typing import Optional, List, Union
|
||||
|
||||
# Generated content DO NOT EDIT
|
||||
class Trainer:
|
||||
"""Base class for all trainers
|
||||
"""
|
||||
Base class for all trainers
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
Trainer will return an instance of this class when instantiated.
|
||||
|
||||
Args:
|
||||
vocab_size: unsigned int:
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
|
||||
min_frequency: unsigned int:
|
||||
The minimum frequency a pair should have in order to be merged.
|
||||
|
||||
show_progress: boolean:
|
||||
Whether to show progress bars while training.
|
||||
|
||||
special_tokens: List[Union[str, AddedToken]]:
|
||||
A list of special tokens the model should know of.
|
||||
|
||||
limit_alphabet: unsigned int:
|
||||
The maximum different characters to keep in the alphabet.
|
||||
|
||||
initial_alphabet: List[str]:
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contain more than one character, only the first one
|
||||
is kept.
|
||||
|
||||
continuing_subword_prefix: Optional[str]:
|
||||
A prefix to be used for every subword that is not a beginning-of-word.
|
||||
|
||||
end_of_word_suffix: Optional[str]:
|
||||
A suffix to be used for every subword that is a end-of-word.
|
||||
|
||||
Returns:
|
||||
Trainer
|
||||
"""
|
||||
|
||||
class BpeTrainer(Trainer):
|
||||
"""BpeTrainer
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=30000,
|
||||
min_frequency=0,
|
||||
show_progress=True,
|
||||
special_tokens=[],
|
||||
limit_alphabet=None,
|
||||
initial_alphabet=[],
|
||||
continuing_subword_prefix=None,
|
||||
end_of_word_suffix=None,
|
||||
):
|
||||
pass
|
||||
|
||||
class BpeTrainer(Trainer):
|
||||
"""
|
||||
Capable of training a BPE model
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 0,
|
||||
show_progress: bool = True,
|
||||
special_tokens: List[Union[str, AddedToken]] = [],
|
||||
limit_alphabet: Optional[int] = None,
|
||||
initial_alphabet: List[str] = [],
|
||||
continuing_subword_prefix: Optional[str] = None,
|
||||
end_of_word_suffix: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Instantiate a new BpeTrainer with the given options:
|
||||
class UnigramTrainer(Trainer):
|
||||
"""
|
||||
Capable of training a Unigram model
|
||||
|
||||
Args:
|
||||
vocab_size: unsigned int:
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
Args:
|
||||
vocab_size: unsigned int:
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
|
||||
min_frequency: unsigned int:
|
||||
The minimum frequency a pair should have in order to be merged.
|
||||
show_progress: boolean:
|
||||
Whether to show progress bars while training.
|
||||
|
||||
show_progress: boolean:
|
||||
Whether to show progress bars while training.
|
||||
special_tokens: List[Union[str, AddedToken]]:
|
||||
A list of special tokens the model should know of.
|
||||
|
||||
special_tokens: List[Union[str, AddedToken]]:
|
||||
A list of special tokens the model should know of.
|
||||
initial_alphabet: List[str]:
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contain more than one character, only the first one
|
||||
is kept.
|
||||
|
||||
limit_alphabet: unsigned int:
|
||||
The maximum different characters to keep in the alphabet.
|
||||
Returns:
|
||||
Trainer
|
||||
"""
|
||||
|
||||
initial_alphabet: List[str]:
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contain more than one character, only the first one
|
||||
is kept.
|
||||
|
||||
continuing_subword_prefix: Optional[str]:
|
||||
A prefix to be used for every subword that is not a beginning-of-word.
|
||||
|
||||
end_of_word_suffix: Optional[str]:
|
||||
A suffix to be used for every subword that is a end-of-word.
|
||||
|
||||
Returns:
|
||||
Trainer
|
||||
"""
|
||||
def __init__(self, vocab_size=8000, show_progress=True, special_tokens=[]):
|
||||
pass
|
||||
|
||||
class WordPieceTrainer(Trainer):
|
||||
"""WordPieceTrainer
|
||||
|
||||
"""
|
||||
Capable of training a WordPiece model
|
||||
Args:
|
||||
vocab_size: unsigned int:
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
|
||||
min_frequency: unsigned int:
|
||||
The minimum frequency a pair should have in order to be merged.
|
||||
|
||||
show_progress: boolean:
|
||||
Whether to show progress bars while training.
|
||||
|
||||
special_tokens: List[Union[str, AddedToken]]:
|
||||
A list of special tokens the model should know of.
|
||||
|
||||
limit_alphabet: unsigned int:
|
||||
The maximum different characters to keep in the alphabet.
|
||||
|
||||
initial_alphabet: List[str]:
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contain more than one character, only the first one
|
||||
is kept.
|
||||
|
||||
continuing_subword_prefix: Optional[str]:
|
||||
A prefix to be used for every subword that is not a beginning-of-word.
|
||||
|
||||
end_of_word_suffix: Optional[str]:
|
||||
A suffix to be used for every subword that is a end-of-word.
|
||||
|
||||
Returns:
|
||||
Trainer
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 0,
|
||||
show_progress: bool = True,
|
||||
special_tokens: List[Union[str, AddedToken]] = [],
|
||||
limit_alphabet: Optional[int] = None,
|
||||
initial_alphabet: List[str] = [],
|
||||
continuing_subword_prefix: Optional[str] = "##",
|
||||
end_of_word_suffix: Optional[str] = None,
|
||||
) -> Trainer:
|
||||
"""Instantiate a new WordPieceTrainer with the given options:
|
||||
|
||||
Args:
|
||||
vocab_size: unsigned int:
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
|
||||
min_frequency: unsigned int:
|
||||
The minimum frequency a pair should have in order to be merged.
|
||||
|
||||
show_progress: boolean:
|
||||
Whether to show progress bars while training.
|
||||
|
||||
special_tokens: List[Union[str, AddedToken]]:
|
||||
A list of special tokens the model should know of.
|
||||
|
||||
limit_alphabet: unsigned int:
|
||||
The maximum different characters to keep in the alphabet.
|
||||
|
||||
initial_alphabet: List[str]:
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contain more than one character, only the first one
|
||||
is kept.
|
||||
|
||||
continuing_subword_prefix: Optional[str]:
|
||||
A prefix to be used for every subword that is not a beginning-of-word.
|
||||
|
||||
end_of_word_suffix: Optional[str]:
|
||||
A suffix to be used for every subword that is a end-of-word.
|
||||
|
||||
Returns:
|
||||
Trainer
|
||||
"""
|
||||
pass
|
||||
|
||||
class UnigramTrainer(Trainer):
|
||||
"""UnigramTrainer
|
||||
|
||||
Capable of training a Unigram model
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int = 8000,
|
||||
show_progress: bool = True,
|
||||
special_tokens: List[Union[str, AddedToken]] = [],
|
||||
) -> Trainer:
|
||||
"""Instantiate a new UnigramTrainer with the given options:
|
||||
|
||||
Args:
|
||||
vocab_size: unsigned int:
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
|
||||
show_progress: boolean:
|
||||
Whether to show progress bars while training.
|
||||
|
||||
special_tokens: List[Union[str, AddedToken]]:
|
||||
A list of special tokens the model should know of.
|
||||
|
||||
initial_alphabet: List[str]:
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contain more than one character, only the first one
|
||||
is kept.
|
||||
|
||||
Returns:
|
||||
Trainer
|
||||
"""
|
||||
vocab_size=30000,
|
||||
min_frequency=0,
|
||||
show_progress=True,
|
||||
special_tokens=[],
|
||||
limit_alphabet=None,
|
||||
initial_alphabet=[],
|
||||
continuing_subword_prefix="##",
|
||||
end_of_word_suffix=None,
|
||||
):
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user