mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-05 04:08:22 +00:00
Move Python source to subdirectory.
This allows testing versions not built in-place. Otherwise importing (or testing) in the package root fails without develop builds. Replace maturin with setuptools_rust since maturin fails with proper project structure.
This commit is contained in:
29
bindings/python/py_src/tokenizers/__init__.py
Normal file
29
bindings/python/py_src/tokenizers/__init__.py
Normal file
@@ -0,0 +1,29 @@
|
||||
__version__ = "0.8.1"
|
||||
|
||||
from typing import Tuple, Union, Tuple, List
|
||||
|
||||
Offsets = Tuple[int, int]
|
||||
|
||||
TextInputSequence = str
|
||||
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
|
||||
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
|
||||
PreTokenizedEncodeInput = Union[
|
||||
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence]
|
||||
]
|
||||
|
||||
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
|
||||
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
|
||||
|
||||
from .tokenizers import Tokenizer, Encoding, AddedToken
|
||||
from .tokenizers import decoders
|
||||
from .tokenizers import models
|
||||
from .tokenizers import normalizers
|
||||
from .tokenizers import pre_tokenizers
|
||||
from .tokenizers import processors
|
||||
from .tokenizers import trainers
|
||||
from .implementations import (
|
||||
ByteLevelBPETokenizer,
|
||||
CharBPETokenizer,
|
||||
SentencePieceBPETokenizer,
|
||||
BertWordPieceTokenizer,
|
||||
)
|
||||
636
bindings/python/py_src/tokenizers/__init__.pyi
Normal file
636
bindings/python/py_src/tokenizers/__init__.pyi
Normal file
@@ -0,0 +1,636 @@
|
||||
from .decoders import *
|
||||
from .models import *
|
||||
from .normalizers import *
|
||||
from .pre_tokenizers import *
|
||||
from .processors import *
|
||||
from .trainers import *
|
||||
|
||||
from .implementations import (
|
||||
ByteLevelBPETokenizer as ByteLevelBPETokenizer,
|
||||
CharBPETokenizer as CharBPETokenizer,
|
||||
SentencePieceBPETokenizer as SentencePieceBPETokenizer,
|
||||
BertWordPieceTokenizer as BertWordPieceTokenizer,
|
||||
)
|
||||
|
||||
from typing import Optional, Union, List, Tuple
|
||||
|
||||
Offsets = Tuple[int, int]
|
||||
|
||||
TextInputSequence = str
|
||||
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
|
||||
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
|
||||
PreTokenizedEncodeInput = Union[
|
||||
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
|
||||
]
|
||||
|
||||
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
|
||||
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
|
||||
|
||||
class Encoding:
|
||||
""" An Encoding as returned by the Tokenizer """
|
||||
|
||||
@staticmethod
|
||||
def merge(encodings: List[Encoding], growing_offsets: bool = True) -> Encoding:
|
||||
""" Merge the list of Encoding into one final Encoding
|
||||
|
||||
Args:
|
||||
encodings: List[Encoding]:
|
||||
The list of encodings
|
||||
|
||||
growing_offsets: bool:
|
||||
Whether the offsets should accumulate while merging
|
||||
|
||||
Returns:
|
||||
The resulting Encoding
|
||||
"""
|
||||
pass
|
||||
@property
|
||||
def ids(self) -> List[int]:
|
||||
""" The tokenized ids """
|
||||
pass
|
||||
@property
|
||||
def tokens(self) -> List[str]:
|
||||
""" The tokenized strings """
|
||||
pass
|
||||
@property
|
||||
def words(self) -> List[Optional[int]]:
|
||||
""" The tokenized words index """
|
||||
pass
|
||||
@property
|
||||
def type_ids(self) -> List[int]:
|
||||
""" The type ids """
|
||||
pass
|
||||
@property
|
||||
def offsets(self) -> List[Offsets]:
|
||||
""" The offsets.
|
||||
These offsets can be used to index any `IndexableString` directly. If you want to
|
||||
index the original `str`, make sure to retrieve the converted offsets using the `.offsets`
|
||||
method on the `original_str`.
|
||||
"""
|
||||
pass
|
||||
@property
|
||||
def special_tokens_mask(self) -> List[int]:
|
||||
""" The special tokens mask """
|
||||
pass
|
||||
@property
|
||||
def attention_mask(self) -> List[int]:
|
||||
""" The attention mask """
|
||||
pass
|
||||
@property
|
||||
def overflowing(self) -> Optional[Encoding]:
|
||||
""" The overflowing encoding, after truncation """
|
||||
pass
|
||||
def word_to_tokens(self, word_index: int) -> Optional[Tuple[int, int]]:
|
||||
"""
|
||||
Get the encoded tokens corresponding to the word at the given index in the input
|
||||
sequence, with the form [start_token, end_token + 1]
|
||||
|
||||
Args:
|
||||
word_index: int:
|
||||
The index of the word in the input sequence.
|
||||
|
||||
Returns:
|
||||
The range of tokens with the form [start_token, end_token + 1]
|
||||
"""
|
||||
pass
|
||||
def word_to_chars(self, word_index: int) -> Optional[Offsets]:
|
||||
"""
|
||||
Get the offsets of the word at the given index in the input sequence.
|
||||
|
||||
Args:
|
||||
word_index: int:
|
||||
The index of the word in the input sequence.
|
||||
|
||||
Returns:
|
||||
The word offsets
|
||||
"""
|
||||
pass
|
||||
def token_to_chars(self, token_index: int) -> Optional[Offsets]:
|
||||
"""
|
||||
Get the offsets of the token at the given index
|
||||
|
||||
Args:
|
||||
token_index: int:
|
||||
The index of the token in the encoded sequence.
|
||||
|
||||
Returns:
|
||||
The token offsets
|
||||
"""
|
||||
pass
|
||||
def token_to_word(self, token_index: int) -> Optional[int]:
|
||||
"""
|
||||
Get the word that contains the token at the given index
|
||||
|
||||
Args:
|
||||
token_index: int:
|
||||
The index of the token in the encoded sequence.
|
||||
|
||||
Returns:
|
||||
The index of the word in the input sequence.
|
||||
"""
|
||||
pass
|
||||
def char_to_token(self, pos: int) -> Optional[int]:
|
||||
"""
|
||||
Get the token that contains the char at the given position
|
||||
|
||||
Args:
|
||||
pos: int:
|
||||
The position of a char in the input string
|
||||
|
||||
Returns:
|
||||
The index of the token that contains this char
|
||||
"""
|
||||
pass
|
||||
def char_to_word(self, pos: int) -> Optional[int]:
|
||||
"""
|
||||
Get the word that contains the given char.
|
||||
|
||||
Args:
|
||||
pos: int:
|
||||
The position of a char in the input string
|
||||
|
||||
Returns:
|
||||
The index of the word that contains this char
|
||||
"""
|
||||
pass
|
||||
def pad(
|
||||
self,
|
||||
length: int,
|
||||
pad_id: Optional[int] = 0,
|
||||
pad_type_id: Optional[int] = 0,
|
||||
pad_token: Optional[str] = "[PAD]",
|
||||
direction: Optional[str] = "right",
|
||||
):
|
||||
""" Pad the current Encoding at the given length
|
||||
|
||||
Args:
|
||||
length: int:
|
||||
The length at which to pad
|
||||
|
||||
direction: (`optional`) str:
|
||||
Can be one of: `right` or `left`
|
||||
|
||||
pad_id: (`optional`) unsigned int:
|
||||
The indice to be used when padding
|
||||
|
||||
pad_type_id: (`optional`) unsigned int:
|
||||
The type indice to be used when padding
|
||||
|
||||
pad_token: (`optional`) str:
|
||||
The pad token to be used when padding
|
||||
"""
|
||||
pass
|
||||
def truncate(self, max_length: int, stride: Optional[int] = 0):
|
||||
""" Truncate the current Encoding at the given max_length
|
||||
|
||||
Args:
|
||||
max_length: int:
|
||||
The maximum length to be kept
|
||||
|
||||
stride: (`optional`) unsigned int:
|
||||
The length of the previous first sequence to be included
|
||||
in the overflowing sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class AddedToken:
|
||||
""" AddedToken represents a token to be added to a Tokenizer
|
||||
|
||||
An AddedToken can have special options defining the way it should behave.
|
||||
"""
|
||||
|
||||
def __new__(
|
||||
cls,
|
||||
content: str = "",
|
||||
single_word: bool = False,
|
||||
lstrip: bool = False,
|
||||
rstrip: bool = False,
|
||||
normalized: bool = True,
|
||||
) -> AddedToken:
|
||||
""" Instantiate a new AddedToken
|
||||
|
||||
Args:
|
||||
content: str:
|
||||
The content of the token
|
||||
|
||||
single_word: bool
|
||||
Whether this token should only match against single words. If True,
|
||||
this token will never match inside of a word. For example the token `ing` would
|
||||
match on `tokenizing` if this option if False, but not if this option is True.
|
||||
|
||||
lstrip: bool
|
||||
Whether this token should strip all potential whitespaces on the left side.
|
||||
If True, this token will greedily match any whitespace on the left. For example,
|
||||
if we try to match the token `[MASK]` with lstrip=True, in the text `I saw a [MASK]`
|
||||
we will match on ` [MASK]`.
|
||||
|
||||
rstrip: bool
|
||||
Whether this token should strip all potential whitespaces on the right side.
|
||||
If True, this token will greedily match any whitespace on the right. It works just
|
||||
like lstrip, but on the right.
|
||||
|
||||
normalized: bool:
|
||||
Whether this token should be match the normalized version of the input text. For
|
||||
example, with the added token `yesterday` and a normalizer in charge of lowercasing
|
||||
the text, the token could be extract from the input `I saw a lion Yesterday`.
|
||||
"""
|
||||
pass
|
||||
|
||||
class Tokenizer:
|
||||
""" Tokenizer
|
||||
|
||||
A Tokenizer works as a pipeline, it processes some raw text as input and outputs
|
||||
an `Encoding`.
|
||||
|
||||
The various steps of the pipeline are:
|
||||
1. The `Normalizer`: in charge of normalizing the text. Common examples of
|
||||
normalization are the unicode normalization standards, such as NFD or NFKC.
|
||||
2. The `PreTokenizer`: in charge of creating initial words splits in the text.
|
||||
The most common way of splitting text is simply on whitespace.
|
||||
3. The `Model`: in charge of doing the actual tokenization. An example of a
|
||||
`Model` would be `BPE` or `WordPiece`.
|
||||
4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything
|
||||
relevant that, for example, a language model would need, such as special tokens.
|
||||
"""
|
||||
|
||||
def __new__(cls, model: models.Model) -> Tokenizer:
|
||||
""" Instantiate a new Tokenizer using the given Model
|
||||
|
||||
Args:
|
||||
model: models.Model:
|
||||
The model to be used with this Tokenizer
|
||||
|
||||
Returns:
|
||||
Tokenizer
|
||||
"""
|
||||
pass
|
||||
@staticmethod
|
||||
def from_str(s: str) -> Tokenizer:
|
||||
""" Instantiate a new Tokenizer from the given JSON string
|
||||
|
||||
Args:
|
||||
s: str:
|
||||
A JSON string representation of the Tokenizer
|
||||
|
||||
Returns:
|
||||
Tokenizer
|
||||
"""
|
||||
pass
|
||||
@staticmethod
|
||||
def from_file(path: str) -> Tokenizer:
|
||||
""" Instantiate a new Tokenizer from the given file
|
||||
|
||||
Args:
|
||||
path: str:
|
||||
Path to a file containing a Tokenizer
|
||||
|
||||
Returns:
|
||||
Tokenizer
|
||||
"""
|
||||
pass
|
||||
@staticmethod
|
||||
def from_buffer(buffer: bytes) -> Tokenizer:
|
||||
""" Instantiate a new Tokenizer from the given buffer
|
||||
|
||||
Args:
|
||||
buffer: bytes:
|
||||
A buffer used to instantiate a new Tokenizer
|
||||
|
||||
Returns:
|
||||
Tokenizer
|
||||
"""
|
||||
pass
|
||||
def to_str(self, pretty: bool = False) -> str:
|
||||
""" Get a serialized JSON version of the Tokenizer as a str
|
||||
|
||||
Args:
|
||||
pretty: bool:
|
||||
Whether the JSON string should be prettified
|
||||
|
||||
Returns:
|
||||
str
|
||||
"""
|
||||
pass
|
||||
def save(self, path: str, pretty: bool = False):
|
||||
""" Save the Tokenizer as JSON to the given path
|
||||
|
||||
Args:
|
||||
pretty: bool:
|
||||
Whether the JSON string should be prettified
|
||||
"""
|
||||
pass
|
||||
@property
|
||||
def model(self) -> Model:
|
||||
""" Get the model in use with this Tokenizer """
|
||||
pass
|
||||
@model.setter
|
||||
def model(self, model: models.Model):
|
||||
""" Change the model to use with this Tokenizer """
|
||||
pass
|
||||
@property
|
||||
def pre_tokenizer(self) -> Optional[PreTokenizer]:
|
||||
""" Get the pre-tokenizer in use with this model """
|
||||
pass
|
||||
@pre_tokenizer.setter
|
||||
def pre_tokenizer(self, pre_tokenizer: pre_tokenizers.PreTokenizer):
|
||||
""" Change the pre tokenizer to use with this Tokenizer """
|
||||
pass
|
||||
@property
|
||||
def decoder(self) -> Optional[Decoder]:
|
||||
""" Get the decoder in use with this model """
|
||||
pass
|
||||
@decoder.setter
|
||||
def decoder(self, decoder: decoders.Decoder):
|
||||
""" Change the decoder to use with this Tokenizer """
|
||||
pass
|
||||
@property
|
||||
def post_processor(self) -> Optional[PostProcessor]:
|
||||
""" Get the post-processor in use with this Tokenizer """
|
||||
pass
|
||||
@post_processor.setter
|
||||
def post_processor(self, processor: processors.PostProcessor):
|
||||
""" Change the post processor to use with this Tokenizer """
|
||||
@property
|
||||
def normalizer(self) -> Optional[Normalizer]:
|
||||
""" Get the normalizer in use with this Tokenizer """
|
||||
pass
|
||||
@normalizer.setter
|
||||
def normalizer(self, normalizer: normalizers.Normalizer):
|
||||
""" Change the normalizer to use with this Tokenizer """
|
||||
def num_special_tokens_to_add(self, is_pair: bool) -> int:
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
||||
:return:
|
||||
"""
|
||||
pass
|
||||
def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
|
||||
""" Returns the vocabulary
|
||||
|
||||
Args:
|
||||
with_added_tokens: boolean:
|
||||
Whether to include the added tokens in the vocabulary
|
||||
|
||||
Returns:
|
||||
The vocabulary
|
||||
"""
|
||||
pass
|
||||
def get_vocab_size(self, with_added_tokens: bool = True) -> int:
|
||||
""" Returns the size of the vocabulary
|
||||
|
||||
Args:
|
||||
with_added_tokens: boolean:
|
||||
Whether to include the added tokens in the vocabulary's size
|
||||
|
||||
Returns:
|
||||
The size of the vocabulary
|
||||
"""
|
||||
pass
|
||||
def enable_truncation(self, max_length: int, stride: Optional[int], strategy: Optional[str]):
|
||||
""" Enable the truncation
|
||||
|
||||
Args:
|
||||
max_length: unsigned int:
|
||||
The maximum length at which to truncate
|
||||
|
||||
stride: (`optional`) unsigned int:
|
||||
The length of the previous first sequence to be included
|
||||
in the overflowing sequence
|
||||
|
||||
strategy: (`optional) str:
|
||||
Can be one of `longest_first`, `only_first` or `only_second`
|
||||
"""
|
||||
pass
|
||||
def no_truncation(self):
|
||||
""" Disable truncation """
|
||||
pass
|
||||
@property
|
||||
def truncation(self) -> Optional[dict]:
|
||||
""" Get the current truncation parameters
|
||||
|
||||
Returns:
|
||||
None if truncation is disabled, a dict with the current truncation parameters if
|
||||
truncation is enabled
|
||||
"""
|
||||
pass
|
||||
def enable_padding(
|
||||
self,
|
||||
direction: Optional[str] = "right",
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
pad_id: Optional[int] = 0,
|
||||
pad_type_id: Optional[int] = 0,
|
||||
pad_token: Optional[str] = "[PAD]",
|
||||
length: Optional[int] = None,
|
||||
):
|
||||
""" Enable the padding
|
||||
|
||||
Args:
|
||||
direction: (`optional`) str:
|
||||
Can be one of: `right` or `left`
|
||||
|
||||
pad_to_multiple_of: (`optional`) unsigned int:
|
||||
If specified, the padding length should always snap to the next multiple of
|
||||
the given value. For example if we were going to pad with a length of 250 but
|
||||
`pad_to_multiple_of=8` then we will pad to 256.
|
||||
|
||||
pad_id: (`optional`) unsigned int:
|
||||
The indice to be used when padding
|
||||
|
||||
pad_type_id: (`optional`) unsigned int:
|
||||
The type indice to be used when padding
|
||||
|
||||
pad_token: (`optional`) str:
|
||||
The pad token to be used when padding
|
||||
|
||||
length: (`optional`) unsigned int:
|
||||
If specified, the length at which to pad. If not specified
|
||||
we pad using the size of the longest sequence in a batch
|
||||
"""
|
||||
pass
|
||||
def no_padding(self):
|
||||
""" Disable padding """
|
||||
pass
|
||||
@property
|
||||
def padding(self) -> Optional[dict]:
|
||||
""" Get the current padding parameters
|
||||
|
||||
Returns:
|
||||
None if padding is disabled, a dict with the currently set parameters
|
||||
if the padding is enabled.
|
||||
"""
|
||||
pass
|
||||
def normalize(self, sequence: str) -> str:
|
||||
""" Normalize the given sequence
|
||||
|
||||
Args:
|
||||
sequence: str:
|
||||
The sequence to normalize
|
||||
|
||||
Returns:
|
||||
The normalized string
|
||||
"""
|
||||
pass
|
||||
def encode(
|
||||
self,
|
||||
sequence: InputSequence,
|
||||
pair: Optional[InputSequence],
|
||||
is_pretokenized: bool = False,
|
||||
add_special_tokens: bool = True,
|
||||
) -> Encoding:
|
||||
""" Encode the given sequence and pair. This method can process raw text sequences as well
|
||||
as already pre-tokenized sequences.
|
||||
|
||||
Args:
|
||||
sequence: InputSequence:
|
||||
The sequence we want to encode. This sequence can be either raw text or
|
||||
pre-tokenized, according to the `is_pretokenized` argument:
|
||||
|
||||
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
|
||||
- If `is_pretokenized=True`: `InputSequence` is expected to be
|
||||
`Union[List[str], Tuple[str]]`
|
||||
|
||||
is_pretokenized: bool:
|
||||
Whether the input is already pre-tokenized
|
||||
|
||||
add_special_tokens: bool:
|
||||
Whether to add the special tokens while encoding.
|
||||
|
||||
Returns:
|
||||
An Encoding
|
||||
"""
|
||||
pass
|
||||
def encode_batch(
|
||||
self,
|
||||
inputs: List[EncodeInput],
|
||||
is_pretokenized: bool = False,
|
||||
add_special_tokens: bool = True,
|
||||
) -> List[Encoding]:
|
||||
""" Encode the given inputs. This method accept both raw text sequences as well as already
|
||||
pre-tokenized sequences.
|
||||
|
||||
Args:
|
||||
inputs: List[EncodeInput]:
|
||||
A list of single sequences or pair sequences to encode. Each `EncodeInput` is
|
||||
expected to be of the following form:
|
||||
`Union[InputSequence, Tuple[InputSequence, InputSequence]]`
|
||||
|
||||
Each `InputSequence` can either be raw text or pre-tokenized,
|
||||
according to the `is_pretokenized` argument:
|
||||
|
||||
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
|
||||
- If `is_pretokenized=True`: `InputSequence` is expected to be
|
||||
`Union[List[str], Tuple[str]]`
|
||||
|
||||
is_pretokenized: bool:
|
||||
Whether the input is already pre-tokenized.
|
||||
|
||||
add_special_tokens: bool:
|
||||
Whether to add the special tokens while encoding.
|
||||
|
||||
Returns:
|
||||
A list of Encoding
|
||||
"""
|
||||
pass
|
||||
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
|
||||
""" Decode the given list of ids to a string sequence
|
||||
|
||||
Args:
|
||||
ids: List[unsigned int]:
|
||||
A list of ids to be decoded
|
||||
|
||||
skip_special_tokens: (`optional`) boolean:
|
||||
Whether to remove all the special tokens from the output string
|
||||
|
||||
Returns:
|
||||
The decoded string
|
||||
"""
|
||||
pass
|
||||
def decode_batch(
|
||||
self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
|
||||
) -> str:
|
||||
""" Decode the list of sequences to a list of string sequences
|
||||
|
||||
Args:
|
||||
sequences: List[List[unsigned int]]:
|
||||
A list of sequence of ids to be decoded
|
||||
|
||||
skip_special_tokens: (`optional`) boolean:
|
||||
Whether to remove all the special tokens from the output strings
|
||||
|
||||
Returns:
|
||||
A list of decoded strings
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, token: str) -> Optional[int]:
|
||||
""" Convert the given token to its corresponding id
|
||||
|
||||
Args:
|
||||
token: str:
|
||||
The token to convert
|
||||
|
||||
Returns:
|
||||
The corresponding id if it exists, None otherwise
|
||||
"""
|
||||
pass
|
||||
def id_to_token(self, id: int) -> Optional[str]:
|
||||
""" Convert the given token id to its corresponding string
|
||||
|
||||
Args:
|
||||
token: id:
|
||||
The token id to convert
|
||||
|
||||
Returns:
|
||||
The corresponding string if it exists, None otherwise
|
||||
"""
|
||||
pass
|
||||
def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
|
||||
""" Add the given tokens to the vocabulary
|
||||
|
||||
Args:
|
||||
tokens: List[Union[str, AddedToken]]:
|
||||
A list of tokens to add to the vocabulary. Each token can either be
|
||||
a string, or an instance of AddedToken
|
||||
|
||||
Returns:
|
||||
The number of tokens that were added to the vocabulary
|
||||
"""
|
||||
pass
|
||||
def add_special_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
|
||||
""" Add the given special tokens to the vocabulary, and treat them as special tokens.
|
||||
|
||||
The special tokens will never be processed by the model, and will be
|
||||
removed while decoding.
|
||||
|
||||
Args:
|
||||
tokens: List[Union[str, AddedToken]]:
|
||||
The list of special tokens to add. Each token can either be a string
|
||||
or an instance of AddedToken
|
||||
|
||||
Returns:
|
||||
The number of tokens that were added to the vocabulary
|
||||
"""
|
||||
pass
|
||||
def post_process(
|
||||
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True,
|
||||
) -> Encoding:
|
||||
""" Apply all the post-processing steps to the given encodings.
|
||||
|
||||
The various steps are:
|
||||
1. Truncate according to global params (provided to `enable_truncation`)
|
||||
2. Apply the PostProcessor
|
||||
3. Pad according to global params. (provided to `enable_padding`)
|
||||
|
||||
Args:
|
||||
encoding: Encoding:
|
||||
The main Encoding to post process
|
||||
|
||||
pair: Optional[Encoding]:
|
||||
An optional pair Encoding
|
||||
|
||||
add_special_tokens: bool:
|
||||
Whether to add special tokens
|
||||
|
||||
Returns:
|
||||
The resulting Encoding
|
||||
"""
|
||||
pass
|
||||
7
bindings/python/py_src/tokenizers/decoders/__init__.py
Normal file
7
bindings/python/py_src/tokenizers/decoders/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from .. import decoders
|
||||
|
||||
Decoder = decoders.Decoder
|
||||
ByteLevel = decoders.ByteLevel
|
||||
WordPiece = decoders.WordPiece
|
||||
Metaspace = decoders.Metaspace
|
||||
BPEDecoder = decoders.BPEDecoder
|
||||
65
bindings/python/py_src/tokenizers/decoders/__init__.pyi
Normal file
65
bindings/python/py_src/tokenizers/decoders/__init__.pyi
Normal file
@@ -0,0 +1,65 @@
|
||||
from typing import List
|
||||
|
||||
class Decoder:
|
||||
""" Base class for all decoders
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||
a Decoder will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
def decode(self, tokens: List[str]) -> str:
|
||||
""" Decode the given list of string to a final string """
|
||||
pass
|
||||
|
||||
class ByteLevel(Decoder):
|
||||
""" ByteLevel Decoder """
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new ByteLevel Decoder """
|
||||
pass
|
||||
|
||||
class WordPiece(Decoder):
|
||||
""" WordPiece Decoder """
|
||||
|
||||
@staticmethod
|
||||
def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder:
|
||||
""" Instantiate a new WordPiece Decoder
|
||||
|
||||
Args:
|
||||
prefix: str:
|
||||
The prefix to use for subwords that are not a beginning-of-word
|
||||
cleanup: bool:
|
||||
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
|
||||
and some abbreviated english forms.
|
||||
"""
|
||||
pass
|
||||
|
||||
class Metaspace(Decoder):
|
||||
""" Metaspace decoder """
|
||||
|
||||
def __init__(self, replacement: str = "▁", add_prefix_space: bool = True) -> None:
|
||||
""" Instantiate a new Metaspace
|
||||
|
||||
Args:
|
||||
replacement: str:
|
||||
The replacement character. Must be exactly one character. By default we
|
||||
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||
|
||||
add_prefix_space: boolean:
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
"""
|
||||
pass
|
||||
|
||||
class BPEDecoder(Decoder):
|
||||
""" BPEDecoder """
|
||||
|
||||
def __init__(self, suffix: str = "</w>") -> None:
|
||||
""" Instantiate a new BPEDecoder
|
||||
|
||||
Args:
|
||||
suffix: str:
|
||||
The suffix that was used to caracterize an end-of-word. This suffix will
|
||||
be replaced by whitespaces during the decoding
|
||||
"""
|
||||
pass
|
||||
@@ -0,0 +1,5 @@
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
from .byte_level_bpe import ByteLevelBPETokenizer
|
||||
from .char_level_bpe import CharBPETokenizer
|
||||
from .sentencepiece_bpe import SentencePieceBPETokenizer
|
||||
from .bert_wordpiece import BertWordPieceTokenizer
|
||||
@@ -0,0 +1,369 @@
|
||||
from tokenizers import Tokenizer, Encoding, AddedToken, InputSequence, EncodeInput
|
||||
from tokenizers.models import TokenizedSequence, TokenizedSequenceWithOffsets
|
||||
|
||||
from typing import List, Union, Tuple, Optional, Dict
|
||||
|
||||
Offsets = Tuple[int, int]
|
||||
|
||||
|
||||
class BaseTokenizer:
|
||||
def __init__(self, tokenizer: Tokenizer, parameters=None):
|
||||
self._tokenizer = tokenizer
|
||||
self._parameters = parameters if parameters is not None else {}
|
||||
|
||||
def __repr__(self):
|
||||
return "Tokenizer(vocabulary_size={}, {})".format(
|
||||
self._tokenizer.get_vocab_size(),
|
||||
", ".join(k + "=" + str(v) for k, v in self._parameters.items()),
|
||||
)
|
||||
|
||||
def num_special_tokens_to_add(self, is_pair: bool) -> int:
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
||||
:return:
|
||||
"""
|
||||
return self._tokenizer.num_special_tokens_to_add(is_pair)
|
||||
|
||||
def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
|
||||
""" Returns the vocabulary
|
||||
|
||||
Args:
|
||||
with_added_tokens: boolean:
|
||||
Whether to include the added tokens in the vocabulary
|
||||
|
||||
Returns:
|
||||
The vocabulary
|
||||
"""
|
||||
return self._tokenizer.get_vocab(with_added_tokens=with_added_tokens)
|
||||
|
||||
def get_vocab_size(self, with_added_tokens: bool = True) -> int:
|
||||
""" Return the size of vocabulary, with or without added tokens.
|
||||
|
||||
Args:
|
||||
with_added_tokens: (`optional`) bool:
|
||||
Whether to count in added special tokens or not
|
||||
|
||||
Returns:
|
||||
Size of vocabulary
|
||||
"""
|
||||
return self._tokenizer.get_vocab_size(with_added_tokens=with_added_tokens)
|
||||
|
||||
def enable_padding(
|
||||
self,
|
||||
direction: Optional[str] = "right",
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
pad_id: Optional[int] = 0,
|
||||
pad_type_id: Optional[int] = 0,
|
||||
pad_token: Optional[str] = "[PAD]",
|
||||
length: Optional[int] = None,
|
||||
):
|
||||
""" Change the padding strategy
|
||||
|
||||
Args:
|
||||
direction: (`optional`) str:
|
||||
Can be one of: `right` or `left`
|
||||
|
||||
pad_to_multiple_of: (`optional`) unsigned int:
|
||||
If specified, the padding length should always snap to the next multiple of
|
||||
the given value. For example if we were going to pad with a length of 250 but
|
||||
`pad_to_multiple_of=8` then we will pad to 256.
|
||||
|
||||
pad_id: (`optional`) unsigned int:
|
||||
The indice to be used when padding
|
||||
|
||||
pad_type_id: (`optional`) unsigned int:
|
||||
The type indice to be used when padding
|
||||
|
||||
pad_token: (`optional`) str:
|
||||
The pad token to be used when padding
|
||||
|
||||
length: (`optional`) unsigned int:
|
||||
If specified, the length at which to pad. If not specified
|
||||
we pad using the size of the longest sequence in a batch
|
||||
"""
|
||||
return self._tokenizer.enable_padding(
|
||||
direction=direction,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
pad_id=pad_id,
|
||||
pad_type_id=pad_type_id,
|
||||
pad_token=pad_token,
|
||||
length=length,
|
||||
)
|
||||
|
||||
def no_padding(self):
|
||||
""" Disable padding """
|
||||
return self._tokenizer.no_padding()
|
||||
|
||||
@property
|
||||
def padding(self) -> Optional[dict]:
|
||||
""" Get the current padding parameters
|
||||
|
||||
Returns:
|
||||
None if padding is disabled, a dict with the currently set parameters
|
||||
if the padding is enabled.
|
||||
"""
|
||||
return self._tokenizer.padding
|
||||
|
||||
def enable_truncation(
|
||||
self, max_length: int, stride: Optional[int] = 0, strategy: Optional[str] = "longest_first"
|
||||
):
|
||||
""" Change the truncation options
|
||||
|
||||
Args:
|
||||
max_length: unsigned int:
|
||||
The maximum length at which to truncate
|
||||
|
||||
stride: (`optional`) unsigned int:
|
||||
The length of the previous first sequence to be included
|
||||
in the overflowing sequence
|
||||
|
||||
strategy: (`optional) str:
|
||||
Can be one of `longest_first`, `only_first` or `only_second`
|
||||
"""
|
||||
return self._tokenizer.enable_truncation(max_length, stride=stride, strategy=strategy)
|
||||
|
||||
def no_truncation(self):
|
||||
""" Disable truncation """
|
||||
return self._tokenizer.no_truncation()
|
||||
|
||||
@property
|
||||
def truncation(self) -> Optional[dict]:
|
||||
""" Get the current truncation parameters
|
||||
|
||||
Returns:
|
||||
None if truncation is disabled, a dict with the current truncation parameters if
|
||||
truncation is enabled
|
||||
"""
|
||||
return self._tokenizer.truncation
|
||||
|
||||
def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
|
||||
""" Add the given tokens to the vocabulary
|
||||
|
||||
Args:
|
||||
tokens: List[Union[str, AddedToken]]:
|
||||
A list of tokens to add to the vocabulary. Each token can either be
|
||||
a string, or an instance of AddedToken
|
||||
|
||||
Returns:
|
||||
The number of tokens that were added to the vocabulary
|
||||
"""
|
||||
return self._tokenizer.add_tokens(tokens)
|
||||
|
||||
def add_special_tokens(self, special_tokens: List[Union[str, AddedToken]]) -> int:
|
||||
""" Add the given special tokens to the vocabulary, and treat them as special tokens.
|
||||
|
||||
The special tokens will never be processed by the model, and will be
|
||||
removed while decoding.
|
||||
|
||||
Args:
|
||||
tokens: List[Union[str, AddedToken]]:
|
||||
A list of special tokens to add to the vocabulary. Each token can either be
|
||||
a string, or an instance of AddedToken
|
||||
|
||||
Returns:
|
||||
The number of tokens that were added to the vocabulary
|
||||
"""
|
||||
return self._tokenizer.add_special_tokens(special_tokens)
|
||||
|
||||
def normalize(self, sequence: str) -> str:
|
||||
""" Normalize the given sequence
|
||||
|
||||
Args:
|
||||
sequence: str:
|
||||
The sequence to normalize
|
||||
|
||||
Returns:
|
||||
The normalized string
|
||||
"""
|
||||
return self._tokenizer.normalize(sequence)
|
||||
|
||||
def encode(
|
||||
self,
|
||||
sequence: InputSequence,
|
||||
pair: Optional[InputSequence] = None,
|
||||
is_pretokenized: bool = False,
|
||||
add_special_tokens: bool = True,
|
||||
) -> Encoding:
|
||||
""" Encode the given sequence and pair. This method can process raw text sequences as well
|
||||
as already pre-tokenized sequences.
|
||||
|
||||
Args:
|
||||
sequence: InputSequence:
|
||||
The sequence we want to encode. This sequence can be either raw text or
|
||||
pre-tokenized, according to the `is_pretokenized` argument:
|
||||
|
||||
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
|
||||
- If `is_pretokenized=True`: `InputSequence` is expected to be
|
||||
`Union[List[str], Tuple[str]]`
|
||||
|
||||
is_pretokenized: bool:
|
||||
Whether the input is already pre-tokenized.
|
||||
|
||||
add_special_tokens: bool:
|
||||
Whether to add the special tokens while encoding.
|
||||
|
||||
Returns:
|
||||
An Encoding
|
||||
"""
|
||||
if sequence is None:
|
||||
raise ValueError("encode: `sequence` can't be `None`")
|
||||
|
||||
return self._tokenizer.encode(sequence, pair, is_pretokenized, add_special_tokens)
|
||||
|
||||
def encode_batch(
|
||||
self,
|
||||
inputs: List[EncodeInput],
|
||||
is_pretokenized: bool = False,
|
||||
add_special_tokens: bool = True,
|
||||
) -> List[Encoding]:
|
||||
""" Encode the given inputs. This method accept both raw text sequences as well as already
|
||||
pre-tokenized sequences.
|
||||
|
||||
Args:
|
||||
inputs: List[EncodeInput]:
|
||||
A list of single sequences or pair sequences to encode. Each `EncodeInput` is
|
||||
expected to be of the following form:
|
||||
`Union[InputSequence, Tuple[InputSequence, InputSequence]]`
|
||||
|
||||
Each `InputSequence` can either be raw text or pre-tokenized,
|
||||
according to the `is_pretokenized` argument:
|
||||
|
||||
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
|
||||
- If `is_pretokenized=True`: `InputSequence` is expected to be
|
||||
`Union[List[str], Tuple[str]]`
|
||||
|
||||
is_pretokenized: bool:
|
||||
Whether the input is already pre-tokenized.
|
||||
|
||||
add_special_tokens: bool:
|
||||
Whether to add the special tokens while encoding.
|
||||
|
||||
Returns:
|
||||
A list of Encoding
|
||||
"""
|
||||
|
||||
if inputs is None:
|
||||
raise ValueError("encode_batch: `inputs` can't be `None`")
|
||||
|
||||
return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)
|
||||
|
||||
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
|
||||
""" Decode the given list of ids to a string sequence
|
||||
|
||||
Args:
|
||||
ids: List[unsigned int]:
|
||||
A list of ids to be decoded
|
||||
|
||||
skip_special_tokens: (`optional`) boolean:
|
||||
Whether to remove all the special tokens from the output string
|
||||
|
||||
Returns:
|
||||
The decoded string
|
||||
"""
|
||||
if ids is None:
|
||||
raise ValueError("None input is not valid. Should be a list of integers.")
|
||||
|
||||
return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
|
||||
|
||||
def decode_batch(
|
||||
self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
|
||||
) -> str:
|
||||
""" Decode the list of sequences to a list of string sequences
|
||||
|
||||
Args:
|
||||
sequences: List[List[unsigned int]]:
|
||||
A list of sequence of ids to be decoded
|
||||
|
||||
skip_special_tokens: (`optional`) boolean:
|
||||
Whether to remove all the special tokens from the output strings
|
||||
|
||||
Returns:
|
||||
A list of decoded strings
|
||||
"""
|
||||
if sequences is None:
|
||||
raise ValueError("None input is not valid. Should be list of list of integers.")
|
||||
|
||||
return self._tokenizer.decode_batch(sequences, skip_special_tokens=skip_special_tokens)
|
||||
|
||||
def token_to_id(self, token: str) -> Optional[int]:
|
||||
""" Convert the given token to its corresponding id
|
||||
|
||||
Args:
|
||||
token: str:
|
||||
The token to convert
|
||||
|
||||
Returns:
|
||||
The corresponding id if it exists, None otherwise
|
||||
"""
|
||||
return self._tokenizer.token_to_id(token)
|
||||
|
||||
def id_to_token(self, id: int) -> Optional[str]:
|
||||
""" Convert the given token id to its corresponding string
|
||||
|
||||
Args:
|
||||
token: id:
|
||||
The token id to convert
|
||||
|
||||
Returns:
|
||||
The corresponding string if it exists, None otherwise
|
||||
"""
|
||||
return self._tokenizer.id_to_token(id)
|
||||
|
||||
def save_model(self, directory: str, name: Optional[str] = None):
|
||||
""" Save the current model to the given directory
|
||||
|
||||
Args:
|
||||
directory: str:
|
||||
A path to the destination directory
|
||||
|
||||
name: (Optional) str:
|
||||
The name of the tokenizer, to be used in the saved files
|
||||
"""
|
||||
return self._tokenizer.model.save(directory, name=name)
|
||||
|
||||
def save(self, path: str, pretty: bool = False):
|
||||
""" Save the current Tokenizer at the given path
|
||||
|
||||
Args:
|
||||
path: str:
|
||||
A path to the destination Tokenizer file
|
||||
"""
|
||||
return self._tokenizer.save(path, pretty)
|
||||
|
||||
def to_str(self, pretty: bool = False):
|
||||
""" Get a serialized JSON version of the Tokenizer as a str
|
||||
|
||||
Args:
|
||||
pretty: bool:
|
||||
Whether the JSON string should be prettified
|
||||
|
||||
Returns:
|
||||
str
|
||||
"""
|
||||
return self._tokenizer.to_str(pretty)
|
||||
|
||||
def post_process(
|
||||
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True
|
||||
) -> Encoding:
|
||||
""" Apply all the post-processing steps to the given encodings.
|
||||
|
||||
The various steps are:
|
||||
1. Truncate according to global params (provided to `enable_truncation`)
|
||||
2. Apply the PostProcessor
|
||||
3. Pad according to global params. (provided to `enable_padding`)
|
||||
|
||||
Args:
|
||||
encoding: Encoding:
|
||||
The main Encoding to post process
|
||||
|
||||
pair: Optional[Encoding]:
|
||||
An optional pair Encoding
|
||||
|
||||
add_special_tokens: bool:
|
||||
Whether to add special tokens
|
||||
|
||||
Returns:
|
||||
The resulting Encoding
|
||||
"""
|
||||
return self._tokenizer.post_process(encoding, pair, add_special_tokens)
|
||||
@@ -0,0 +1,113 @@
|
||||
from tokenizers import Tokenizer, AddedToken, decoders, trainers
|
||||
from tokenizers.models import WordPiece
|
||||
from tokenizers.normalizers import BertNormalizer
|
||||
from tokenizers.pre_tokenizers import BertPreTokenizer
|
||||
from tokenizers.processors import BertProcessing
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union
|
||||
|
||||
|
||||
class BertWordPieceTokenizer(BaseTokenizer):
|
||||
""" Bert WordPiece Tokenizer """
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file: Optional[str] = None,
|
||||
unk_token: Union[str, AddedToken] = "[UNK]",
|
||||
sep_token: Union[str, AddedToken] = "[SEP]",
|
||||
cls_token: Union[str, AddedToken] = "[CLS]",
|
||||
pad_token: Union[str, AddedToken] = "[PAD]",
|
||||
mask_token: Union[str, AddedToken] = "[MASK]",
|
||||
clean_text: bool = True,
|
||||
handle_chinese_chars: bool = True,
|
||||
strip_accents: Optional[bool] = None,
|
||||
lowercase: bool = True,
|
||||
wordpieces_prefix: str = "##",
|
||||
):
|
||||
|
||||
if vocab_file is not None:
|
||||
tokenizer = Tokenizer(WordPiece(vocab_file, unk_token=str(unk_token)))
|
||||
else:
|
||||
tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token)))
|
||||
|
||||
# Let the tokenizer know about special tokens if they are part of the vocab
|
||||
if tokenizer.token_to_id(str(unk_token)) is not None:
|
||||
tokenizer.add_special_tokens([str(unk_token)])
|
||||
if tokenizer.token_to_id(str(sep_token)) is not None:
|
||||
tokenizer.add_special_tokens([str(sep_token)])
|
||||
if tokenizer.token_to_id(str(cls_token)) is not None:
|
||||
tokenizer.add_special_tokens([str(cls_token)])
|
||||
if tokenizer.token_to_id(str(pad_token)) is not None:
|
||||
tokenizer.add_special_tokens([str(pad_token)])
|
||||
if tokenizer.token_to_id(str(mask_token)) is not None:
|
||||
tokenizer.add_special_tokens([str(mask_token)])
|
||||
|
||||
tokenizer.normalizer = BertNormalizer(
|
||||
clean_text=clean_text,
|
||||
handle_chinese_chars=handle_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
lowercase=lowercase,
|
||||
)
|
||||
tokenizer.pre_tokenizer = BertPreTokenizer()
|
||||
|
||||
if vocab_file is not None:
|
||||
sep_token_id = tokenizer.token_to_id(str(sep_token))
|
||||
if sep_token_id is None:
|
||||
raise TypeError("sep_token not found in the vocabulary")
|
||||
cls_token_id = tokenizer.token_to_id(str(cls_token))
|
||||
if cls_token_id is None:
|
||||
raise TypeError("cls_token not found in the vocabulary")
|
||||
|
||||
tokenizer.post_processor = BertProcessing(
|
||||
(str(sep_token), sep_token_id), (str(cls_token), cls_token_id)
|
||||
)
|
||||
tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)
|
||||
|
||||
parameters = {
|
||||
"model": "BertWordPiece",
|
||||
"unk_token": unk_token,
|
||||
"sep_token": sep_token,
|
||||
"cls_token": cls_token,
|
||||
"pad_token": pad_token,
|
||||
"mask_token": mask_token,
|
||||
"clean_text": clean_text,
|
||||
"handle_chinese_chars": handle_chinese_chars,
|
||||
"strip_accents": strip_accents,
|
||||
"lowercase": lowercase,
|
||||
"wordpieces_prefix": wordpieces_prefix,
|
||||
}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
def train(
|
||||
self,
|
||||
files: Union[str, List[str]],
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 2,
|
||||
limit_alphabet: int = 1000,
|
||||
initial_alphabet: List[str] = [],
|
||||
special_tokens: List[Union[str, AddedToken]] = [
|
||||
"[PAD]",
|
||||
"[UNK]",
|
||||
"[CLS]",
|
||||
"[SEP]",
|
||||
"[MASK]",
|
||||
],
|
||||
show_progress: bool = True,
|
||||
wordpieces_prefix: str = "##",
|
||||
):
|
||||
""" Train the model using the given files """
|
||||
|
||||
trainer = trainers.WordPieceTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
limit_alphabet=limit_alphabet,
|
||||
initial_alphabet=initial_alphabet,
|
||||
special_tokens=special_tokens,
|
||||
show_progress=show_progress,
|
||||
continuing_subword_prefix=wordpieces_prefix,
|
||||
)
|
||||
if isinstance(files, str):
|
||||
files = [files]
|
||||
self._tokenizer.train(trainer, files)
|
||||
@@ -0,0 +1,92 @@
|
||||
from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers, processors
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union
|
||||
|
||||
|
||||
class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
""" ByteLevelBPETokenizer
|
||||
|
||||
Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file: Optional[str] = None,
|
||||
merges_file: Optional[str] = None,
|
||||
add_prefix_space: bool = False,
|
||||
lowercase: bool = False,
|
||||
dropout: Optional[float] = None,
|
||||
unicode_normalizer: Optional[str] = None,
|
||||
continuing_subword_prefix: Optional[str] = None,
|
||||
end_of_word_suffix: Optional[str] = None,
|
||||
trim_offsets: bool = False,
|
||||
):
|
||||
if vocab_file is not None and merges_file is not None:
|
||||
tokenizer = Tokenizer(
|
||||
BPE(
|
||||
vocab_file,
|
||||
merges_file,
|
||||
dropout=dropout,
|
||||
continuing_subword_prefix=continuing_subword_prefix or "",
|
||||
end_of_word_suffix=end_of_word_suffix or "",
|
||||
)
|
||||
)
|
||||
else:
|
||||
tokenizer = Tokenizer(BPE())
|
||||
|
||||
# Check for Unicode normalization first (before everything else)
|
||||
normalizers = []
|
||||
|
||||
if unicode_normalizer:
|
||||
normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
|
||||
|
||||
if lowercase:
|
||||
normalizers += [Lowercase()]
|
||||
|
||||
# Create the normalizer structure
|
||||
if len(normalizers) > 0:
|
||||
if len(normalizers) > 1:
|
||||
tokenizer.normalizer = Sequence(normalizers)
|
||||
else:
|
||||
tokenizer.normalizer = normalizers[0]
|
||||
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
|
||||
tokenizer.decoder = decoders.ByteLevel()
|
||||
tokenizer.post_processor = processors.ByteLevel(trim_offsets=trim_offsets)
|
||||
|
||||
parameters = {
|
||||
"model": "ByteLevelBPE",
|
||||
"add_prefix_space": add_prefix_space,
|
||||
"lowercase": lowercase,
|
||||
"dropout": dropout,
|
||||
"unicode_normalizer": unicode_normalizer,
|
||||
"continuing_subword_prefix": continuing_subword_prefix,
|
||||
"end_of_word_suffix": end_of_word_suffix,
|
||||
"trim_offsets": trim_offsets,
|
||||
}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
def train(
|
||||
self,
|
||||
files: Union[str, List[str]],
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 2,
|
||||
show_progress: bool = True,
|
||||
special_tokens: List[Union[str, AddedToken]] = [],
|
||||
):
|
||||
""" Train the model using the given files """
|
||||
|
||||
trainer = trainers.BpeTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
show_progress=show_progress,
|
||||
special_tokens=special_tokens,
|
||||
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
|
||||
)
|
||||
if isinstance(files, str):
|
||||
files = [files]
|
||||
self._tokenizer.train(trainer, files)
|
||||
@@ -0,0 +1,116 @@
|
||||
from .. import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers
|
||||
from ..models import BPE
|
||||
from ..normalizers import Sequence, Lowercase, unicode_normalizer_from_str, BertNormalizer
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union
|
||||
|
||||
|
||||
class CharBPETokenizer(BaseTokenizer):
|
||||
""" Original BPE Tokenizer
|
||||
|
||||
Represents the BPE algorithm, as introduced by Rico Sennrich
|
||||
(https://arxiv.org/abs/1508.07909)
|
||||
|
||||
The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
|
||||
Sennrich subword-nmt implementation by the following options that you can deactivate:
|
||||
- adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
|
||||
* removing any control characters and replacing all whitespaces by the classic one.
|
||||
* handle chinese chars by putting spaces around them.
|
||||
* strip all accents.
|
||||
- spitting on punctuation in addition to whitespaces (deactivate it with
|
||||
`split_on_whitespace_only=True`)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file: Optional[str] = None,
|
||||
merges_file: Optional[str] = None,
|
||||
unk_token: Union[str, AddedToken] = "<unk>",
|
||||
suffix: str = "</w>",
|
||||
dropout: Optional[float] = None,
|
||||
lowercase: bool = False,
|
||||
unicode_normalizer: Optional[str] = None,
|
||||
bert_normalizer: bool = True,
|
||||
split_on_whitespace_only: bool = False,
|
||||
):
|
||||
if vocab_file is not None and merges_file is not None:
|
||||
tokenizer = Tokenizer(
|
||||
BPE(
|
||||
vocab_file,
|
||||
merges_file,
|
||||
dropout=dropout,
|
||||
unk_token=str(unk_token),
|
||||
end_of_word_suffix=suffix,
|
||||
)
|
||||
)
|
||||
else:
|
||||
tokenizer = Tokenizer(BPE())
|
||||
|
||||
if tokenizer.token_to_id(str(unk_token)) is not None:
|
||||
tokenizer.add_special_tokens([str(unk_token)])
|
||||
|
||||
# Check for Unicode normalization first (before everything else)
|
||||
normalizers = []
|
||||
|
||||
if unicode_normalizer:
|
||||
normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
|
||||
|
||||
if bert_normalizer:
|
||||
normalizers += [BertNormalizer(lowercase=False)]
|
||||
|
||||
if lowercase:
|
||||
normalizers += [Lowercase()]
|
||||
|
||||
# Create the normalizer structure
|
||||
if len(normalizers) > 0:
|
||||
if len(normalizers) > 1:
|
||||
tokenizer.normalizer = Sequence(normalizers)
|
||||
else:
|
||||
tokenizer.normalizer = normalizers[0]
|
||||
|
||||
if split_on_whitespace_only:
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
|
||||
else:
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
|
||||
|
||||
tokenizer.decoder = decoders.BPEDecoder(suffix=suffix)
|
||||
|
||||
parameters = {
|
||||
"model": "BPE",
|
||||
"unk_token": unk_token,
|
||||
"suffix": suffix,
|
||||
"dropout": dropout,
|
||||
"lowercase": lowercase,
|
||||
"unicode_normalizer": unicode_normalizer,
|
||||
"bert_normalizer": bert_normalizer,
|
||||
"split_on_whitespace_only": split_on_whitespace_only,
|
||||
}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
def train(
|
||||
self,
|
||||
files: Union[str, List[str]],
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 2,
|
||||
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
|
||||
limit_alphabet: int = 1000,
|
||||
initial_alphabet: List[str] = [],
|
||||
suffix: Optional[str] = "</w>",
|
||||
show_progress: bool = True,
|
||||
):
|
||||
""" Train the model using the given files """
|
||||
|
||||
trainer = trainers.BpeTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
special_tokens=special_tokens,
|
||||
limit_alphabet=limit_alphabet,
|
||||
initial_alphabet=initial_alphabet,
|
||||
end_of_word_suffix=suffix,
|
||||
show_progress=show_progress,
|
||||
)
|
||||
if isinstance(files, str):
|
||||
files = [files]
|
||||
self._tokenizer.train(trainer, files)
|
||||
@@ -0,0 +1,74 @@
|
||||
from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import NFKC
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union
|
||||
|
||||
|
||||
class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
""" SentencePiece BPE Tokenizer
|
||||
|
||||
Represents the BPE algorithm, with the pretokenization used by SentencePiece
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file: Optional[str] = None,
|
||||
merges_file: Optional[str] = None,
|
||||
unk_token: Union[str, AddedToken] = "<unk>",
|
||||
replacement: str = "▁",
|
||||
add_prefix_space: bool = True,
|
||||
dropout: Optional[float] = None,
|
||||
):
|
||||
if vocab_file is not None and merges_file is not None:
|
||||
tokenizer = Tokenizer(
|
||||
BPE(vocab_file, merges_file, dropout=dropout, unk_token=unk_token)
|
||||
)
|
||||
else:
|
||||
tokenizer = Tokenizer(BPE())
|
||||
|
||||
if tokenizer.token_to_id(str(unk_token)) is not None:
|
||||
tokenizer.add_special_tokens([str(unk_token)])
|
||||
|
||||
tokenizer.normalizer = NFKC()
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
)
|
||||
tokenizer.decoder = decoders.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
)
|
||||
|
||||
parameters = {
|
||||
"model": "SentencePieceBPE",
|
||||
"unk_token": unk_token,
|
||||
"replacement": replacement,
|
||||
"add_prefix_space": add_prefix_space,
|
||||
"dropout": dropout,
|
||||
}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
def train(
|
||||
self,
|
||||
files: Union[str, List[str]],
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 2,
|
||||
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
|
||||
limit_alphabet: int = 1000,
|
||||
initial_alphabet: List[str] = [],
|
||||
show_progress: bool = True,
|
||||
):
|
||||
""" Train the model using the given files """
|
||||
|
||||
trainer = trainers.BpeTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
special_tokens=special_tokens,
|
||||
limit_alphabet=limit_alphabet,
|
||||
initial_alphabet=initial_alphabet,
|
||||
show_progress=show_progress,
|
||||
)
|
||||
if isinstance(files, str):
|
||||
files = [files]
|
||||
self._tokenizer.train(trainer, files)
|
||||
11
bindings/python/py_src/tokenizers/models/__init__.py
Normal file
11
bindings/python/py_src/tokenizers/models/__init__.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from typing import List, Tuple
|
||||
|
||||
from .. import models, Offsets
|
||||
|
||||
TokenizedSequence = List[str]
|
||||
TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
|
||||
|
||||
Model = models.Model
|
||||
BPE = models.BPE
|
||||
WordPiece = models.WordPiece
|
||||
WordLevel = models.WordLevel
|
||||
156
bindings/python/py_src/tokenizers/models/__init__.pyi
Normal file
156
bindings/python/py_src/tokenizers/models/__init__.pyi
Normal file
@@ -0,0 +1,156 @@
|
||||
from .. import Encoding, Offsets
|
||||
from typing import List, Optional, Union, Tuple
|
||||
|
||||
TokenizedSequence = List[str]
|
||||
TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
|
||||
|
||||
class Model:
|
||||
""" Base class for all models
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||
a Model will return a instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
def save(self, folder: str, name: Optional[str] = None) -> List[str]:
|
||||
""" Save the current model
|
||||
|
||||
Save the current model in the given folder, using the given name for the various
|
||||
files that will get created.
|
||||
Any file with the same name that already exist in this folder will be overwritten.
|
||||
"""
|
||||
pass
|
||||
def encode(
|
||||
self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
|
||||
) -> Encoding:
|
||||
""" Encode the given sequence.
|
||||
|
||||
A sequence can either be:
|
||||
- `TokenizedSequence`: (`List[str]`)
|
||||
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
|
||||
a Tuple[int, int].
|
||||
|
||||
If the Offsets are not provided, they will be automatically generated, making the hypothesis
|
||||
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
|
||||
|
||||
Args:
|
||||
sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
|
||||
Either a TokenizedSequence or a TokenizedSequenceWithOffsets
|
||||
|
||||
type_id: int:
|
||||
The type id of the given sequence
|
||||
|
||||
Returns:
|
||||
An Encoding
|
||||
"""
|
||||
pass
|
||||
def encode_batch(
|
||||
self,
|
||||
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
|
||||
type_id: int = 0,
|
||||
) -> List[Encoding]:
|
||||
""" Encode the given batch of sequence.
|
||||
|
||||
A sequence can either be:
|
||||
- `TokenizedSequence`: (`List[str]`)
|
||||
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
|
||||
a Tuple[int, int].
|
||||
|
||||
If the Offsets are not provided, they will be automatically generated, making the hypothesis
|
||||
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
|
||||
|
||||
Args:
|
||||
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
|
||||
A list of sequence. Each sequence is either a TokenizedSequence or a
|
||||
TokenizedSequenceWithOffsets
|
||||
|
||||
type_id: int:
|
||||
The type if of the given sequence
|
||||
|
||||
Returns:
|
||||
A list of Encoding
|
||||
"""
|
||||
pass
|
||||
|
||||
class BPE(Model):
|
||||
"""BytePairEncoding model class
|
||||
|
||||
Instantiate a BPE Model from the given vocab and merges files.
|
||||
|
||||
Args:
|
||||
vocab: ('`optional`) string:
|
||||
Path to a vocabulary JSON file.
|
||||
|
||||
merges: (`optional`) string:
|
||||
Path to a merge file.
|
||||
|
||||
cache_capacity: (`optional`) int:
|
||||
The number of words that the BPE cache can contain. The cache allows
|
||||
to speed-up the process by keeping the result of the merge operations
|
||||
for a number of words.
|
||||
|
||||
dropout: (`optional`) Optional[float] [0, 1]:
|
||||
The BPE dropout to use. Must be an float between 0 and 1
|
||||
|
||||
unk_token: (`optional`) str:
|
||||
The unknown token to be used by the model.
|
||||
|
||||
continuing_subword_prefix: (`optional`) str:
|
||||
The prefix to attach to subword units that don't represent a beginning of word.
|
||||
|
||||
end_of_word_suffix: (`optional`) str:
|
||||
The suffix to attach to subword units that represent an end of word.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Optional[str],
|
||||
merges: Optional[str],
|
||||
cache_capacity: Optional[int],
|
||||
dropout: Optional[float],
|
||||
unk_token: Optional[str],
|
||||
continuing_subword_prefix: Optional[str],
|
||||
end_of_word_suffix: Optional[str],
|
||||
):
|
||||
pass
|
||||
|
||||
class WordPiece(Model):
|
||||
""" WordPiece model class
|
||||
|
||||
Instantiate a WordPiece Model from the given vocab file.
|
||||
|
||||
Args:
|
||||
vocab: (`optional`) string:
|
||||
Path to a vocabulary file.
|
||||
|
||||
unk_token: (`optional`) str:
|
||||
The unknown token to be used by the model.
|
||||
|
||||
max_input_chars_per_word: (`optional`) int:
|
||||
The maximum number of characters to authorize in a single word.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Optional[str],
|
||||
unk_token: Optional[str],
|
||||
max_input_chars_per_word: Optional[int],
|
||||
):
|
||||
pass
|
||||
|
||||
class WordLevel(Model):
|
||||
"""
|
||||
Most simple tokenizer model based on mapping token from a vocab file to their corresponding id.
|
||||
|
||||
Instantiate a WordLevel Model from the given vocab file.
|
||||
|
||||
Args:
|
||||
vocab: (`optional`) string:
|
||||
Path to a vocabulary file.
|
||||
|
||||
unk_token: str:
|
||||
The unknown token to be used by the model.
|
||||
"""
|
||||
|
||||
def __init__(self, vocab: Optional[str], unk_token: Optional[str]):
|
||||
pass
|
||||
25
bindings/python/py_src/tokenizers/normalizers/__init__.py
Normal file
25
bindings/python/py_src/tokenizers/normalizers/__init__.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from .. import normalizers
|
||||
|
||||
Normalizer = normalizers.Normalizer
|
||||
BertNormalizer = normalizers.BertNormalizer
|
||||
NFD = normalizers.NFD
|
||||
NFKD = normalizers.NFKD
|
||||
NFC = normalizers.NFC
|
||||
NFKC = normalizers.NFKC
|
||||
Sequence = normalizers.Sequence
|
||||
Lowercase = normalizers.Lowercase
|
||||
Strip = normalizers.Strip
|
||||
|
||||
|
||||
NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
|
||||
|
||||
|
||||
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
|
||||
if normalizer not in NORMALIZERS:
|
||||
raise ValueError(
|
||||
"{} is not a known unicode normalizer. Available are {}".format(
|
||||
normalizer, NORMALIZERS.keys()
|
||||
)
|
||||
)
|
||||
|
||||
return NORMALIZERS[normalizer]()
|
||||
108
bindings/python/py_src/tokenizers/normalizers/__init__.pyi
Normal file
108
bindings/python/py_src/tokenizers/normalizers/__init__.pyi
Normal file
@@ -0,0 +1,108 @@
|
||||
from typing import Optional, List
|
||||
|
||||
class Normalizer:
|
||||
""" Base class for all normalizers
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
Normalizer will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
class BertNormalizer(Normalizer):
|
||||
""" BertNormalizer
|
||||
|
||||
Takes care of normalizing raw text before giving it to a Bert model.
|
||||
This includes cleaning the text, handling accents, chinese chars and lowercasing
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
clean_text: Optional[bool] = True,
|
||||
handle_chinese_chars: Optional[bool] = True,
|
||||
strip_accents: Optional[bool] = None,
|
||||
lowercase: Optional[bool] = True,
|
||||
) -> None:
|
||||
""" Instantiate a BertNormalizer with the given options.
|
||||
|
||||
Args:
|
||||
clean_text: (`optional`) boolean:
|
||||
Whether to clean the text, by removing any control characters
|
||||
and replacing all whitespaces by the classic one.
|
||||
|
||||
handle_chinese_chars: (`optional`) boolean:
|
||||
Whether to handle chinese chars by putting spaces around them.
|
||||
|
||||
strip_accents: (`optional`) boolean:
|
||||
Whether to strip all accents. If this option is not specified (ie == None),
|
||||
then it will be determined by the value for `lowercase` (as in the original Bert).
|
||||
|
||||
lowercase: (`optional`) boolean:
|
||||
Whether to lowercase.
|
||||
|
||||
Returns:
|
||||
Normalizer
|
||||
"""
|
||||
pass
|
||||
|
||||
class NFD(Normalizer):
|
||||
""" NFD Unicode Normalizer """
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new NFD Normalizer """
|
||||
pass
|
||||
|
||||
class NFKD(Normalizer):
|
||||
""" NFKD Unicode Normalizer """
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new NFKD Normalizer """
|
||||
pass
|
||||
|
||||
class NFC(Normalizer):
|
||||
""" NFC Unicode Normalizer """
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new NFC Normalizer """
|
||||
pass
|
||||
|
||||
class NFKC(Normalizer):
|
||||
""" NFKC Unicode Normalizer """
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new NFKC Normalizer """
|
||||
pass
|
||||
|
||||
class Sequence(Normalizer):
|
||||
""" Allows concatenating multiple other Normalizer as a Sequence.
|
||||
|
||||
All the normalizers run in sequence in the given order
|
||||
"""
|
||||
|
||||
def __init__(self, normalizers: List[Normalizer]) -> None:
|
||||
""" Instantiate a new normalization Sequence using the given normalizers
|
||||
|
||||
Args:
|
||||
normalizers: List[Normalizer]:
|
||||
A list of Normalizer to be run as a sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class Lowercase(Normalizer):
|
||||
""" Lowercase Normalizer """
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new Lowercase Normalizer """
|
||||
pass
|
||||
|
||||
class Strip(Normalizer):
|
||||
""" Strip normalizer """
|
||||
|
||||
def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
|
||||
pass
|
||||
|
||||
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
|
||||
"""
|
||||
Instanciate unicode normalizer from the normalizer name
|
||||
:param normalizer: Name of the normalizer
|
||||
:return:
|
||||
"""
|
||||
pass
|
||||
@@ -0,0 +1,9 @@
|
||||
from .. import pre_tokenizers
|
||||
|
||||
PreTokenizer = pre_tokenizers.PreTokenizer
|
||||
ByteLevel = pre_tokenizers.ByteLevel
|
||||
Whitespace = pre_tokenizers.Whitespace
|
||||
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
|
||||
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
|
||||
Metaspace = pre_tokenizers.Metaspace
|
||||
CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
|
||||
109
bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi
Normal file
109
bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi
Normal file
@@ -0,0 +1,109 @@
|
||||
from typing import Optional, List, Tuple
|
||||
|
||||
Offsets = Tuple[int, int]
|
||||
|
||||
class PreTokenizer:
|
||||
""" Base class for all pre-tokenizers
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
PreTokenizer will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
def pre_tokenize(self, sequence: str) -> List[Tuple[str, Offsets]]:
|
||||
""" Pre tokenize the given sequence """
|
||||
pass
|
||||
|
||||
class ByteLevel(PreTokenizer):
|
||||
""" ByteLevel PreTokenizer
|
||||
|
||||
This pre-tokenizer takes care of replacing all bytes of the given string
|
||||
with a corresponding representation, as well as splitting into words.
|
||||
"""
|
||||
|
||||
def __init__(self, add_prefix_space: bool = True) -> None:
|
||||
""" Instantiate a new ByteLevel PreTokenizer
|
||||
Args:
|
||||
add_prefix_space: (`optional`) boolean:
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
Returns:
|
||||
PreTokenizer
|
||||
"""
|
||||
pass
|
||||
@staticmethod
|
||||
def alphabet() -> List[str]:
|
||||
""" Returns the alphabet used by this PreTokenizer.
|
||||
|
||||
Since the ByteLevel works as its name suggests, at the byte level, it
|
||||
encodes any byte to one visible character. This means that there is a
|
||||
total of 256 different characters composing this alphabet.
|
||||
"""
|
||||
pass
|
||||
|
||||
class Whitespace(PreTokenizer):
|
||||
""" Whitespace PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new Whitespace PreTokenizer """
|
||||
pass
|
||||
|
||||
class WhitespaceSplit(PreTokenizer):
|
||||
""" Whitespace PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new WhitespaceSplit PreTokenizer """
|
||||
pass
|
||||
|
||||
class BertPreTokenizer(PreTokenizer):
|
||||
""" BertPreTokenizer
|
||||
|
||||
This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||
Each occurence of a punctuation character will be treated separately.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new BertPreTokenizer """
|
||||
pass
|
||||
|
||||
class Metaspace(PreTokenizer):
|
||||
""" Metaspace pre-tokenizer
|
||||
|
||||
This pre-tokenizer replaces any whitespace by the provided replacement character.
|
||||
It then tries to split on these spaces.
|
||||
"""
|
||||
|
||||
def __init__(self, replacement: str = "▁", add_prefix_space: bool = True) -> None:
|
||||
""" Instantiate a new Metaspace
|
||||
|
||||
Args:
|
||||
replacement: str:
|
||||
The replacement character. Must be exactly one character. By default we
|
||||
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||
|
||||
add_prefix_space: boolean:
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
"""
|
||||
pass
|
||||
|
||||
class CharDelimiterSplit(PreTokenizer):
|
||||
""" CharDelimiterSplit PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def __init__(self, delimiter: str) -> None:
|
||||
""" Instantiate a new CharDelimiterSplit PreTokenizer
|
||||
|
||||
Args:
|
||||
delimiter: str:
|
||||
The delimiter char that will be used to split input
|
||||
"""
|
||||
pass
|
||||
6
bindings/python/py_src/tokenizers/processors/__init__.py
Normal file
6
bindings/python/py_src/tokenizers/processors/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from .. import processors
|
||||
|
||||
PostProcessor = processors.PostProcessor
|
||||
BertProcessing = processors.BertProcessing
|
||||
RobertaProcessing = processors.RobertaProcessing
|
||||
ByteLevel = processors.ByteLevel
|
||||
99
bindings/python/py_src/tokenizers/processors/__init__.pyi
Normal file
99
bindings/python/py_src/tokenizers/processors/__init__.pyi
Normal file
@@ -0,0 +1,99 @@
|
||||
from typing import Tuple
|
||||
|
||||
class PostProcessor:
|
||||
""" Base class for all post-processors
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||
a PostProcessor will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
def num_special_tokens_to_add(self, is_pair: bool) -> int:
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
||||
:return:
|
||||
"""
|
||||
pass
|
||||
|
||||
class BertProcessing(PostProcessor):
|
||||
""" BertProcessing
|
||||
|
||||
This post-processor takes care of adding the special tokens needed by
|
||||
a Bert model:
|
||||
- a SEP token
|
||||
- a CLS token
|
||||
"""
|
||||
|
||||
def __init__(self, sep: Tuple[str, int], cls: Tuple[str, int]) -> None:
|
||||
""" Instantiate a new BertProcessing with the given tokens
|
||||
|
||||
Args:
|
||||
sep: Tuple[str, int]:
|
||||
A tuple with the string representation of the SEP token, and its id
|
||||
|
||||
cls: Tuple[str, int]:
|
||||
A tuple with the string representation of the CLS token, and its id
|
||||
|
||||
Returns:
|
||||
PostProcessor
|
||||
"""
|
||||
pass
|
||||
|
||||
class RobertaProcessing(PostProcessor):
|
||||
""" RobertaProcessing
|
||||
|
||||
This post-processor takes care of adding the special tokens needed by
|
||||
a Roberta model:
|
||||
- a SEP token
|
||||
- a CLS token
|
||||
|
||||
It also takes care of trimming the offsets.
|
||||
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
|
||||
want the offsets to include these whitespaces, then this PostProcessor should be initialized
|
||||
with `trim_offsets=True`
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sep: Tuple[str, int],
|
||||
cls: Tuple[str, int],
|
||||
trim_offsets: bool = True,
|
||||
add_prefix_space: bool = True,
|
||||
) -> None:
|
||||
""" Instantiate a new RobertaProcessing with the given tokens
|
||||
|
||||
Args:
|
||||
sep: Tuple[str, int]:
|
||||
A tuple with the string representation of the SEP token, and its id
|
||||
|
||||
cls: Tuple[str, int]:
|
||||
A tuple with the string representation of the CLS token, and its id
|
||||
|
||||
trim_offsets: bool:
|
||||
Whether to trim the whitespaces from the produced offsets.
|
||||
|
||||
add_prefix_space: bool:
|
||||
Whether the add_prefix_space option was enabled during pre-tokenization. This
|
||||
is relevant because it defines the way the offsets are trimmed out.
|
||||
|
||||
Returns:
|
||||
PostProcessor
|
||||
"""
|
||||
pass
|
||||
|
||||
class ByteLevel(PostProcessor):
|
||||
""" ByteLevel Post processing
|
||||
|
||||
This post-processor takes care of trimming the offsets.
|
||||
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
|
||||
want the offsets to include these whitespaces, then this PostProcessor must be used.
|
||||
"""
|
||||
|
||||
def __init(self, trim_offsets: bool = True) -> None:
|
||||
""" Instantiate a new ByteLevel
|
||||
|
||||
Args:
|
||||
trim_offsets: bool:
|
||||
Whether to trim the whitespaces from the produced offsets.
|
||||
"""
|
||||
pass
|
||||
5
bindings/python/py_src/tokenizers/trainers/__init__.py
Normal file
5
bindings/python/py_src/tokenizers/trainers/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from .. import trainers
|
||||
|
||||
Trainer = trainers.Trainer
|
||||
BpeTrainer = trainers.BpeTrainer
|
||||
WordPieceTrainer = trainers.WordPieceTrainer
|
||||
113
bindings/python/py_src/tokenizers/trainers/__init__.pyi
Normal file
113
bindings/python/py_src/tokenizers/trainers/__init__.pyi
Normal file
@@ -0,0 +1,113 @@
|
||||
from .. import AddedToken
|
||||
from typing import Optional, List, Union
|
||||
|
||||
class Trainer:
|
||||
""" Base class for all trainers
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
Trainer will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
class BpeTrainer(Trainer):
|
||||
""" BpeTrainer
|
||||
|
||||
Capable of training a BPE model
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 0,
|
||||
show_progress: bool = True,
|
||||
special_tokens: List[Union[str, AddedToken]] = [],
|
||||
limit_alphabet: Optional[int] = None,
|
||||
initial_alphabet: List[str] = [],
|
||||
continuing_subword_prefix: Optional[str] = None,
|
||||
end_of_word_suffix: Optional[str] = None,
|
||||
) -> None:
|
||||
""" Instantiate a new BpeTrainer with the given options:
|
||||
|
||||
Args:
|
||||
vocab_size: unsigned int:
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
|
||||
min_frequency: unsigned int:
|
||||
The minimum frequency a pair should have in order to be merged.
|
||||
|
||||
show_progress: boolean:
|
||||
Whether to show progress bars while training.
|
||||
|
||||
special_tokens: List[Union[str, AddedToken]]:
|
||||
A list of special tokens the model should know of.
|
||||
|
||||
limit_alphabet: unsigned int:
|
||||
The maximum different characters to keep in the alphabet.
|
||||
|
||||
initial_alphabet: List[str]:
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contains more than one character, only the first one
|
||||
is kept.
|
||||
|
||||
continuing_subword_prefix: Optional[str]:
|
||||
A prefix to be used for every subword that is not a beginning-of-word.
|
||||
|
||||
end_of_word_suffix: Optional[str]:
|
||||
A suffix to be used for every subword that is a end-of-word.
|
||||
|
||||
Returns:
|
||||
Trainer
|
||||
"""
|
||||
pass
|
||||
|
||||
class WordPieceTrainer(Trainer):
|
||||
""" WordPieceTrainer
|
||||
|
||||
Capable of training a WordPiece model
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 0,
|
||||
show_progress: bool = True,
|
||||
special_tokens: List[Union[str, AddedToken]] = [],
|
||||
limit_alphabet: Optional[int] = None,
|
||||
initial_alphabet: List[str] = [],
|
||||
continuing_subword_prefix: Optional[str] = "##",
|
||||
end_of_word_suffix: Optional[str] = None,
|
||||
) -> Trainer:
|
||||
""" Instantiate a new WordPieceTrainer with the given options:
|
||||
|
||||
Args:
|
||||
vocab_size: unsigned int:
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
|
||||
min_frequency: unsigned int:
|
||||
The minimum frequency a pair should have in order to be merged.
|
||||
|
||||
show_progress: boolean:
|
||||
Whether to show progress bars while training.
|
||||
|
||||
special_tokens: List[Union[str, AddedToken]]:
|
||||
A list of special tokens the model should know of.
|
||||
|
||||
limit_alphabet: unsigned int:
|
||||
The maximum different characters to keep in the alphabet.
|
||||
|
||||
initial_alphabet: List[str]:
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contains more than one character, only the first one
|
||||
is kept.
|
||||
|
||||
continuing_subword_prefix: Optional[str]:
|
||||
A prefix to be used for every subword that is not a beginning-of-word.
|
||||
|
||||
end_of_word_suffix: Optional[str]:
|
||||
A suffix to be used for every subword that is a end-of-word.
|
||||
|
||||
Returns:
|
||||
Trainer
|
||||
"""
|
||||
pass
|
||||
Reference in New Issue
Block a user