Move Python source to subdirectory.

This allows testing versions not built in-place. Otherwise
importing (or testing) in the package root fails without develop
builds.
Replace maturin with setuptools_rust since maturin fails with
proper project structure.
This commit is contained in:
Sebastian Pütz
2020-07-25 19:41:18 +02:00
parent e9a2e63a67
commit 0d7c232f95
23 changed files with 5 additions and 4 deletions

View File

@@ -0,0 +1,29 @@
__version__ = "0.8.1"
from typing import Tuple, Union, Tuple, List
Offsets = Tuple[int, int]
TextInputSequence = str
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence]
]
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
from .tokenizers import Tokenizer, Encoding, AddedToken
from .tokenizers import decoders
from .tokenizers import models
from .tokenizers import normalizers
from .tokenizers import pre_tokenizers
from .tokenizers import processors
from .tokenizers import trainers
from .implementations import (
ByteLevelBPETokenizer,
CharBPETokenizer,
SentencePieceBPETokenizer,
BertWordPieceTokenizer,
)

View File

@@ -0,0 +1,636 @@
from .decoders import *
from .models import *
from .normalizers import *
from .pre_tokenizers import *
from .processors import *
from .trainers import *
from .implementations import (
ByteLevelBPETokenizer as ByteLevelBPETokenizer,
CharBPETokenizer as CharBPETokenizer,
SentencePieceBPETokenizer as SentencePieceBPETokenizer,
BertWordPieceTokenizer as BertWordPieceTokenizer,
)
from typing import Optional, Union, List, Tuple
Offsets = Tuple[int, int]
TextInputSequence = str
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
]
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
class Encoding:
""" An Encoding as returned by the Tokenizer """
@staticmethod
def merge(encodings: List[Encoding], growing_offsets: bool = True) -> Encoding:
""" Merge the list of Encoding into one final Encoding
Args:
encodings: List[Encoding]:
The list of encodings
growing_offsets: bool:
Whether the offsets should accumulate while merging
Returns:
The resulting Encoding
"""
pass
@property
def ids(self) -> List[int]:
""" The tokenized ids """
pass
@property
def tokens(self) -> List[str]:
""" The tokenized strings """
pass
@property
def words(self) -> List[Optional[int]]:
""" The tokenized words index """
pass
@property
def type_ids(self) -> List[int]:
""" The type ids """
pass
@property
def offsets(self) -> List[Offsets]:
""" The offsets.
These offsets can be used to index any `IndexableString` directly. If you want to
index the original `str`, make sure to retrieve the converted offsets using the `.offsets`
method on the `original_str`.
"""
pass
@property
def special_tokens_mask(self) -> List[int]:
""" The special tokens mask """
pass
@property
def attention_mask(self) -> List[int]:
""" The attention mask """
pass
@property
def overflowing(self) -> Optional[Encoding]:
""" The overflowing encoding, after truncation """
pass
def word_to_tokens(self, word_index: int) -> Optional[Tuple[int, int]]:
"""
Get the encoded tokens corresponding to the word at the given index in the input
sequence, with the form [start_token, end_token + 1]
Args:
word_index: int:
The index of the word in the input sequence.
Returns:
The range of tokens with the form [start_token, end_token + 1]
"""
pass
def word_to_chars(self, word_index: int) -> Optional[Offsets]:
"""
Get the offsets of the word at the given index in the input sequence.
Args:
word_index: int:
The index of the word in the input sequence.
Returns:
The word offsets
"""
pass
def token_to_chars(self, token_index: int) -> Optional[Offsets]:
"""
Get the offsets of the token at the given index
Args:
token_index: int:
The index of the token in the encoded sequence.
Returns:
The token offsets
"""
pass
def token_to_word(self, token_index: int) -> Optional[int]:
"""
Get the word that contains the token at the given index
Args:
token_index: int:
The index of the token in the encoded sequence.
Returns:
The index of the word in the input sequence.
"""
pass
def char_to_token(self, pos: int) -> Optional[int]:
"""
Get the token that contains the char at the given position
Args:
pos: int:
The position of a char in the input string
Returns:
The index of the token that contains this char
"""
pass
def char_to_word(self, pos: int) -> Optional[int]:
"""
Get the word that contains the given char.
Args:
pos: int:
The position of a char in the input string
Returns:
The index of the word that contains this char
"""
pass
def pad(
self,
length: int,
pad_id: Optional[int] = 0,
pad_type_id: Optional[int] = 0,
pad_token: Optional[str] = "[PAD]",
direction: Optional[str] = "right",
):
""" Pad the current Encoding at the given length
Args:
length: int:
The length at which to pad
direction: (`optional`) str:
Can be one of: `right` or `left`
pad_id: (`optional`) unsigned int:
The indice to be used when padding
pad_type_id: (`optional`) unsigned int:
The type indice to be used when padding
pad_token: (`optional`) str:
The pad token to be used when padding
"""
pass
def truncate(self, max_length: int, stride: Optional[int] = 0):
""" Truncate the current Encoding at the given max_length
Args:
max_length: int:
The maximum length to be kept
stride: (`optional`) unsigned int:
The length of the previous first sequence to be included
in the overflowing sequence
"""
pass
class AddedToken:
""" AddedToken represents a token to be added to a Tokenizer
An AddedToken can have special options defining the way it should behave.
"""
def __new__(
cls,
content: str = "",
single_word: bool = False,
lstrip: bool = False,
rstrip: bool = False,
normalized: bool = True,
) -> AddedToken:
""" Instantiate a new AddedToken
Args:
content: str:
The content of the token
single_word: bool
Whether this token should only match against single words. If True,
this token will never match inside of a word. For example the token `ing` would
match on `tokenizing` if this option if False, but not if this option is True.
lstrip: bool
Whether this token should strip all potential whitespaces on the left side.
If True, this token will greedily match any whitespace on the left. For example,
if we try to match the token `[MASK]` with lstrip=True, in the text `I saw a [MASK]`
we will match on ` [MASK]`.
rstrip: bool
Whether this token should strip all potential whitespaces on the right side.
If True, this token will greedily match any whitespace on the right. It works just
like lstrip, but on the right.
normalized: bool:
Whether this token should be match the normalized version of the input text. For
example, with the added token `yesterday` and a normalizer in charge of lowercasing
the text, the token could be extract from the input `I saw a lion Yesterday`.
"""
pass
class Tokenizer:
""" Tokenizer
A Tokenizer works as a pipeline, it processes some raw text as input and outputs
an `Encoding`.
The various steps of the pipeline are:
1. The `Normalizer`: in charge of normalizing the text. Common examples of
normalization are the unicode normalization standards, such as NFD or NFKC.
2. The `PreTokenizer`: in charge of creating initial words splits in the text.
The most common way of splitting text is simply on whitespace.
3. The `Model`: in charge of doing the actual tokenization. An example of a
`Model` would be `BPE` or `WordPiece`.
4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything
relevant that, for example, a language model would need, such as special tokens.
"""
def __new__(cls, model: models.Model) -> Tokenizer:
""" Instantiate a new Tokenizer using the given Model
Args:
model: models.Model:
The model to be used with this Tokenizer
Returns:
Tokenizer
"""
pass
@staticmethod
def from_str(s: str) -> Tokenizer:
""" Instantiate a new Tokenizer from the given JSON string
Args:
s: str:
A JSON string representation of the Tokenizer
Returns:
Tokenizer
"""
pass
@staticmethod
def from_file(path: str) -> Tokenizer:
""" Instantiate a new Tokenizer from the given file
Args:
path: str:
Path to a file containing a Tokenizer
Returns:
Tokenizer
"""
pass
@staticmethod
def from_buffer(buffer: bytes) -> Tokenizer:
""" Instantiate a new Tokenizer from the given buffer
Args:
buffer: bytes:
A buffer used to instantiate a new Tokenizer
Returns:
Tokenizer
"""
pass
def to_str(self, pretty: bool = False) -> str:
""" Get a serialized JSON version of the Tokenizer as a str
Args:
pretty: bool:
Whether the JSON string should be prettified
Returns:
str
"""
pass
def save(self, path: str, pretty: bool = False):
""" Save the Tokenizer as JSON to the given path
Args:
pretty: bool:
Whether the JSON string should be prettified
"""
pass
@property
def model(self) -> Model:
""" Get the model in use with this Tokenizer """
pass
@model.setter
def model(self, model: models.Model):
""" Change the model to use with this Tokenizer """
pass
@property
def pre_tokenizer(self) -> Optional[PreTokenizer]:
""" Get the pre-tokenizer in use with this model """
pass
@pre_tokenizer.setter
def pre_tokenizer(self, pre_tokenizer: pre_tokenizers.PreTokenizer):
""" Change the pre tokenizer to use with this Tokenizer """
pass
@property
def decoder(self) -> Optional[Decoder]:
""" Get the decoder in use with this model """
pass
@decoder.setter
def decoder(self, decoder: decoders.Decoder):
""" Change the decoder to use with this Tokenizer """
pass
@property
def post_processor(self) -> Optional[PostProcessor]:
""" Get the post-processor in use with this Tokenizer """
pass
@post_processor.setter
def post_processor(self, processor: processors.PostProcessor):
""" Change the post processor to use with this Tokenizer """
@property
def normalizer(self) -> Optional[Normalizer]:
""" Get the normalizer in use with this Tokenizer """
pass
@normalizer.setter
def normalizer(self, normalizer: normalizers.Normalizer):
""" Change the normalizer to use with this Tokenizer """
def num_special_tokens_to_add(self, is_pair: bool) -> int:
"""
Return the number of special tokens that would be added for single/pair sentences.
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
:return:
"""
pass
def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
""" Returns the vocabulary
Args:
with_added_tokens: boolean:
Whether to include the added tokens in the vocabulary
Returns:
The vocabulary
"""
pass
def get_vocab_size(self, with_added_tokens: bool = True) -> int:
""" Returns the size of the vocabulary
Args:
with_added_tokens: boolean:
Whether to include the added tokens in the vocabulary's size
Returns:
The size of the vocabulary
"""
pass
def enable_truncation(self, max_length: int, stride: Optional[int], strategy: Optional[str]):
""" Enable the truncation
Args:
max_length: unsigned int:
The maximum length at which to truncate
stride: (`optional`) unsigned int:
The length of the previous first sequence to be included
in the overflowing sequence
strategy: (`optional) str:
Can be one of `longest_first`, `only_first` or `only_second`
"""
pass
def no_truncation(self):
""" Disable truncation """
pass
@property
def truncation(self) -> Optional[dict]:
""" Get the current truncation parameters
Returns:
None if truncation is disabled, a dict with the current truncation parameters if
truncation is enabled
"""
pass
def enable_padding(
self,
direction: Optional[str] = "right",
pad_to_multiple_of: Optional[int] = None,
pad_id: Optional[int] = 0,
pad_type_id: Optional[int] = 0,
pad_token: Optional[str] = "[PAD]",
length: Optional[int] = None,
):
""" Enable the padding
Args:
direction: (`optional`) str:
Can be one of: `right` or `left`
pad_to_multiple_of: (`optional`) unsigned int:
If specified, the padding length should always snap to the next multiple of
the given value. For example if we were going to pad with a length of 250 but
`pad_to_multiple_of=8` then we will pad to 256.
pad_id: (`optional`) unsigned int:
The indice to be used when padding
pad_type_id: (`optional`) unsigned int:
The type indice to be used when padding
pad_token: (`optional`) str:
The pad token to be used when padding
length: (`optional`) unsigned int:
If specified, the length at which to pad. If not specified
we pad using the size of the longest sequence in a batch
"""
pass
def no_padding(self):
""" Disable padding """
pass
@property
def padding(self) -> Optional[dict]:
""" Get the current padding parameters
Returns:
None if padding is disabled, a dict with the currently set parameters
if the padding is enabled.
"""
pass
def normalize(self, sequence: str) -> str:
""" Normalize the given sequence
Args:
sequence: str:
The sequence to normalize
Returns:
The normalized string
"""
pass
def encode(
self,
sequence: InputSequence,
pair: Optional[InputSequence],
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> Encoding:
""" Encode the given sequence and pair. This method can process raw text sequences as well
as already pre-tokenized sequences.
Args:
sequence: InputSequence:
The sequence we want to encode. This sequence can be either raw text or
pre-tokenized, according to the `is_pretokenized` argument:
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
- If `is_pretokenized=True`: `InputSequence` is expected to be
`Union[List[str], Tuple[str]]`
is_pretokenized: bool:
Whether the input is already pre-tokenized
add_special_tokens: bool:
Whether to add the special tokens while encoding.
Returns:
An Encoding
"""
pass
def encode_batch(
self,
inputs: List[EncodeInput],
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> List[Encoding]:
""" Encode the given inputs. This method accept both raw text sequences as well as already
pre-tokenized sequences.
Args:
inputs: List[EncodeInput]:
A list of single sequences or pair sequences to encode. Each `EncodeInput` is
expected to be of the following form:
`Union[InputSequence, Tuple[InputSequence, InputSequence]]`
Each `InputSequence` can either be raw text or pre-tokenized,
according to the `is_pretokenized` argument:
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
- If `is_pretokenized=True`: `InputSequence` is expected to be
`Union[List[str], Tuple[str]]`
is_pretokenized: bool:
Whether the input is already pre-tokenized.
add_special_tokens: bool:
Whether to add the special tokens while encoding.
Returns:
A list of Encoding
"""
pass
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
""" Decode the given list of ids to a string sequence
Args:
ids: List[unsigned int]:
A list of ids to be decoded
skip_special_tokens: (`optional`) boolean:
Whether to remove all the special tokens from the output string
Returns:
The decoded string
"""
pass
def decode_batch(
self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
) -> str:
""" Decode the list of sequences to a list of string sequences
Args:
sequences: List[List[unsigned int]]:
A list of sequence of ids to be decoded
skip_special_tokens: (`optional`) boolean:
Whether to remove all the special tokens from the output strings
Returns:
A list of decoded strings
"""
pass
def token_to_id(self, token: str) -> Optional[int]:
""" Convert the given token to its corresponding id
Args:
token: str:
The token to convert
Returns:
The corresponding id if it exists, None otherwise
"""
pass
def id_to_token(self, id: int) -> Optional[str]:
""" Convert the given token id to its corresponding string
Args:
token: id:
The token id to convert
Returns:
The corresponding string if it exists, None otherwise
"""
pass
def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
""" Add the given tokens to the vocabulary
Args:
tokens: List[Union[str, AddedToken]]:
A list of tokens to add to the vocabulary. Each token can either be
a string, or an instance of AddedToken
Returns:
The number of tokens that were added to the vocabulary
"""
pass
def add_special_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
""" Add the given special tokens to the vocabulary, and treat them as special tokens.
The special tokens will never be processed by the model, and will be
removed while decoding.
Args:
tokens: List[Union[str, AddedToken]]:
The list of special tokens to add. Each token can either be a string
or an instance of AddedToken
Returns:
The number of tokens that were added to the vocabulary
"""
pass
def post_process(
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True,
) -> Encoding:
""" Apply all the post-processing steps to the given encodings.
The various steps are:
1. Truncate according to global params (provided to `enable_truncation`)
2. Apply the PostProcessor
3. Pad according to global params. (provided to `enable_padding`)
Args:
encoding: Encoding:
The main Encoding to post process
pair: Optional[Encoding]:
An optional pair Encoding
add_special_tokens: bool:
Whether to add special tokens
Returns:
The resulting Encoding
"""
pass

View File

@@ -0,0 +1,7 @@
from .. import decoders
Decoder = decoders.Decoder
ByteLevel = decoders.ByteLevel
WordPiece = decoders.WordPiece
Metaspace = decoders.Metaspace
BPEDecoder = decoders.BPEDecoder

View File

@@ -0,0 +1,65 @@
from typing import List
class Decoder:
""" Base class for all decoders
This class is not supposed to be instantiated directly. Instead, any implementation of
a Decoder will return an instance of this class when instantiated.
"""
def decode(self, tokens: List[str]) -> str:
""" Decode the given list of string to a final string """
pass
class ByteLevel(Decoder):
""" ByteLevel Decoder """
def __init__(self) -> None:
""" Instantiate a new ByteLevel Decoder """
pass
class WordPiece(Decoder):
""" WordPiece Decoder """
@staticmethod
def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder:
""" Instantiate a new WordPiece Decoder
Args:
prefix: str:
The prefix to use for subwords that are not a beginning-of-word
cleanup: bool:
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
and some abbreviated english forms.
"""
pass
class Metaspace(Decoder):
""" Metaspace decoder """
def __init__(self, replacement: str = "", add_prefix_space: bool = True) -> None:
""" Instantiate a new Metaspace
Args:
replacement: str:
The replacement character. Must be exactly one character. By default we
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
add_prefix_space: boolean:
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
"""
pass
class BPEDecoder(Decoder):
""" BPEDecoder """
def __init__(self, suffix: str = "</w>") -> None:
""" Instantiate a new BPEDecoder
Args:
suffix: str:
The suffix that was used to caracterize an end-of-word. This suffix will
be replaced by whitespaces during the decoding
"""
pass

View File

@@ -0,0 +1,5 @@
from .base_tokenizer import BaseTokenizer
from .byte_level_bpe import ByteLevelBPETokenizer
from .char_level_bpe import CharBPETokenizer
from .sentencepiece_bpe import SentencePieceBPETokenizer
from .bert_wordpiece import BertWordPieceTokenizer

View File

@@ -0,0 +1,369 @@
from tokenizers import Tokenizer, Encoding, AddedToken, InputSequence, EncodeInput
from tokenizers.models import TokenizedSequence, TokenizedSequenceWithOffsets
from typing import List, Union, Tuple, Optional, Dict
Offsets = Tuple[int, int]
class BaseTokenizer:
def __init__(self, tokenizer: Tokenizer, parameters=None):
self._tokenizer = tokenizer
self._parameters = parameters if parameters is not None else {}
def __repr__(self):
return "Tokenizer(vocabulary_size={}, {})".format(
self._tokenizer.get_vocab_size(),
", ".join(k + "=" + str(v) for k, v in self._parameters.items()),
)
def num_special_tokens_to_add(self, is_pair: bool) -> int:
"""
Return the number of special tokens that would be added for single/pair sentences.
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
:return:
"""
return self._tokenizer.num_special_tokens_to_add(is_pair)
def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
""" Returns the vocabulary
Args:
with_added_tokens: boolean:
Whether to include the added tokens in the vocabulary
Returns:
The vocabulary
"""
return self._tokenizer.get_vocab(with_added_tokens=with_added_tokens)
def get_vocab_size(self, with_added_tokens: bool = True) -> int:
""" Return the size of vocabulary, with or without added tokens.
Args:
with_added_tokens: (`optional`) bool:
Whether to count in added special tokens or not
Returns:
Size of vocabulary
"""
return self._tokenizer.get_vocab_size(with_added_tokens=with_added_tokens)
def enable_padding(
self,
direction: Optional[str] = "right",
pad_to_multiple_of: Optional[int] = None,
pad_id: Optional[int] = 0,
pad_type_id: Optional[int] = 0,
pad_token: Optional[str] = "[PAD]",
length: Optional[int] = None,
):
""" Change the padding strategy
Args:
direction: (`optional`) str:
Can be one of: `right` or `left`
pad_to_multiple_of: (`optional`) unsigned int:
If specified, the padding length should always snap to the next multiple of
the given value. For example if we were going to pad with a length of 250 but
`pad_to_multiple_of=8` then we will pad to 256.
pad_id: (`optional`) unsigned int:
The indice to be used when padding
pad_type_id: (`optional`) unsigned int:
The type indice to be used when padding
pad_token: (`optional`) str:
The pad token to be used when padding
length: (`optional`) unsigned int:
If specified, the length at which to pad. If not specified
we pad using the size of the longest sequence in a batch
"""
return self._tokenizer.enable_padding(
direction=direction,
pad_to_multiple_of=pad_to_multiple_of,
pad_id=pad_id,
pad_type_id=pad_type_id,
pad_token=pad_token,
length=length,
)
def no_padding(self):
""" Disable padding """
return self._tokenizer.no_padding()
@property
def padding(self) -> Optional[dict]:
""" Get the current padding parameters
Returns:
None if padding is disabled, a dict with the currently set parameters
if the padding is enabled.
"""
return self._tokenizer.padding
def enable_truncation(
self, max_length: int, stride: Optional[int] = 0, strategy: Optional[str] = "longest_first"
):
""" Change the truncation options
Args:
max_length: unsigned int:
The maximum length at which to truncate
stride: (`optional`) unsigned int:
The length of the previous first sequence to be included
in the overflowing sequence
strategy: (`optional) str:
Can be one of `longest_first`, `only_first` or `only_second`
"""
return self._tokenizer.enable_truncation(max_length, stride=stride, strategy=strategy)
def no_truncation(self):
""" Disable truncation """
return self._tokenizer.no_truncation()
@property
def truncation(self) -> Optional[dict]:
""" Get the current truncation parameters
Returns:
None if truncation is disabled, a dict with the current truncation parameters if
truncation is enabled
"""
return self._tokenizer.truncation
def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
""" Add the given tokens to the vocabulary
Args:
tokens: List[Union[str, AddedToken]]:
A list of tokens to add to the vocabulary. Each token can either be
a string, or an instance of AddedToken
Returns:
The number of tokens that were added to the vocabulary
"""
return self._tokenizer.add_tokens(tokens)
def add_special_tokens(self, special_tokens: List[Union[str, AddedToken]]) -> int:
""" Add the given special tokens to the vocabulary, and treat them as special tokens.
The special tokens will never be processed by the model, and will be
removed while decoding.
Args:
tokens: List[Union[str, AddedToken]]:
A list of special tokens to add to the vocabulary. Each token can either be
a string, or an instance of AddedToken
Returns:
The number of tokens that were added to the vocabulary
"""
return self._tokenizer.add_special_tokens(special_tokens)
def normalize(self, sequence: str) -> str:
""" Normalize the given sequence
Args:
sequence: str:
The sequence to normalize
Returns:
The normalized string
"""
return self._tokenizer.normalize(sequence)
def encode(
self,
sequence: InputSequence,
pair: Optional[InputSequence] = None,
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> Encoding:
""" Encode the given sequence and pair. This method can process raw text sequences as well
as already pre-tokenized sequences.
Args:
sequence: InputSequence:
The sequence we want to encode. This sequence can be either raw text or
pre-tokenized, according to the `is_pretokenized` argument:
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
- If `is_pretokenized=True`: `InputSequence` is expected to be
`Union[List[str], Tuple[str]]`
is_pretokenized: bool:
Whether the input is already pre-tokenized.
add_special_tokens: bool:
Whether to add the special tokens while encoding.
Returns:
An Encoding
"""
if sequence is None:
raise ValueError("encode: `sequence` can't be `None`")
return self._tokenizer.encode(sequence, pair, is_pretokenized, add_special_tokens)
def encode_batch(
self,
inputs: List[EncodeInput],
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> List[Encoding]:
""" Encode the given inputs. This method accept both raw text sequences as well as already
pre-tokenized sequences.
Args:
inputs: List[EncodeInput]:
A list of single sequences or pair sequences to encode. Each `EncodeInput` is
expected to be of the following form:
`Union[InputSequence, Tuple[InputSequence, InputSequence]]`
Each `InputSequence` can either be raw text or pre-tokenized,
according to the `is_pretokenized` argument:
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
- If `is_pretokenized=True`: `InputSequence` is expected to be
`Union[List[str], Tuple[str]]`
is_pretokenized: bool:
Whether the input is already pre-tokenized.
add_special_tokens: bool:
Whether to add the special tokens while encoding.
Returns:
A list of Encoding
"""
if inputs is None:
raise ValueError("encode_batch: `inputs` can't be `None`")
return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
""" Decode the given list of ids to a string sequence
Args:
ids: List[unsigned int]:
A list of ids to be decoded
skip_special_tokens: (`optional`) boolean:
Whether to remove all the special tokens from the output string
Returns:
The decoded string
"""
if ids is None:
raise ValueError("None input is not valid. Should be a list of integers.")
return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
def decode_batch(
self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
) -> str:
""" Decode the list of sequences to a list of string sequences
Args:
sequences: List[List[unsigned int]]:
A list of sequence of ids to be decoded
skip_special_tokens: (`optional`) boolean:
Whether to remove all the special tokens from the output strings
Returns:
A list of decoded strings
"""
if sequences is None:
raise ValueError("None input is not valid. Should be list of list of integers.")
return self._tokenizer.decode_batch(sequences, skip_special_tokens=skip_special_tokens)
def token_to_id(self, token: str) -> Optional[int]:
""" Convert the given token to its corresponding id
Args:
token: str:
The token to convert
Returns:
The corresponding id if it exists, None otherwise
"""
return self._tokenizer.token_to_id(token)
def id_to_token(self, id: int) -> Optional[str]:
""" Convert the given token id to its corresponding string
Args:
token: id:
The token id to convert
Returns:
The corresponding string if it exists, None otherwise
"""
return self._tokenizer.id_to_token(id)
def save_model(self, directory: str, name: Optional[str] = None):
""" Save the current model to the given directory
Args:
directory: str:
A path to the destination directory
name: (Optional) str:
The name of the tokenizer, to be used in the saved files
"""
return self._tokenizer.model.save(directory, name=name)
def save(self, path: str, pretty: bool = False):
""" Save the current Tokenizer at the given path
Args:
path: str:
A path to the destination Tokenizer file
"""
return self._tokenizer.save(path, pretty)
def to_str(self, pretty: bool = False):
""" Get a serialized JSON version of the Tokenizer as a str
Args:
pretty: bool:
Whether the JSON string should be prettified
Returns:
str
"""
return self._tokenizer.to_str(pretty)
def post_process(
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True
) -> Encoding:
""" Apply all the post-processing steps to the given encodings.
The various steps are:
1. Truncate according to global params (provided to `enable_truncation`)
2. Apply the PostProcessor
3. Pad according to global params. (provided to `enable_padding`)
Args:
encoding: Encoding:
The main Encoding to post process
pair: Optional[Encoding]:
An optional pair Encoding
add_special_tokens: bool:
Whether to add special tokens
Returns:
The resulting Encoding
"""
return self._tokenizer.post_process(encoding, pair, add_special_tokens)

View File

@@ -0,0 +1,113 @@
from tokenizers import Tokenizer, AddedToken, decoders, trainers
from tokenizers.models import WordPiece
from tokenizers.normalizers import BertNormalizer
from tokenizers.pre_tokenizers import BertPreTokenizer
from tokenizers.processors import BertProcessing
from .base_tokenizer import BaseTokenizer
from typing import Optional, List, Union
class BertWordPieceTokenizer(BaseTokenizer):
""" Bert WordPiece Tokenizer """
def __init__(
self,
vocab_file: Optional[str] = None,
unk_token: Union[str, AddedToken] = "[UNK]",
sep_token: Union[str, AddedToken] = "[SEP]",
cls_token: Union[str, AddedToken] = "[CLS]",
pad_token: Union[str, AddedToken] = "[PAD]",
mask_token: Union[str, AddedToken] = "[MASK]",
clean_text: bool = True,
handle_chinese_chars: bool = True,
strip_accents: Optional[bool] = None,
lowercase: bool = True,
wordpieces_prefix: str = "##",
):
if vocab_file is not None:
tokenizer = Tokenizer(WordPiece(vocab_file, unk_token=str(unk_token)))
else:
tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token)))
# Let the tokenizer know about special tokens if they are part of the vocab
if tokenizer.token_to_id(str(unk_token)) is not None:
tokenizer.add_special_tokens([str(unk_token)])
if tokenizer.token_to_id(str(sep_token)) is not None:
tokenizer.add_special_tokens([str(sep_token)])
if tokenizer.token_to_id(str(cls_token)) is not None:
tokenizer.add_special_tokens([str(cls_token)])
if tokenizer.token_to_id(str(pad_token)) is not None:
tokenizer.add_special_tokens([str(pad_token)])
if tokenizer.token_to_id(str(mask_token)) is not None:
tokenizer.add_special_tokens([str(mask_token)])
tokenizer.normalizer = BertNormalizer(
clean_text=clean_text,
handle_chinese_chars=handle_chinese_chars,
strip_accents=strip_accents,
lowercase=lowercase,
)
tokenizer.pre_tokenizer = BertPreTokenizer()
if vocab_file is not None:
sep_token_id = tokenizer.token_to_id(str(sep_token))
if sep_token_id is None:
raise TypeError("sep_token not found in the vocabulary")
cls_token_id = tokenizer.token_to_id(str(cls_token))
if cls_token_id is None:
raise TypeError("cls_token not found in the vocabulary")
tokenizer.post_processor = BertProcessing(
(str(sep_token), sep_token_id), (str(cls_token), cls_token_id)
)
tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)
parameters = {
"model": "BertWordPiece",
"unk_token": unk_token,
"sep_token": sep_token,
"cls_token": cls_token,
"pad_token": pad_token,
"mask_token": mask_token,
"clean_text": clean_text,
"handle_chinese_chars": handle_chinese_chars,
"strip_accents": strip_accents,
"lowercase": lowercase,
"wordpieces_prefix": wordpieces_prefix,
}
super().__init__(tokenizer, parameters)
def train(
self,
files: Union[str, List[str]],
vocab_size: int = 30000,
min_frequency: int = 2,
limit_alphabet: int = 1000,
initial_alphabet: List[str] = [],
special_tokens: List[Union[str, AddedToken]] = [
"[PAD]",
"[UNK]",
"[CLS]",
"[SEP]",
"[MASK]",
],
show_progress: bool = True,
wordpieces_prefix: str = "##",
):
""" Train the model using the given files """
trainer = trainers.WordPieceTrainer(
vocab_size=vocab_size,
min_frequency=min_frequency,
limit_alphabet=limit_alphabet,
initial_alphabet=initial_alphabet,
special_tokens=special_tokens,
show_progress=show_progress,
continuing_subword_prefix=wordpieces_prefix,
)
if isinstance(files, str):
files = [files]
self._tokenizer.train(trainer, files)

View File

@@ -0,0 +1,92 @@
from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers, processors
from tokenizers.models import BPE
from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
from .base_tokenizer import BaseTokenizer
from typing import Optional, List, Union
class ByteLevelBPETokenizer(BaseTokenizer):
""" ByteLevelBPETokenizer
Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
"""
def __init__(
self,
vocab_file: Optional[str] = None,
merges_file: Optional[str] = None,
add_prefix_space: bool = False,
lowercase: bool = False,
dropout: Optional[float] = None,
unicode_normalizer: Optional[str] = None,
continuing_subword_prefix: Optional[str] = None,
end_of_word_suffix: Optional[str] = None,
trim_offsets: bool = False,
):
if vocab_file is not None and merges_file is not None:
tokenizer = Tokenizer(
BPE(
vocab_file,
merges_file,
dropout=dropout,
continuing_subword_prefix=continuing_subword_prefix or "",
end_of_word_suffix=end_of_word_suffix or "",
)
)
else:
tokenizer = Tokenizer(BPE())
# Check for Unicode normalization first (before everything else)
normalizers = []
if unicode_normalizer:
normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
if lowercase:
normalizers += [Lowercase()]
# Create the normalizer structure
if len(normalizers) > 0:
if len(normalizers) > 1:
tokenizer.normalizer = Sequence(normalizers)
else:
tokenizer.normalizer = normalizers[0]
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=trim_offsets)
parameters = {
"model": "ByteLevelBPE",
"add_prefix_space": add_prefix_space,
"lowercase": lowercase,
"dropout": dropout,
"unicode_normalizer": unicode_normalizer,
"continuing_subword_prefix": continuing_subword_prefix,
"end_of_word_suffix": end_of_word_suffix,
"trim_offsets": trim_offsets,
}
super().__init__(tokenizer, parameters)
def train(
self,
files: Union[str, List[str]],
vocab_size: int = 30000,
min_frequency: int = 2,
show_progress: bool = True,
special_tokens: List[Union[str, AddedToken]] = [],
):
""" Train the model using the given files """
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
min_frequency=min_frequency,
show_progress=show_progress,
special_tokens=special_tokens,
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
)
if isinstance(files, str):
files = [files]
self._tokenizer.train(trainer, files)

View File

@@ -0,0 +1,116 @@
from .. import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers
from ..models import BPE
from ..normalizers import Sequence, Lowercase, unicode_normalizer_from_str, BertNormalizer
from .base_tokenizer import BaseTokenizer
from typing import Optional, List, Union
class CharBPETokenizer(BaseTokenizer):
""" Original BPE Tokenizer
Represents the BPE algorithm, as introduced by Rico Sennrich
(https://arxiv.org/abs/1508.07909)
The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
Sennrich subword-nmt implementation by the following options that you can deactivate:
- adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
* removing any control characters and replacing all whitespaces by the classic one.
* handle chinese chars by putting spaces around them.
* strip all accents.
- spitting on punctuation in addition to whitespaces (deactivate it with
`split_on_whitespace_only=True`)
"""
def __init__(
self,
vocab_file: Optional[str] = None,
merges_file: Optional[str] = None,
unk_token: Union[str, AddedToken] = "<unk>",
suffix: str = "</w>",
dropout: Optional[float] = None,
lowercase: bool = False,
unicode_normalizer: Optional[str] = None,
bert_normalizer: bool = True,
split_on_whitespace_only: bool = False,
):
if vocab_file is not None and merges_file is not None:
tokenizer = Tokenizer(
BPE(
vocab_file,
merges_file,
dropout=dropout,
unk_token=str(unk_token),
end_of_word_suffix=suffix,
)
)
else:
tokenizer = Tokenizer(BPE())
if tokenizer.token_to_id(str(unk_token)) is not None:
tokenizer.add_special_tokens([str(unk_token)])
# Check for Unicode normalization first (before everything else)
normalizers = []
if unicode_normalizer:
normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
if bert_normalizer:
normalizers += [BertNormalizer(lowercase=False)]
if lowercase:
normalizers += [Lowercase()]
# Create the normalizer structure
if len(normalizers) > 0:
if len(normalizers) > 1:
tokenizer.normalizer = Sequence(normalizers)
else:
tokenizer.normalizer = normalizers[0]
if split_on_whitespace_only:
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
else:
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
tokenizer.decoder = decoders.BPEDecoder(suffix=suffix)
parameters = {
"model": "BPE",
"unk_token": unk_token,
"suffix": suffix,
"dropout": dropout,
"lowercase": lowercase,
"unicode_normalizer": unicode_normalizer,
"bert_normalizer": bert_normalizer,
"split_on_whitespace_only": split_on_whitespace_only,
}
super().__init__(tokenizer, parameters)
def train(
self,
files: Union[str, List[str]],
vocab_size: int = 30000,
min_frequency: int = 2,
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
limit_alphabet: int = 1000,
initial_alphabet: List[str] = [],
suffix: Optional[str] = "</w>",
show_progress: bool = True,
):
""" Train the model using the given files """
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
min_frequency=min_frequency,
special_tokens=special_tokens,
limit_alphabet=limit_alphabet,
initial_alphabet=initial_alphabet,
end_of_word_suffix=suffix,
show_progress=show_progress,
)
if isinstance(files, str):
files = [files]
self._tokenizer.train(trainer, files)

View File

@@ -0,0 +1,74 @@
from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers
from tokenizers.models import BPE
from tokenizers.normalizers import NFKC
from .base_tokenizer import BaseTokenizer
from typing import Optional, List, Union
class SentencePieceBPETokenizer(BaseTokenizer):
""" SentencePiece BPE Tokenizer
Represents the BPE algorithm, with the pretokenization used by SentencePiece
"""
def __init__(
self,
vocab_file: Optional[str] = None,
merges_file: Optional[str] = None,
unk_token: Union[str, AddedToken] = "<unk>",
replacement: str = "",
add_prefix_space: bool = True,
dropout: Optional[float] = None,
):
if vocab_file is not None and merges_file is not None:
tokenizer = Tokenizer(
BPE(vocab_file, merges_file, dropout=dropout, unk_token=unk_token)
)
else:
tokenizer = Tokenizer(BPE())
if tokenizer.token_to_id(str(unk_token)) is not None:
tokenizer.add_special_tokens([str(unk_token)])
tokenizer.normalizer = NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
)
tokenizer.decoder = decoders.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
)
parameters = {
"model": "SentencePieceBPE",
"unk_token": unk_token,
"replacement": replacement,
"add_prefix_space": add_prefix_space,
"dropout": dropout,
}
super().__init__(tokenizer, parameters)
def train(
self,
files: Union[str, List[str]],
vocab_size: int = 30000,
min_frequency: int = 2,
special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
limit_alphabet: int = 1000,
initial_alphabet: List[str] = [],
show_progress: bool = True,
):
""" Train the model using the given files """
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
min_frequency=min_frequency,
special_tokens=special_tokens,
limit_alphabet=limit_alphabet,
initial_alphabet=initial_alphabet,
show_progress=show_progress,
)
if isinstance(files, str):
files = [files]
self._tokenizer.train(trainer, files)

View File

@@ -0,0 +1,11 @@
from typing import List, Tuple
from .. import models, Offsets
TokenizedSequence = List[str]
TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
Model = models.Model
BPE = models.BPE
WordPiece = models.WordPiece
WordLevel = models.WordLevel

View File

@@ -0,0 +1,156 @@
from .. import Encoding, Offsets
from typing import List, Optional, Union, Tuple
TokenizedSequence = List[str]
TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
class Model:
""" Base class for all models
This class is not supposed to be instantiated directly. Instead, any implementation of
a Model will return a instance of this class when instantiated.
"""
def save(self, folder: str, name: Optional[str] = None) -> List[str]:
""" Save the current model
Save the current model in the given folder, using the given name for the various
files that will get created.
Any file with the same name that already exist in this folder will be overwritten.
"""
pass
def encode(
self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
) -> Encoding:
""" Encode the given sequence.
A sequence can either be:
- `TokenizedSequence`: (`List[str]`)
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
a Tuple[int, int].
If the Offsets are not provided, they will be automatically generated, making the hypothesis
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
Args:
sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
Either a TokenizedSequence or a TokenizedSequenceWithOffsets
type_id: int:
The type id of the given sequence
Returns:
An Encoding
"""
pass
def encode_batch(
self,
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
type_id: int = 0,
) -> List[Encoding]:
""" Encode the given batch of sequence.
A sequence can either be:
- `TokenizedSequence`: (`List[str]`)
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
a Tuple[int, int].
If the Offsets are not provided, they will be automatically generated, making the hypothesis
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
Args:
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
A list of sequence. Each sequence is either a TokenizedSequence or a
TokenizedSequenceWithOffsets
type_id: int:
The type if of the given sequence
Returns:
A list of Encoding
"""
pass
class BPE(Model):
"""BytePairEncoding model class
Instantiate a BPE Model from the given vocab and merges files.
Args:
vocab: ('`optional`) string:
Path to a vocabulary JSON file.
merges: (`optional`) string:
Path to a merge file.
cache_capacity: (`optional`) int:
The number of words that the BPE cache can contain. The cache allows
to speed-up the process by keeping the result of the merge operations
for a number of words.
dropout: (`optional`) Optional[float] [0, 1]:
The BPE dropout to use. Must be an float between 0 and 1
unk_token: (`optional`) str:
The unknown token to be used by the model.
continuing_subword_prefix: (`optional`) str:
The prefix to attach to subword units that don't represent a beginning of word.
end_of_word_suffix: (`optional`) str:
The suffix to attach to subword units that represent an end of word.
"""
@staticmethod
def __init__(
self,
vocab: Optional[str],
merges: Optional[str],
cache_capacity: Optional[int],
dropout: Optional[float],
unk_token: Optional[str],
continuing_subword_prefix: Optional[str],
end_of_word_suffix: Optional[str],
):
pass
class WordPiece(Model):
""" WordPiece model class
Instantiate a WordPiece Model from the given vocab file.
Args:
vocab: (`optional`) string:
Path to a vocabulary file.
unk_token: (`optional`) str:
The unknown token to be used by the model.
max_input_chars_per_word: (`optional`) int:
The maximum number of characters to authorize in a single word.
"""
def __init__(
self,
vocab: Optional[str],
unk_token: Optional[str],
max_input_chars_per_word: Optional[int],
):
pass
class WordLevel(Model):
"""
Most simple tokenizer model based on mapping token from a vocab file to their corresponding id.
Instantiate a WordLevel Model from the given vocab file.
Args:
vocab: (`optional`) string:
Path to a vocabulary file.
unk_token: str:
The unknown token to be used by the model.
"""
def __init__(self, vocab: Optional[str], unk_token: Optional[str]):
pass

View File

@@ -0,0 +1,25 @@
from .. import normalizers
Normalizer = normalizers.Normalizer
BertNormalizer = normalizers.BertNormalizer
NFD = normalizers.NFD
NFKD = normalizers.NFKD
NFC = normalizers.NFC
NFKC = normalizers.NFKC
Sequence = normalizers.Sequence
Lowercase = normalizers.Lowercase
Strip = normalizers.Strip
NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
if normalizer not in NORMALIZERS:
raise ValueError(
"{} is not a known unicode normalizer. Available are {}".format(
normalizer, NORMALIZERS.keys()
)
)
return NORMALIZERS[normalizer]()

View File

@@ -0,0 +1,108 @@
from typing import Optional, List
class Normalizer:
""" Base class for all normalizers
This class is not supposed to be instantiated directly. Instead, any implementation of a
Normalizer will return an instance of this class when instantiated.
"""
class BertNormalizer(Normalizer):
""" BertNormalizer
Takes care of normalizing raw text before giving it to a Bert model.
This includes cleaning the text, handling accents, chinese chars and lowercasing
"""
def __init__(
self,
clean_text: Optional[bool] = True,
handle_chinese_chars: Optional[bool] = True,
strip_accents: Optional[bool] = None,
lowercase: Optional[bool] = True,
) -> None:
""" Instantiate a BertNormalizer with the given options.
Args:
clean_text: (`optional`) boolean:
Whether to clean the text, by removing any control characters
and replacing all whitespaces by the classic one.
handle_chinese_chars: (`optional`) boolean:
Whether to handle chinese chars by putting spaces around them.
strip_accents: (`optional`) boolean:
Whether to strip all accents. If this option is not specified (ie == None),
then it will be determined by the value for `lowercase` (as in the original Bert).
lowercase: (`optional`) boolean:
Whether to lowercase.
Returns:
Normalizer
"""
pass
class NFD(Normalizer):
""" NFD Unicode Normalizer """
def __init__(self) -> None:
""" Instantiate a new NFD Normalizer """
pass
class NFKD(Normalizer):
""" NFKD Unicode Normalizer """
def __init__(self) -> None:
""" Instantiate a new NFKD Normalizer """
pass
class NFC(Normalizer):
""" NFC Unicode Normalizer """
def __init__(self) -> None:
""" Instantiate a new NFC Normalizer """
pass
class NFKC(Normalizer):
""" NFKC Unicode Normalizer """
def __init__(self) -> None:
""" Instantiate a new NFKC Normalizer """
pass
class Sequence(Normalizer):
""" Allows concatenating multiple other Normalizer as a Sequence.
All the normalizers run in sequence in the given order
"""
def __init__(self, normalizers: List[Normalizer]) -> None:
""" Instantiate a new normalization Sequence using the given normalizers
Args:
normalizers: List[Normalizer]:
A list of Normalizer to be run as a sequence
"""
pass
class Lowercase(Normalizer):
""" Lowercase Normalizer """
def __init__(self) -> None:
""" Instantiate a new Lowercase Normalizer """
pass
class Strip(Normalizer):
""" Strip normalizer """
def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
pass
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
"""
Instanciate unicode normalizer from the normalizer name
:param normalizer: Name of the normalizer
:return:
"""
pass

View File

@@ -0,0 +1,9 @@
from .. import pre_tokenizers
PreTokenizer = pre_tokenizers.PreTokenizer
ByteLevel = pre_tokenizers.ByteLevel
Whitespace = pre_tokenizers.Whitespace
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
Metaspace = pre_tokenizers.Metaspace
CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit

View File

@@ -0,0 +1,109 @@
from typing import Optional, List, Tuple
Offsets = Tuple[int, int]
class PreTokenizer:
""" Base class for all pre-tokenizers
This class is not supposed to be instantiated directly. Instead, any implementation of a
PreTokenizer will return an instance of this class when instantiated.
"""
def pre_tokenize(self, sequence: str) -> List[Tuple[str, Offsets]]:
""" Pre tokenize the given sequence """
pass
class ByteLevel(PreTokenizer):
""" ByteLevel PreTokenizer
This pre-tokenizer takes care of replacing all bytes of the given string
with a corresponding representation, as well as splitting into words.
"""
def __init__(self, add_prefix_space: bool = True) -> None:
""" Instantiate a new ByteLevel PreTokenizer
Args:
add_prefix_space: (`optional`) boolean:
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
Returns:
PreTokenizer
"""
pass
@staticmethod
def alphabet() -> List[str]:
""" Returns the alphabet used by this PreTokenizer.
Since the ByteLevel works as its name suggests, at the byte level, it
encodes any byte to one visible character. This means that there is a
total of 256 different characters composing this alphabet.
"""
pass
class Whitespace(PreTokenizer):
""" Whitespace PreTokenizer
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
"""
def __init__(self) -> None:
""" Instantiate a new Whitespace PreTokenizer """
pass
class WhitespaceSplit(PreTokenizer):
""" Whitespace PreTokenizer
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
"""
def __init__(self) -> None:
""" Instantiate a new WhitespaceSplit PreTokenizer """
pass
class BertPreTokenizer(PreTokenizer):
""" BertPreTokenizer
This pre-tokenizer splits tokens on spaces, and also on punctuation.
Each occurence of a punctuation character will be treated separately.
"""
def __init__(self) -> None:
""" Instantiate a new BertPreTokenizer """
pass
class Metaspace(PreTokenizer):
""" Metaspace pre-tokenizer
This pre-tokenizer replaces any whitespace by the provided replacement character.
It then tries to split on these spaces.
"""
def __init__(self, replacement: str = "", add_prefix_space: bool = True) -> None:
""" Instantiate a new Metaspace
Args:
replacement: str:
The replacement character. Must be exactly one character. By default we
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
add_prefix_space: boolean:
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
"""
pass
class CharDelimiterSplit(PreTokenizer):
""" CharDelimiterSplit PreTokenizer
This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
"""
@staticmethod
def __init__(self, delimiter: str) -> None:
""" Instantiate a new CharDelimiterSplit PreTokenizer
Args:
delimiter: str:
The delimiter char that will be used to split input
"""
pass

View File

@@ -0,0 +1,6 @@
from .. import processors
PostProcessor = processors.PostProcessor
BertProcessing = processors.BertProcessing
RobertaProcessing = processors.RobertaProcessing
ByteLevel = processors.ByteLevel

View File

@@ -0,0 +1,99 @@
from typing import Tuple
class PostProcessor:
""" Base class for all post-processors
This class is not supposed to be instantiated directly. Instead, any implementation of
a PostProcessor will return an instance of this class when instantiated.
"""
def num_special_tokens_to_add(self, is_pair: bool) -> int:
"""
Return the number of special tokens that would be added for single/pair sentences.
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
:return:
"""
pass
class BertProcessing(PostProcessor):
""" BertProcessing
This post-processor takes care of adding the special tokens needed by
a Bert model:
- a SEP token
- a CLS token
"""
def __init__(self, sep: Tuple[str, int], cls: Tuple[str, int]) -> None:
""" Instantiate a new BertProcessing with the given tokens
Args:
sep: Tuple[str, int]:
A tuple with the string representation of the SEP token, and its id
cls: Tuple[str, int]:
A tuple with the string representation of the CLS token, and its id
Returns:
PostProcessor
"""
pass
class RobertaProcessing(PostProcessor):
""" RobertaProcessing
This post-processor takes care of adding the special tokens needed by
a Roberta model:
- a SEP token
- a CLS token
It also takes care of trimming the offsets.
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
want the offsets to include these whitespaces, then this PostProcessor should be initialized
with `trim_offsets=True`
"""
def __init__(
self,
sep: Tuple[str, int],
cls: Tuple[str, int],
trim_offsets: bool = True,
add_prefix_space: bool = True,
) -> None:
""" Instantiate a new RobertaProcessing with the given tokens
Args:
sep: Tuple[str, int]:
A tuple with the string representation of the SEP token, and its id
cls: Tuple[str, int]:
A tuple with the string representation of the CLS token, and its id
trim_offsets: bool:
Whether to trim the whitespaces from the produced offsets.
add_prefix_space: bool:
Whether the add_prefix_space option was enabled during pre-tokenization. This
is relevant because it defines the way the offsets are trimmed out.
Returns:
PostProcessor
"""
pass
class ByteLevel(PostProcessor):
""" ByteLevel Post processing
This post-processor takes care of trimming the offsets.
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
want the offsets to include these whitespaces, then this PostProcessor must be used.
"""
def __init(self, trim_offsets: bool = True) -> None:
""" Instantiate a new ByteLevel
Args:
trim_offsets: bool:
Whether to trim the whitespaces from the produced offsets.
"""
pass

View File

@@ -0,0 +1,5 @@
from .. import trainers
Trainer = trainers.Trainer
BpeTrainer = trainers.BpeTrainer
WordPieceTrainer = trainers.WordPieceTrainer

View File

@@ -0,0 +1,113 @@
from .. import AddedToken
from typing import Optional, List, Union
class Trainer:
""" Base class for all trainers
This class is not supposed to be instantiated directly. Instead, any implementation of a
Trainer will return an instance of this class when instantiated.
"""
class BpeTrainer(Trainer):
""" BpeTrainer
Capable of training a BPE model
"""
def __init__(
self,
vocab_size: int = 30000,
min_frequency: int = 0,
show_progress: bool = True,
special_tokens: List[Union[str, AddedToken]] = [],
limit_alphabet: Optional[int] = None,
initial_alphabet: List[str] = [],
continuing_subword_prefix: Optional[str] = None,
end_of_word_suffix: Optional[str] = None,
) -> None:
""" Instantiate a new BpeTrainer with the given options:
Args:
vocab_size: unsigned int:
The size of the final vocabulary, including all tokens and alphabet.
min_frequency: unsigned int:
The minimum frequency a pair should have in order to be merged.
show_progress: boolean:
Whether to show progress bars while training.
special_tokens: List[Union[str, AddedToken]]:
A list of special tokens the model should know of.
limit_alphabet: unsigned int:
The maximum different characters to keep in the alphabet.
initial_alphabet: List[str]:
A list of characters to include in the initial alphabet, even
if not seen in the training dataset.
If the strings contains more than one character, only the first one
is kept.
continuing_subword_prefix: Optional[str]:
A prefix to be used for every subword that is not a beginning-of-word.
end_of_word_suffix: Optional[str]:
A suffix to be used for every subword that is a end-of-word.
Returns:
Trainer
"""
pass
class WordPieceTrainer(Trainer):
""" WordPieceTrainer
Capable of training a WordPiece model
"""
def __init__(
self,
vocab_size: int = 30000,
min_frequency: int = 0,
show_progress: bool = True,
special_tokens: List[Union[str, AddedToken]] = [],
limit_alphabet: Optional[int] = None,
initial_alphabet: List[str] = [],
continuing_subword_prefix: Optional[str] = "##",
end_of_word_suffix: Optional[str] = None,
) -> Trainer:
""" Instantiate a new WordPieceTrainer with the given options:
Args:
vocab_size: unsigned int:
The size of the final vocabulary, including all tokens and alphabet.
min_frequency: unsigned int:
The minimum frequency a pair should have in order to be merged.
show_progress: boolean:
Whether to show progress bars while training.
special_tokens: List[Union[str, AddedToken]]:
A list of special tokens the model should know of.
limit_alphabet: unsigned int:
The maximum different characters to keep in the alphabet.
initial_alphabet: List[str]:
A list of characters to include in the initial alphabet, even
if not seen in the training dataset.
If the strings contains more than one character, only the first one
is kept.
continuing_subword_prefix: Optional[str]:
A prefix to be used for every subword that is not a beginning-of-word.
end_of_word_suffix: Optional[str]:
A suffix to be used for every subword that is a end-of-word.
Returns:
Trainer
"""
pass