[remove black] And use ruff (#1436)

* nits

* Fixing deps.

* Ruff update.

* Import order matters.

* Fix.

* Revert ruff fix.

* Visualizer.

* Putting back the imports.

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
Arthur
2024-03-12 21:24:21 +11:00
committed by GitHub
parent 72a1973cd1
commit 29fef1e7aa
29 changed files with 258 additions and 169 deletions

View File

@ -8,12 +8,14 @@ check_dirs := examples py_src/tokenizers tests
# Format source code automatically # Format source code automatically
style: style:
python stub.py python stub.py
black --line-length 119 --target-version py35 $(check_dirs) ruff check $(check_dirs) --fix
ruff format $(check_dirs)t
# Check the source code is formatted correctly # Check the source code is formatted correctly
check-style: check-style:
python stub.py --check python stub.py --check
black --check --line-length 119 --target-version py35 examples py_src/tokenizers tests ruff check examples py_src/tokenizers tests
ruff format --check examples py_src/tokenizers tests
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json

View File

@ -4,16 +4,15 @@ import time
from tqdm import tqdm from tqdm import tqdm
logging.getLogger("transformers").disabled = True
logging.getLogger("transformers.tokenization_utils").disabled = True
from tokenizers import Tokenizer, decoders, pre_tokenizers from tokenizers import Tokenizer, decoders, pre_tokenizers
from tokenizers.models import BPE, WordPiece from tokenizers.models import BPE, WordPiece
from tokenizers.normalizers import BertNormalizer from tokenizers.normalizers import BertNormalizer
from tokenizers.processors import BertProcessing from tokenizers.processors import BertProcessing
from transformers import BertTokenizer, GPT2Tokenizer from transformers import BertTokenizer, GPT2Tokenizer
logging.getLogger("transformers").disabled = True
logging.getLogger("transformers.tokenization_utils").disabled = True
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--type", default="gpt2", type=str, help="The type of tokenizer (bert|gpt2)") parser.add_argument("--type", default="gpt2", type=str, help="The type of tokenizer (bert|gpt2)")
@ -51,9 +50,7 @@ Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea. If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea. If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those! Namespaces are one honking great idea -- let's do more of those!
""".split( """.split("\n")
"\n"
)
if args.type == "gpt2": if args.type == "gpt2":
print("Running GPT-2 tokenizer") print("Running GPT-2 tokenizer")

View File

@ -1,6 +1,6 @@
import datasets import datasets
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers from tokenizers import Tokenizer, models, normalizers, pre_tokenizers
# Build a tokenizer # Build a tokenizer

View File

@ -34,39 +34,44 @@ class AddedToken:
Defines whether this token should be skipped when decoding. Defines whether this token should be skipped when decoding.
""" """
def __init__(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False): def __init__(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False):
pass pass
@property @property
def content(self): def content(self):
""" """
Get the content of this :obj:`AddedToken` Get the content of this :obj:`AddedToken`
""" """
pass pass
@property @property
def lstrip(self): def lstrip(self):
""" """
Get the value of the :obj:`lstrip` option Get the value of the :obj:`lstrip` option
""" """
pass pass
@property @property
def normalized(self): def normalized(self):
""" """
Get the value of the :obj:`normalized` option Get the value of the :obj:`normalized` option
""" """
pass pass
@property @property
def rstrip(self): def rstrip(self):
""" """
Get the value of the :obj:`rstrip` option Get the value of the :obj:`rstrip` option
""" """
pass pass
@property @property
def single_word(self): def single_word(self):
""" """
Get the value of the :obj:`single_word` option Get the value of the :obj:`single_word` option
""" """
pass pass
@property @property
def special(self): def special(self):
""" """
@ -78,7 +83,6 @@ class Encoding:
""" """
The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`. The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
""" """
@property @property
def attention_mask(self): def attention_mask(self):
""" """
@ -92,6 +96,7 @@ class Encoding:
:obj:`List[int]`: The attention mask :obj:`List[int]`: The attention mask
""" """
pass pass
def char_to_token(self, char_pos, sequence_index=0): def char_to_token(self, char_pos, sequence_index=0):
""" """
Get the token that contains the char at the given position in the input sequence. Get the token that contains the char at the given position in the input sequence.
@ -106,6 +111,7 @@ class Encoding:
:obj:`int`: The index of the token that contains this char in the encoded sequence :obj:`int`: The index of the token that contains this char in the encoded sequence
""" """
pass pass
def char_to_word(self, char_pos, sequence_index=0): def char_to_word(self, char_pos, sequence_index=0):
""" """
Get the word that contains the char at the given position in the input sequence. Get the word that contains the char at the given position in the input sequence.
@ -120,6 +126,7 @@ class Encoding:
:obj:`int`: The index of the word that contains this char in the input sequence :obj:`int`: The index of the word that contains this char in the input sequence
""" """
pass pass
@property @property
def ids(self): def ids(self):
""" """
@ -132,6 +139,7 @@ class Encoding:
:obj:`List[int]`: The list of IDs :obj:`List[int]`: The list of IDs
""" """
pass pass
@staticmethod @staticmethod
def merge(encodings, growing_offsets=True): def merge(encodings, growing_offsets=True):
""" """
@ -148,6 +156,7 @@ class Encoding:
:class:`~tokenizers.Encoding`: The resulting Encoding :class:`~tokenizers.Encoding`: The resulting Encoding
""" """
pass pass
@property @property
def n_sequences(self): def n_sequences(self):
""" """
@ -157,6 +166,7 @@ class Encoding:
:obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding` :obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
""" """
pass pass
@property @property
def offsets(self): def offsets(self):
""" """
@ -169,6 +179,7 @@ class Encoding:
A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
""" """
pass pass
@property @property
def overflowing(self): def overflowing(self):
""" """
@ -183,6 +194,7 @@ class Encoding:
maximum length. maximum length.
""" """
pass pass
def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"): def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"):
""" """
Pad the :class:`~tokenizers.Encoding` at the given length Pad the :class:`~tokenizers.Encoding` at the given length
@ -204,6 +216,7 @@ class Encoding:
The pad token to use The pad token to use
""" """
pass pass
@property @property
def sequence_ids(self): def sequence_ids(self):
""" """
@ -217,6 +230,7 @@ class Encoding:
A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index. A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
""" """
pass pass
def set_sequence_id(self, sequence_id): def set_sequence_id(self, sequence_id):
""" """
Set the given sequence index Set the given sequence index
@ -225,6 +239,7 @@ class Encoding:
:class:`~tokenizers.Encoding`. :class:`~tokenizers.Encoding`.
""" """
pass pass
@property @property
def special_tokens_mask(self): def special_tokens_mask(self):
""" """
@ -236,6 +251,7 @@ class Encoding:
:obj:`List[int]`: The special tokens mask :obj:`List[int]`: The special tokens mask
""" """
pass pass
def token_to_chars(self, token_index): def token_to_chars(self, token_index):
""" """
Get the offsets of the token at the given index. Get the offsets of the token at the given index.
@ -252,6 +268,7 @@ class Encoding:
:obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)` :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
""" """
pass pass
def token_to_sequence(self, token_index): def token_to_sequence(self, token_index):
""" """
Get the index of the sequence represented by the given token. Get the index of the sequence represented by the given token.
@ -267,6 +284,7 @@ class Encoding:
:obj:`int`: The sequence id of the given token :obj:`int`: The sequence id of the given token
""" """
pass pass
def token_to_word(self, token_index): def token_to_word(self, token_index):
""" """
Get the index of the word that contains the token in one of the input sequences. Get the index of the word that contains the token in one of the input sequences.
@ -283,6 +301,7 @@ class Encoding:
:obj:`int`: The index of the word in the relevant input sequence. :obj:`int`: The index of the word in the relevant input sequence.
""" """
pass pass
@property @property
def tokens(self): def tokens(self):
""" """
@ -294,6 +313,7 @@ class Encoding:
:obj:`List[str]`: The list of tokens :obj:`List[str]`: The list of tokens
""" """
pass pass
def truncate(self, max_length, stride=0, direction="right"): def truncate(self, max_length, stride=0, direction="right"):
""" """
Truncate the :class:`~tokenizers.Encoding` at the given length Truncate the :class:`~tokenizers.Encoding` at the given length
@ -312,6 +332,7 @@ class Encoding:
Truncate direction Truncate direction
""" """
pass pass
@property @property
def type_ids(self): def type_ids(self):
""" """
@ -324,6 +345,7 @@ class Encoding:
:obj:`List[int]`: The list of type ids :obj:`List[int]`: The list of type ids
""" """
pass pass
@property @property
def word_ids(self): def word_ids(self):
""" """
@ -341,6 +363,7 @@ class Encoding:
A :obj:`List` of :obj:`Optional[int]`: A list of optional word index. A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
""" """
pass pass
def word_to_chars(self, word_index, sequence_index=0): def word_to_chars(self, word_index, sequence_index=0):
""" """
Get the offsets of the word at the given index in one of the input sequences. Get the offsets of the word at the given index in one of the input sequences.
@ -355,6 +378,7 @@ class Encoding:
:obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)` :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
""" """
pass pass
def word_to_tokens(self, word_index, sequence_index=0): def word_to_tokens(self, word_index, sequence_index=0):
""" """
Get the encoded tokens corresponding to the word at the given index Get the encoded tokens corresponding to the word at the given index
@ -370,6 +394,7 @@ class Encoding:
:obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)` :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
""" """
pass pass
@property @property
def words(self): def words(self):
""" """
@ -404,37 +429,42 @@ class NormalizedString:
sequence: str: sequence: str:
The string sequence used to initialize this NormalizedString The string sequence used to initialize this NormalizedString
""" """
def append(self, s): def append(self, s):
""" """
Append the given sequence to the string Append the given sequence to the string
""" """
pass pass
def clear(self): def clear(self):
""" """
Clears the string Clears the string
""" """
pass pass
def filter(self, func): def filter(self, func):
""" """
Filter each character of the string using the given func Filter each character of the string using the given func
""" """
pass pass
def for_each(self, func): def for_each(self, func):
""" """
Calls the given function for each character of the string Calls the given function for each character of the string
""" """
pass pass
def lowercase(self): def lowercase(self):
""" """
Lowercase the string Lowercase the string
""" """
pass pass
def lstrip(self): def lstrip(self):
""" """
Strip the left of the string Strip the left of the string
""" """
pass pass
def map(self, func): def map(self, func):
""" """
Calls the given function for each character of the string Calls the given function for each character of the string
@ -443,37 +473,44 @@ class NormalizedString:
returned value **must** be a str of length 1 (ie a character). returned value **must** be a str of length 1 (ie a character).
""" """
pass pass
def nfc(self): def nfc(self):
""" """
Runs the NFC normalization Runs the NFC normalization
""" """
pass pass
def nfd(self): def nfd(self):
""" """
Runs the NFD normalization Runs the NFD normalization
""" """
pass pass
def nfkc(self): def nfkc(self):
""" """
Runs the NFKC normalization Runs the NFKC normalization
""" """
pass pass
def nfkd(self): def nfkd(self):
""" """
Runs the NFKD normalization Runs the NFKD normalization
""" """
pass pass
@property @property
def normalized(self): def normalized(self):
""" """
The normalized part of the string The normalized part of the string
""" """
pass pass
def prepend(self, s): def prepend(self, s):
""" """
Prepend the given sequence to the string Prepend the given sequence to the string
""" """
pass pass
def replace(self, pattern, content): def replace(self, pattern, content):
""" """
Replace the content of the given pattern with the provided content Replace the content of the given pattern with the provided content
@ -486,16 +523,19 @@ class NormalizedString:
The content to be used as replacement The content to be used as replacement
""" """
pass pass
def rstrip(self): def rstrip(self):
""" """
Strip the right of the string Strip the right of the string
""" """
pass pass
def slice(self, range): def slice(self, range):
""" """
Slice the string using the given range Slice the string using the given range
""" """
pass pass
def split(self, pattern, behavior): def split(self, pattern, behavior):
""" """
Split the NormalizedString using the given pattern and the specified behavior Split the NormalizedString using the given pattern and the specified behavior
@ -513,11 +553,13 @@ class NormalizedString:
A list of NormalizedString, representing each split A list of NormalizedString, representing each split
""" """
pass pass
def strip(self): def strip(self):
""" """
Strip both ends of the string Strip both ends of the string
""" """
pass pass
def uppercase(self): def uppercase(self):
""" """
Uppercase the string Uppercase the string
@ -542,9 +584,9 @@ class PreTokenizedString:
sequence: str: sequence: str:
The string sequence used to initialize this PreTokenizedString The string sequence used to initialize this PreTokenizedString
""" """
def __init__(self, sequence): def __init__(self, sequence):
pass pass
def get_splits(self, offset_referential="original", offset_type="char"): def get_splits(self, offset_referential="original", offset_type="char"):
""" """
Get the splits currently managed by the PreTokenizedString Get the splits currently managed by the PreTokenizedString
@ -565,6 +607,7 @@ class PreTokenizedString:
A list of splits A list of splits
""" """
pass pass
def normalize(self, func): def normalize(self, func):
""" """
Normalize each split of the `PreTokenizedString` using the given `func` Normalize each split of the `PreTokenizedString` using the given `func`
@ -576,6 +619,7 @@ class PreTokenizedString:
NormalizedString allow its modification. NormalizedString allow its modification.
""" """
pass pass
def split(self, func): def split(self, func):
""" """
Split the PreTokenizedString using the given `func` Split the PreTokenizedString using the given `func`
@ -590,6 +634,7 @@ class PreTokenizedString:
should come from calling either `.split` or `.slice` on the received one. should come from calling either `.split` or `.slice` on the received one.
""" """
pass pass
def to_encoding(self, type_id=0, word_idx=None): def to_encoding(self, type_id=0, word_idx=None):
""" """
Return an Encoding generated from this PreTokenizedString Return an Encoding generated from this PreTokenizedString
@ -607,6 +652,7 @@ class PreTokenizedString:
An Encoding An Encoding
""" """
pass pass
def tokenize(self, func): def tokenize(self, func):
""" """
Tokenize each split of the `PreTokenizedString` using the given `func` Tokenize each split of the `PreTokenizedString` using the given `func`
@ -622,7 +668,6 @@ class Regex:
""" """
Instantiate a new Regex with the given pattern Instantiate a new Regex with the given pattern
""" """
def __init__(self, pattern): def __init__(self, pattern):
pass pass
@ -639,9 +684,9 @@ class Tokenizer:
The core algorithm that this :obj:`Tokenizer` should be using. The core algorithm that this :obj:`Tokenizer` should be using.
""" """
def __init__(self, model): def __init__(self, model):
pass pass
def add_special_tokens(self, tokens): def add_special_tokens(self, tokens):
""" """
Add the given special tokens to the Tokenizer. Add the given special tokens to the Tokenizer.
@ -662,6 +707,7 @@ class Tokenizer:
:obj:`int`: The number of tokens that were created in the vocabulary :obj:`int`: The number of tokens that were created in the vocabulary
""" """
pass pass
def add_tokens(self, tokens): def add_tokens(self, tokens):
""" """
Add the given tokens to the vocabulary Add the given tokens to the vocabulary
@ -678,6 +724,7 @@ class Tokenizer:
:obj:`int`: The number of tokens that were created in the vocabulary :obj:`int`: The number of tokens that were created in the vocabulary
""" """
pass pass
def decode(self, ids, skip_special_tokens=True): def decode(self, ids, skip_special_tokens=True):
""" """
Decode the given list of ids back to a string Decode the given list of ids back to a string
@ -695,6 +742,7 @@ class Tokenizer:
:obj:`str`: The decoded string :obj:`str`: The decoded string
""" """
pass pass
def decode_batch(self, sequences, skip_special_tokens=True): def decode_batch(self, sequences, skip_special_tokens=True):
""" """
Decode a batch of ids back to their corresponding string Decode a batch of ids back to their corresponding string
@ -710,12 +758,14 @@ class Tokenizer:
:obj:`List[str]`: A list of decoded strings :obj:`List[str]`: A list of decoded strings
""" """
pass pass
@property @property
def decoder(self): def decoder(self):
""" """
The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
""" """
pass pass
def enable_padding( def enable_padding(
self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None
): ):
@ -745,6 +795,7 @@ class Tokenizer:
the longest sequence in a batch. the longest sequence in a batch.
""" """
pass pass
def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"): def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"):
""" """
Enable truncation Enable truncation
@ -765,6 +816,7 @@ class Tokenizer:
Truncate direction Truncate direction
""" """
pass pass
def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True): def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
""" """
Encode the given sequence and pair. This method can process raw text sequences Encode the given sequence and pair. This method can process raw text sequences
@ -803,6 +855,7 @@ class Tokenizer:
""" """
pass pass
def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True): def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
""" """
Encode the given batch of inputs. This method accept both raw text sequences Encode the given batch of inputs. This method accept both raw text sequences
@ -838,6 +891,7 @@ class Tokenizer:
""" """
pass pass
@property @property
def encode_special_tokens(self): def encode_special_tokens(self):
""" """
@ -850,6 +904,7 @@ class Tokenizer:
""" """
pass pass
@staticmethod @staticmethod
def from_buffer(buffer): def from_buffer(buffer):
""" """
@ -863,6 +918,7 @@ class Tokenizer:
:class:`~tokenizers.Tokenizer`: The new tokenizer :class:`~tokenizers.Tokenizer`: The new tokenizer
""" """
pass pass
@staticmethod @staticmethod
def from_file(path): def from_file(path):
""" """
@ -877,6 +933,7 @@ class Tokenizer:
:class:`~tokenizers.Tokenizer`: The new tokenizer :class:`~tokenizers.Tokenizer`: The new tokenizer
""" """
pass pass
@staticmethod @staticmethod
def from_pretrained(identifier, revision="main", auth_token=None): def from_pretrained(identifier, revision="main", auth_token=None):
""" """
@ -897,6 +954,7 @@ class Tokenizer:
:class:`~tokenizers.Tokenizer`: The new tokenizer :class:`~tokenizers.Tokenizer`: The new tokenizer
""" """
pass pass
@staticmethod @staticmethod
def from_str(json): def from_str(json):
""" """
@ -911,6 +969,7 @@ class Tokenizer:
:class:`~tokenizers.Tokenizer`: The new tokenizer :class:`~tokenizers.Tokenizer`: The new tokenizer
""" """
pass pass
def get_added_tokens_decoder(self): def get_added_tokens_decoder(self):
""" """
Get the underlying vocabulary Get the underlying vocabulary
@ -919,6 +978,7 @@ class Tokenizer:
:obj:`Dict[int, AddedToken]`: The vocabulary :obj:`Dict[int, AddedToken]`: The vocabulary
""" """
pass pass
def get_vocab(self, with_added_tokens=True): def get_vocab(self, with_added_tokens=True):
""" """
Get the underlying vocabulary Get the underlying vocabulary
@ -931,6 +991,7 @@ class Tokenizer:
:obj:`Dict[str, int]`: The vocabulary :obj:`Dict[str, int]`: The vocabulary
""" """
pass pass
def get_vocab_size(self, with_added_tokens=True): def get_vocab_size(self, with_added_tokens=True):
""" """
Get the size of the underlying vocabulary Get the size of the underlying vocabulary
@ -943,6 +1004,7 @@ class Tokenizer:
:obj:`int`: The size of the vocabulary :obj:`int`: The size of the vocabulary
""" """
pass pass
def id_to_token(self, id): def id_to_token(self, id):
""" """
Convert the given id to its corresponding token if it exists Convert the given id to its corresponding token if it exists
@ -955,28 +1017,33 @@ class Tokenizer:
:obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
""" """
pass pass
@property @property
def model(self): def model(self):
""" """
The :class:`~tokenizers.models.Model` in use by the Tokenizer The :class:`~tokenizers.models.Model` in use by the Tokenizer
""" """
pass pass
def no_padding(self): def no_padding(self):
""" """
Disable padding Disable padding
""" """
pass pass
def no_truncation(self): def no_truncation(self):
""" """
Disable truncation Disable truncation
""" """
pass pass
@property @property
def normalizer(self): def normalizer(self):
""" """
The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
""" """
pass pass
def num_special_tokens_to_add(self, is_pair): def num_special_tokens_to_add(self, is_pair):
""" """
Return the number of special tokens that would be added for single/pair sentences. Return the number of special tokens that would be added for single/pair sentences.
@ -984,6 +1051,7 @@ class Tokenizer:
:return: :return:
""" """
pass pass
@property @property
def padding(self): def padding(self):
""" """
@ -996,6 +1064,7 @@ class Tokenizer:
A dict with the current padding parameters if padding is enabled A dict with the current padding parameters if padding is enabled
""" """
pass pass
def post_process(self, encoding, pair=None, add_special_tokens=True): def post_process(self, encoding, pair=None, add_special_tokens=True):
""" """
Apply all the post-processing steps to the given encodings. Apply all the post-processing steps to the given encodings.
@ -1022,18 +1091,21 @@ class Tokenizer:
:class:`~tokenizers.Encoding`: The final post-processed encoding :class:`~tokenizers.Encoding`: The final post-processed encoding
""" """
pass pass
@property @property
def post_processor(self): def post_processor(self):
""" """
The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
""" """
pass pass
@property @property
def pre_tokenizer(self): def pre_tokenizer(self):
""" """
The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
""" """
pass pass
def save(self, path, pretty=True): def save(self, path, pretty=True):
""" """
Save the :class:`~tokenizers.Tokenizer` to the file at the given path. Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
@ -1046,6 +1118,7 @@ class Tokenizer:
Whether the JSON file should be pretty formatted. Whether the JSON file should be pretty formatted.
""" """
pass pass
def to_str(self, pretty=False): def to_str(self, pretty=False):
""" """
Gets a serialized string representing this :class:`~tokenizers.Tokenizer`. Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
@ -1058,6 +1131,7 @@ class Tokenizer:
:obj:`str`: A string representing the serialized Tokenizer :obj:`str`: A string representing the serialized Tokenizer
""" """
pass pass
def token_to_id(self, token): def token_to_id(self, token):
""" """
Convert the given token to its corresponding id if it exists Convert the given token to its corresponding id if it exists
@ -1070,6 +1144,7 @@ class Tokenizer:
:obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
""" """
pass pass
def train(self, files, trainer=None): def train(self, files, trainer=None):
""" """
Train the Tokenizer using the given files. Train the Tokenizer using the given files.
@ -1086,6 +1161,7 @@ class Tokenizer:
An optional trainer that should be used to train our Model An optional trainer that should be used to train our Model
""" """
pass pass
def train_from_iterator(self, iterator, trainer=None, length=None): def train_from_iterator(self, iterator, trainer=None, length=None):
""" """
Train the Tokenizer using the provided iterator. Train the Tokenizer using the provided iterator.
@ -1109,6 +1185,7 @@ class Tokenizer:
provide meaningful progress tracking provide meaningful progress tracking
""" """
pass pass
@property @property
def truncation(self): def truncation(self):
""" """

View File

@ -6,7 +6,6 @@ class Decoder:
This class is not supposed to be instantiated directly. Instead, any implementation of This class is not supposed to be instantiated directly. Instead, any implementation of
a Decoder will return an instance of this class when instantiated. a Decoder will return an instance of this class when instantiated.
""" """
def decode(self, tokens): def decode(self, tokens):
""" """
Decode the given list of tokens to a final string Decode the given list of tokens to a final string
@ -29,9 +28,9 @@ class BPEDecoder(Decoder):
The suffix that was used to caracterize an end-of-word. This suffix will The suffix that was used to caracterize an end-of-word. This suffix will
be replaced by whitespaces during the decoding be replaced by whitespaces during the decoding
""" """
def __init__(self, suffix="</w>"): def __init__(self, suffix="</w>"):
pass pass
def decode(self, tokens): def decode(self, tokens):
""" """
Decode the given list of tokens to a final string Decode the given list of tokens to a final string
@ -53,9 +52,9 @@ class ByteFallback(Decoder):
cannot be decoded you will get <20> instead for each inconvertable byte token cannot be decoded you will get <20> instead for each inconvertable byte token
""" """
def __init__(self): def __init__(self):
pass pass
def decode(self, tokens): def decode(self, tokens):
""" """
Decode the given list of tokens to a final string Decode the given list of tokens to a final string
@ -76,9 +75,9 @@ class ByteLevel(Decoder):
This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel` This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel`
:class:`~tokenizers.pre_tokenizers.PreTokenizer`. :class:`~tokenizers.pre_tokenizers.PreTokenizer`.
""" """
def __init__(self): def __init__(self):
pass pass
def decode(self, tokens): def decode(self, tokens):
""" """
Decode the given list of tokens to a final string Decode the given list of tokens to a final string
@ -105,9 +104,9 @@ class CTC(Decoder):
Whether to cleanup some tokenization artifacts. Whether to cleanup some tokenization artifacts.
Mainly spaces before punctuation, and some abbreviated english forms. Mainly spaces before punctuation, and some abbreviated english forms.
""" """
def __init__(self, pad_token="<pad>", word_delimiter_token="|", cleanup=True): def __init__(self, pad_token="<pad>", word_delimiter_token="|", cleanup=True):
pass pass
def decode(self, tokens): def decode(self, tokens):
""" """
Decode the given list of tokens to a final string Decode the given list of tokens to a final string
@ -128,9 +127,9 @@ class Fuse(Decoder):
This is the last step of decoding, this decoder exists only if This is the last step of decoding, this decoder exists only if
there is need to add other decoders *after* the fusion there is need to add other decoders *after* the fusion
""" """
def __init__(self): def __init__(self):
pass pass
def decode(self, tokens): def decode(self, tokens):
""" """
Decode the given list of tokens to a final string Decode the given list of tokens to a final string
@ -157,9 +156,9 @@ class Metaspace(Decoder):
Whether to add a space to the first word if there isn't already one. This Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`. lets us treat `hello` exactly like `say hello`.
""" """
def __init__(self, replacement="", add_prefix_space=True): def __init__(self, replacement="", add_prefix_space=True):
pass pass
def decode(self, tokens): def decode(self, tokens):
""" """
Decode the given list of tokens to a final string Decode the given list of tokens to a final string
@ -180,9 +179,9 @@ class Replace(Decoder):
This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace` This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace`
:class:`~tokenizers.pre_tokenizers.PreTokenizer`. :class:`~tokenizers.pre_tokenizers.PreTokenizer`.
""" """
def __init__(self, pattern, content): def __init__(self, pattern, content):
pass pass
def decode(self, tokens): def decode(self, tokens):
""" """
Decode the given list of tokens to a final string Decode the given list of tokens to a final string
@ -204,9 +203,9 @@ class Sequence(Decoder):
decoders (:obj:`List[Decoder]`) decoders (:obj:`List[Decoder]`)
The decoders that need to be chained The decoders that need to be chained
""" """
def __init__(self, decoders): def __init__(self, decoders):
pass pass
def decode(self, tokens): def decode(self, tokens):
""" """
Decode the given list of tokens to a final string Decode the given list of tokens to a final string
@ -225,9 +224,9 @@ class Strip(Decoder):
Strip normalizer Strip normalizer
Strips n left characters of each token, or n right characters of each token Strips n left characters of each token, or n right characters of each token
""" """
def __init__(self, content, left=0, right=0): def __init__(self, content, left=0, right=0):
pass pass
def decode(self, tokens): def decode(self, tokens):
""" """
Decode the given list of tokens to a final string Decode the given list of tokens to a final string
@ -253,9 +252,9 @@ class WordPiece(Decoder):
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation, Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
and some abbreviated english forms. and some abbreviated english forms.
""" """
def __init__(self, prefix="##", cleanup=True): def __init__(self, prefix="##", cleanup=True):
pass pass
def decode(self, tokens): def decode(self, tokens):
""" """
Decode the given list of tokens to a final string Decode the given list of tokens to a final string

View File

@ -8,7 +8,6 @@ class Model:
This class cannot be constructed directly. Please use one of the concrete models. This class cannot be constructed directly. Please use one of the concrete models.
""" """
def get_trainer(self): def get_trainer(self):
""" """
Get the associated :class:`~tokenizers.trainers.Trainer` Get the associated :class:`~tokenizers.trainers.Trainer`
@ -20,6 +19,7 @@ class Model:
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
""" """
pass pass
def id_to_token(self, id): def id_to_token(self, id):
""" """
Get the token associated to an ID Get the token associated to an ID
@ -32,6 +32,7 @@ class Model:
:obj:`str`: The token associated to the ID :obj:`str`: The token associated to the ID
""" """
pass pass
def save(self, folder, prefix): def save(self, folder, prefix):
""" """
Save the current model Save the current model
@ -51,6 +52,7 @@ class Model:
:obj:`List[str]`: The list of saved files :obj:`List[str]`: The list of saved files
""" """
pass pass
def token_to_id(self, tokens): def token_to_id(self, tokens):
""" """
Get the ID associated to a token Get the ID associated to a token
@ -63,6 +65,7 @@ class Model:
:obj:`int`: The ID associated to the token :obj:`int`: The ID associated to the token
""" """
pass pass
def tokenize(self, sequence): def tokenize(self, sequence):
""" """
Tokenize a sequence Tokenize a sequence
@ -110,7 +113,6 @@ class BPE(Model):
byte_fallback (:obj:`bool`, `optional`): byte_fallback (:obj:`bool`, `optional`):
Whether to use spm byte-fallback trick (defaults to False) Whether to use spm byte-fallback trick (defaults to False)
""" """
def __init__( def __init__(
self, self,
vocab=None, vocab=None,
@ -124,6 +126,7 @@ class BPE(Model):
byte_fallback=False, byte_fallback=False,
): ):
pass pass
@staticmethod @staticmethod
def from_file(cls, vocab, merge, **kwargs): def from_file(cls, vocab, merge, **kwargs):
""" """
@ -149,6 +152,7 @@ class BPE(Model):
:class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
""" """
pass pass
def get_trainer(self): def get_trainer(self):
""" """
Get the associated :class:`~tokenizers.trainers.Trainer` Get the associated :class:`~tokenizers.trainers.Trainer`
@ -160,6 +164,7 @@ class BPE(Model):
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
""" """
pass pass
def id_to_token(self, id): def id_to_token(self, id):
""" """
Get the token associated to an ID Get the token associated to an ID
@ -172,6 +177,7 @@ class BPE(Model):
:obj:`str`: The token associated to the ID :obj:`str`: The token associated to the ID
""" """
pass pass
@staticmethod @staticmethod
def read_file(self, vocab, merges): def read_file(self, vocab, merges):
""" """
@ -193,6 +199,7 @@ class BPE(Model):
The vocabulary and merges loaded into memory The vocabulary and merges loaded into memory
""" """
pass pass
def save(self, folder, prefix): def save(self, folder, prefix):
""" """
Save the current model Save the current model
@ -212,6 +219,7 @@ class BPE(Model):
:obj:`List[str]`: The list of saved files :obj:`List[str]`: The list of saved files
""" """
pass pass
def token_to_id(self, tokens): def token_to_id(self, tokens):
""" """
Get the ID associated to a token Get the ID associated to a token
@ -224,6 +232,7 @@ class BPE(Model):
:obj:`int`: The ID associated to the token :obj:`int`: The ID associated to the token
""" """
pass pass
def tokenize(self, sequence): def tokenize(self, sequence):
""" """
Tokenize a sequence Tokenize a sequence
@ -245,9 +254,9 @@ class Unigram(Model):
vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`): vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
A list of vocabulary items and their relative score [("am", -0.2442),...] A list of vocabulary items and their relative score [("am", -0.2442),...]
""" """
def __init__(self, vocab, unk_id, byte_fallback): def __init__(self, vocab, unk_id, byte_fallback):
pass pass
def get_trainer(self): def get_trainer(self):
""" """
Get the associated :class:`~tokenizers.trainers.Trainer` Get the associated :class:`~tokenizers.trainers.Trainer`
@ -259,6 +268,7 @@ class Unigram(Model):
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
""" """
pass pass
def id_to_token(self, id): def id_to_token(self, id):
""" """
Get the token associated to an ID Get the token associated to an ID
@ -271,6 +281,7 @@ class Unigram(Model):
:obj:`str`: The token associated to the ID :obj:`str`: The token associated to the ID
""" """
pass pass
def save(self, folder, prefix): def save(self, folder, prefix):
""" """
Save the current model Save the current model
@ -290,6 +301,7 @@ class Unigram(Model):
:obj:`List[str]`: The list of saved files :obj:`List[str]`: The list of saved files
""" """
pass pass
def token_to_id(self, tokens): def token_to_id(self, tokens):
""" """
Get the ID associated to a token Get the ID associated to a token
@ -302,6 +314,7 @@ class Unigram(Model):
:obj:`int`: The ID associated to the token :obj:`int`: The ID associated to the token
""" """
pass pass
def tokenize(self, sequence): def tokenize(self, sequence):
""" """
Tokenize a sequence Tokenize a sequence
@ -328,9 +341,9 @@ class WordLevel(Model):
unk_token (:obj:`str`, `optional`): unk_token (:obj:`str`, `optional`):
The unknown token to be used by the model. The unknown token to be used by the model.
""" """
def __init__(self, vocab, unk_token): def __init__(self, vocab, unk_token):
pass pass
@staticmethod @staticmethod
def from_file(vocab, unk_token): def from_file(vocab, unk_token):
""" """
@ -353,6 +366,7 @@ class WordLevel(Model):
:class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
""" """
pass pass
def get_trainer(self): def get_trainer(self):
""" """
Get the associated :class:`~tokenizers.trainers.Trainer` Get the associated :class:`~tokenizers.trainers.Trainer`
@ -364,6 +378,7 @@ class WordLevel(Model):
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
""" """
pass pass
def id_to_token(self, id): def id_to_token(self, id):
""" """
Get the token associated to an ID Get the token associated to an ID
@ -376,6 +391,7 @@ class WordLevel(Model):
:obj:`str`: The token associated to the ID :obj:`str`: The token associated to the ID
""" """
pass pass
@staticmethod @staticmethod
def read_file(vocab): def read_file(vocab):
""" """
@ -393,6 +409,7 @@ class WordLevel(Model):
:obj:`Dict[str, int]`: The vocabulary as a :obj:`dict` :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
""" """
pass pass
def save(self, folder, prefix): def save(self, folder, prefix):
""" """
Save the current model Save the current model
@ -412,6 +429,7 @@ class WordLevel(Model):
:obj:`List[str]`: The list of saved files :obj:`List[str]`: The list of saved files
""" """
pass pass
def token_to_id(self, tokens): def token_to_id(self, tokens):
""" """
Get the ID associated to a token Get the ID associated to a token
@ -424,6 +442,7 @@ class WordLevel(Model):
:obj:`int`: The ID associated to the token :obj:`int`: The ID associated to the token
""" """
pass pass
def tokenize(self, sequence): def tokenize(self, sequence):
""" """
Tokenize a sequence Tokenize a sequence
@ -451,9 +470,9 @@ class WordPiece(Model):
max_input_chars_per_word (:obj:`int`, `optional`): max_input_chars_per_word (:obj:`int`, `optional`):
The maximum number of characters to authorize in a single word. The maximum number of characters to authorize in a single word.
""" """
def __init__(self, vocab, unk_token, max_input_chars_per_word): def __init__(self, vocab, unk_token, max_input_chars_per_word):
pass pass
@staticmethod @staticmethod
def from_file(vocab, **kwargs): def from_file(vocab, **kwargs):
""" """
@ -476,6 +495,7 @@ class WordPiece(Model):
:class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
""" """
pass pass
def get_trainer(self): def get_trainer(self):
""" """
Get the associated :class:`~tokenizers.trainers.Trainer` Get the associated :class:`~tokenizers.trainers.Trainer`
@ -487,6 +507,7 @@ class WordPiece(Model):
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
""" """
pass pass
def id_to_token(self, id): def id_to_token(self, id):
""" """
Get the token associated to an ID Get the token associated to an ID
@ -499,6 +520,7 @@ class WordPiece(Model):
:obj:`str`: The token associated to the ID :obj:`str`: The token associated to the ID
""" """
pass pass
@staticmethod @staticmethod
def read_file(vocab): def read_file(vocab):
""" """
@ -517,6 +539,7 @@ class WordPiece(Model):
:obj:`Dict[str, int]`: The vocabulary as a :obj:`dict` :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
""" """
pass pass
def save(self, folder, prefix): def save(self, folder, prefix):
""" """
Save the current model Save the current model
@ -536,6 +559,7 @@ class WordPiece(Model):
:obj:`List[str]`: The list of saved files :obj:`List[str]`: The list of saved files
""" """
pass pass
def token_to_id(self, tokens): def token_to_id(self, tokens):
""" """
Get the ID associated to a token Get the ID associated to a token
@ -548,6 +572,7 @@ class WordPiece(Model):
:obj:`int`: The ID associated to the token :obj:`int`: The ID associated to the token
""" """
pass pass
def tokenize(self, sequence): def tokenize(self, sequence):
""" """
Tokenize a sequence Tokenize a sequence

View File

@ -6,7 +6,6 @@ class Normalizer:
This class is not supposed to be instantiated directly. Instead, any implementation of a This class is not supposed to be instantiated directly. Instead, any implementation of a
Normalizer will return an instance of this class when instantiated. Normalizer will return an instance of this class when instantiated.
""" """
def normalize(self, normalized): def normalize(self, normalized):
""" """
Normalize a :class:`~tokenizers.NormalizedString` in-place Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -22,6 +21,7 @@ class Normalizer:
:class:`~tokenizers.normalizers.Normalizer` :class:`~tokenizers.normalizers.Normalizer`
""" """
pass pass
def normalize_str(self, sequence): def normalize_str(self, sequence):
""" """
Normalize the given string Normalize the given string
@ -62,9 +62,9 @@ class BertNormalizer(Normalizer):
lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`): lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lowercase. Whether to lowercase.
""" """
def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True): def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True):
pass pass
def normalize(self, normalized): def normalize(self, normalized):
""" """
Normalize a :class:`~tokenizers.NormalizedString` in-place Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -80,6 +80,7 @@ class BertNormalizer(Normalizer):
:class:`~tokenizers.normalizers.Normalizer` :class:`~tokenizers.normalizers.Normalizer`
""" """
pass pass
def normalize_str(self, sequence): def normalize_str(self, sequence):
""" """
Normalize the given string Normalize the given string
@ -102,9 +103,9 @@ class Lowercase(Normalizer):
""" """
Lowercase Normalizer Lowercase Normalizer
""" """
def __init__(self): def __init__(self):
pass pass
def normalize(self, normalized): def normalize(self, normalized):
""" """
Normalize a :class:`~tokenizers.NormalizedString` in-place Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -120,6 +121,7 @@ class Lowercase(Normalizer):
:class:`~tokenizers.normalizers.Normalizer` :class:`~tokenizers.normalizers.Normalizer`
""" """
pass pass
def normalize_str(self, sequence): def normalize_str(self, sequence):
""" """
Normalize the given string Normalize the given string
@ -142,9 +144,9 @@ class NFC(Normalizer):
""" """
NFC Unicode Normalizer NFC Unicode Normalizer
""" """
def __init__(self): def __init__(self):
pass pass
def normalize(self, normalized): def normalize(self, normalized):
""" """
Normalize a :class:`~tokenizers.NormalizedString` in-place Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -160,6 +162,7 @@ class NFC(Normalizer):
:class:`~tokenizers.normalizers.Normalizer` :class:`~tokenizers.normalizers.Normalizer`
""" """
pass pass
def normalize_str(self, sequence): def normalize_str(self, sequence):
""" """
Normalize the given string Normalize the given string
@ -182,9 +185,9 @@ class NFD(Normalizer):
""" """
NFD Unicode Normalizer NFD Unicode Normalizer
""" """
def __init__(self): def __init__(self):
pass pass
def normalize(self, normalized): def normalize(self, normalized):
""" """
Normalize a :class:`~tokenizers.NormalizedString` in-place Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -200,6 +203,7 @@ class NFD(Normalizer):
:class:`~tokenizers.normalizers.Normalizer` :class:`~tokenizers.normalizers.Normalizer`
""" """
pass pass
def normalize_str(self, sequence): def normalize_str(self, sequence):
""" """
Normalize the given string Normalize the given string
@ -222,9 +226,9 @@ class NFKC(Normalizer):
""" """
NFKC Unicode Normalizer NFKC Unicode Normalizer
""" """
def __init__(self): def __init__(self):
pass pass
def normalize(self, normalized): def normalize(self, normalized):
""" """
Normalize a :class:`~tokenizers.NormalizedString` in-place Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -240,6 +244,7 @@ class NFKC(Normalizer):
:class:`~tokenizers.normalizers.Normalizer` :class:`~tokenizers.normalizers.Normalizer`
""" """
pass pass
def normalize_str(self, sequence): def normalize_str(self, sequence):
""" """
Normalize the given string Normalize the given string
@ -262,9 +267,9 @@ class NFKD(Normalizer):
""" """
NFKD Unicode Normalizer NFKD Unicode Normalizer
""" """
def __init__(self): def __init__(self):
pass pass
def normalize(self, normalized): def normalize(self, normalized):
""" """
Normalize a :class:`~tokenizers.NormalizedString` in-place Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -280,6 +285,7 @@ class NFKD(Normalizer):
:class:`~tokenizers.normalizers.Normalizer` :class:`~tokenizers.normalizers.Normalizer`
""" """
pass pass
def normalize_str(self, sequence): def normalize_str(self, sequence):
""" """
Normalize the given string Normalize the given string
@ -302,9 +308,9 @@ class Nmt(Normalizer):
""" """
Nmt normalizer Nmt normalizer
""" """
def __init__(self): def __init__(self):
pass pass
def normalize(self, normalized): def normalize(self, normalized):
""" """
Normalize a :class:`~tokenizers.NormalizedString` in-place Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -320,6 +326,7 @@ class Nmt(Normalizer):
:class:`~tokenizers.normalizers.Normalizer` :class:`~tokenizers.normalizers.Normalizer`
""" """
pass pass
def normalize_str(self, sequence): def normalize_str(self, sequence):
""" """
Normalize the given string Normalize the given string
@ -343,9 +350,9 @@ class Precompiled(Normalizer):
Precompiled normalizer Precompiled normalizer
Don't use manually it is used for compatiblity for SentencePiece. Don't use manually it is used for compatiblity for SentencePiece.
""" """
def __init__(self, precompiled_charsmap): def __init__(self, precompiled_charsmap):
pass pass
def normalize(self, normalized): def normalize(self, normalized):
""" """
Normalize a :class:`~tokenizers.NormalizedString` in-place Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -361,6 +368,7 @@ class Precompiled(Normalizer):
:class:`~tokenizers.normalizers.Normalizer` :class:`~tokenizers.normalizers.Normalizer`
""" """
pass pass
def normalize_str(self, sequence): def normalize_str(self, sequence):
""" """
Normalize the given string Normalize the given string
@ -383,9 +391,9 @@ class Prepend(Normalizer):
""" """
Prepend normalizer Prepend normalizer
""" """
def __init__(self, prepend): def __init__(self, prepend):
pass pass
def normalize(self, normalized): def normalize(self, normalized):
""" """
Normalize a :class:`~tokenizers.NormalizedString` in-place Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -401,6 +409,7 @@ class Prepend(Normalizer):
:class:`~tokenizers.normalizers.Normalizer` :class:`~tokenizers.normalizers.Normalizer`
""" """
pass pass
def normalize_str(self, sequence): def normalize_str(self, sequence):
""" """
Normalize the given string Normalize the given string
@ -423,9 +432,9 @@ class Replace(Normalizer):
""" """
Replace normalizer Replace normalizer
""" """
def __init__(self, pattern, content): def __init__(self, pattern, content):
pass pass
def normalize(self, normalized): def normalize(self, normalized):
""" """
Normalize a :class:`~tokenizers.NormalizedString` in-place Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -441,6 +450,7 @@ class Replace(Normalizer):
:class:`~tokenizers.normalizers.Normalizer` :class:`~tokenizers.normalizers.Normalizer`
""" """
pass pass
def normalize_str(self, sequence): def normalize_str(self, sequence):
""" """
Normalize the given string Normalize the given string
@ -468,7 +478,6 @@ class Sequence(Normalizer):
normalizers (:obj:`List[Normalizer]`): normalizers (:obj:`List[Normalizer]`):
A list of Normalizer to be run as a sequence A list of Normalizer to be run as a sequence
""" """
def normalize(self, normalized): def normalize(self, normalized):
""" """
Normalize a :class:`~tokenizers.NormalizedString` in-place Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -484,6 +493,7 @@ class Sequence(Normalizer):
:class:`~tokenizers.normalizers.Normalizer` :class:`~tokenizers.normalizers.Normalizer`
""" """
pass pass
def normalize_str(self, sequence): def normalize_str(self, sequence):
""" """
Normalize the given string Normalize the given string
@ -506,9 +516,9 @@ class Strip(Normalizer):
""" """
Strip normalizer Strip normalizer
""" """
def __init__(self, left=True, right=True): def __init__(self, left=True, right=True):
pass pass
def normalize(self, normalized): def normalize(self, normalized):
""" """
Normalize a :class:`~tokenizers.NormalizedString` in-place Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -524,6 +534,7 @@ class Strip(Normalizer):
:class:`~tokenizers.normalizers.Normalizer` :class:`~tokenizers.normalizers.Normalizer`
""" """
pass pass
def normalize_str(self, sequence): def normalize_str(self, sequence):
""" """
Normalize the given string Normalize the given string
@ -546,9 +557,9 @@ class StripAccents(Normalizer):
""" """
StripAccents normalizer StripAccents normalizer
""" """
def __init__(self): def __init__(self):
pass pass
def normalize(self, normalized): def normalize(self, normalized):
""" """
Normalize a :class:`~tokenizers.NormalizedString` in-place Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -564,6 +575,7 @@ class StripAccents(Normalizer):
:class:`~tokenizers.normalizers.Normalizer` :class:`~tokenizers.normalizers.Normalizer`
""" """
pass pass
def normalize_str(self, sequence): def normalize_str(self, sequence):
""" """
Normalize the given string Normalize the given string

View File

@ -6,7 +6,6 @@ class PreTokenizer:
This class is not supposed to be instantiated directly. Instead, any implementation of a This class is not supposed to be instantiated directly. Instead, any implementation of a
PreTokenizer will return an instance of this class when instantiated. PreTokenizer will return an instance of this class when instantiated.
""" """
def pre_tokenize(self, pretok): def pre_tokenize(self, pretok):
""" """
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -23,6 +22,7 @@ class PreTokenizer:
:class:`~tokenizers.pre_tokenizers.PreTokenizer` :class:`~tokenizers.pre_tokenizers.PreTokenizer`
""" """
pass pass
def pre_tokenize_str(self, sequence): def pre_tokenize_str(self, sequence):
""" """
Pre tokenize the given string Pre tokenize the given string
@ -50,9 +50,9 @@ class BertPreTokenizer(PreTokenizer):
This pre-tokenizer splits tokens on spaces, and also on punctuation. This pre-tokenizer splits tokens on spaces, and also on punctuation.
Each occurence of a punctuation character will be treated separately. Each occurence of a punctuation character will be treated separately.
""" """
def __init__(self): def __init__(self):
pass pass
def pre_tokenize(self, pretok): def pre_tokenize(self, pretok):
""" """
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -69,6 +69,7 @@ class BertPreTokenizer(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer` :class:`~tokenizers.pre_tokenizers.PreTokenizer`
""" """
pass pass
def pre_tokenize_str(self, sequence): def pre_tokenize_str(self, sequence):
""" """
Pre tokenize the given string Pre tokenize the given string
@ -104,9 +105,9 @@ class ByteLevel(PreTokenizer):
Set this to :obj:`False` to prevent this `pre_tokenizer` from using Set this to :obj:`False` to prevent this `pre_tokenizer` from using
the GPT2 specific regexp for spliting on whitespace. the GPT2 specific regexp for spliting on whitespace.
""" """
def __init__(self, add_prefix_space=True, use_regex=True): def __init__(self, add_prefix_space=True, use_regex=True):
pass pass
@staticmethod @staticmethod
def alphabet(): def alphabet():
""" """
@ -120,6 +121,7 @@ class ByteLevel(PreTokenizer):
:obj:`List[str]`: A list of characters that compose the alphabet :obj:`List[str]`: A list of characters that compose the alphabet
""" """
pass pass
def pre_tokenize(self, pretok): def pre_tokenize(self, pretok):
""" """
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -136,6 +138,7 @@ class ByteLevel(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer` :class:`~tokenizers.pre_tokenizers.PreTokenizer`
""" """
pass pass
def pre_tokenize_str(self, sequence): def pre_tokenize_str(self, sequence):
""" """
Pre tokenize the given string Pre tokenize the given string
@ -164,7 +167,6 @@ class CharDelimiterSplit(PreTokenizer):
delimiter: str: delimiter: str:
The delimiter char that will be used to split input The delimiter char that will be used to split input
""" """
def pre_tokenize(self, pretok): def pre_tokenize(self, pretok):
""" """
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -181,6 +183,7 @@ class CharDelimiterSplit(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer` :class:`~tokenizers.pre_tokenizers.PreTokenizer`
""" """
pass pass
def pre_tokenize_str(self, sequence): def pre_tokenize_str(self, sequence):
""" """
Pre tokenize the given string Pre tokenize the given string
@ -215,9 +218,9 @@ class Digits(PreTokenizer):
"Call 123 please" -> "Call ", "123", " please" "Call 123 please" -> "Call ", "123", " please"
""" """
def __init__(self, individual_digits=False): def __init__(self, individual_digits=False):
pass pass
def pre_tokenize(self, pretok): def pre_tokenize(self, pretok):
""" """
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -234,6 +237,7 @@ class Digits(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer` :class:`~tokenizers.pre_tokenizers.PreTokenizer`
""" """
pass pass
def pre_tokenize_str(self, sequence): def pre_tokenize_str(self, sequence):
""" """
Pre tokenize the given string Pre tokenize the given string
@ -270,9 +274,9 @@ class Metaspace(PreTokenizer):
Whether to add a space to the first word if there isn't already one. This Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`. lets us treat `hello` exactly like `say hello`.
""" """
def __init__(self, replacement="_", add_prefix_space=True): def __init__(self, replacement="_", add_prefix_space=True):
pass pass
def pre_tokenize(self, pretok): def pre_tokenize(self, pretok):
""" """
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -289,6 +293,7 @@ class Metaspace(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer` :class:`~tokenizers.pre_tokenizers.PreTokenizer`
""" """
pass pass
def pre_tokenize_str(self, sequence): def pre_tokenize_str(self, sequence):
""" """
Pre tokenize the given string Pre tokenize the given string
@ -319,9 +324,9 @@ class Punctuation(PreTokenizer):
Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next", Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
"contiguous" "contiguous"
""" """
def __init__(self, behavior="isolated"): def __init__(self, behavior="isolated"):
pass pass
def pre_tokenize(self, pretok): def pre_tokenize(self, pretok):
""" """
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -338,6 +343,7 @@ class Punctuation(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer` :class:`~tokenizers.pre_tokenizers.PreTokenizer`
""" """
pass pass
def pre_tokenize_str(self, sequence): def pre_tokenize_str(self, sequence):
""" """
Pre tokenize the given string Pre tokenize the given string
@ -362,9 +368,9 @@ class Sequence(PreTokenizer):
""" """
This pre-tokenizer composes other pre_tokenizers and applies them in sequence This pre-tokenizer composes other pre_tokenizers and applies them in sequence
""" """
def __init__(self, pretokenizers): def __init__(self, pretokenizers):
pass pass
def pre_tokenize(self, pretok): def pre_tokenize(self, pretok):
""" """
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -381,6 +387,7 @@ class Sequence(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer` :class:`~tokenizers.pre_tokenizers.PreTokenizer`
""" """
pass pass
def pre_tokenize_str(self, sequence): def pre_tokenize_str(self, sequence):
""" """
Pre tokenize the given string Pre tokenize the given string
@ -421,9 +428,9 @@ class Split(PreTokenizer):
invert (:obj:`bool`, `optional`, defaults to :obj:`False`): invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to invert the pattern. Whether to invert the pattern.
""" """
def __init__(self, pattern, behavior, invert=False): def __init__(self, pattern, behavior, invert=False):
pass pass
def pre_tokenize(self, pretok): def pre_tokenize(self, pretok):
""" """
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -440,6 +447,7 @@ class Split(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer` :class:`~tokenizers.pre_tokenizers.PreTokenizer`
""" """
pass pass
def pre_tokenize_str(self, sequence): def pre_tokenize_str(self, sequence):
""" """
Pre tokenize the given string Pre tokenize the given string
@ -467,9 +475,9 @@ class UnicodeScripts(PreTokenizer):
Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too. Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
This mimicks SentencePiece Unigram implementation. This mimicks SentencePiece Unigram implementation.
""" """
def __init__(self): def __init__(self):
pass pass
def pre_tokenize(self, pretok): def pre_tokenize(self, pretok):
""" """
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -486,6 +494,7 @@ class UnicodeScripts(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer` :class:`~tokenizers.pre_tokenizers.PreTokenizer`
""" """
pass pass
def pre_tokenize_str(self, sequence): def pre_tokenize_str(self, sequence):
""" """
Pre tokenize the given string Pre tokenize the given string
@ -510,9 +519,9 @@ class Whitespace(PreTokenizer):
""" """
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+` This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
""" """
def __init__(self): def __init__(self):
pass pass
def pre_tokenize(self, pretok): def pre_tokenize(self, pretok):
""" """
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -529,6 +538,7 @@ class Whitespace(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer` :class:`~tokenizers.pre_tokenizers.PreTokenizer`
""" """
pass pass
def pre_tokenize_str(self, sequence): def pre_tokenize_str(self, sequence):
""" """
Pre tokenize the given string Pre tokenize the given string
@ -553,9 +563,9 @@ class WhitespaceSplit(PreTokenizer):
""" """
This pre-tokenizer simply splits on the whitespace. Works like `.split()` This pre-tokenizer simply splits on the whitespace. Works like `.split()`
""" """
def __init__(self): def __init__(self):
pass pass
def pre_tokenize(self, pretok): def pre_tokenize(self, pretok):
""" """
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -572,6 +582,7 @@ class WhitespaceSplit(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer` :class:`~tokenizers.pre_tokenizers.PreTokenizer`
""" """
pass pass
def pre_tokenize_str(self, sequence): def pre_tokenize_str(self, sequence):
""" """
Pre tokenize the given string Pre tokenize the given string

View File

@ -6,7 +6,6 @@ class PostProcessor:
This class is not supposed to be instantiated directly. Instead, any implementation of This class is not supposed to be instantiated directly. Instead, any implementation of
a PostProcessor will return an instance of this class when instantiated. a PostProcessor will return an instance of this class when instantiated.
""" """
def num_special_tokens_to_add(self, is_pair): def num_special_tokens_to_add(self, is_pair):
""" """
Return the number of special tokens that would be added for single/pair sentences. Return the number of special tokens that would be added for single/pair sentences.
@ -19,6 +18,7 @@ class PostProcessor:
:obj:`int`: The number of tokens to add :obj:`int`: The number of tokens to add
""" """
pass pass
def process(self, encoding, pair=None, add_special_tokens=True): def process(self, encoding, pair=None, add_special_tokens=True):
""" """
Post-process the given encodings, generating the final one Post-process the given encodings, generating the final one
@ -53,9 +53,9 @@ class BertProcessing(PostProcessor):
cls (:obj:`Tuple[str, int]`): cls (:obj:`Tuple[str, int]`):
A tuple with the string representation of the CLS token, and its id A tuple with the string representation of the CLS token, and its id
""" """
def __init__(self, sep, cls): def __init__(self, sep, cls):
pass pass
def num_special_tokens_to_add(self, is_pair): def num_special_tokens_to_add(self, is_pair):
""" """
Return the number of special tokens that would be added for single/pair sentences. Return the number of special tokens that would be added for single/pair sentences.
@ -68,6 +68,7 @@ class BertProcessing(PostProcessor):
:obj:`int`: The number of tokens to add :obj:`int`: The number of tokens to add
""" """
pass pass
def process(self, encoding, pair=None, add_special_tokens=True): def process(self, encoding, pair=None, add_special_tokens=True):
""" """
Post-process the given encodings, generating the final one Post-process the given encodings, generating the final one
@ -98,9 +99,9 @@ class ByteLevel(PostProcessor):
trim_offsets (:obj:`bool`): trim_offsets (:obj:`bool`):
Whether to trim the whitespaces from the produced offsets. Whether to trim the whitespaces from the produced offsets.
""" """
def __init__(self, trim_offsets=True): def __init__(self, trim_offsets=True):
pass pass
def num_special_tokens_to_add(self, is_pair): def num_special_tokens_to_add(self, is_pair):
""" """
Return the number of special tokens that would be added for single/pair sentences. Return the number of special tokens that would be added for single/pair sentences.
@ -113,6 +114,7 @@ class ByteLevel(PostProcessor):
:obj:`int`: The number of tokens to add :obj:`int`: The number of tokens to add
""" """
pass pass
def process(self, encoding, pair=None, add_special_tokens=True): def process(self, encoding, pair=None, add_special_tokens=True):
""" """
Post-process the given encodings, generating the final one Post-process the given encodings, generating the final one
@ -159,9 +161,9 @@ class RobertaProcessing(PostProcessor):
Whether the add_prefix_space option was enabled during pre-tokenization. This Whether the add_prefix_space option was enabled during pre-tokenization. This
is relevant because it defines the way the offsets are trimmed out. is relevant because it defines the way the offsets are trimmed out.
""" """
def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True): def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True):
pass pass
def num_special_tokens_to_add(self, is_pair): def num_special_tokens_to_add(self, is_pair):
""" """
Return the number of special tokens that would be added for single/pair sentences. Return the number of special tokens that would be added for single/pair sentences.
@ -174,6 +176,7 @@ class RobertaProcessing(PostProcessor):
:obj:`int`: The number of tokens to add :obj:`int`: The number of tokens to add
""" """
pass pass
def process(self, encoding, pair=None, add_special_tokens=True): def process(self, encoding, pair=None, add_special_tokens=True):
""" """
Post-process the given encodings, generating the final one Post-process the given encodings, generating the final one
@ -201,9 +204,9 @@ class Sequence(PostProcessor):
processors (:obj:`List[PostProcessor]`) processors (:obj:`List[PostProcessor]`)
The processors that need to be chained The processors that need to be chained
""" """
def __init__(self, processors): def __init__(self, processors):
pass pass
def num_special_tokens_to_add(self, is_pair): def num_special_tokens_to_add(self, is_pair):
""" """
Return the number of special tokens that would be added for single/pair sentences. Return the number of special tokens that would be added for single/pair sentences.
@ -216,6 +219,7 @@ class Sequence(PostProcessor):
:obj:`int`: The number of tokens to add :obj:`int`: The number of tokens to add
""" """
pass pass
def process(self, encoding, pair=None, add_special_tokens=True): def process(self, encoding, pair=None, add_special_tokens=True):
""" """
Post-process the given encodings, generating the final one Post-process the given encodings, generating the final one
@ -302,9 +306,9 @@ class TemplateProcessing(PostProcessor):
The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have
the same length. the same length.
""" """
def __init__(self, single, pair, special_tokens): def __init__(self, single, pair, special_tokens):
pass pass
def num_special_tokens_to_add(self, is_pair): def num_special_tokens_to_add(self, is_pair):
""" """
Return the number of special tokens that would be added for single/pair sentences. Return the number of special tokens that would be added for single/pair sentences.
@ -317,6 +321,7 @@ class TemplateProcessing(PostProcessor):
:obj:`int`: The number of tokens to add :obj:`int`: The number of tokens to add
""" """
pass pass
def process(self, encoding, pair=None, add_special_tokens=True): def process(self, encoding, pair=None, add_special_tokens=True):
""" """
Post-process the given encodings, generating the final one Post-process the given encodings, generating the final one

View File

@ -92,7 +92,7 @@ class EncodingVisualizer:
if default_to_notebook: if default_to_notebook:
try: try:
from IPython.core.display import HTML, display from IPython.core.display import HTML, display
except ImportError as e: except ImportError:
raise Exception( raise Exception(
"""We couldn't import IPython utils for html display. """We couldn't import IPython utils for html display.
Are you running in a notebook? Are you running in a notebook?
@ -136,7 +136,7 @@ class EncodingVisualizer:
if final_default_to_notebook: if final_default_to_notebook:
try: try:
from IPython.core.display import HTML, display from IPython.core.display import HTML, display
except ImportError as e: except ImportError:
raise Exception( raise Exception(
"""We couldn't import IPython utils for html display. """We couldn't import IPython utils for html display.
Are you running in a notebook?""" Are you running in a notebook?"""
@ -170,7 +170,7 @@ class EncodingVisualizer:
if h_step < 20: if h_step < 20:
h_step = 20 h_step = 20
s = 32 s = 32
l = 64 l = 64 # noqa: E741
h = 10 h = 10
colors = {} colors = {}

View File

@ -80,7 +80,6 @@ class UnigramTrainer(Trainer):
The number of iterations of the EM algorithm to perform before The number of iterations of the EM algorithm to perform before
pruning the vocabulary. pruning the vocabulary.
""" """
def __init__( def __init__(
self, self,
vocab_size=8000, vocab_size=8000,
@ -143,7 +142,6 @@ class WordPieceTrainer(Trainer):
end_of_word_suffix (:obj:`str`, `optional`): end_of_word_suffix (:obj:`str`, `optional`):
A suffix to be used for every subword that is a end-of-word. A suffix to be used for every subword that is a end-of-word.
""" """
def __init__( def __init__(
self, self,
vocab_size=30000, vocab_size=30000,

View File

@ -34,7 +34,7 @@ Source = 'https://github.com/huggingface/tokenizers'
[project.optional-dependencies] [project.optional-dependencies]
testing = ["pytest", "requests", "numpy", "datasets", "black==22.3"] testing = ["pytest", "requests", "numpy", "datasets", "black==22.3", "ruff"]
docs = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"] docs = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"]
dev = ["tokenizers[testing]"] dev = ["tokenizers[testing]"]
@ -52,3 +52,21 @@ features = ["pyo3/extension-module"]
[tool.black] [tool.black]
line-length = 119 line-length = 119
target-version = ['py35'] target-version = ['py35']
[tool.ruff]
line-length = 119
target-version = "py311"
lint.ignore = [
# a == None in tests vs is None.
"E711",
# a == False in tests vs is False.
"E712",
# try.. import except.. pattern without using the lib.
"F401",
# Raw type equality is required in asserts
"E721",
# Import order
"E402",
# Fixtures unused import
"F811",
]

View File

@ -80,9 +80,7 @@ class SpmConverter(Converter):
tokenizer = Tokenizer(Unigram(vocab, unk_id)) tokenizer = Tokenizer(Unigram(vocab, unk_id))
elif model_type == 2: elif model_type == 2:
vocab, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract() vocab, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract()
tokenizer = Tokenizer( tokenizer = Tokenizer(BPE(vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True))
BPE(vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True)
)
else: else:
raise Exception( raise Exception(
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm" "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
@ -105,12 +103,8 @@ class SpmConverter(Converter):
replacement = "" replacement = ""
add_prefix_space = True add_prefix_space = True
tokenizer.pre_tokenizer = Metaspace( tokenizer.pre_tokenizer = Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
replacement=replacement, add_prefix_space=add_prefix_space tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
)
tokenizer.decoder = decoders.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
)
post_processor = self.post_processor(tokenizer) post_processor = self.post_processor(tokenizer)
if post_processor: if post_processor:
tokenizer.post_processor = post_processor tokenizer.post_processor = post_processor
@ -124,9 +118,7 @@ class SpmConverter(Converter):
class AlbertConverter(SpmConverter): class AlbertConverter(SpmConverter):
def vocab(self, proto): def vocab(self, proto):
return [ return [
(piece.piece, piece.score) (piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100)
if check_number_comma(piece.piece)
else (piece.piece, piece.score - 100)
for piece in proto.pieces for piece in proto.pieces
] ]
@ -261,9 +253,7 @@ class XLMRobertaConverter(SpmConverter):
class XLNetConverter(SpmConverter): class XLNetConverter(SpmConverter):
def vocab(self, proto): def vocab(self, proto):
return [ return [
(piece.piece, piece.score) (piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100)
if check_number_comma(piece.piece)
else (piece.piece, piece.score - 100)
for piece in proto.pieces for piece in proto.pieces
] ]
@ -420,9 +410,7 @@ def main():
print(f"|{'-'*model_len}|{'-'*status_len}|{'-'*speedup_len}|") print(f"|{'-'*model_len}|{'-'*status_len}|{'-'*speedup_len}|")
for pretrained in args.models: for pretrained in args.models:
status, speedup = check(pretrained, args.filename) status, speedup = check(pretrained, args.filename)
print( print(f"|{pretrained:<{model_len}}|{status:^{status_len}}|{speedup:^{speedup_len - 1}.2f}x|")
f"|{pretrained:<{model_len}}|{status:^{status_len}}|{speedup:^{speedup_len - 1}.2f}x|"
)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -59,7 +59,6 @@ class YouTokenToMeExtractor:
def extract(self) -> Tuple[Dict[str, int], List[Tuple]]: def extract(self) -> Tuple[Dict[str, int], List[Tuple]]:
with open(self._model, "r") as model_f: with open(self._model, "r") as model_f:
# Retrieve information # Retrieve information
nb_pieces, nb_merges = map(int, model_f.readline().split()) nb_pieces, nb_merges = map(int, model_f.readline().split())
vocab, merges = {}, [] vocab, merges = {}, []
@ -97,9 +96,7 @@ if __name__ == "__main__":
choices=["sentencepiece", "youtokentome"], choices=["sentencepiece", "youtokentome"],
help="Indicate the format of the file.", help="Indicate the format of the file.",
) )
parser.add_argument( parser.add_argument("--model", type=str, required=True, help="SentencePiece model to extract vocab from.")
"--model", type=str, required=True, help="SentencePiece model to extract vocab from."
)
parser.add_argument( parser.add_argument(
"--vocab-output-path", "--vocab-output-path",
type=str, type=str,
@ -128,9 +125,7 @@ if __name__ == "__main__":
args.model = f.name args.model = f.name
# Allocate extractor # Allocate extractor
extractor = ( extractor = SentencePieceExtractor if args.provider == "sentencepiece" else YouTokenToMeExtractor
SentencePieceExtractor if args.provider == "sentencepiece" else YouTokenToMeExtractor
)
extractor = extractor(args.model) extractor = extractor(args.model)
logger.info(f"Using {type(extractor).__name__}") logger.info(f"Using {type(extractor).__name__}")

View File

@ -121,9 +121,7 @@ def check_train(args):
break break
print(f"Tokenizer used {tokenizer_tokens}, where spm used {spm_tokens}") print(f"Tokenizer used {tokenizer_tokens}, where spm used {spm_tokens}")
assert ( assert tokenizer_tokens < spm_tokens, "Our trainer should be at least more efficient than the SPM one"
tokenizer_tokens < spm_tokens
), "Our trainer should be at least more efficient than the SPM one"
print("Ok our trainer is at least more efficient than the SPM one") print("Ok our trainer is at least more efficient than the SPM one")
@ -131,9 +129,7 @@ def check_diff(spm_diff, tok_diff, sp, tok):
if spm_diff == list(reversed(tok_diff)): if spm_diff == list(reversed(tok_diff)):
# AAA -> AA+A vs A+AA case. # AAA -> AA+A vs A+AA case.
return True return True
elif len(spm_diff) == len(tok_diff) and tok.decode(spm_diff) == tok.decode( elif len(spm_diff) == len(tok_diff) and tok.decode(spm_diff) == tok.decode(tok_diff):
tok_diff
):
# Second order OK # Second order OK
# Barrich -> Barr + ich vs Bar + rich # Barrich -> Barr + ich vs Bar + rich
return True return True
@ -173,24 +169,17 @@ def check_details(line, spm_ids, tok_ids, sp, tok):
spms = Counter(spm_ids[first:last]) spms = Counter(spm_ids[first:last])
toks = Counter(tok_ids[first:last]) toks = Counter(tok_ids[first:last])
removable_tokens = { removable_tokens = {spm_ for (spm_, si) in spms.items() if toks.get(spm_, 0) == si}
spm_ for (spm_, si) in spms.items() if toks.get(spm_, 0) == si
}
min_width = 3 min_width = 3
for i in range(last - first - min_width): for i in range(last - first - min_width):
if all( if all(spm_ids[first + i + j] in removable_tokens for j in range(min_width)):
spm_ids[first + i + j] in removable_tokens for j in range(min_width)
):
possible_matches = [ possible_matches = [
k k
for k in range(last - first - min_width) for k in range(last - first - min_width)
if tok_ids[first + k : first + k + min_width] if tok_ids[first + k : first + k + min_width] == spm_ids[first + i : first + i + min_width]
== spm_ids[first + i : first + i + min_width]
] ]
for j in possible_matches: for j in possible_matches:
if check_diff( if check_diff(spm_ids[first : first + i], tok_ids[first : first + j], sp, tok) and check_details(
spm_ids[first : first + i], tok_ids[first : first + j], sp, tok
) and check_details(
line, line,
spm_ids[first + i : last], spm_ids[first + i : last],
tok_ids[first + j : last], tok_ids[first + j : last],
@ -210,9 +199,7 @@ def check_details(line, spm_ids, tok_ids, sp, tok):
wrong = tok.decode(spm_ids[first:last]) wrong = tok.decode(spm_ids[first:last])
print() print()
if has_color: if has_color:
print( print(f"{colored(ok_start, 'grey')}{colored(wrong, 'red')}{colored(ok_end, 'grey')}")
f"{colored(ok_start, 'grey')}{colored(wrong, 'red')}{colored(ok_end, 'grey')}"
)
else: else:
print(wrong) print(wrong)
return False return False
@ -251,9 +238,7 @@ def check_encode(args):
if args.verbose: if args.verbose:
if i % 10000 == 0: if i % 10000 == 0:
print( print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})")
f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})"
)
print(f"SPM: {spm_total_time} - TOK: {tok_total_time}") print(f"SPM: {spm_total_time} - TOK: {tok_total_time}")
if ids != encoded.ids: if ids != encoded.ids:
@ -265,13 +250,13 @@ def check_encode(args):
else: else:
perfect += 1 perfect += 1
assert ids == encoded.ids, f"line {i}: {line} : \n\n{ids}\n{encoded.ids}\n{list(zip(encoded.ids, encoded.tokens))}" assert (
ids == encoded.ids
), f"line {i}: {line} : \n\n{ids}\n{encoded.ids}\n{list(zip(encoded.ids, encoded.tokens))}"
print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})") print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})")
total = perfect + imperfect + wrong total = perfect + imperfect + wrong
print( print(f"Accuracy {perfect * 100 / total:.2f} Slowdown : {tok_total_time/ spm_total_time:.2f}")
f"Accuracy {perfect * 100 / total:.2f} Slowdown : {tok_total_time/ spm_total_time:.2f}"
)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -3,8 +3,6 @@ import inspect
import os import os
from pathlib import Path from pathlib import Path
import black
INDENT = " " * 4 INDENT = " " * 4
GENERATED_COMMENT = "# Generated content DO NOT EDIT\n" GENERATED_COMMENT = "# Generated content DO NOT EDIT\n"
@ -85,7 +83,7 @@ def pyi_file(obj, indent=""):
body += f"{indent+INDENT}pass\n" body += f"{indent+INDENT}pass\n"
body += "\n" body += "\n"
for (name, fn) in fns: for name, fn in fns:
body += pyi_file(fn, indent=indent) body += pyi_file(fn, indent=indent)
if not body: if not body:
@ -122,18 +120,17 @@ def py_file(module, origin):
return string return string
def do_black(content, is_pyi): import subprocess
mode = black.Mode( from typing import List, Optional, Tuple
target_versions={black.TargetVersion.PY35},
line_length=119,
is_pyi=is_pyi, def do_ruff(code, is_pyi: bool):
string_normalization=True, command = ["ruff", "format", "--config", "pyproject.toml", "--silent", "-"]
experimental_string_processing=False, if is_pyi:
) command.extend(["--stdin-filename", "test.pyi"])
try: process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
return black.format_file_contents(content, fast=True, mode=mode) stdout, _ = process.communicate(input=code.encode("utf-8"))
except black.NothingChanged: return stdout.decode("utf-8")
return content
def write(module, directory, origin, check=False): def write(module, directory, origin, check=False):
@ -141,7 +138,7 @@ def write(module, directory, origin, check=False):
filename = os.path.join(directory, "__init__.pyi") filename = os.path.join(directory, "__init__.pyi")
pyi_content = pyi_file(module) pyi_content = pyi_file(module)
pyi_content = do_black(pyi_content, is_pyi=True) pyi_content = do_ruff(pyi_content, is_pyi=True)
os.makedirs(directory, exist_ok=True) os.makedirs(directory, exist_ok=True)
if check: if check:
with open(filename, "r") as f: with open(filename, "r") as f:
@ -153,7 +150,7 @@ def write(module, directory, origin, check=False):
filename = os.path.join(directory, "__init__.py") filename = os.path.join(directory, "__init__.py")
py_content = py_file(module, origin) py_content = py_file(module, origin)
py_content = do_black(py_content, is_pyi=False) py_content = do_ruff(py_content, is_pyi=False)
os.makedirs(directory, exist_ok=True) os.makedirs(directory, exist_ok=True)
is_auto = False is_auto = False

View File

@ -3,7 +3,6 @@ import pickle
import pytest import pytest
from tokenizers.models import BPE, Model, WordLevel, WordPiece from tokenizers.models import BPE, Model, WordLevel, WordPiece
from ..utils import bert_files, data_dir, roberta_files from ..utils import bert_files, data_dir, roberta_files

View File

@ -2,8 +2,7 @@ import pickle
import pytest import pytest
from tokenizers import NormalizedString, Tokenizer from tokenizers import NormalizedString
from tokenizers.models import BPE
from tokenizers.normalizers import BertNormalizer, Lowercase, Normalizer, Sequence, Strip, Prepend from tokenizers.normalizers import BertNormalizer, Lowercase, Normalizer, Sequence, Strip, Prepend

View File

@ -146,18 +146,18 @@ class TestTemplateProcessing:
assert isinstance(pickle.loads(pickle.dumps(bert)), TemplateProcessing) assert isinstance(pickle.loads(pickle.dumps(bert)), TemplateProcessing)
# It is absolutely legal to have tokens with spaces in the name: # It is absolutely legal to have tokens with spaces in the name:
processor = TemplateProcessing( TemplateProcessing(
single=["[ C L S ]", "Token with space"], single=["[ C L S ]", "Token with space"],
special_tokens=[("[ C L S ]", 0), ("Token with space", 1)], special_tokens=[("[ C L S ]", 0), ("Token with space", 1)],
) )
# Sequence identifiers must be well formed: # Sequence identifiers must be well formed:
with pytest.raises(Exception, match="Cannot build Piece"): with pytest.raises(Exception, match="Cannot build Piece"):
processor = TemplateProcessing(single="[CLS] $$ [SEP]") TemplateProcessing(single="[CLS] $$ [SEP]")
with pytest.raises(Exception, match="Cannot build Piece"): with pytest.raises(Exception, match="Cannot build Piece"):
processor = TemplateProcessing(single="[CLS] $A: [SEP]") TemplateProcessing(single="[CLS] $A: [SEP]")
# Special tokens must be provided when used in template: # Special tokens must be provided when used in template:
with pytest.raises(Exception, match="Missing SpecialToken\\(s\\) with id\\(s\\)"): with pytest.raises(Exception, match="Missing SpecialToken\\(s\\) with id\\(s\\)"):
processor = TemplateProcessing(single=["[CLS]"]) TemplateProcessing(single=["[CLS]"])
def test_bert_parity(self): def test_bert_parity(self):
tokenizer = Tokenizer(BPE()) tokenizer = Tokenizer(BPE())

View File

@ -5,10 +5,9 @@ import pytest
from tokenizers import AddedToken, Encoding, Tokenizer from tokenizers import AddedToken, Encoding, Tokenizer
from tokenizers.implementations import BertWordPieceTokenizer from tokenizers.implementations import BertWordPieceTokenizer
from tokenizers.models import BPE, Model, WordPiece, Unigram from tokenizers.models import BPE, Model, Unigram
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import ByteLevel from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import BertProcessing, RobertaProcessing from tokenizers.processors import RobertaProcessing
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files

View File

@ -2,7 +2,6 @@ from tokenizers import Tokenizer
from ..utils import data_dir, doc_pipeline_bert_tokenizer, doc_wiki_tokenizer from ..utils import data_dir, doc_pipeline_bert_tokenizer, doc_wiki_tokenizer
disable_printing = True disable_printing = True
original_print = print original_print = print

View File

@ -1,8 +1,4 @@
from tokenizers import Tokenizer from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from ..utils import data_dir, doc_wiki_tokenizer from ..utils import data_dir, doc_wiki_tokenizer

View File

@ -1,3 +1,4 @@
# flake8: noqa
import gzip import gzip
import os import os

View File

@ -1,5 +1,3 @@
import pytest
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, processors from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, processors
from tokenizers.implementations import BaseTokenizer from tokenizers.implementations import BaseTokenizer

View File

@ -1,5 +1,3 @@
import pytest
from tokenizers import BertWordPieceTokenizer from tokenizers import BertWordPieceTokenizer
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism from ..utils import bert_files, data_dir, multiprocessing_with_parallelism

View File

@ -1,5 +1,3 @@
import pytest
from tokenizers import ByteLevelBPETokenizer from tokenizers import ByteLevelBPETokenizer
from ..utils import data_dir, multiprocessing_with_parallelism, roberta_files from ..utils import data_dir, multiprocessing_with_parallelism, roberta_files

View File

@ -1,5 +1,3 @@
import pytest
from tokenizers import CharBPETokenizer from tokenizers import CharBPETokenizer
from ..utils import data_dir, multiprocessing_with_parallelism, openai_files from ..utils import data_dir, multiprocessing_with_parallelism, openai_files

View File

@ -1,5 +1,3 @@
import os
import pytest import pytest
from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer

View File

@ -6,7 +6,6 @@ import tqdm
from huggingface_hub import HfApi, cached_download, hf_hub_url from huggingface_hub import HfApi, cached_download, hf_hub_url
from tokenizers import Tokenizer from tokenizers import Tokenizer
from .utils import albert_base, data_dir from .utils import albert_base, data_dir
@ -15,7 +14,7 @@ class TestSerialization:
# Check we can read this file. # Check we can read this file.
# This used to fail because of BufReader that would fail because the # This used to fail because of BufReader that would fail because the
# file exceeds the buffer capacity # file exceeds the buffer capacity
tokenizer = Tokenizer.from_file(albert_base) Tokenizer.from_file(albert_base)
def check(tokenizer_file) -> bool: def check(tokenizer_file) -> bool:
@ -51,8 +50,6 @@ class TestFullDeserialization(unittest.TestCase):
# Check we can read this file. # Check we can read this file.
# This used to fail because of BufReader that would fail because the # This used to fail because of BufReader that would fail because the
# file exceeds the buffer capacity # file exceeds the buffer capacity
api = HfApi()
not_loadable = [] not_loadable = []
invalid_pre_tokenizer = [] invalid_pre_tokenizer = []
@ -77,7 +74,7 @@ class TestFullDeserialization(unittest.TestCase):
except Exception as e: except Exception as e:
print(f"{model_id} is not loadable: {e}") print(f"{model_id} is not loadable: {e}")
not_loadable.append(model_id) not_loadable.append(model_id)
except: except: # noqa: E722
print(f"{model_id} is not loadable: Rust error") print(f"{model_id} is not loadable: Rust error")
not_loadable.append(model_id) not_loadable.append(model_id)