[remove black] And use ruff (#1436)

* nits

* Fixing deps.

* Ruff update.

* Import order matters.

* Fix.

* Revert ruff fix.

* Visualizer.

* Putting back the imports.

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
Arthur
2024-03-12 21:24:21 +11:00
committed by GitHub
parent 72a1973cd1
commit 29fef1e7aa
29 changed files with 258 additions and 169 deletions

View File

@ -8,12 +8,14 @@ check_dirs := examples py_src/tokenizers tests
# Format source code automatically
style:
python stub.py
black --line-length 119 --target-version py35 $(check_dirs)
ruff check $(check_dirs) --fix
ruff format $(check_dirs)t
# Check the source code is formatted correctly
check-style:
python stub.py --check
black --check --line-length 119 --target-version py35 examples py_src/tokenizers tests
ruff check examples py_src/tokenizers tests
ruff format --check examples py_src/tokenizers tests
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json

View File

@ -4,16 +4,15 @@ import time
from tqdm import tqdm
logging.getLogger("transformers").disabled = True
logging.getLogger("transformers.tokenization_utils").disabled = True
from tokenizers import Tokenizer, decoders, pre_tokenizers
from tokenizers.models import BPE, WordPiece
from tokenizers.normalizers import BertNormalizer
from tokenizers.processors import BertProcessing
from transformers import BertTokenizer, GPT2Tokenizer
logging.getLogger("transformers").disabled = True
logging.getLogger("transformers.tokenization_utils").disabled = True
parser = argparse.ArgumentParser()
parser.add_argument("--type", default="gpt2", type=str, help="The type of tokenizer (bert|gpt2)")
@ -51,9 +50,7 @@ Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!
""".split(
"\n"
)
""".split("\n")
if args.type == "gpt2":
print("Running GPT-2 tokenizer")

View File

@ -1,6 +1,6 @@
import datasets
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers
# Build a tokenizer

View File

@ -34,39 +34,44 @@ class AddedToken:
Defines whether this token should be skipped when decoding.
"""
def __init__(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False):
pass
@property
def content(self):
"""
Get the content of this :obj:`AddedToken`
"""
pass
@property
def lstrip(self):
"""
Get the value of the :obj:`lstrip` option
"""
pass
@property
def normalized(self):
"""
Get the value of the :obj:`normalized` option
"""
pass
@property
def rstrip(self):
"""
Get the value of the :obj:`rstrip` option
"""
pass
@property
def single_word(self):
"""
Get the value of the :obj:`single_word` option
"""
pass
@property
def special(self):
"""
@ -78,7 +83,6 @@ class Encoding:
"""
The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
"""
@property
def attention_mask(self):
"""
@ -92,6 +96,7 @@ class Encoding:
:obj:`List[int]`: The attention mask
"""
pass
def char_to_token(self, char_pos, sequence_index=0):
"""
Get the token that contains the char at the given position in the input sequence.
@ -106,6 +111,7 @@ class Encoding:
:obj:`int`: The index of the token that contains this char in the encoded sequence
"""
pass
def char_to_word(self, char_pos, sequence_index=0):
"""
Get the word that contains the char at the given position in the input sequence.
@ -120,6 +126,7 @@ class Encoding:
:obj:`int`: The index of the word that contains this char in the input sequence
"""
pass
@property
def ids(self):
"""
@ -132,6 +139,7 @@ class Encoding:
:obj:`List[int]`: The list of IDs
"""
pass
@staticmethod
def merge(encodings, growing_offsets=True):
"""
@ -148,6 +156,7 @@ class Encoding:
:class:`~tokenizers.Encoding`: The resulting Encoding
"""
pass
@property
def n_sequences(self):
"""
@ -157,6 +166,7 @@ class Encoding:
:obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
"""
pass
@property
def offsets(self):
"""
@ -169,6 +179,7 @@ class Encoding:
A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
"""
pass
@property
def overflowing(self):
"""
@ -183,6 +194,7 @@ class Encoding:
maximum length.
"""
pass
def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"):
"""
Pad the :class:`~tokenizers.Encoding` at the given length
@ -204,6 +216,7 @@ class Encoding:
The pad token to use
"""
pass
@property
def sequence_ids(self):
"""
@ -217,6 +230,7 @@ class Encoding:
A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
"""
pass
def set_sequence_id(self, sequence_id):
"""
Set the given sequence index
@ -225,6 +239,7 @@ class Encoding:
:class:`~tokenizers.Encoding`.
"""
pass
@property
def special_tokens_mask(self):
"""
@ -236,6 +251,7 @@ class Encoding:
:obj:`List[int]`: The special tokens mask
"""
pass
def token_to_chars(self, token_index):
"""
Get the offsets of the token at the given index.
@ -252,6 +268,7 @@ class Encoding:
:obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
"""
pass
def token_to_sequence(self, token_index):
"""
Get the index of the sequence represented by the given token.
@ -267,6 +284,7 @@ class Encoding:
:obj:`int`: The sequence id of the given token
"""
pass
def token_to_word(self, token_index):
"""
Get the index of the word that contains the token in one of the input sequences.
@ -283,6 +301,7 @@ class Encoding:
:obj:`int`: The index of the word in the relevant input sequence.
"""
pass
@property
def tokens(self):
"""
@ -294,6 +313,7 @@ class Encoding:
:obj:`List[str]`: The list of tokens
"""
pass
def truncate(self, max_length, stride=0, direction="right"):
"""
Truncate the :class:`~tokenizers.Encoding` at the given length
@ -312,6 +332,7 @@ class Encoding:
Truncate direction
"""
pass
@property
def type_ids(self):
"""
@ -324,6 +345,7 @@ class Encoding:
:obj:`List[int]`: The list of type ids
"""
pass
@property
def word_ids(self):
"""
@ -341,6 +363,7 @@ class Encoding:
A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
"""
pass
def word_to_chars(self, word_index, sequence_index=0):
"""
Get the offsets of the word at the given index in one of the input sequences.
@ -355,6 +378,7 @@ class Encoding:
:obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
"""
pass
def word_to_tokens(self, word_index, sequence_index=0):
"""
Get the encoded tokens corresponding to the word at the given index
@ -370,6 +394,7 @@ class Encoding:
:obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
"""
pass
@property
def words(self):
"""
@ -404,37 +429,42 @@ class NormalizedString:
sequence: str:
The string sequence used to initialize this NormalizedString
"""
def append(self, s):
"""
Append the given sequence to the string
"""
pass
def clear(self):
"""
Clears the string
"""
pass
def filter(self, func):
"""
Filter each character of the string using the given func
"""
pass
def for_each(self, func):
"""
Calls the given function for each character of the string
"""
pass
def lowercase(self):
"""
Lowercase the string
"""
pass
def lstrip(self):
"""
Strip the left of the string
"""
pass
def map(self, func):
"""
Calls the given function for each character of the string
@ -443,37 +473,44 @@ class NormalizedString:
returned value **must** be a str of length 1 (ie a character).
"""
pass
def nfc(self):
"""
Runs the NFC normalization
"""
pass
def nfd(self):
"""
Runs the NFD normalization
"""
pass
def nfkc(self):
"""
Runs the NFKC normalization
"""
pass
def nfkd(self):
"""
Runs the NFKD normalization
"""
pass
@property
def normalized(self):
"""
The normalized part of the string
"""
pass
def prepend(self, s):
"""
Prepend the given sequence to the string
"""
pass
def replace(self, pattern, content):
"""
Replace the content of the given pattern with the provided content
@ -486,16 +523,19 @@ class NormalizedString:
The content to be used as replacement
"""
pass
def rstrip(self):
"""
Strip the right of the string
"""
pass
def slice(self, range):
"""
Slice the string using the given range
"""
pass
def split(self, pattern, behavior):
"""
Split the NormalizedString using the given pattern and the specified behavior
@ -513,11 +553,13 @@ class NormalizedString:
A list of NormalizedString, representing each split
"""
pass
def strip(self):
"""
Strip both ends of the string
"""
pass
def uppercase(self):
"""
Uppercase the string
@ -542,9 +584,9 @@ class PreTokenizedString:
sequence: str:
The string sequence used to initialize this PreTokenizedString
"""
def __init__(self, sequence):
pass
def get_splits(self, offset_referential="original", offset_type="char"):
"""
Get the splits currently managed by the PreTokenizedString
@ -565,6 +607,7 @@ class PreTokenizedString:
A list of splits
"""
pass
def normalize(self, func):
"""
Normalize each split of the `PreTokenizedString` using the given `func`
@ -576,6 +619,7 @@ class PreTokenizedString:
NormalizedString allow its modification.
"""
pass
def split(self, func):
"""
Split the PreTokenizedString using the given `func`
@ -590,6 +634,7 @@ class PreTokenizedString:
should come from calling either `.split` or `.slice` on the received one.
"""
pass
def to_encoding(self, type_id=0, word_idx=None):
"""
Return an Encoding generated from this PreTokenizedString
@ -607,6 +652,7 @@ class PreTokenizedString:
An Encoding
"""
pass
def tokenize(self, func):
"""
Tokenize each split of the `PreTokenizedString` using the given `func`
@ -622,7 +668,6 @@ class Regex:
"""
Instantiate a new Regex with the given pattern
"""
def __init__(self, pattern):
pass
@ -639,9 +684,9 @@ class Tokenizer:
The core algorithm that this :obj:`Tokenizer` should be using.
"""
def __init__(self, model):
pass
def add_special_tokens(self, tokens):
"""
Add the given special tokens to the Tokenizer.
@ -662,6 +707,7 @@ class Tokenizer:
:obj:`int`: The number of tokens that were created in the vocabulary
"""
pass
def add_tokens(self, tokens):
"""
Add the given tokens to the vocabulary
@ -678,6 +724,7 @@ class Tokenizer:
:obj:`int`: The number of tokens that were created in the vocabulary
"""
pass
def decode(self, ids, skip_special_tokens=True):
"""
Decode the given list of ids back to a string
@ -695,6 +742,7 @@ class Tokenizer:
:obj:`str`: The decoded string
"""
pass
def decode_batch(self, sequences, skip_special_tokens=True):
"""
Decode a batch of ids back to their corresponding string
@ -710,12 +758,14 @@ class Tokenizer:
:obj:`List[str]`: A list of decoded strings
"""
pass
@property
def decoder(self):
"""
The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
"""
pass
def enable_padding(
self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None
):
@ -745,6 +795,7 @@ class Tokenizer:
the longest sequence in a batch.
"""
pass
def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"):
"""
Enable truncation
@ -765,6 +816,7 @@ class Tokenizer:
Truncate direction
"""
pass
def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
"""
Encode the given sequence and pair. This method can process raw text sequences
@ -803,6 +855,7 @@ class Tokenizer:
"""
pass
def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
"""
Encode the given batch of inputs. This method accept both raw text sequences
@ -838,6 +891,7 @@ class Tokenizer:
"""
pass
@property
def encode_special_tokens(self):
"""
@ -850,6 +904,7 @@ class Tokenizer:
"""
pass
@staticmethod
def from_buffer(buffer):
"""
@ -863,6 +918,7 @@ class Tokenizer:
:class:`~tokenizers.Tokenizer`: The new tokenizer
"""
pass
@staticmethod
def from_file(path):
"""
@ -877,6 +933,7 @@ class Tokenizer:
:class:`~tokenizers.Tokenizer`: The new tokenizer
"""
pass
@staticmethod
def from_pretrained(identifier, revision="main", auth_token=None):
"""
@ -897,6 +954,7 @@ class Tokenizer:
:class:`~tokenizers.Tokenizer`: The new tokenizer
"""
pass
@staticmethod
def from_str(json):
"""
@ -911,6 +969,7 @@ class Tokenizer:
:class:`~tokenizers.Tokenizer`: The new tokenizer
"""
pass
def get_added_tokens_decoder(self):
"""
Get the underlying vocabulary
@ -919,6 +978,7 @@ class Tokenizer:
:obj:`Dict[int, AddedToken]`: The vocabulary
"""
pass
def get_vocab(self, with_added_tokens=True):
"""
Get the underlying vocabulary
@ -931,6 +991,7 @@ class Tokenizer:
:obj:`Dict[str, int]`: The vocabulary
"""
pass
def get_vocab_size(self, with_added_tokens=True):
"""
Get the size of the underlying vocabulary
@ -943,6 +1004,7 @@ class Tokenizer:
:obj:`int`: The size of the vocabulary
"""
pass
def id_to_token(self, id):
"""
Convert the given id to its corresponding token if it exists
@ -955,28 +1017,33 @@ class Tokenizer:
:obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
"""
pass
@property
def model(self):
"""
The :class:`~tokenizers.models.Model` in use by the Tokenizer
"""
pass
def no_padding(self):
"""
Disable padding
"""
pass
def no_truncation(self):
"""
Disable truncation
"""
pass
@property
def normalizer(self):
"""
The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
"""
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
@ -984,6 +1051,7 @@ class Tokenizer:
:return:
"""
pass
@property
def padding(self):
"""
@ -996,6 +1064,7 @@ class Tokenizer:
A dict with the current padding parameters if padding is enabled
"""
pass
def post_process(self, encoding, pair=None, add_special_tokens=True):
"""
Apply all the post-processing steps to the given encodings.
@ -1022,18 +1091,21 @@ class Tokenizer:
:class:`~tokenizers.Encoding`: The final post-processed encoding
"""
pass
@property
def post_processor(self):
"""
The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
"""
pass
@property
def pre_tokenizer(self):
"""
The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
"""
pass
def save(self, path, pretty=True):
"""
Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
@ -1046,6 +1118,7 @@ class Tokenizer:
Whether the JSON file should be pretty formatted.
"""
pass
def to_str(self, pretty=False):
"""
Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
@ -1058,6 +1131,7 @@ class Tokenizer:
:obj:`str`: A string representing the serialized Tokenizer
"""
pass
def token_to_id(self, token):
"""
Convert the given token to its corresponding id if it exists
@ -1070,6 +1144,7 @@ class Tokenizer:
:obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
"""
pass
def train(self, files, trainer=None):
"""
Train the Tokenizer using the given files.
@ -1086,6 +1161,7 @@ class Tokenizer:
An optional trainer that should be used to train our Model
"""
pass
def train_from_iterator(self, iterator, trainer=None, length=None):
"""
Train the Tokenizer using the provided iterator.
@ -1109,6 +1185,7 @@ class Tokenizer:
provide meaningful progress tracking
"""
pass
@property
def truncation(self):
"""

View File

@ -6,7 +6,6 @@ class Decoder:
This class is not supposed to be instantiated directly. Instead, any implementation of
a Decoder will return an instance of this class when instantiated.
"""
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@ -29,9 +28,9 @@ class BPEDecoder(Decoder):
The suffix that was used to caracterize an end-of-word. This suffix will
be replaced by whitespaces during the decoding
"""
def __init__(self, suffix="</w>"):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@ -53,9 +52,9 @@ class ByteFallback(Decoder):
cannot be decoded you will get <20> instead for each inconvertable byte token
"""
def __init__(self):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@ -76,9 +75,9 @@ class ByteLevel(Decoder):
This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel`
:class:`~tokenizers.pre_tokenizers.PreTokenizer`.
"""
def __init__(self):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@ -105,9 +104,9 @@ class CTC(Decoder):
Whether to cleanup some tokenization artifacts.
Mainly spaces before punctuation, and some abbreviated english forms.
"""
def __init__(self, pad_token="<pad>", word_delimiter_token="|", cleanup=True):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@ -128,9 +127,9 @@ class Fuse(Decoder):
This is the last step of decoding, this decoder exists only if
there is need to add other decoders *after* the fusion
"""
def __init__(self):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@ -157,9 +156,9 @@ class Metaspace(Decoder):
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
"""
def __init__(self, replacement="", add_prefix_space=True):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@ -180,9 +179,9 @@ class Replace(Decoder):
This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace`
:class:`~tokenizers.pre_tokenizers.PreTokenizer`.
"""
def __init__(self, pattern, content):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@ -204,9 +203,9 @@ class Sequence(Decoder):
decoders (:obj:`List[Decoder]`)
The decoders that need to be chained
"""
def __init__(self, decoders):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@ -225,9 +224,9 @@ class Strip(Decoder):
Strip normalizer
Strips n left characters of each token, or n right characters of each token
"""
def __init__(self, content, left=0, right=0):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@ -253,9 +252,9 @@ class WordPiece(Decoder):
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
and some abbreviated english forms.
"""
def __init__(self, prefix="##", cleanup=True):
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string

View File

@ -8,7 +8,6 @@ class Model:
This class cannot be constructed directly. Please use one of the concrete models.
"""
def get_trainer(self):
"""
Get the associated :class:`~tokenizers.trainers.Trainer`
@ -20,6 +19,7 @@ class Model:
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
"""
pass
def id_to_token(self, id):
"""
Get the token associated to an ID
@ -32,6 +32,7 @@ class Model:
:obj:`str`: The token associated to the ID
"""
pass
def save(self, folder, prefix):
"""
Save the current model
@ -51,6 +52,7 @@ class Model:
:obj:`List[str]`: The list of saved files
"""
pass
def token_to_id(self, tokens):
"""
Get the ID associated to a token
@ -63,6 +65,7 @@ class Model:
:obj:`int`: The ID associated to the token
"""
pass
def tokenize(self, sequence):
"""
Tokenize a sequence
@ -110,7 +113,6 @@ class BPE(Model):
byte_fallback (:obj:`bool`, `optional`):
Whether to use spm byte-fallback trick (defaults to False)
"""
def __init__(
self,
vocab=None,
@ -124,6 +126,7 @@ class BPE(Model):
byte_fallback=False,
):
pass
@staticmethod
def from_file(cls, vocab, merge, **kwargs):
"""
@ -149,6 +152,7 @@ class BPE(Model):
:class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
"""
pass
def get_trainer(self):
"""
Get the associated :class:`~tokenizers.trainers.Trainer`
@ -160,6 +164,7 @@ class BPE(Model):
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
"""
pass
def id_to_token(self, id):
"""
Get the token associated to an ID
@ -172,6 +177,7 @@ class BPE(Model):
:obj:`str`: The token associated to the ID
"""
pass
@staticmethod
def read_file(self, vocab, merges):
"""
@ -193,6 +199,7 @@ class BPE(Model):
The vocabulary and merges loaded into memory
"""
pass
def save(self, folder, prefix):
"""
Save the current model
@ -212,6 +219,7 @@ class BPE(Model):
:obj:`List[str]`: The list of saved files
"""
pass
def token_to_id(self, tokens):
"""
Get the ID associated to a token
@ -224,6 +232,7 @@ class BPE(Model):
:obj:`int`: The ID associated to the token
"""
pass
def tokenize(self, sequence):
"""
Tokenize a sequence
@ -245,9 +254,9 @@ class Unigram(Model):
vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
A list of vocabulary items and their relative score [("am", -0.2442),...]
"""
def __init__(self, vocab, unk_id, byte_fallback):
pass
def get_trainer(self):
"""
Get the associated :class:`~tokenizers.trainers.Trainer`
@ -259,6 +268,7 @@ class Unigram(Model):
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
"""
pass
def id_to_token(self, id):
"""
Get the token associated to an ID
@ -271,6 +281,7 @@ class Unigram(Model):
:obj:`str`: The token associated to the ID
"""
pass
def save(self, folder, prefix):
"""
Save the current model
@ -290,6 +301,7 @@ class Unigram(Model):
:obj:`List[str]`: The list of saved files
"""
pass
def token_to_id(self, tokens):
"""
Get the ID associated to a token
@ -302,6 +314,7 @@ class Unigram(Model):
:obj:`int`: The ID associated to the token
"""
pass
def tokenize(self, sequence):
"""
Tokenize a sequence
@ -328,9 +341,9 @@ class WordLevel(Model):
unk_token (:obj:`str`, `optional`):
The unknown token to be used by the model.
"""
def __init__(self, vocab, unk_token):
pass
@staticmethod
def from_file(vocab, unk_token):
"""
@ -353,6 +366,7 @@ class WordLevel(Model):
:class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
"""
pass
def get_trainer(self):
"""
Get the associated :class:`~tokenizers.trainers.Trainer`
@ -364,6 +378,7 @@ class WordLevel(Model):
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
"""
pass
def id_to_token(self, id):
"""
Get the token associated to an ID
@ -376,6 +391,7 @@ class WordLevel(Model):
:obj:`str`: The token associated to the ID
"""
pass
@staticmethod
def read_file(vocab):
"""
@ -393,6 +409,7 @@ class WordLevel(Model):
:obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
"""
pass
def save(self, folder, prefix):
"""
Save the current model
@ -412,6 +429,7 @@ class WordLevel(Model):
:obj:`List[str]`: The list of saved files
"""
pass
def token_to_id(self, tokens):
"""
Get the ID associated to a token
@ -424,6 +442,7 @@ class WordLevel(Model):
:obj:`int`: The ID associated to the token
"""
pass
def tokenize(self, sequence):
"""
Tokenize a sequence
@ -451,9 +470,9 @@ class WordPiece(Model):
max_input_chars_per_word (:obj:`int`, `optional`):
The maximum number of characters to authorize in a single word.
"""
def __init__(self, vocab, unk_token, max_input_chars_per_word):
pass
@staticmethod
def from_file(vocab, **kwargs):
"""
@ -476,6 +495,7 @@ class WordPiece(Model):
:class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
"""
pass
def get_trainer(self):
"""
Get the associated :class:`~tokenizers.trainers.Trainer`
@ -487,6 +507,7 @@ class WordPiece(Model):
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
"""
pass
def id_to_token(self, id):
"""
Get the token associated to an ID
@ -499,6 +520,7 @@ class WordPiece(Model):
:obj:`str`: The token associated to the ID
"""
pass
@staticmethod
def read_file(vocab):
"""
@ -517,6 +539,7 @@ class WordPiece(Model):
:obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
"""
pass
def save(self, folder, prefix):
"""
Save the current model
@ -536,6 +559,7 @@ class WordPiece(Model):
:obj:`List[str]`: The list of saved files
"""
pass
def token_to_id(self, tokens):
"""
Get the ID associated to a token
@ -548,6 +572,7 @@ class WordPiece(Model):
:obj:`int`: The ID associated to the token
"""
pass
def tokenize(self, sequence):
"""
Tokenize a sequence

View File

@ -6,7 +6,6 @@ class Normalizer:
This class is not supposed to be instantiated directly. Instead, any implementation of a
Normalizer will return an instance of this class when instantiated.
"""
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -22,6 +21,7 @@ class Normalizer:
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
@ -62,9 +62,9 @@ class BertNormalizer(Normalizer):
lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lowercase.
"""
def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -80,6 +80,7 @@ class BertNormalizer(Normalizer):
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
@ -102,9 +103,9 @@ class Lowercase(Normalizer):
"""
Lowercase Normalizer
"""
def __init__(self):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -120,6 +121,7 @@ class Lowercase(Normalizer):
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
@ -142,9 +144,9 @@ class NFC(Normalizer):
"""
NFC Unicode Normalizer
"""
def __init__(self):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -160,6 +162,7 @@ class NFC(Normalizer):
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
@ -182,9 +185,9 @@ class NFD(Normalizer):
"""
NFD Unicode Normalizer
"""
def __init__(self):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -200,6 +203,7 @@ class NFD(Normalizer):
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
@ -222,9 +226,9 @@ class NFKC(Normalizer):
"""
NFKC Unicode Normalizer
"""
def __init__(self):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -240,6 +244,7 @@ class NFKC(Normalizer):
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
@ -262,9 +267,9 @@ class NFKD(Normalizer):
"""
NFKD Unicode Normalizer
"""
def __init__(self):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -280,6 +285,7 @@ class NFKD(Normalizer):
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
@ -302,9 +308,9 @@ class Nmt(Normalizer):
"""
Nmt normalizer
"""
def __init__(self):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -320,6 +326,7 @@ class Nmt(Normalizer):
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
@ -343,9 +350,9 @@ class Precompiled(Normalizer):
Precompiled normalizer
Don't use manually it is used for compatiblity for SentencePiece.
"""
def __init__(self, precompiled_charsmap):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -361,6 +368,7 @@ class Precompiled(Normalizer):
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
@ -383,9 +391,9 @@ class Prepend(Normalizer):
"""
Prepend normalizer
"""
def __init__(self, prepend):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -401,6 +409,7 @@ class Prepend(Normalizer):
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
@ -423,9 +432,9 @@ class Replace(Normalizer):
"""
Replace normalizer
"""
def __init__(self, pattern, content):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -441,6 +450,7 @@ class Replace(Normalizer):
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
@ -468,7 +478,6 @@ class Sequence(Normalizer):
normalizers (:obj:`List[Normalizer]`):
A list of Normalizer to be run as a sequence
"""
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -484,6 +493,7 @@ class Sequence(Normalizer):
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
@ -506,9 +516,9 @@ class Strip(Normalizer):
"""
Strip normalizer
"""
def __init__(self, left=True, right=True):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -524,6 +534,7 @@ class Strip(Normalizer):
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string
@ -546,9 +557,9 @@ class StripAccents(Normalizer):
"""
StripAccents normalizer
"""
def __init__(self):
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@ -564,6 +575,7 @@ class StripAccents(Normalizer):
:class:`~tokenizers.normalizers.Normalizer`
"""
pass
def normalize_str(self, sequence):
"""
Normalize the given string

View File

@ -6,7 +6,6 @@ class PreTokenizer:
This class is not supposed to be instantiated directly. Instead, any implementation of a
PreTokenizer will return an instance of this class when instantiated.
"""
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -23,6 +22,7 @@ class PreTokenizer:
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
@ -50,9 +50,9 @@ class BertPreTokenizer(PreTokenizer):
This pre-tokenizer splits tokens on spaces, and also on punctuation.
Each occurence of a punctuation character will be treated separately.
"""
def __init__(self):
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -69,6 +69,7 @@ class BertPreTokenizer(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
@ -104,9 +105,9 @@ class ByteLevel(PreTokenizer):
Set this to :obj:`False` to prevent this `pre_tokenizer` from using
the GPT2 specific regexp for spliting on whitespace.
"""
def __init__(self, add_prefix_space=True, use_regex=True):
pass
@staticmethod
def alphabet():
"""
@ -120,6 +121,7 @@ class ByteLevel(PreTokenizer):
:obj:`List[str]`: A list of characters that compose the alphabet
"""
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -136,6 +138,7 @@ class ByteLevel(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
@ -164,7 +167,6 @@ class CharDelimiterSplit(PreTokenizer):
delimiter: str:
The delimiter char that will be used to split input
"""
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -181,6 +183,7 @@ class CharDelimiterSplit(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
@ -215,9 +218,9 @@ class Digits(PreTokenizer):
"Call 123 please" -> "Call ", "123", " please"
"""
def __init__(self, individual_digits=False):
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -234,6 +237,7 @@ class Digits(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
@ -270,9 +274,9 @@ class Metaspace(PreTokenizer):
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
"""
def __init__(self, replacement="_", add_prefix_space=True):
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -289,6 +293,7 @@ class Metaspace(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
@ -319,9 +324,9 @@ class Punctuation(PreTokenizer):
Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
"contiguous"
"""
def __init__(self, behavior="isolated"):
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -338,6 +343,7 @@ class Punctuation(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
@ -362,9 +368,9 @@ class Sequence(PreTokenizer):
"""
This pre-tokenizer composes other pre_tokenizers and applies them in sequence
"""
def __init__(self, pretokenizers):
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -381,6 +387,7 @@ class Sequence(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
@ -421,9 +428,9 @@ class Split(PreTokenizer):
invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to invert the pattern.
"""
def __init__(self, pattern, behavior, invert=False):
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -440,6 +447,7 @@ class Split(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
@ -467,9 +475,9 @@ class UnicodeScripts(PreTokenizer):
Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
This mimicks SentencePiece Unigram implementation.
"""
def __init__(self):
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -486,6 +494,7 @@ class UnicodeScripts(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
@ -510,9 +519,9 @@ class Whitespace(PreTokenizer):
"""
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
"""
def __init__(self):
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -529,6 +538,7 @@ class Whitespace(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string
@ -553,9 +563,9 @@ class WhitespaceSplit(PreTokenizer):
"""
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
"""
def __init__(self):
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@ -572,6 +582,7 @@ class WhitespaceSplit(PreTokenizer):
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given string

View File

@ -6,7 +6,6 @@ class PostProcessor:
This class is not supposed to be instantiated directly. Instead, any implementation of
a PostProcessor will return an instance of this class when instantiated.
"""
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
@ -19,6 +18,7 @@ class PostProcessor:
:obj:`int`: The number of tokens to add
"""
pass
def process(self, encoding, pair=None, add_special_tokens=True):
"""
Post-process the given encodings, generating the final one
@ -53,9 +53,9 @@ class BertProcessing(PostProcessor):
cls (:obj:`Tuple[str, int]`):
A tuple with the string representation of the CLS token, and its id
"""
def __init__(self, sep, cls):
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
@ -68,6 +68,7 @@ class BertProcessing(PostProcessor):
:obj:`int`: The number of tokens to add
"""
pass
def process(self, encoding, pair=None, add_special_tokens=True):
"""
Post-process the given encodings, generating the final one
@ -98,9 +99,9 @@ class ByteLevel(PostProcessor):
trim_offsets (:obj:`bool`):
Whether to trim the whitespaces from the produced offsets.
"""
def __init__(self, trim_offsets=True):
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
@ -113,6 +114,7 @@ class ByteLevel(PostProcessor):
:obj:`int`: The number of tokens to add
"""
pass
def process(self, encoding, pair=None, add_special_tokens=True):
"""
Post-process the given encodings, generating the final one
@ -159,9 +161,9 @@ class RobertaProcessing(PostProcessor):
Whether the add_prefix_space option was enabled during pre-tokenization. This
is relevant because it defines the way the offsets are trimmed out.
"""
def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True):
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
@ -174,6 +176,7 @@ class RobertaProcessing(PostProcessor):
:obj:`int`: The number of tokens to add
"""
pass
def process(self, encoding, pair=None, add_special_tokens=True):
"""
Post-process the given encodings, generating the final one
@ -201,9 +204,9 @@ class Sequence(PostProcessor):
processors (:obj:`List[PostProcessor]`)
The processors that need to be chained
"""
def __init__(self, processors):
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
@ -216,6 +219,7 @@ class Sequence(PostProcessor):
:obj:`int`: The number of tokens to add
"""
pass
def process(self, encoding, pair=None, add_special_tokens=True):
"""
Post-process the given encodings, generating the final one
@ -302,9 +306,9 @@ class TemplateProcessing(PostProcessor):
The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have
the same length.
"""
def __init__(self, single, pair, special_tokens):
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
@ -317,6 +321,7 @@ class TemplateProcessing(PostProcessor):
:obj:`int`: The number of tokens to add
"""
pass
def process(self, encoding, pair=None, add_special_tokens=True):
"""
Post-process the given encodings, generating the final one

View File

@ -92,7 +92,7 @@ class EncodingVisualizer:
if default_to_notebook:
try:
from IPython.core.display import HTML, display
except ImportError as e:
except ImportError:
raise Exception(
"""We couldn't import IPython utils for html display.
Are you running in a notebook?
@ -136,7 +136,7 @@ class EncodingVisualizer:
if final_default_to_notebook:
try:
from IPython.core.display import HTML, display
except ImportError as e:
except ImportError:
raise Exception(
"""We couldn't import IPython utils for html display.
Are you running in a notebook?"""
@ -170,7 +170,7 @@ class EncodingVisualizer:
if h_step < 20:
h_step = 20
s = 32
l = 64
l = 64 # noqa: E741
h = 10
colors = {}

View File

@ -80,7 +80,6 @@ class UnigramTrainer(Trainer):
The number of iterations of the EM algorithm to perform before
pruning the vocabulary.
"""
def __init__(
self,
vocab_size=8000,
@ -143,7 +142,6 @@ class WordPieceTrainer(Trainer):
end_of_word_suffix (:obj:`str`, `optional`):
A suffix to be used for every subword that is a end-of-word.
"""
def __init__(
self,
vocab_size=30000,

View File

@ -34,7 +34,7 @@ Source = 'https://github.com/huggingface/tokenizers'
[project.optional-dependencies]
testing = ["pytest", "requests", "numpy", "datasets", "black==22.3"]
testing = ["pytest", "requests", "numpy", "datasets", "black==22.3", "ruff"]
docs = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"]
dev = ["tokenizers[testing]"]
@ -52,3 +52,21 @@ features = ["pyo3/extension-module"]
[tool.black]
line-length = 119
target-version = ['py35']
[tool.ruff]
line-length = 119
target-version = "py311"
lint.ignore = [
# a == None in tests vs is None.
"E711",
# a == False in tests vs is False.
"E712",
# try.. import except.. pattern without using the lib.
"F401",
# Raw type equality is required in asserts
"E721",
# Import order
"E402",
# Fixtures unused import
"F811",
]

View File

@ -80,9 +80,7 @@ class SpmConverter(Converter):
tokenizer = Tokenizer(Unigram(vocab, unk_id))
elif model_type == 2:
vocab, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract()
tokenizer = Tokenizer(
BPE(vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True)
)
tokenizer = Tokenizer(BPE(vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True))
else:
raise Exception(
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
@ -105,12 +103,8 @@ class SpmConverter(Converter):
replacement = ""
add_prefix_space = True
tokenizer.pre_tokenizer = Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
)
tokenizer.decoder = decoders.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
)
tokenizer.pre_tokenizer = Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
post_processor = self.post_processor(tokenizer)
if post_processor:
tokenizer.post_processor = post_processor
@ -124,9 +118,7 @@ class SpmConverter(Converter):
class AlbertConverter(SpmConverter):
def vocab(self, proto):
return [
(piece.piece, piece.score)
if check_number_comma(piece.piece)
else (piece.piece, piece.score - 100)
(piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100)
for piece in proto.pieces
]
@ -261,9 +253,7 @@ class XLMRobertaConverter(SpmConverter):
class XLNetConverter(SpmConverter):
def vocab(self, proto):
return [
(piece.piece, piece.score)
if check_number_comma(piece.piece)
else (piece.piece, piece.score - 100)
(piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100)
for piece in proto.pieces
]
@ -420,9 +410,7 @@ def main():
print(f"|{'-'*model_len}|{'-'*status_len}|{'-'*speedup_len}|")
for pretrained in args.models:
status, speedup = check(pretrained, args.filename)
print(
f"|{pretrained:<{model_len}}|{status:^{status_len}}|{speedup:^{speedup_len - 1}.2f}x|"
)
print(f"|{pretrained:<{model_len}}|{status:^{status_len}}|{speedup:^{speedup_len - 1}.2f}x|")
if __name__ == "__main__":

View File

@ -59,7 +59,6 @@ class YouTokenToMeExtractor:
def extract(self) -> Tuple[Dict[str, int], List[Tuple]]:
with open(self._model, "r") as model_f:
# Retrieve information
nb_pieces, nb_merges = map(int, model_f.readline().split())
vocab, merges = {}, []
@ -97,9 +96,7 @@ if __name__ == "__main__":
choices=["sentencepiece", "youtokentome"],
help="Indicate the format of the file.",
)
parser.add_argument(
"--model", type=str, required=True, help="SentencePiece model to extract vocab from."
)
parser.add_argument("--model", type=str, required=True, help="SentencePiece model to extract vocab from.")
parser.add_argument(
"--vocab-output-path",
type=str,
@ -128,9 +125,7 @@ if __name__ == "__main__":
args.model = f.name
# Allocate extractor
extractor = (
SentencePieceExtractor if args.provider == "sentencepiece" else YouTokenToMeExtractor
)
extractor = SentencePieceExtractor if args.provider == "sentencepiece" else YouTokenToMeExtractor
extractor = extractor(args.model)
logger.info(f"Using {type(extractor).__name__}")

View File

@ -121,9 +121,7 @@ def check_train(args):
break
print(f"Tokenizer used {tokenizer_tokens}, where spm used {spm_tokens}")
assert (
tokenizer_tokens < spm_tokens
), "Our trainer should be at least more efficient than the SPM one"
assert tokenizer_tokens < spm_tokens, "Our trainer should be at least more efficient than the SPM one"
print("Ok our trainer is at least more efficient than the SPM one")
@ -131,9 +129,7 @@ def check_diff(spm_diff, tok_diff, sp, tok):
if spm_diff == list(reversed(tok_diff)):
# AAA -> AA+A vs A+AA case.
return True
elif len(spm_diff) == len(tok_diff) and tok.decode(spm_diff) == tok.decode(
tok_diff
):
elif len(spm_diff) == len(tok_diff) and tok.decode(spm_diff) == tok.decode(tok_diff):
# Second order OK
# Barrich -> Barr + ich vs Bar + rich
return True
@ -173,24 +169,17 @@ def check_details(line, spm_ids, tok_ids, sp, tok):
spms = Counter(spm_ids[first:last])
toks = Counter(tok_ids[first:last])
removable_tokens = {
spm_ for (spm_, si) in spms.items() if toks.get(spm_, 0) == si
}
removable_tokens = {spm_ for (spm_, si) in spms.items() if toks.get(spm_, 0) == si}
min_width = 3
for i in range(last - first - min_width):
if all(
spm_ids[first + i + j] in removable_tokens for j in range(min_width)
):
if all(spm_ids[first + i + j] in removable_tokens for j in range(min_width)):
possible_matches = [
k
for k in range(last - first - min_width)
if tok_ids[first + k : first + k + min_width]
== spm_ids[first + i : first + i + min_width]
if tok_ids[first + k : first + k + min_width] == spm_ids[first + i : first + i + min_width]
]
for j in possible_matches:
if check_diff(
spm_ids[first : first + i], tok_ids[first : first + j], sp, tok
) and check_details(
if check_diff(spm_ids[first : first + i], tok_ids[first : first + j], sp, tok) and check_details(
line,
spm_ids[first + i : last],
tok_ids[first + j : last],
@ -210,9 +199,7 @@ def check_details(line, spm_ids, tok_ids, sp, tok):
wrong = tok.decode(spm_ids[first:last])
print()
if has_color:
print(
f"{colored(ok_start, 'grey')}{colored(wrong, 'red')}{colored(ok_end, 'grey')}"
)
print(f"{colored(ok_start, 'grey')}{colored(wrong, 'red')}{colored(ok_end, 'grey')}")
else:
print(wrong)
return False
@ -251,9 +238,7 @@ def check_encode(args):
if args.verbose:
if i % 10000 == 0:
print(
f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})"
)
print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})")
print(f"SPM: {spm_total_time} - TOK: {tok_total_time}")
if ids != encoded.ids:
@ -265,13 +250,13 @@ def check_encode(args):
else:
perfect += 1
assert ids == encoded.ids, f"line {i}: {line} : \n\n{ids}\n{encoded.ids}\n{list(zip(encoded.ids, encoded.tokens))}"
assert (
ids == encoded.ids
), f"line {i}: {line} : \n\n{ids}\n{encoded.ids}\n{list(zip(encoded.ids, encoded.tokens))}"
print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})")
total = perfect + imperfect + wrong
print(
f"Accuracy {perfect * 100 / total:.2f} Slowdown : {tok_total_time/ spm_total_time:.2f}"
)
print(f"Accuracy {perfect * 100 / total:.2f} Slowdown : {tok_total_time/ spm_total_time:.2f}")
if __name__ == "__main__":

View File

@ -3,8 +3,6 @@ import inspect
import os
from pathlib import Path
import black
INDENT = " " * 4
GENERATED_COMMENT = "# Generated content DO NOT EDIT\n"
@ -85,7 +83,7 @@ def pyi_file(obj, indent=""):
body += f"{indent+INDENT}pass\n"
body += "\n"
for (name, fn) in fns:
for name, fn in fns:
body += pyi_file(fn, indent=indent)
if not body:
@ -122,18 +120,17 @@ def py_file(module, origin):
return string
def do_black(content, is_pyi):
mode = black.Mode(
target_versions={black.TargetVersion.PY35},
line_length=119,
is_pyi=is_pyi,
string_normalization=True,
experimental_string_processing=False,
)
try:
return black.format_file_contents(content, fast=True, mode=mode)
except black.NothingChanged:
return content
import subprocess
from typing import List, Optional, Tuple
def do_ruff(code, is_pyi: bool):
command = ["ruff", "format", "--config", "pyproject.toml", "--silent", "-"]
if is_pyi:
command.extend(["--stdin-filename", "test.pyi"])
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
stdout, _ = process.communicate(input=code.encode("utf-8"))
return stdout.decode("utf-8")
def write(module, directory, origin, check=False):
@ -141,7 +138,7 @@ def write(module, directory, origin, check=False):
filename = os.path.join(directory, "__init__.pyi")
pyi_content = pyi_file(module)
pyi_content = do_black(pyi_content, is_pyi=True)
pyi_content = do_ruff(pyi_content, is_pyi=True)
os.makedirs(directory, exist_ok=True)
if check:
with open(filename, "r") as f:
@ -153,7 +150,7 @@ def write(module, directory, origin, check=False):
filename = os.path.join(directory, "__init__.py")
py_content = py_file(module, origin)
py_content = do_black(py_content, is_pyi=False)
py_content = do_ruff(py_content, is_pyi=False)
os.makedirs(directory, exist_ok=True)
is_auto = False

View File

@ -3,7 +3,6 @@ import pickle
import pytest
from tokenizers.models import BPE, Model, WordLevel, WordPiece
from ..utils import bert_files, data_dir, roberta_files

View File

@ -2,8 +2,7 @@ import pickle
import pytest
from tokenizers import NormalizedString, Tokenizer
from tokenizers.models import BPE
from tokenizers import NormalizedString
from tokenizers.normalizers import BertNormalizer, Lowercase, Normalizer, Sequence, Strip, Prepend

View File

@ -146,18 +146,18 @@ class TestTemplateProcessing:
assert isinstance(pickle.loads(pickle.dumps(bert)), TemplateProcessing)
# It is absolutely legal to have tokens with spaces in the name:
processor = TemplateProcessing(
TemplateProcessing(
single=["[ C L S ]", "Token with space"],
special_tokens=[("[ C L S ]", 0), ("Token with space", 1)],
)
# Sequence identifiers must be well formed:
with pytest.raises(Exception, match="Cannot build Piece"):
processor = TemplateProcessing(single="[CLS] $$ [SEP]")
TemplateProcessing(single="[CLS] $$ [SEP]")
with pytest.raises(Exception, match="Cannot build Piece"):
processor = TemplateProcessing(single="[CLS] $A: [SEP]")
TemplateProcessing(single="[CLS] $A: [SEP]")
# Special tokens must be provided when used in template:
with pytest.raises(Exception, match="Missing SpecialToken\\(s\\) with id\\(s\\)"):
processor = TemplateProcessing(single=["[CLS]"])
TemplateProcessing(single=["[CLS]"])
def test_bert_parity(self):
tokenizer = Tokenizer(BPE())

View File

@ -5,10 +5,9 @@ import pytest
from tokenizers import AddedToken, Encoding, Tokenizer
from tokenizers.implementations import BertWordPieceTokenizer
from tokenizers.models import BPE, Model, WordPiece, Unigram
from tokenizers.normalizers import Lowercase
from tokenizers.models import BPE, Model, Unigram
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import BertProcessing, RobertaProcessing
from tokenizers.processors import RobertaProcessing
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files

View File

@ -2,7 +2,6 @@ from tokenizers import Tokenizer
from ..utils import data_dir, doc_pipeline_bert_tokenizer, doc_wiki_tokenizer
disable_printing = True
original_print = print

View File

@ -1,8 +1,4 @@
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from ..utils import data_dir, doc_wiki_tokenizer

View File

@ -1,3 +1,4 @@
# flake8: noqa
import gzip
import os

View File

@ -1,5 +1,3 @@
import pytest
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, processors
from tokenizers.implementations import BaseTokenizer

View File

@ -1,5 +1,3 @@
import pytest
from tokenizers import BertWordPieceTokenizer
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism

View File

@ -1,5 +1,3 @@
import pytest
from tokenizers import ByteLevelBPETokenizer
from ..utils import data_dir, multiprocessing_with_parallelism, roberta_files

View File

@ -1,5 +1,3 @@
import pytest
from tokenizers import CharBPETokenizer
from ..utils import data_dir, multiprocessing_with_parallelism, openai_files

View File

@ -1,5 +1,3 @@
import os
import pytest
from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer

View File

@ -6,7 +6,6 @@ import tqdm
from huggingface_hub import HfApi, cached_download, hf_hub_url
from tokenizers import Tokenizer
from .utils import albert_base, data_dir
@ -15,7 +14,7 @@ class TestSerialization:
# Check we can read this file.
# This used to fail because of BufReader that would fail because the
# file exceeds the buffer capacity
tokenizer = Tokenizer.from_file(albert_base)
Tokenizer.from_file(albert_base)
def check(tokenizer_file) -> bool:
@ -51,8 +50,6 @@ class TestFullDeserialization(unittest.TestCase):
# Check we can read this file.
# This used to fail because of BufReader that would fail because the
# file exceeds the buffer capacity
api = HfApi()
not_loadable = []
invalid_pre_tokenizer = []
@ -77,7 +74,7 @@ class TestFullDeserialization(unittest.TestCase):
except Exception as e:
print(f"{model_id} is not loadable: {e}")
not_loadable.append(model_id)
except:
except: # noqa: E722
print(f"{model_id} is not loadable: Rust error")
not_loadable.append(model_id)