mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
[remove black
] And use ruff (#1436)
* nits * Fixing deps. * Ruff update. * Import order matters. * Fix. * Revert ruff fix. * Visualizer. * Putting back the imports. --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
@ -3,7 +3,6 @@ import pickle
|
||||
import pytest
|
||||
|
||||
from tokenizers.models import BPE, Model, WordLevel, WordPiece
|
||||
|
||||
from ..utils import bert_files, data_dir, roberta_files
|
||||
|
||||
|
||||
|
@ -2,8 +2,7 @@ import pickle
|
||||
|
||||
import pytest
|
||||
|
||||
from tokenizers import NormalizedString, Tokenizer
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers import NormalizedString
|
||||
from tokenizers.normalizers import BertNormalizer, Lowercase, Normalizer, Sequence, Strip, Prepend
|
||||
|
||||
|
||||
|
@ -146,18 +146,18 @@ class TestTemplateProcessing:
|
||||
assert isinstance(pickle.loads(pickle.dumps(bert)), TemplateProcessing)
|
||||
|
||||
# It is absolutely legal to have tokens with spaces in the name:
|
||||
processor = TemplateProcessing(
|
||||
TemplateProcessing(
|
||||
single=["[ C L S ]", "Token with space"],
|
||||
special_tokens=[("[ C L S ]", 0), ("Token with space", 1)],
|
||||
)
|
||||
# Sequence identifiers must be well formed:
|
||||
with pytest.raises(Exception, match="Cannot build Piece"):
|
||||
processor = TemplateProcessing(single="[CLS] $$ [SEP]")
|
||||
TemplateProcessing(single="[CLS] $$ [SEP]")
|
||||
with pytest.raises(Exception, match="Cannot build Piece"):
|
||||
processor = TemplateProcessing(single="[CLS] $A: [SEP]")
|
||||
TemplateProcessing(single="[CLS] $A: [SEP]")
|
||||
# Special tokens must be provided when used in template:
|
||||
with pytest.raises(Exception, match="Missing SpecialToken\\(s\\) with id\\(s\\)"):
|
||||
processor = TemplateProcessing(single=["[CLS]"])
|
||||
TemplateProcessing(single=["[CLS]"])
|
||||
|
||||
def test_bert_parity(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
|
@ -5,10 +5,9 @@ import pytest
|
||||
|
||||
from tokenizers import AddedToken, Encoding, Tokenizer
|
||||
from tokenizers.implementations import BertWordPieceTokenizer
|
||||
from tokenizers.models import BPE, Model, WordPiece, Unigram
|
||||
from tokenizers.normalizers import Lowercase
|
||||
from tokenizers.models import BPE, Model, Unigram
|
||||
from tokenizers.pre_tokenizers import ByteLevel
|
||||
from tokenizers.processors import BertProcessing, RobertaProcessing
|
||||
from tokenizers.processors import RobertaProcessing
|
||||
|
||||
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files
|
||||
|
||||
|
@ -2,7 +2,6 @@ from tokenizers import Tokenizer
|
||||
|
||||
from ..utils import data_dir, doc_pipeline_bert_tokenizer, doc_wiki_tokenizer
|
||||
|
||||
|
||||
disable_printing = True
|
||||
original_print = print
|
||||
|
||||
|
@ -1,8 +1,4 @@
|
||||
from tokenizers import Tokenizer
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.pre_tokenizers import Whitespace
|
||||
from tokenizers.trainers import BpeTrainer
|
||||
|
||||
from ..utils import data_dir, doc_wiki_tokenizer
|
||||
|
||||
|
||||
|
@ -1,3 +1,4 @@
|
||||
# flake8: noqa
|
||||
import gzip
|
||||
import os
|
||||
|
||||
|
@ -1,5 +1,3 @@
|
||||
import pytest
|
||||
|
||||
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, processors
|
||||
from tokenizers.implementations import BaseTokenizer
|
||||
|
||||
|
@ -1,5 +1,3 @@
|
||||
import pytest
|
||||
|
||||
from tokenizers import BertWordPieceTokenizer
|
||||
|
||||
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism
|
||||
|
@ -1,5 +1,3 @@
|
||||
import pytest
|
||||
|
||||
from tokenizers import ByteLevelBPETokenizer
|
||||
|
||||
from ..utils import data_dir, multiprocessing_with_parallelism, roberta_files
|
||||
|
@ -1,5 +1,3 @@
|
||||
import pytest
|
||||
|
||||
from tokenizers import CharBPETokenizer
|
||||
|
||||
from ..utils import data_dir, multiprocessing_with_parallelism, openai_files
|
||||
|
@ -1,5 +1,3 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer
|
||||
|
@ -6,7 +6,6 @@ import tqdm
|
||||
|
||||
from huggingface_hub import HfApi, cached_download, hf_hub_url
|
||||
from tokenizers import Tokenizer
|
||||
|
||||
from .utils import albert_base, data_dir
|
||||
|
||||
|
||||
@ -15,7 +14,7 @@ class TestSerialization:
|
||||
# Check we can read this file.
|
||||
# This used to fail because of BufReader that would fail because the
|
||||
# file exceeds the buffer capacity
|
||||
tokenizer = Tokenizer.from_file(albert_base)
|
||||
Tokenizer.from_file(albert_base)
|
||||
|
||||
|
||||
def check(tokenizer_file) -> bool:
|
||||
@ -51,8 +50,6 @@ class TestFullDeserialization(unittest.TestCase):
|
||||
# Check we can read this file.
|
||||
# This used to fail because of BufReader that would fail because the
|
||||
# file exceeds the buffer capacity
|
||||
api = HfApi()
|
||||
|
||||
not_loadable = []
|
||||
invalid_pre_tokenizer = []
|
||||
|
||||
@ -77,7 +74,7 @@ class TestFullDeserialization(unittest.TestCase):
|
||||
except Exception as e:
|
||||
print(f"{model_id} is not loadable: {e}")
|
||||
not_loadable.append(model_id)
|
||||
except:
|
||||
except: # noqa: E722
|
||||
print(f"{model_id} is not loadable: Rust error")
|
||||
not_loadable.append(model_id)
|
||||
|
||||
|
Reference in New Issue
Block a user