[remove black] And use ruff (#1436)

* nits

* Fixing deps.

* Ruff update.

* Import order matters.

* Fix.

* Revert ruff fix.

* Visualizer.

* Putting back the imports.

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
Arthur
2024-03-12 21:24:21 +11:00
committed by GitHub
parent 72a1973cd1
commit 29fef1e7aa
29 changed files with 258 additions and 169 deletions

View File

@ -3,7 +3,6 @@ import pickle
import pytest
from tokenizers.models import BPE, Model, WordLevel, WordPiece
from ..utils import bert_files, data_dir, roberta_files

View File

@ -2,8 +2,7 @@ import pickle
import pytest
from tokenizers import NormalizedString, Tokenizer
from tokenizers.models import BPE
from tokenizers import NormalizedString
from tokenizers.normalizers import BertNormalizer, Lowercase, Normalizer, Sequence, Strip, Prepend

View File

@ -146,18 +146,18 @@ class TestTemplateProcessing:
assert isinstance(pickle.loads(pickle.dumps(bert)), TemplateProcessing)
# It is absolutely legal to have tokens with spaces in the name:
processor = TemplateProcessing(
TemplateProcessing(
single=["[ C L S ]", "Token with space"],
special_tokens=[("[ C L S ]", 0), ("Token with space", 1)],
)
# Sequence identifiers must be well formed:
with pytest.raises(Exception, match="Cannot build Piece"):
processor = TemplateProcessing(single="[CLS] $$ [SEP]")
TemplateProcessing(single="[CLS] $$ [SEP]")
with pytest.raises(Exception, match="Cannot build Piece"):
processor = TemplateProcessing(single="[CLS] $A: [SEP]")
TemplateProcessing(single="[CLS] $A: [SEP]")
# Special tokens must be provided when used in template:
with pytest.raises(Exception, match="Missing SpecialToken\\(s\\) with id\\(s\\)"):
processor = TemplateProcessing(single=["[CLS]"])
TemplateProcessing(single=["[CLS]"])
def test_bert_parity(self):
tokenizer = Tokenizer(BPE())

View File

@ -5,10 +5,9 @@ import pytest
from tokenizers import AddedToken, Encoding, Tokenizer
from tokenizers.implementations import BertWordPieceTokenizer
from tokenizers.models import BPE, Model, WordPiece, Unigram
from tokenizers.normalizers import Lowercase
from tokenizers.models import BPE, Model, Unigram
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import BertProcessing, RobertaProcessing
from tokenizers.processors import RobertaProcessing
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files

View File

@ -2,7 +2,6 @@ from tokenizers import Tokenizer
from ..utils import data_dir, doc_pipeline_bert_tokenizer, doc_wiki_tokenizer
disable_printing = True
original_print = print

View File

@ -1,8 +1,4 @@
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from ..utils import data_dir, doc_wiki_tokenizer

View File

@ -1,3 +1,4 @@
# flake8: noqa
import gzip
import os

View File

@ -1,5 +1,3 @@
import pytest
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, processors
from tokenizers.implementations import BaseTokenizer

View File

@ -1,5 +1,3 @@
import pytest
from tokenizers import BertWordPieceTokenizer
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism

View File

@ -1,5 +1,3 @@
import pytest
from tokenizers import ByteLevelBPETokenizer
from ..utils import data_dir, multiprocessing_with_parallelism, roberta_files

View File

@ -1,5 +1,3 @@
import pytest
from tokenizers import CharBPETokenizer
from ..utils import data_dir, multiprocessing_with_parallelism, openai_files

View File

@ -1,5 +1,3 @@
import os
import pytest
from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer

View File

@ -6,7 +6,6 @@ import tqdm
from huggingface_hub import HfApi, cached_download, hf_hub_url
from tokenizers import Tokenizer
from .utils import albert_base, data_dir
@ -15,7 +14,7 @@ class TestSerialization:
# Check we can read this file.
# This used to fail because of BufReader that would fail because the
# file exceeds the buffer capacity
tokenizer = Tokenizer.from_file(albert_base)
Tokenizer.from_file(albert_base)
def check(tokenizer_file) -> bool:
@ -51,8 +50,6 @@ class TestFullDeserialization(unittest.TestCase):
# Check we can read this file.
# This used to fail because of BufReader that would fail because the
# file exceeds the buffer capacity
api = HfApi()
not_loadable = []
invalid_pre_tokenizer = []
@ -77,7 +74,7 @@ class TestFullDeserialization(unittest.TestCase):
except Exception as e:
print(f"{model_id} is not loadable: {e}")
not_loadable.append(model_id)
except:
except: # noqa: E722
print(f"{model_id} is not loadable: Rust error")
not_loadable.append(model_id)