Updating python formatting. (#1079)

* Updating python formatting.

* Forgot gh action.

* Skipping isort to prevent circular imports.

* Updating stub.

* Removing `isort` (it contradicts `stub.py`).

* Fixing weird stub black/isort disagreeement.
This commit is contained in:
Nicolas Patry
2022-10-05 15:29:33 +02:00
committed by GitHub
parent 5f6e978452
commit 6113666624
43 changed files with 280 additions and 306 deletions

View File

@ -1,6 +1,7 @@
from ..utils import data_dir, doc_wiki_tokenizer, doc_pipeline_bert_tokenizer
from tokenizers import Tokenizer
from ..utils import data_dir, doc_pipeline_bert_tokenizer, doc_wiki_tokenizer
disable_printing = True
original_print = print
@ -112,7 +113,7 @@ class TestPipeline:
# END bert_setup_tokenizer
# START bert_setup_normalizer
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents
from tokenizers.normalizers import NFD, Lowercase, StripAccents
bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
# END bert_setup_normalizer
@ -136,9 +137,7 @@ class TestPipeline:
# START bert_train_tokenizer
from tokenizers.trainers import WordPieceTrainer
trainer = WordPieceTrainer(
vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)
trainer = WordPieceTrainer(vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
bert_tokenizer.train(files, trainer)
@ -171,9 +170,9 @@ class TestPipeline:
if __name__ == "__main__":
import os
from urllib import request
from zipfile import ZipFile
import os
disable_printing = False
if not os.path.isdir("data/wikitext-103-raw"):

View File

@ -1,8 +1,10 @@
from ..utils import data_dir, doc_wiki_tokenizer
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from ..utils import data_dir, doc_wiki_tokenizer
disable_printing = True
original_print = print
@ -181,9 +183,9 @@ class TestQuicktour:
if __name__ == "__main__":
import os
from urllib import request
from zipfile import ZipFile
import os
disable_printing = False
if not os.path.isdir("data/wikitext-103-raw"):

View File

@ -1,15 +1,17 @@
from ..utils import data_dir, train_files
import os
import pytest
import datasets
import gzip
import os
import datasets
import pytest
from ..utils import data_dir, train_files
class TestTrainFromIterators:
@staticmethod
def get_tokenizer_trainer():
# START init_tokenizer_trainer
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
tokenizer = Tokenizer(models.Unigram())
tokenizer.normalizer = normalizers.NFKC()
@ -31,9 +33,7 @@ class TestTrainFromIterators:
# START load_dataset
import datasets
dataset = datasets.load_dataset(
"wikitext", "wikitext-103-raw-v1", split="train+test+validation"
)
dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train+test+validation")
# END load_dataset
@pytest.fixture(scope="class")