mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-04 00:09:34 +00:00
Updating python formatting. (#1079)
* Updating python formatting. * Forgot gh action. * Skipping isort to prevent circular imports. * Updating stub. * Removing `isort` (it contradicts `stub.py`). * Fixing weird stub black/isort disagreeement.
This commit is contained in:
@ -1,6 +1,7 @@
|
||||
from ..utils import data_dir, doc_wiki_tokenizer, doc_pipeline_bert_tokenizer
|
||||
from tokenizers import Tokenizer
|
||||
|
||||
from ..utils import data_dir, doc_pipeline_bert_tokenizer, doc_wiki_tokenizer
|
||||
|
||||
|
||||
disable_printing = True
|
||||
original_print = print
|
||||
@ -112,7 +113,7 @@ class TestPipeline:
|
||||
# END bert_setup_tokenizer
|
||||
# START bert_setup_normalizer
|
||||
from tokenizers import normalizers
|
||||
from tokenizers.normalizers import Lowercase, NFD, StripAccents
|
||||
from tokenizers.normalizers import NFD, Lowercase, StripAccents
|
||||
|
||||
bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
|
||||
# END bert_setup_normalizer
|
||||
@ -136,9 +137,7 @@ class TestPipeline:
|
||||
# START bert_train_tokenizer
|
||||
from tokenizers.trainers import WordPieceTrainer
|
||||
|
||||
trainer = WordPieceTrainer(
|
||||
vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
|
||||
)
|
||||
trainer = WordPieceTrainer(vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
||||
files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
|
||||
bert_tokenizer.train(files, trainer)
|
||||
|
||||
@ -171,9 +170,9 @@ class TestPipeline:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
from urllib import request
|
||||
from zipfile import ZipFile
|
||||
import os
|
||||
|
||||
disable_printing = False
|
||||
if not os.path.isdir("data/wikitext-103-raw"):
|
||||
|
@ -1,8 +1,10 @@
|
||||
from ..utils import data_dir, doc_wiki_tokenizer
|
||||
from tokenizers import Tokenizer
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.trainers import BpeTrainer
|
||||
from tokenizers.pre_tokenizers import Whitespace
|
||||
from tokenizers.trainers import BpeTrainer
|
||||
|
||||
from ..utils import data_dir, doc_wiki_tokenizer
|
||||
|
||||
|
||||
disable_printing = True
|
||||
original_print = print
|
||||
@ -181,9 +183,9 @@ class TestQuicktour:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
from urllib import request
|
||||
from zipfile import ZipFile
|
||||
import os
|
||||
|
||||
disable_printing = False
|
||||
if not os.path.isdir("data/wikitext-103-raw"):
|
||||
|
@ -1,15 +1,17 @@
|
||||
from ..utils import data_dir, train_files
|
||||
import os
|
||||
import pytest
|
||||
import datasets
|
||||
import gzip
|
||||
import os
|
||||
|
||||
import datasets
|
||||
import pytest
|
||||
|
||||
from ..utils import data_dir, train_files
|
||||
|
||||
|
||||
class TestTrainFromIterators:
|
||||
@staticmethod
|
||||
def get_tokenizer_trainer():
|
||||
# START init_tokenizer_trainer
|
||||
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers
|
||||
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
|
||||
|
||||
tokenizer = Tokenizer(models.Unigram())
|
||||
tokenizer.normalizer = normalizers.NFKC()
|
||||
@ -31,9 +33,7 @@ class TestTrainFromIterators:
|
||||
# START load_dataset
|
||||
import datasets
|
||||
|
||||
dataset = datasets.load_dataset(
|
||||
"wikitext", "wikitext-103-raw-v1", split="train+test+validation"
|
||||
)
|
||||
dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train+test+validation")
|
||||
# END load_dataset
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
|
Reference in New Issue
Block a user