Updating python formatting. (#1079)

* Updating python formatting.

* Forgot gh action.

* Skipping isort to prevent circular imports.

* Updating stub.

* Removing `isort` (it contradicts `stub.py`).

* Fixing weird stub black/isort disagreeement.
This commit is contained in:
Nicolas Patry
2022-10-05 15:29:33 +02:00
committed by GitHub
parent 5f6e978452
commit 6113666624
43 changed files with 280 additions and 306 deletions

View File

@@ -1,12 +1,11 @@
import jieba
from typing import List
from tokenizers import Tokenizer, Regex, NormalizedString, PreTokenizedString
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.normalizers import Normalizer
import jieba
from tokenizers import NormalizedString, PreTokenizedString, Regex, Tokenizer
from tokenizers.decoders import Decoder
from tokenizers.models import BPE
from tokenizers.normalizers import Normalizer
from tokenizers.pre_tokenizers import PreTokenizer
class JiebaPreTokenizer:
@@ -21,9 +20,7 @@ class JiebaPreTokenizer:
# We can also easily do it in one line:
# return [normalized_string[w[1] : w[2]] for w in jieba.tokenize(str(normalized_string))]
def odd_number_split(
self, i: int, normalized_string: NormalizedString
) -> List[NormalizedString]:
def odd_number_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
# Just an odd example...
splits = []
last = 0

View File

@@ -1,18 +1,19 @@
import time
import argparse
import logging
import time
from tqdm import tqdm
import logging
logging.getLogger("transformers").disabled = True
logging.getLogger("transformers.tokenization_utils").disabled = True
from tokenizers import Tokenizer, pre_tokenizers, decoders
from tokenizers import Tokenizer, decoders, pre_tokenizers
from tokenizers.models import BPE, WordPiece
from tokenizers.processors import BertProcessing
from tokenizers.normalizers import BertNormalizer
from tokenizers.processors import BertProcessing
from transformers import BertTokenizer, GPT2Tokenizer
from transformers import GPT2Tokenizer, BertTokenizer
parser = argparse.ArgumentParser()
parser.add_argument("--type", default="gpt2", type=str, help="The type of tokenizer (bert|gpt2)")

View File

@@ -3,6 +3,7 @@ import glob
from tokenizers import BertWordPieceTokenizer
parser = argparse.ArgumentParser()
parser.add_argument(
"--files",
@@ -19,9 +20,7 @@ parser.add_argument(
type=str,
help="Path to the output directory, where the files will be saved",
)
parser.add_argument(
"--name", default="bert-wordpiece", type=str, help="The name of the output vocab files"
)
parser.add_argument("--name", default="bert-wordpiece", type=str, help="The name of the output vocab files")
args = parser.parse_args()
files = glob.glob(args.files)

View File

@@ -4,6 +4,7 @@ from os.path import join
from tokenizers import ByteLevelBPETokenizer
parser = argparse.ArgumentParser()
parser.add_argument(
"--files",
@@ -20,9 +21,7 @@ parser.add_argument(
type=str,
help="Path to the output directory, where the files will be saved",
)
parser.add_argument(
"--name", default="bpe-bytelevel", type=str, help="The name of the output vocab files"
)
parser.add_argument("--name", default="bpe-bytelevel", type=str, help="The name of the output vocab files")
args = parser.parse_args()
files = glob.glob(args.files)

View File

@@ -1,5 +1,7 @@
import datasets
from tokenizers import normalizers, pre_tokenizers, Tokenizer, models, trainers
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers
# Build a tokenizer
bpe_tokenizer = Tokenizer(models.BPE())