Upgrading to black 20.8b1

This commit is contained in:
Nicolas Patry
2020-09-24 15:00:20 +02:00
committed by Anthony MOI
parent dc1d0711cf
commit a410903051
11 changed files with 66 additions and 32 deletions

View File

@ -25,7 +25,7 @@ jobs:
architecture: "x64"
- name: Install dependencies
run: pip install black==19.10b0
run: pip install black==20.8b1
- name: Check style
working-directory: ./bindings/python

View File

@ -70,13 +70,17 @@ elif args.type == "bert":
tok_r = Tokenizer(WordPiece(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100))
tok_r.normalizer = BertNormalizer(
clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True,
clean_text=True,
handle_chinese_chars=True,
strip_accents=True,
lowercase=True,
)
# tok_r.pre_tokenizer = pre_tokenizers.Whitespace()
tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
tok_r.decoder = decoders.WordPiece()
tok_r.post_processor = BertProcessing(
("[SEP]", tok_r.token_to_id("[SEP]")), ("[CLS]", tok_r.token_to_id("[CLS]")),
("[SEP]", tok_r.token_to_id("[SEP]")),
("[CLS]", tok_r.token_to_id("[CLS]")),
)
else:
raise Exception(f"Unknown type {args.type}")

View File

@ -32,7 +32,10 @@ if not files:
# Initialize an empty tokenizer
tokenizer = BertWordPieceTokenizer(
clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True,
clean_text=True,
handle_chinese_chars=True,
strip_accents=True,
lowercase=True,
)
# And then train

View File

@ -9,7 +9,8 @@ TextInputSequence = str
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
PreTokenizedInputSequence,
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
]
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]

View File

@ -21,7 +21,8 @@ TextInputSequence = str
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
PreTokenizedInputSequence,
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
]
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
@ -827,7 +828,10 @@ class Tokenizer:
"""
pass
def post_process(
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True,
self,
encoding: Encoding,
pair: Optional[Encoding] = None,
add_special_tokens: bool = True,
) -> Encoding:
"""Apply all the post-processing steps to the given encodings.

View File

@ -21,7 +21,10 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
"""
def __init__(
self, vocab: Optional[str] = None, replacement: str = "", add_prefix_space: bool = True,
self,
vocab: Optional[str] = None,
replacement: str = "",
add_prefix_space: bool = True,
):
if vocab is not None:
# Let Unigram(..) fail if only one of them is None
@ -29,7 +32,12 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
else:
tokenizer = Tokenizer(Unigram())
tokenizer.normalizer = normalizers.Sequence([normalizers.Nmt(), normalizers.NFKC(),])
tokenizer.normalizer = normalizers.Sequence(
[
normalizers.Nmt(),
normalizers.NFKC(),
]
)
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[
pre_tokenizers.WhitespaceSplit(),
@ -60,7 +68,9 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
""" Train the model using the given files """
trainer = trainers.UnigramTrainer(
vocab_size=vocab_size, special_tokens=special_tokens, show_progress=show_progress,
vocab_size=vocab_size,
special_tokens=special_tokens,
show_progress=show_progress,
)
if isinstance(files, str):

View File

@ -19,7 +19,10 @@ class TestBPE:
BPE(vocab=vocab)
BPE(merges=merges)
assert isinstance(pickle.loads(pickle.dumps(BPE(vocab, merges))), BPE,)
assert isinstance(
pickle.loads(pickle.dumps(BPE(vocab, merges))),
BPE,
)
# Deprecated calls in 0.9
with pytest.deprecated_call():

View File

@ -22,7 +22,8 @@ class TestBertProcessing:
assert isinstance(processor, PostProcessor)
assert isinstance(processor, BertProcessing)
assert isinstance(
pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))), BertProcessing,
pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))),
BertProcessing,
)
def test_processing(self):
@ -94,7 +95,9 @@ class TestTemplateProcessing:
def get_roberta(self):
return TemplateProcessing(
seq_a="<s> $0 </s>", seq_b="</s> $0 </s>", special_tokens=[("<s>", 0), ("</s>", 1)],
seq_a="<s> $0 </s>",
seq_b="</s> $0 </s>",
special_tokens=[("<s>", 0), ("</s>", 1)],
)
def get_t5_squad(self):

View File

@ -232,10 +232,12 @@ class TestTokenizer:
# Numpy
test_single(
np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]), True,
np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]),
True,
)
test_single(
np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))), True,
np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))),
True,
)
test_pair(
np.array(
@ -276,7 +278,8 @@ class TestTokenizer:
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
tokenizer.post_processor = RobertaProcessing(
("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")),
("</s>", tokenizer.token_to_id("</s>")),
("<s>", tokenizer.token_to_id("<s>")),
)
# Can encode with special tokens

View File

@ -65,7 +65,10 @@ class TestByteLevelBPE:
def test_lowerspace(self, roberta_files):
tokenizer = ByteLevelBPETokenizer.from_file(
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True,
roberta_files["vocab"],
roberta_files["merges"],
add_prefix_space=True,
lowercase=True,
)
output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")