Upgrading to black 20.8b1

This commit is contained in:
Nicolas Patry
2020-09-24 15:00:20 +02:00
committed by Anthony MOI
parent dc1d0711cf
commit a410903051
11 changed files with 66 additions and 32 deletions

View File

@ -25,7 +25,7 @@ jobs:
architecture: "x64" architecture: "x64"
- name: Install dependencies - name: Install dependencies
run: pip install black==19.10b0 run: pip install black==20.8b1
- name: Check style - name: Check style
working-directory: ./bindings/python working-directory: ./bindings/python

View File

@ -70,13 +70,17 @@ elif args.type == "bert":
tok_r = Tokenizer(WordPiece(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100)) tok_r = Tokenizer(WordPiece(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100))
tok_r.normalizer = BertNormalizer( tok_r.normalizer = BertNormalizer(
clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, clean_text=True,
handle_chinese_chars=True,
strip_accents=True,
lowercase=True,
) )
# tok_r.pre_tokenizer = pre_tokenizers.Whitespace() # tok_r.pre_tokenizer = pre_tokenizers.Whitespace()
tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
tok_r.decoder = decoders.WordPiece() tok_r.decoder = decoders.WordPiece()
tok_r.post_processor = BertProcessing( tok_r.post_processor = BertProcessing(
("[SEP]", tok_r.token_to_id("[SEP]")), ("[CLS]", tok_r.token_to_id("[CLS]")), ("[SEP]", tok_r.token_to_id("[SEP]")),
("[CLS]", tok_r.token_to_id("[CLS]")),
) )
else: else:
raise Exception(f"Unknown type {args.type}") raise Exception(f"Unknown type {args.type}")

View File

@ -32,7 +32,10 @@ if not files:
# Initialize an empty tokenizer # Initialize an empty tokenizer
tokenizer = BertWordPieceTokenizer( tokenizer = BertWordPieceTokenizer(
clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, clean_text=True,
handle_chinese_chars=True,
strip_accents=True,
lowercase=True,
) )
# And then train # And then train

View File

@ -9,7 +9,8 @@ TextInputSequence = str
PreTokenizedInputSequence = Union[List[str], Tuple[str]] PreTokenizedInputSequence = Union[List[str], Tuple[str]]
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]] TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
PreTokenizedEncodeInput = Union[ PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], PreTokenizedInputSequence,
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
] ]
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence] InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]

View File

@ -21,7 +21,8 @@ TextInputSequence = str
PreTokenizedInputSequence = Union[List[str], Tuple[str]] PreTokenizedInputSequence = Union[List[str], Tuple[str]]
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]] TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
PreTokenizedEncodeInput = Union[ PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], PreTokenizedInputSequence,
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
] ]
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence] InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
@ -827,7 +828,10 @@ class Tokenizer:
""" """
pass pass
def post_process( def post_process(
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True, self,
encoding: Encoding,
pair: Optional[Encoding] = None,
add_special_tokens: bool = True,
) -> Encoding: ) -> Encoding:
"""Apply all the post-processing steps to the given encodings. """Apply all the post-processing steps to the given encodings.

View File

@ -21,7 +21,10 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
""" """
def __init__( def __init__(
self, vocab: Optional[str] = None, replacement: str = "", add_prefix_space: bool = True, self,
vocab: Optional[str] = None,
replacement: str = "",
add_prefix_space: bool = True,
): ):
if vocab is not None: if vocab is not None:
# Let Unigram(..) fail if only one of them is None # Let Unigram(..) fail if only one of them is None
@ -29,7 +32,12 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
else: else:
tokenizer = Tokenizer(Unigram()) tokenizer = Tokenizer(Unigram())
tokenizer.normalizer = normalizers.Sequence([normalizers.Nmt(), normalizers.NFKC(),]) tokenizer.normalizer = normalizers.Sequence(
[
normalizers.Nmt(),
normalizers.NFKC(),
]
)
tokenizer.pre_tokenizer = pre_tokenizers.Sequence( tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[ [
pre_tokenizers.WhitespaceSplit(), pre_tokenizers.WhitespaceSplit(),
@ -60,7 +68,9 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
""" Train the model using the given files """ """ Train the model using the given files """
trainer = trainers.UnigramTrainer( trainer = trainers.UnigramTrainer(
vocab_size=vocab_size, special_tokens=special_tokens, show_progress=show_progress, vocab_size=vocab_size,
special_tokens=special_tokens,
show_progress=show_progress,
) )
if isinstance(files, str): if isinstance(files, str):

View File

@ -19,7 +19,10 @@ class TestBPE:
BPE(vocab=vocab) BPE(vocab=vocab)
BPE(merges=merges) BPE(merges=merges)
assert isinstance(pickle.loads(pickle.dumps(BPE(vocab, merges))), BPE,) assert isinstance(
pickle.loads(pickle.dumps(BPE(vocab, merges))),
BPE,
)
# Deprecated calls in 0.9 # Deprecated calls in 0.9
with pytest.deprecated_call(): with pytest.deprecated_call():

View File

@ -22,7 +22,8 @@ class TestBertProcessing:
assert isinstance(processor, PostProcessor) assert isinstance(processor, PostProcessor)
assert isinstance(processor, BertProcessing) assert isinstance(processor, BertProcessing)
assert isinstance( assert isinstance(
pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))), BertProcessing, pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))),
BertProcessing,
) )
def test_processing(self): def test_processing(self):
@ -94,7 +95,9 @@ class TestTemplateProcessing:
def get_roberta(self): def get_roberta(self):
return TemplateProcessing( return TemplateProcessing(
seq_a="<s> $0 </s>", seq_b="</s> $0 </s>", special_tokens=[("<s>", 0), ("</s>", 1)], seq_a="<s> $0 </s>",
seq_b="</s> $0 </s>",
special_tokens=[("<s>", 0), ("</s>", 1)],
) )
def get_t5_squad(self): def get_t5_squad(self):

View File

@ -232,10 +232,12 @@ class TestTokenizer:
# Numpy # Numpy
test_single( test_single(
np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]), True, np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]),
True,
) )
test_single( test_single(
np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))), True, np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))),
True,
) )
test_pair( test_pair(
np.array( np.array(
@ -276,7 +278,8 @@ class TestTokenizer:
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True) tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
tokenizer.post_processor = RobertaProcessing( tokenizer.post_processor = RobertaProcessing(
("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ("</s>", tokenizer.token_to_id("</s>")),
("<s>", tokenizer.token_to_id("<s>")),
) )
# Can encode with special tokens # Can encode with special tokens

View File

@ -65,7 +65,10 @@ class TestByteLevelBPE:
def test_lowerspace(self, roberta_files): def test_lowerspace(self, roberta_files):
tokenizer = ByteLevelBPETokenizer.from_file( tokenizer = ByteLevelBPETokenizer.from_file(
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True, roberta_files["vocab"],
roberta_files["merges"],
add_prefix_space=True,
lowercase=True,
) )
output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog") output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")