Upgrading to black 20.8b1

This commit is contained in:
Nicolas Patry
2020-09-24 15:00:20 +02:00
committed by Anthony MOI
parent dc1d0711cf
commit a410903051
11 changed files with 66 additions and 32 deletions

View File

@ -25,7 +25,7 @@ jobs:
architecture: "x64"
- name: Install dependencies
run: pip install black==19.10b0
run: pip install black==20.8b1
- name: Check style
working-directory: ./bindings/python

View File

@ -70,13 +70,17 @@ elif args.type == "bert":
tok_r = Tokenizer(WordPiece(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100))
tok_r.normalizer = BertNormalizer(
clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True,
clean_text=True,
handle_chinese_chars=True,
strip_accents=True,
lowercase=True,
)
# tok_r.pre_tokenizer = pre_tokenizers.Whitespace()
tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
tok_r.decoder = decoders.WordPiece()
tok_r.post_processor = BertProcessing(
("[SEP]", tok_r.token_to_id("[SEP]")), ("[CLS]", tok_r.token_to_id("[CLS]")),
("[SEP]", tok_r.token_to_id("[SEP]")),
("[CLS]", tok_r.token_to_id("[CLS]")),
)
else:
raise Exception(f"Unknown type {args.type}")

View File

@ -32,7 +32,10 @@ if not files:
# Initialize an empty tokenizer
tokenizer = BertWordPieceTokenizer(
clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True,
clean_text=True,
handle_chinese_chars=True,
strip_accents=True,
lowercase=True,
)
# And then train

View File

@ -9,7 +9,8 @@ TextInputSequence = str
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
PreTokenizedInputSequence,
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
]
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]

View File

@ -21,7 +21,8 @@ TextInputSequence = str
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
PreTokenizedInputSequence,
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
]
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
@ -52,7 +53,7 @@ Range = Union[int, Tuple[int, int], slice]
Pattern = Union[str, Regex]
class PreTokenizedString:
""" PreTokenizedString
"""PreTokenizedString
Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
underlying string, while keeping track of the alignment information (offsets).
@ -66,7 +67,7 @@ class PreTokenizedString:
"""
def __new__(sequence: str) -> PreTokenizedString:
""" Instantiate a new PreTokenizedString using the given str
"""Instantiate a new PreTokenizedString using the given str
Args:
sequence: str:
@ -74,7 +75,7 @@ class PreTokenizedString:
"""
pass
def split(self, func: Callable[[index, NormalizedString], List[NormalizedString]]):
""" Split the PreTokenizedString using the given `func`
"""Split the PreTokenizedString using the given `func`
Args:
func: Callable[[index, NormalizedString], List[NormalizedString]]:
@ -87,7 +88,7 @@ class PreTokenizedString:
"""
pass
def normalize(self, func: Callable[[NormalizedString], None]):
""" Normalize each split of the `PreTokenizedString` using the given `func`
"""Normalize each split of the `PreTokenizedString` using the given `func`
Args:
func: Callable[[NormalizedString], None]:
@ -97,7 +98,7 @@ class PreTokenizedString:
"""
pass
def tokenize(self, func: Callable[[str], List[Token]]):
""" Tokenize each split of the `PreTokenizedString` using the given `func`
"""Tokenize each split of the `PreTokenizedString` using the given `func`
Args:
func: Callable[[str], List[Token]]:
@ -106,7 +107,7 @@ class PreTokenizedString:
"""
pass
def to_encoding(self, type_id: int = 0, word_idx: Optional[int] = None) -> Encoding:
""" Return an Encoding generated from this PreTokenizedString
"""Return an Encoding generated from this PreTokenizedString
Args:
type_id: int = 0:
@ -126,7 +127,7 @@ class PreTokenizedString:
offset_referential: OffsetReferential = OffsetReferential.ORIGINAL,
offset_type: OffsetType = OffsetType.CHAR,
) -> List[Split]:
""" Get the splits currently managed by the PreTokenizedString
"""Get the splits currently managed by the PreTokenizedString
Args:
offset_referential: OffsetReferential:
@ -145,7 +146,7 @@ class PreTokenizedString:
pass
class NormalizedString:
""" NormalizedString
"""NormalizedString
A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one.
While making all the requested modifications, it keeps track of the alignment information
@ -153,7 +154,7 @@ class NormalizedString:
"""
def __new__(sequence: str) -> NormalizedString:
""" Instantiate a new NormalizedString using the given str
"""Instantiate a new NormalizedString using the given str
Args:
sequence: str:
@ -214,14 +215,14 @@ class NormalizedString:
""" Calls the given function for each character of the string """
pass
def map(self, func: Callable[[str], str]):
""" Calls the given function for each character of the string
"""Calls the given function for each character of the string
Replaces each character of the string using the returned value. Each
returned value **must** be a str of length 1 (ie a character).
"""
pass
def split(self, pattern: Pattern, behavior: SplitDelimiterBehavior) -> List[NormalizedString]:
""" Split the NormalizedString using the given pattern and the specified behavior
"""Split the NormalizedString using the given pattern and the specified behavior
Args:
pattern: Pattern:
@ -235,7 +236,7 @@ class NormalizedString:
"""
pass
def replace(self, pattern: Pattern, content: str):
""" Replace the content of the given pattern with the provided content
"""Replace the content of the given pattern with the provided content
Args:
pattern: Pattern:
@ -827,7 +828,10 @@ class Tokenizer:
"""
pass
def post_process(
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True,
self,
encoding: Encoding,
pair: Optional[Encoding] = None,
add_special_tokens: bool = True,
) -> Encoding:
"""Apply all the post-processing steps to the given encodings.

View File

@ -21,7 +21,10 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
"""
def __init__(
self, vocab: Optional[str] = None, replacement: str = "", add_prefix_space: bool = True,
self,
vocab: Optional[str] = None,
replacement: str = "",
add_prefix_space: bool = True,
):
if vocab is not None:
# Let Unigram(..) fail if only one of them is None
@ -29,7 +32,12 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
else:
tokenizer = Tokenizer(Unigram())
tokenizer.normalizer = normalizers.Sequence([normalizers.Nmt(), normalizers.NFKC(),])
tokenizer.normalizer = normalizers.Sequence(
[
normalizers.Nmt(),
normalizers.NFKC(),
]
)
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[
pre_tokenizers.WhitespaceSplit(),
@ -60,7 +68,9 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
""" Train the model using the given files """
trainer = trainers.UnigramTrainer(
vocab_size=vocab_size, special_tokens=special_tokens, show_progress=show_progress,
vocab_size=vocab_size,
special_tokens=special_tokens,
show_progress=show_progress,
)
if isinstance(files, str):

View File

@ -78,7 +78,7 @@ class BPE(Model):
def from_file(vocab_filename: str, merges_filename: str, **kwargs) -> BPE:
"""
Convenient method to intialize a BPE from files
Roughly equivalent to
Roughly equivalent to
def from_file(vocab_filename, merges_filenames, **kwargs):
vocab, merges = BPE.read_file(vocab_filename, merges_filename)
@ -116,7 +116,7 @@ class WordPiece(Model):
def from_file(vocab_filename: str, **kwargs) -> WordPiece:
"""
Convenient method to intialize a WordPiece from file
Roughly equivalent to
Roughly equivalent to
def from_file(vocab_filename, **kwargs):
vocab, merges = WordPiece.read_file(vocab_filename)
@ -147,7 +147,7 @@ class WordLevel(Model):
def from_file(vocab_filename: str, **kwargs) -> WordLevelg:
"""
Convenient method to intialize a WordLevelg from file
Roughly equivalent to
Roughly equivalent to
def from_file(vocab_filename, **kwargs):
vocab, merges = WordLevelg.read_file(vocab_filename)

View File

@ -19,7 +19,10 @@ class TestBPE:
BPE(vocab=vocab)
BPE(merges=merges)
assert isinstance(pickle.loads(pickle.dumps(BPE(vocab, merges))), BPE,)
assert isinstance(
pickle.loads(pickle.dumps(BPE(vocab, merges))),
BPE,
)
# Deprecated calls in 0.9
with pytest.deprecated_call():

View File

@ -22,7 +22,8 @@ class TestBertProcessing:
assert isinstance(processor, PostProcessor)
assert isinstance(processor, BertProcessing)
assert isinstance(
pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))), BertProcessing,
pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))),
BertProcessing,
)
def test_processing(self):
@ -94,7 +95,9 @@ class TestTemplateProcessing:
def get_roberta(self):
return TemplateProcessing(
seq_a="<s> $0 </s>", seq_b="</s> $0 </s>", special_tokens=[("<s>", 0), ("</s>", 1)],
seq_a="<s> $0 </s>",
seq_b="</s> $0 </s>",
special_tokens=[("<s>", 0), ("</s>", 1)],
)
def get_t5_squad(self):

View File

@ -232,10 +232,12 @@ class TestTokenizer:
# Numpy
test_single(
np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]), True,
np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]),
True,
)
test_single(
np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))), True,
np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))),
True,
)
test_pair(
np.array(
@ -276,7 +278,8 @@ class TestTokenizer:
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
tokenizer.post_processor = RobertaProcessing(
("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")),
("</s>", tokenizer.token_to_id("</s>")),
("<s>", tokenizer.token_to_id("<s>")),
)
# Can encode with special tokens

View File

@ -65,7 +65,10 @@ class TestByteLevelBPE:
def test_lowerspace(self, roberta_files):
tokenizer = ByteLevelBPETokenizer.from_file(
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True,
roberta_files["vocab"],
roberta_files["merges"],
add_prefix_space=True,
lowercase=True,
)
output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")