Upgrading to black 20.8b1

This commit is contained in:
Nicolas Patry
2020-09-24 15:00:20 +02:00
committed by Anthony MOI
parent dc1d0711cf
commit a410903051
11 changed files with 66 additions and 32 deletions

View File

@ -25,7 +25,7 @@ jobs:
architecture: "x64" architecture: "x64"
- name: Install dependencies - name: Install dependencies
run: pip install black==19.10b0 run: pip install black==20.8b1
- name: Check style - name: Check style
working-directory: ./bindings/python working-directory: ./bindings/python

View File

@ -70,13 +70,17 @@ elif args.type == "bert":
tok_r = Tokenizer(WordPiece(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100)) tok_r = Tokenizer(WordPiece(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100))
tok_r.normalizer = BertNormalizer( tok_r.normalizer = BertNormalizer(
clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, clean_text=True,
handle_chinese_chars=True,
strip_accents=True,
lowercase=True,
) )
# tok_r.pre_tokenizer = pre_tokenizers.Whitespace() # tok_r.pre_tokenizer = pre_tokenizers.Whitespace()
tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
tok_r.decoder = decoders.WordPiece() tok_r.decoder = decoders.WordPiece()
tok_r.post_processor = BertProcessing( tok_r.post_processor = BertProcessing(
("[SEP]", tok_r.token_to_id("[SEP]")), ("[CLS]", tok_r.token_to_id("[CLS]")), ("[SEP]", tok_r.token_to_id("[SEP]")),
("[CLS]", tok_r.token_to_id("[CLS]")),
) )
else: else:
raise Exception(f"Unknown type {args.type}") raise Exception(f"Unknown type {args.type}")

View File

@ -32,7 +32,10 @@ if not files:
# Initialize an empty tokenizer # Initialize an empty tokenizer
tokenizer = BertWordPieceTokenizer( tokenizer = BertWordPieceTokenizer(
clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, clean_text=True,
handle_chinese_chars=True,
strip_accents=True,
lowercase=True,
) )
# And then train # And then train

View File

@ -9,7 +9,8 @@ TextInputSequence = str
PreTokenizedInputSequence = Union[List[str], Tuple[str]] PreTokenizedInputSequence = Union[List[str], Tuple[str]]
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]] TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
PreTokenizedEncodeInput = Union[ PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], PreTokenizedInputSequence,
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
] ]
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence] InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]

View File

@ -21,7 +21,8 @@ TextInputSequence = str
PreTokenizedInputSequence = Union[List[str], Tuple[str]] PreTokenizedInputSequence = Union[List[str], Tuple[str]]
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]] TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
PreTokenizedEncodeInput = Union[ PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], PreTokenizedInputSequence,
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
] ]
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence] InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
@ -52,7 +53,7 @@ Range = Union[int, Tuple[int, int], slice]
Pattern = Union[str, Regex] Pattern = Union[str, Regex]
class PreTokenizedString: class PreTokenizedString:
""" PreTokenizedString """PreTokenizedString
Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
underlying string, while keeping track of the alignment information (offsets). underlying string, while keeping track of the alignment information (offsets).
@ -66,7 +67,7 @@ class PreTokenizedString:
""" """
def __new__(sequence: str) -> PreTokenizedString: def __new__(sequence: str) -> PreTokenizedString:
""" Instantiate a new PreTokenizedString using the given str """Instantiate a new PreTokenizedString using the given str
Args: Args:
sequence: str: sequence: str:
@ -74,7 +75,7 @@ class PreTokenizedString:
""" """
pass pass
def split(self, func: Callable[[index, NormalizedString], List[NormalizedString]]): def split(self, func: Callable[[index, NormalizedString], List[NormalizedString]]):
""" Split the PreTokenizedString using the given `func` """Split the PreTokenizedString using the given `func`
Args: Args:
func: Callable[[index, NormalizedString], List[NormalizedString]]: func: Callable[[index, NormalizedString], List[NormalizedString]]:
@ -87,7 +88,7 @@ class PreTokenizedString:
""" """
pass pass
def normalize(self, func: Callable[[NormalizedString], None]): def normalize(self, func: Callable[[NormalizedString], None]):
""" Normalize each split of the `PreTokenizedString` using the given `func` """Normalize each split of the `PreTokenizedString` using the given `func`
Args: Args:
func: Callable[[NormalizedString], None]: func: Callable[[NormalizedString], None]:
@ -97,7 +98,7 @@ class PreTokenizedString:
""" """
pass pass
def tokenize(self, func: Callable[[str], List[Token]]): def tokenize(self, func: Callable[[str], List[Token]]):
""" Tokenize each split of the `PreTokenizedString` using the given `func` """Tokenize each split of the `PreTokenizedString` using the given `func`
Args: Args:
func: Callable[[str], List[Token]]: func: Callable[[str], List[Token]]:
@ -106,7 +107,7 @@ class PreTokenizedString:
""" """
pass pass
def to_encoding(self, type_id: int = 0, word_idx: Optional[int] = None) -> Encoding: def to_encoding(self, type_id: int = 0, word_idx: Optional[int] = None) -> Encoding:
""" Return an Encoding generated from this PreTokenizedString """Return an Encoding generated from this PreTokenizedString
Args: Args:
type_id: int = 0: type_id: int = 0:
@ -126,7 +127,7 @@ class PreTokenizedString:
offset_referential: OffsetReferential = OffsetReferential.ORIGINAL, offset_referential: OffsetReferential = OffsetReferential.ORIGINAL,
offset_type: OffsetType = OffsetType.CHAR, offset_type: OffsetType = OffsetType.CHAR,
) -> List[Split]: ) -> List[Split]:
""" Get the splits currently managed by the PreTokenizedString """Get the splits currently managed by the PreTokenizedString
Args: Args:
offset_referential: OffsetReferential: offset_referential: OffsetReferential:
@ -145,7 +146,7 @@ class PreTokenizedString:
pass pass
class NormalizedString: class NormalizedString:
""" NormalizedString """NormalizedString
A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one. A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one.
While making all the requested modifications, it keeps track of the alignment information While making all the requested modifications, it keeps track of the alignment information
@ -153,7 +154,7 @@ class NormalizedString:
""" """
def __new__(sequence: str) -> NormalizedString: def __new__(sequence: str) -> NormalizedString:
""" Instantiate a new NormalizedString using the given str """Instantiate a new NormalizedString using the given str
Args: Args:
sequence: str: sequence: str:
@ -214,14 +215,14 @@ class NormalizedString:
""" Calls the given function for each character of the string """ """ Calls the given function for each character of the string """
pass pass
def map(self, func: Callable[[str], str]): def map(self, func: Callable[[str], str]):
""" Calls the given function for each character of the string """Calls the given function for each character of the string
Replaces each character of the string using the returned value. Each Replaces each character of the string using the returned value. Each
returned value **must** be a str of length 1 (ie a character). returned value **must** be a str of length 1 (ie a character).
""" """
pass pass
def split(self, pattern: Pattern, behavior: SplitDelimiterBehavior) -> List[NormalizedString]: def split(self, pattern: Pattern, behavior: SplitDelimiterBehavior) -> List[NormalizedString]:
""" Split the NormalizedString using the given pattern and the specified behavior """Split the NormalizedString using the given pattern and the specified behavior
Args: Args:
pattern: Pattern: pattern: Pattern:
@ -235,7 +236,7 @@ class NormalizedString:
""" """
pass pass
def replace(self, pattern: Pattern, content: str): def replace(self, pattern: Pattern, content: str):
""" Replace the content of the given pattern with the provided content """Replace the content of the given pattern with the provided content
Args: Args:
pattern: Pattern: pattern: Pattern:
@ -827,7 +828,10 @@ class Tokenizer:
""" """
pass pass
def post_process( def post_process(
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True, self,
encoding: Encoding,
pair: Optional[Encoding] = None,
add_special_tokens: bool = True,
) -> Encoding: ) -> Encoding:
"""Apply all the post-processing steps to the given encodings. """Apply all the post-processing steps to the given encodings.

View File

@ -21,7 +21,10 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
""" """
def __init__( def __init__(
self, vocab: Optional[str] = None, replacement: str = "", add_prefix_space: bool = True, self,
vocab: Optional[str] = None,
replacement: str = "",
add_prefix_space: bool = True,
): ):
if vocab is not None: if vocab is not None:
# Let Unigram(..) fail if only one of them is None # Let Unigram(..) fail if only one of them is None
@ -29,7 +32,12 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
else: else:
tokenizer = Tokenizer(Unigram()) tokenizer = Tokenizer(Unigram())
tokenizer.normalizer = normalizers.Sequence([normalizers.Nmt(), normalizers.NFKC(),]) tokenizer.normalizer = normalizers.Sequence(
[
normalizers.Nmt(),
normalizers.NFKC(),
]
)
tokenizer.pre_tokenizer = pre_tokenizers.Sequence( tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[ [
pre_tokenizers.WhitespaceSplit(), pre_tokenizers.WhitespaceSplit(),
@ -60,7 +68,9 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
""" Train the model using the given files """ """ Train the model using the given files """
trainer = trainers.UnigramTrainer( trainer = trainers.UnigramTrainer(
vocab_size=vocab_size, special_tokens=special_tokens, show_progress=show_progress, vocab_size=vocab_size,
special_tokens=special_tokens,
show_progress=show_progress,
) )
if isinstance(files, str): if isinstance(files, str):

View File

@ -19,7 +19,10 @@ class TestBPE:
BPE(vocab=vocab) BPE(vocab=vocab)
BPE(merges=merges) BPE(merges=merges)
assert isinstance(pickle.loads(pickle.dumps(BPE(vocab, merges))), BPE,) assert isinstance(
pickle.loads(pickle.dumps(BPE(vocab, merges))),
BPE,
)
# Deprecated calls in 0.9 # Deprecated calls in 0.9
with pytest.deprecated_call(): with pytest.deprecated_call():

View File

@ -22,7 +22,8 @@ class TestBertProcessing:
assert isinstance(processor, PostProcessor) assert isinstance(processor, PostProcessor)
assert isinstance(processor, BertProcessing) assert isinstance(processor, BertProcessing)
assert isinstance( assert isinstance(
pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))), BertProcessing, pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))),
BertProcessing,
) )
def test_processing(self): def test_processing(self):
@ -94,7 +95,9 @@ class TestTemplateProcessing:
def get_roberta(self): def get_roberta(self):
return TemplateProcessing( return TemplateProcessing(
seq_a="<s> $0 </s>", seq_b="</s> $0 </s>", special_tokens=[("<s>", 0), ("</s>", 1)], seq_a="<s> $0 </s>",
seq_b="</s> $0 </s>",
special_tokens=[("<s>", 0), ("</s>", 1)],
) )
def get_t5_squad(self): def get_t5_squad(self):

View File

@ -232,10 +232,12 @@ class TestTokenizer:
# Numpy # Numpy
test_single( test_single(
np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]), True, np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]),
True,
) )
test_single( test_single(
np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))), True, np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))),
True,
) )
test_pair( test_pair(
np.array( np.array(
@ -276,7 +278,8 @@ class TestTokenizer:
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True) tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
tokenizer.post_processor = RobertaProcessing( tokenizer.post_processor = RobertaProcessing(
("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ("</s>", tokenizer.token_to_id("</s>")),
("<s>", tokenizer.token_to_id("<s>")),
) )
# Can encode with special tokens # Can encode with special tokens

View File

@ -65,7 +65,10 @@ class TestByteLevelBPE:
def test_lowerspace(self, roberta_files): def test_lowerspace(self, roberta_files):
tokenizer = ByteLevelBPETokenizer.from_file( tokenizer = ByteLevelBPETokenizer.from_file(
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True, roberta_files["vocab"],
roberta_files["merges"],
add_prefix_space=True,
lowercase=True,
) )
output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog") output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")