Removed now wrong code in convert.py, fixed strange black magic.

This commit is contained in:
Nicolas Patry
2020-09-24 08:40:48 +02:00
parent 4aa750bb47
commit 598ce61229
2 changed files with 9 additions and 59 deletions

View File

@ -79,24 +79,9 @@ class SpmConverter(Converter):
if model_type == 1:
tokenizer = Tokenizer(Unigram(vocab, unk_id))
elif model_type == 2:
vocab, merges = SentencePieceExtractor(
self.original_tokenizer.vocab_file
).extract()
# Open output files and let's extract model information
actual_merges = {}
for id_merge, (a, b) in enumerate(merges):
id_a = vocab[a]
id_b = vocab[b]
id_ab = vocab[a + b]
id_ab = vocab[a + b]
actual_merges[(id_a, id_b)] = (id_merge, id_ab)
vocab, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract()
tokenizer = Tokenizer(
BPE(
vocab,
actual_merges,
unk_token=proto.trainer_spec.unk_piece,
fuse_unk=True,
)
BPE(vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True,)
)
else:
raise Exception(
@ -331,9 +316,7 @@ class PegasusConverter(SpmConverter):
return TemplateProcessing(
seq_a=["$0", eos],
seq_b=["$1", eos],
special_tokens=[
(eos, tokenizer.get_vocab()[eos]),
],
special_tokens=[(eos, tokenizer.get_vocab()[eos]),],
)
@ -342,9 +325,7 @@ class T5Converter(SpmConverter):
return TemplateProcessing(
seq_a=["$0", "</s>"],
seq_b=["$1", "</s>"],
special_tokens=[
("</s>", tokenizer.get_vocab()["</s>"]),
],
special_tokens=[("</s>", tokenizer.get_vocab()["</s>"]),],
)
@ -436,9 +417,7 @@ def main():
model_len = 50
status_len = 6
speedup_len = 8
print(
f"|{'Model':^{model_len}}|{'Status':^{status_len}}|{'Speedup':^{speedup_len}}|"
)
print(f"|{'Model':^{model_len}}|{'Status':^{status_len}}|{'Speedup':^{speedup_len}}|")
print(f"|{'-'*model_len}|{'-'*status_len}|{'-'*speedup_len}|")
for pretrained in args.models:
status, speedup = check(pretrained, args.filename)