mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Removed now wrong code in convert.py
, fixed strange black magic.
This commit is contained in:
@ -79,24 +79,9 @@ class SpmConverter(Converter):
|
||||
if model_type == 1:
|
||||
tokenizer = Tokenizer(Unigram(vocab, unk_id))
|
||||
elif model_type == 2:
|
||||
vocab, merges = SentencePieceExtractor(
|
||||
self.original_tokenizer.vocab_file
|
||||
).extract()
|
||||
# Open output files and let's extract model information
|
||||
actual_merges = {}
|
||||
for id_merge, (a, b) in enumerate(merges):
|
||||
id_a = vocab[a]
|
||||
id_b = vocab[b]
|
||||
id_ab = vocab[a + b]
|
||||
id_ab = vocab[a + b]
|
||||
actual_merges[(id_a, id_b)] = (id_merge, id_ab)
|
||||
vocab, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract()
|
||||
tokenizer = Tokenizer(
|
||||
BPE(
|
||||
vocab,
|
||||
actual_merges,
|
||||
unk_token=proto.trainer_spec.unk_piece,
|
||||
fuse_unk=True,
|
||||
)
|
||||
BPE(vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True,)
|
||||
)
|
||||
else:
|
||||
raise Exception(
|
||||
@ -331,9 +316,7 @@ class PegasusConverter(SpmConverter):
|
||||
return TemplateProcessing(
|
||||
seq_a=["$0", eos],
|
||||
seq_b=["$1", eos],
|
||||
special_tokens=[
|
||||
(eos, tokenizer.get_vocab()[eos]),
|
||||
],
|
||||
special_tokens=[(eos, tokenizer.get_vocab()[eos]),],
|
||||
)
|
||||
|
||||
|
||||
@ -342,9 +325,7 @@ class T5Converter(SpmConverter):
|
||||
return TemplateProcessing(
|
||||
seq_a=["$0", "</s>"],
|
||||
seq_b=["$1", "</s>"],
|
||||
special_tokens=[
|
||||
("</s>", tokenizer.get_vocab()["</s>"]),
|
||||
],
|
||||
special_tokens=[("</s>", tokenizer.get_vocab()["</s>"]),],
|
||||
)
|
||||
|
||||
|
||||
@ -436,9 +417,7 @@ def main():
|
||||
model_len = 50
|
||||
status_len = 6
|
||||
speedup_len = 8
|
||||
print(
|
||||
f"|{'Model':^{model_len}}|{'Status':^{status_len}}|{'Speedup':^{speedup_len}}|"
|
||||
)
|
||||
print(f"|{'Model':^{model_len}}|{'Status':^{status_len}}|{'Speedup':^{speedup_len}}|")
|
||||
print(f"|{'-'*model_len}|{'-'*status_len}|{'-'*speedup_len}|")
|
||||
for pretrained in args.models:
|
||||
status, speedup = check(pretrained, args.filename)
|
||||
|
Reference in New Issue
Block a user