mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-03 15:59:25 +00:00
Python - Update example
This commit is contained in:
@ -6,7 +6,7 @@ import logging
|
|||||||
logging.getLogger('transformers').disabled = True
|
logging.getLogger('transformers').disabled = True
|
||||||
logging.getLogger('transformers.tokenization_utils').disabled = True
|
logging.getLogger('transformers.tokenization_utils').disabled = True
|
||||||
|
|
||||||
from tokenizers import Tokenizer, models, pre_tokenizers, decoders
|
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
|
||||||
from transformers import GPT2Tokenizer, BertTokenizer
|
from transformers import GPT2Tokenizer, BertTokenizer
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
@ -64,6 +64,10 @@ elif args.type == "bert":
|
|||||||
tok_r = Tokenizer(models.WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100))
|
tok_r = Tokenizer(models.WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100))
|
||||||
tok_r.with_pre_tokenizer(pre_tokenizers.BasicPreTokenizer.new(do_lower_case=True, tokenize_chinese_chars=True, never_split=[]))
|
tok_r.with_pre_tokenizer(pre_tokenizers.BasicPreTokenizer.new(do_lower_case=True, tokenize_chinese_chars=True, never_split=[]))
|
||||||
tok_r.with_decoder(decoders.WordPiece.new())
|
tok_r.with_decoder(decoders.WordPiece.new())
|
||||||
|
tok_r.with_post_processor(processors.BertProcessing.new(
|
||||||
|
("[SEP]", tok_r.token_to_id("[SEP]")),
|
||||||
|
("[CLS]", tok_r.token_to_id("[CLS]")),
|
||||||
|
))
|
||||||
else:
|
else:
|
||||||
raise Exception(f"Unknown type {args.type}")
|
raise Exception(f"Unknown type {args.type}")
|
||||||
|
|
||||||
@ -97,8 +101,10 @@ for i in range(0, len(encoded_r)):
|
|||||||
if encoded_r[i].ids != encoded_p[i]:
|
if encoded_r[i].ids != encoded_p[i]:
|
||||||
diff_ids += 1
|
diff_ids += 1
|
||||||
if args.debug:
|
if args.debug:
|
||||||
print("".join([ token.value for token in encoded_r[i] ]))
|
print(encoded_r[i].ids)
|
||||||
print("".join(tok_p.tokenize(text[i])))
|
print(encoded_p[i])
|
||||||
|
print(encoded_r[i].tokens)
|
||||||
|
print(tok_p.tokenize(text[i]))
|
||||||
print(text[i])
|
print(text[i])
|
||||||
print("")
|
print("")
|
||||||
print(f"Ids differences: {diff_ids}")
|
print(f"Ids differences: {diff_ids}")
|
||||||
|
Reference in New Issue
Block a user