Python - Update examples to use new models API

This commit is contained in:
Bjarte Johansen
2020-04-06 21:18:28 +02:00
parent 823066fea9
commit fab97475e5
2 changed files with 4 additions and 6 deletions

View File

@@ -41,8 +41,8 @@ def tokenize(sentence):
# Create a Tokenizer using a BPE model
bpe = models.BPE.from_files(args.vocab, args.merges)
tokenizer = Tokenizer(models.BPE.from_files(args.vocab, args.merges))
bpe = models.BPE(args.vocab, args.merges)
tokenizer = Tokenizer(bpe)
# Test the good custom classes
good_custom = GoodCustom()

View File

@@ -59,7 +59,7 @@ if args.type == "gpt2":
tok_p = GPT2Tokenizer.from_pretrained("gpt2")
# Create a Tokenizer using BPE
tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
tok_r = Tokenizer(BPE(args.vocab, args.merges))
# Use ByteLevel PreTokenizer
tok_r.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
# Use ByteLevel Decoder
@@ -68,9 +68,7 @@ elif args.type == "bert":
print("Running Bert tokenizer")
tok_p = BertTokenizer.from_pretrained(args.vocab)
tok_r = Tokenizer(
WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100)
)
tok_r = Tokenizer(WordPiece(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100))
tok_r.normalizer = BertNormalizer(
clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True,
)