Python - More updates to the new API

This commit is contained in:
Anthony MOI
2020-02-10 11:57:30 -05:00
parent 505c428f72
commit 8585b761d1
2 changed files with 6 additions and 6 deletions

View File

@ -132,8 +132,8 @@ bpe = models.BPE.from_files(vocab, merges)
tokenizer = Tokenizer(bpe) tokenizer = Tokenizer(bpe)
# Customize pre-tokenization and decoding # Customize pre-tokenization and decoding
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=True) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel.new() tokenizer.decoder = decoders.ByteLevel()
# And then encode: # And then encode:
encoded = tokenizer.encode("I can feel the magic, can you?") encoded = tokenizer.encode("I can feel the magic, can you?")
@ -157,11 +157,11 @@ from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers
tokenizer = Tokenizer(models.BPE.empty()) tokenizer = Tokenizer(models.BPE.empty())
# Customize pre-tokenization and decoding # Customize pre-tokenization and decoding
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=True) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel.new() tokenizer.decoder = decoders.ByteLevel()
# And then train # And then train
trainer = trainers.BpeTrainer.new(vocab_size=20000, min_frequency=2) trainer = trainers.BpeTrainer(vocab_size=20000, min_frequency=2)
tokenizer.train(trainer, [ tokenizer.train(trainer, [
"./path/to/dataset/1.txt", "./path/to/dataset/1.txt",
"./path/to/dataset/2.txt", "./path/to/dataset/2.txt",

View File

@ -76,7 +76,7 @@ elif args.type == "bert":
strip_accents=True, strip_accents=True,
lowercase=True, lowercase=True,
) )
# tok_r.pre_tokenizer = pre_tokenizers.Whitespace.new() # tok_r.pre_tokenizer = pre_tokenizers.Whitespace()
tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
tok_r.decoder = decoders.WordPiece() tok_r.decoder = decoders.WordPiece()
tok_r.post_processor = BertProcessing( tok_r.post_processor = BertProcessing(