Python - Make the trainer optional on Tokenizer.train

This commit is contained in:
Anthony MOI
2020-10-07 21:25:32 -04:00
committed by Anthony MOI
parent c230183cf6
commit 224862fe0c
7 changed files with 15 additions and 12 deletions

View File

@ -138,11 +138,11 @@ tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
# And then train
trainer = trainers.BpeTrainer(vocab_size=20000, min_frequency=2)
tokenizer.train(trainer, [
tokenizer.train([
"./path/to/dataset/1.txt",
"./path/to/dataset/2.txt",
"./path/to/dataset/3.txt"
])
], trainer=trainer)
# And Save it
tokenizer.save("byte-level-bpe.tokenizer.json", pretty=True)