mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 16:49:27 +00:00
Python - Add bert wordpiece training example
This commit is contained in:
@ -40,7 +40,7 @@ trainer = trainers.BpeTrainer.new(
|
||||
vocab_size=50000,
|
||||
min_frequency=2,
|
||||
show_progress=True,
|
||||
special_tokens=[ "<s>", "<pad>", "</s" ],
|
||||
special_tokens=[ "<s>", "<pad>", "</s>" ],
|
||||
initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
|
||||
)
|
||||
tokenizer.train(trainer, files)
|
||||
|
Reference in New Issue
Block a user