Fixing missing direction in TruncationParams. (#868)

This commit is contained in:
Nicolas Patry
2022-01-04 14:21:46 +01:00
committed by GitHub
parent 7069988ffe
commit 4122a33f09
2 changed files with 23 additions and 14 deletions

View File

@ -1,12 +0,0 @@
from tokenizers import ByteLevelBPETokenizer
from tokenizers import pre_tokenizers, models, Tokenizer, trainers
tokenizer = Tokenizer(models.Unigram())
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
trainer = trainers.UnigramTrainer(
vocab_size=400000000,
show_progress=True,
special_tokens=["<s>", "<pad>", "</s>", "<unk>", "mask"]
)
tokenizer.train(["data/big.txt"], trainer)