mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Fixing missing direction
in TruncationParams. (#868)
This commit is contained in:
@ -1,12 +0,0 @@
|
||||
from tokenizers import ByteLevelBPETokenizer
|
||||
from tokenizers import pre_tokenizers, models, Tokenizer, trainers
|
||||
|
||||
tokenizer = Tokenizer(models.Unigram())
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
|
||||
trainer = trainers.UnigramTrainer(
|
||||
vocab_size=400000000,
|
||||
show_progress=True,
|
||||
special_tokens=["<s>", "<pad>", "</s>", "<unk>", "mask"]
|
||||
)
|
||||
tokenizer.train(["data/big.txt"], trainer)
|
||||
|
Reference in New Issue
Block a user