mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
* Add truncation to enable_truncation * Fix typo * Adding truncation_side within `TruncationParams`. * Node serialization of this direction param. * Update the test. * Fixing warnings/lint. * Adding stuff (can't local debug :( ) * Slow loop... ;( * Stub.py. Co-authored-by: Niels Rogge <niels.rogge1@gmail.com>
13 lines
475 B
Python
13 lines
475 B
Python
from tokenizers import ByteLevelBPETokenizer
|
|
from tokenizers import pre_tokenizers, models, Tokenizer, trainers
|
|
|
|
tokenizer = Tokenizer(models.Unigram())
|
|
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
|
|
trainer = trainers.UnigramTrainer(
|
|
vocab_size=400000000,
|
|
show_progress=True,
|
|
special_tokens=["<s>", "<pad>", "</s>", "<unk>", "mask"]
|
|
)
|
|
tokenizer.train(["data/big.txt"], trainer)
|
|
|