from tokenizers import ByteLevelBPETokenizer
from tokenizers import pre_tokenizers, models, Tokenizer, trainers
tokenizer = Tokenizer(models.Unigram())
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
trainer = trainers.UnigramTrainer(
vocab_size=400000000,
show_progress=True,
special_tokens=["", "", "", "", "mask"]
)
tokenizer.train(["data/big.txt"], trainer)