mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 16:49:27 +00:00
Add example for training with datasets
This commit is contained in:
19
bindings/python/examples/train_with_datasets.py
Normal file
19
bindings/python/examples/train_with_datasets.py
Normal file
@ -0,0 +1,19 @@
|
||||
import datasets
|
||||
from tokenizers import normalizers, pre_tokenizers, Tokenizer, models, trainers
|
||||
|
||||
# Build a tokenizer
|
||||
bpe_tokenizer = Tokenizer(models.BPE())
|
||||
bpe_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
|
||||
bpe_tokenizer.normalizer = normalizers.Lowercase()
|
||||
|
||||
# Initialize a dataset
|
||||
dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1")
|
||||
|
||||
# Build an iterator over this dataset
|
||||
def batch_iterator():
|
||||
batch_length = 1000
|
||||
for i in range(0, len(dataset["train"]), batch_length):
|
||||
yield dataset["train"][i : i + batch_length]["text"]
|
||||
|
||||
# And finally train
|
||||
bpe_tokenizer.train_from_iterator(batch_iterator(), length=len(dataset["train"]))
|
Reference in New Issue
Block a user