mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
* Updating python formatting. * Forgot gh action. * Skipping isort to prevent circular imports. * Updating stub. * Removing `isort` (it contradicts `stub.py`). * Fixing weird stub black/isort disagreeement.
23 lines
669 B
Python
23 lines
669 B
Python
import datasets
|
|
|
|
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers
|
|
|
|
|
|
# Build a tokenizer
|
|
bpe_tokenizer = Tokenizer(models.BPE())
|
|
bpe_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
|
|
bpe_tokenizer.normalizer = normalizers.Lowercase()
|
|
|
|
# Initialize a dataset
|
|
dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1")
|
|
|
|
# Build an iterator over this dataset
|
|
def batch_iterator():
|
|
batch_length = 1000
|
|
for i in range(0, len(dataset["train"]), batch_length):
|
|
yield dataset["train"][i : i + batch_length]["text"]
|
|
|
|
|
|
# And finally train
|
|
bpe_tokenizer.train_from_iterator(batch_iterator(), length=len(dataset["train"]))
|