diff --git a/bindings/python/examples/train_with_datasets.py b/bindings/python/examples/train_with_datasets.py index 7f95ccd2..7c316834 100644 --- a/bindings/python/examples/train_with_datasets.py +++ b/bindings/python/examples/train_with_datasets.py @@ -9,15 +9,15 @@ bpe_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() bpe_tokenizer.normalizer = normalizers.Lowercase() # Initialize a dataset -dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1") +dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train") # Build an iterator over this dataset def batch_iterator(): - batch_length = 1000 - for i in range(0, len(dataset["train"]), batch_length): - yield dataset["train"][i : i + batch_length]["text"] + batch_size = 1000 + for batch in dataset.iter(batch_size=batch_size): + yield batch["text"] # And finally train -bpe_tokenizer.train_from_iterator(batch_iterator(), length=len(dataset["train"])) +bpe_tokenizer.train_from_iterator(batch_iterator(), length=len(dataset))