Faster datasets train example

Using .iter() is much faster than accessing using row ids
2025-08-22 16:25:30 +00:00 · 2023-03-23 11:24:30 +01:00
parent b8fbea00a9
commit e76f900bc0
1 changed files with 5 additions and 5 deletions
--- a/bindings/python/examples/train_with_datasets.py
+++ b/bindings/python/examples/train_with_datasets.py
@ -9,15 +9,15 @@ bpe_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
 bpe_tokenizer.normalizer = normalizers.Lowercase()
 # Initialize a dataset
-dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1")
+dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train")
 # Build an iterator over this dataset
 def batch_iterator():
-    batch_length = 1000
+    batch_size = 1000
-    for i in range(0, len(dataset["train"]), batch_length):
+    for batch in dataset.iter(batch_size=batch_size):
-        yield dataset["train"][i : i + batch_length]["text"]
+        yield batch["text"]
 # And finally train
-bpe_tokenizer.train_from_iterator(batch_iterator(), length=len(dataset["train"]))
+bpe_tokenizer.train_from_iterator(batch_iterator(), length=len(dataset))