Faster HF dataset iteration in docs (#1414)

* Faster HF dataset iteration in docs * Nit
2025-08-22 16:25:30 +00:00 · 2023-12-14 16:12:56 +01:00
parent 8edec536a7
commit 11462596d1
1 changed files with 4 additions and 2 deletions
--- a/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py
+++ b/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py
@ -70,8 +70,10 @@ class TestTrainFromIterators:

        # START def_batch_iterator
        def batch_iterator(batch_size=1000):
-            for i in range(0, len(dataset), batch_size):
-                yield dataset[i : i + batch_size]["text"]
+            # Only keep the text column to avoid decoding the rest of the columns unnecessarily
+            tok_dataset = dataset.select_columns("text")
+            for batch in tok_dataset.iter(batch_size):
+                yield batch["text"]

        # END def_batch_iterator