Doc - Add documentation for training from iterators

2025-08-22 16:25:30 +00:00 · 2021-01-12 15:30:01 -05:00
parent 7bee825238
commit 91dae1de15
6 changed files with 238 additions and 3 deletions
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@ -100,7 +100,7 @@ jobs:
        run: |
          python -m venv .env
          source .env/bin/activate
-          pip install pytest requests setuptools_rust numpy
+          pip install pytest requests setuptools_rust numpy datasets
          python setup.py develop
      - name: Check style
--- a/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py
+++ b/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py
@ -0,0 +1,101 @@
 from ..utils import data_dir, train_files
 import os
 import pytest
 import datasets
 import gzip
 class TestTrainFromIterators:
    @staticmethod
    def get_tokenizer_trainer():
        # START init_tokenizer_trainer
        from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers
        tokenizer = Tokenizer(models.Unigram())
        tokenizer.normalizer = normalizers.NFKC()
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
        tokenizer.decoders = decoders.ByteLevel()
        trainer = trainers.UnigramTrainer(
            vocab_size=20000,
            initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
            special_tokens=["<PAD>", "<BOS>", "<EOS>"],
        )
        # END init_tokenizer_trainer
        trainer.show_progress = False
        return tokenizer, trainer
    @staticmethod
    def load_dummy_dataset():
        # START load_dataset
        import datasets
        dataset = datasets.load_dataset(
            "wikitext", "wikitext-103-raw-v1", split="train+test+validation"
        )
        # END load_dataset
    @pytest.fixture(scope="class")
    def setup_gzip_files(self, train_files):
        with open(train_files["small"], "rt") as small:
            for n in range(3):
                path = f"data/my-file.{n}.gz"
                with gzip.open(path, "wt") as f:
                    f.write(small.read())
    def test_train_basic(self):
        tokenizer, trainer = self.get_tokenizer_trainer()
        # START train_basic
        # First few lines of the "Zen of Python" https://www.python.org/dev/peps/pep-0020/
        data = [
            "Beautiful is better than ugly."
            "Explicit is better than implicit."
            "Simple is better than complex."
            "Complex is better than complicated."
            "Flat is better than nested."
            "Sparse is better than dense."
            "Readability counts."
        ]
        tokenizer.train_from_iterator(data, trainer=trainer)
        # END train_basic
    def test_datasets(self):
        tokenizer, trainer = self.get_tokenizer_trainer()
        # In order to keep tests fast, we only use the first 100 examples
        os.environ["TOKENIZERS_PARALLELISM"] = "true"
        dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train[0:100]")
        # START def_batch_iterator
        def batch_iterator(batch_size=1000):
            for i in range(0, len(dataset), batch_size):
                yield dataset[i : i + batch_size]["text"]
        # END def_batch_iterator
        # START train_datasets
        tokenizer.train_from_iterator(batch_iterator(), trainer=trainer, length=len(dataset))
        # END train_datasets
    def test_gzip(self, setup_gzip_files):
        tokenizer, trainer = self.get_tokenizer_trainer()
        # START single_gzip
        import gzip
        with gzip.open("data/my-file.0.gz", "rt") as f:
            tokenizer.train_from_iterator(f, trainer=trainer)
        # END single_gzip
        # START multi_gzip
        files = ["data/my-file.0.gz", "data/my-file.1.gz", "data/my-file.2.gz"]
        def gzip_iterator():
            for path in files:
                with gzip.open(path, "rt") as f:
                    for line in f:
                        yield line
        tokenizer.train_from_iterator(gzip_iterator(), trainer=trainer)
        # END multi_gzip
--- a/docs/source/_ext/toctree_tags.py
+++ b/docs/source/_ext/toctree_tags.py
@ -0,0 +1,29 @@
 import re
 from sphinx.directives.other import TocTree
 class TocTreeTags(TocTree):
    hasPat = re.compile("^\s*:(.+):(.+)$")
    def filter_entries(self, entries):
        filtered = []
        for e in entries:
            m = self.hasPat.match(e)
            if m != None:
                if self.env.app.tags.has(m.groups()[0]):
                    filtered.append(m.groups()[1])
            else:
                filtered.append(e)
        return filtered
    def run(self):
        self.content = self.filter_entries(self.content)
        return super().run()
 def setup(app):
    app.add_directive("toctree-tags", TocTreeTags)
    return {
        "version": "0.1",
    }
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -39,7 +39,7 @@ rust_version = "latest"
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon", "entities", "rust_doc"]
+extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon", "entities", "rust_doc", "toctree_tags"]
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
@ -49,7 +49,6 @@ templates_path = ["_templates"]
 # This pattern also affects html_static_path and html_extra_path.
 exclude_patterns = []
 # -- Options for HTML output -------------------------------------------------
 # The theme to use for HTML and HTML Help pages.  See the documentation for
@ -70,6 +69,10 @@ html_static_path = ["_static"]
 def setup(app):
    for language in languages:
        if not tags.has(language):
            exclude_patterns.append(f"tutorials/{language}/*")
    app.add_css_file("css/huggingface.css")
    app.add_css_file("css/code-snippets.css")
    app.add_js_file("js/custom.js")
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -32,6 +32,13 @@ Main features:
    pipeline
    components
 .. toctree-tags::
    :maxdepth: 3
    :caption: Using 🤗 Tokenizers
    :glob:
    :python:tutorials/python/*
 .. toctree::
    :maxdepth: 3
    :caption: API Reference
--- a/docs/source/tutorials/python/training_from_memory.rst
+++ b/docs/source/tutorials/python/training_from_memory.rst
@ -0,0 +1,95 @@
 Training from memory
 ----------------------------------------------------------------------------------------------------
 In the `Quicktour <quicktour.html>`__, we saw how to build and train a tokenizer using text files,
 but we can actually use any Python Iterator. In this section we'll see a few different ways of
 training our tokenizer.
 For all the examples listed below, we'll use the same :class:`~tokenizers.Tokenizer` and
 :class:`~tokenizers.trainers.Trainer`, built as following:
 .. literalinclude:: ../../../../bindings/python/tests/documentation/test_tutorial_train_from_iterators.py
    :language: python
    :start-after: START init_tokenizer_trainer
    :end-before: END init_tokenizer_trainer
    :dedent: 8
 This tokenizer is based on the :class:`~tokenizers.models.Unigram` model. It takes care of
 normalizing the input using the NFKC Unicode normalization method, and uses a
 :class:`~tokenizers.pre_tokenizers.ByteLevel` pre-tokenizer with the corresponding decoder.
 For more information on the components used here, you can check `here <components.html>`__
 The most basic way
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 As you probably guessed already, the easiest way to train our tokenizer is by using a :obj:`List`:
 .. literalinclude:: ../../../../bindings/python/tests/documentation/test_tutorial_train_from_iterators.py
    :language: python
    :start-after: START train_basic
    :end-before: END train_basic
    :dedent: 8
 Easy, right? You can use anything working as an iterator here, be it a :obj:`List`, :obj:`Tuple`,
 or a :obj:`np.Array`. Anything works as long as it provides strings.
 Using the 🤗 Datasets library
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 An awesome way to access one of the many datasets that exist out there is by using the 🤗 Datasets
 library. For more information about it, you should check
 `the official documentation here <https://huggingface.co/docs/datasets/>`__.
 Let's start by loading our dataset:
 .. literalinclude:: ../../../../bindings/python/tests/documentation/test_tutorial_train_from_iterators.py
    :language: python
    :start-after: START load_dataset
    :end-before: END load_dataset
    :dedent: 8
 The next step is to build an iterator over this dataset. The easiest way to do this is probably by
 using a generator:
 .. literalinclude:: ../../../../bindings/python/tests/documentation/test_tutorial_train_from_iterators.py
    :language: python
    :start-after: START def_batch_iterator
    :end-before: END def_batch_iterator
    :dedent: 8
 As you can see here, for improved efficiency we can actually provide a batch of examples used
 to train, instead of iterating over them one by one. By doing so, we can expect performances very
 similar to those we got while training directly from files.
 With our iterator ready, we just need to launch the training. In order to improve the look of our
 progress bars, we can specify the total length of the dataset:
 .. literalinclude:: ../../../../bindings/python/tests/documentation/test_tutorial_train_from_iterators.py
    :language: python
    :start-after: START train_datasets
    :end-before: END train_datasets
    :dedent: 8
 And that's it!
 Using gzip files
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Since gzip files in Python can be used as iterators, it is extremely simple to train on such files:
 .. literalinclude:: ../../../../bindings/python/tests/documentation/test_tutorial_train_from_iterators.py
    :language: python
    :start-after: START single_gzip
    :end-before: END single_gzip
    :dedent: 8
 Now if we wanted to train from multiple gzip files, it wouldn't be much harder:
 .. literalinclude:: ../../../../bindings/python/tests/documentation/test_tutorial_train_from_iterators.py
    :language: python
    :start-after: START multi_gzip
    :end-before: END multi_gzip
    :dedent: 8
 And voilà!