Doc - Update Normalizer part of the Pipeline page

2025-08-22 16:25:30 +00:00 · 2020-10-27 18:45:16 -04:00
parent ab7bae466a
commit 13a80050f0
6 changed files with 207 additions and 21 deletions
--- a/bindings/node/Makefile
+++ b/bindings/node/Makefile
@ -12,7 +12,7 @@ style:
 check-style:
 	npm run lint-check
-TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json
+TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json
 # Launch the test suite
 test: $(TESTS_RESOURCES)
@ -28,3 +28,7 @@ $(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt
 $(DATA_DIR)/roberta.json :
 	$(dir_guard)
 	wget https://storage.googleapis.com/tokenizers/roberta.json -O $@
 $(DATA_DIR)/tokenizer-wiki.json :
 	$(dir_guard)
 	wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json -O $@
--- a/bindings/node/examples/documentation/pipeline.test.ts
+++ b/bindings/node/examples/documentation/pipeline.test.ts
@ -0,0 +1,30 @@
 const globRequire = require;
 describe("pipelineExample", () => {
    // This is a hack to let us require using path similar to what the user has to use
    function require(mod: string) {
        let path = mod.slice("tokenizers/".length);
        return globRequire("../../lib/" + path);
    }
    it("", async () => {
        // START reload_tokenizer
        const { Tokenizer } = require("tokenizers/bindings/tokenizer");
        const tokenizer = Tokenizer.fromFile("data/tokenizer-wiki.json");
        // END reload_tokenizer
        // START setup_normalizer
        const { sequenceNormalizer, nfdNormalizer, stripAccentsNormalizer } = require("tokenizers/bindings/normalizers");
        const normalizer = sequenceNormalizer([nfdNormalizer(), stripAccentsNormalizer()]);
        // END setup_normalizer
        // START test_normalizer
        let normalized = normalizer.normalizeStr("Héllò hôw are ü?")
        // "Hello how are u?"
        // END test_normalizer
        expect(normalized).toEqual("Hello how are u?");
        // START replace_normalizer
        tokenizer.setNormalizer(normalizer)
        // END replace_normalizer
    });
 });
--- a/bindings/python/tests/documentation/test_pipeline.py
+++ b/bindings/python/tests/documentation/test_pipeline.py
@ -0,0 +1,32 @@
 from ..utils import data_dir, doc_wiki_tokenizer
 from tokenizers import Tokenizer
 class TestPipeline:
    def test_pipeline(self, doc_wiki_tokenizer):
        def print(*args, **kwargs):
            pass
        try:
            # START reload_tokenizer
            from tokenizers import Tokenizer
            tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")
            # END reload_tokenizer
        except Exception:
            tokenizer = Tokenizer.from_file(doc_wiki_tokenizer)
        # START setup_normalizer
        from tokenizers import normalizers
        from tokenizers.normalizers import NFD, StripAccents
        normalizer = normalizers.Sequence([NFD(), StripAccents()])
        # END setup_normalizer
        # START test_normalizer
        normalizer.normalize_str("Héllò hôw are ü?")
        # "Hello how are u?"
        # END test_normalizer
        assert normalizer.normalize_str("Héllò hôw are ü?") == "Hello how are u?"
        # START replace_normalizer
        tokenizer.normalizer = normalizer
        # END replace_normalizer
--- a/docs/source/_ext/rust_doc.py
+++ b/docs/source/_ext/rust_doc.py
@ -26,6 +26,8 @@ class RustRef:
            l, title = self.make_func_link(parts, title)
        if doctype == "meth":
            l, title = self.make_meth_link(parts, title)
        if doctype == "trait":
            l, title = self.make_trait_link(parts, title)
        link += l
        node = nodes.reference(internal=False, refuri=link, text=title)
@ -72,11 +74,23 @@ class RustRef:
        return link, title
    def make_trait_link(self, parts, title):
        link = ""
        trait_name = parts[-1]
        path = parts[:-1]
        for p in path:
            link += f"/{p}"
        link += f"/trait.{trait_name}.html"
        return link, title
 def setup(app):
    app.add_role("rust:struct", RustRef())
    app.add_role("rust:func", RustRef())
    app.add_role("rust:meth", RustRef())
    app.add_role("rust:trait", RustRef())
    return {
        "version": "0.1",
--- a/docs/source/pipeline.rst
+++ b/docs/source/pipeline.rst
@ -1,8 +1,8 @@
 The tokenization pipeline
 ====================================================================================================
-When calling :meth:`~tokenizers.Tokenizer.encode` or :meth:`~tokenizers.Tokenizer.encode_batch`, the
+When calling :entity:`Tokenizer.encode` or :entity:`Tokenizer.encode_batch`, the input text(s) go
-input text(s) go through the following pipeline:
+through the following pipeline:
 - :ref:`normalization`
 - :ref:`pre-tokenization`
@ -14,14 +14,32 @@ We'll see in details what happens during each of those steps in detail, as well
 each of those steps to your needs. If you're already familiar with those steps and want to learn by
 seeing some code, jump to :ref:`our BERT from scratch example <example>`.
-For the examples that require a :class:`~tokenizers.Tokenizer`, we will use the tokenizer we trained
+For the examples that require a :entity:`Tokenizer`, we will use the tokenizer we trained
 in the :doc:`quicktour`, which you can load with:
-.. code-block:: python
+.. only:: python
-    from tokenizers import Tokenizer
+    .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
        :language: python
        :start-after: START reload_tokenizer
        :end-before: END reload_tokenizer
        :dedent: 8
-    tokenizer = Tokenizer.from_file("pretrained/wiki.json")
+.. only:: rust
    .. literalinclude:: ../../tokenizers/tests/documentation.rs
        :language: rust
        :start-after: START pipeline_reload_tokenizer
        :end-before: END pipeline_reload_tokenizer
        :dedent: 4
 .. only:: node
    .. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
        :language: javascript
        :start-after: START reload_tokenizer
        :end-before: END reload_tokenizer
        :dedent: 8
 .. _normalization:
@ -36,31 +54,88 @@ or lowercasing all text. If you're familiar with `Unicode normalization
 in most tokenizers.
 Each normalization operation is represented in the 🤗 Tokenizers library by a
-:class:`~tokenizers.normalizers.Normalizer`, and you can combine several of those by using a
+:entity:`Normalizer`, and you can combine several of those by using a
-:class:`~tokenizers.normalizers.Sequence`. Here is a normalizer applying NFD Unicode normalization
+:entity:`normalizers.Sequence`. Here is a normalizer applying NFD Unicode normalization
 and removing accents as an example:
-.. code-block:: python
+.. only:: python
-    import tokenizers
+    .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
-    from tokenizers.normalizers import NFD, StripAccents
+        :language: python
        :start-after: START setup_normalizer
        :end-before: END setup_normalizer
        :dedent: 8
-    normalizer = tokenizers.normalizers.Sequence([NFD(), StripAccents()])
+.. only:: rust
-You can apply that normalizer to any string with the
+    .. literalinclude:: ../../tokenizers/tests/documentation.rs
-:meth:`~tokenizers.normalizers.Normalizer.normalize_str` method:
+        :language: rust
        :start-after: START pipeline_setup_normalizer
        :end-before: END pipeline_setup_normalizer
        :dedent: 4
-.. code-block:: python
+.. only:: node
-    normalizer.normalize_str("Héllò hôw are ü?")
+    .. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
-    # "Hello how are u?"
+        :language: javascript
        :start-after: START setup_normalizer
        :end-before: END setup_normalizer
        :dedent: 8
-When building a :class:`~tokenizers.Tokenizer`, you can customize its normalizer by just changing
+
 You can manually test that normalizer by applying it to any string:
 .. only:: python
    .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
        :language: python
        :start-after: START test_normalizer
        :end-before: END test_normalizer
        :dedent: 8
 .. only:: rust
    .. literalinclude:: ../../tokenizers/tests/documentation.rs
        :language: rust
        :start-after: START pipeline_test_normalizer
        :end-before: END pipeline_test_normalizer
        :dedent: 4
 .. only:: node
    .. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
        :language: javascript
        :start-after: START test_normalizer
        :end-before: END test_normalizer
        :dedent: 8
 When building a :entity:`Tokenizer`, you can customize its normalizer by just changing
 the corresponding attribute:
-.. code-block:: python
+.. only:: python
-    tokenizer.normalizer = normalizer
+    .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
        :language: python
        :start-after: START replace_normalizer
        :end-before: END replace_normalizer
        :dedent: 8
 .. only:: rust
    .. literalinclude:: ../../tokenizers/tests/documentation.rs
        :language: rust
        :start-after: START pipeline_replace_normalizer
        :end-before: END pipeline_replace_normalizer
        :dedent: 4
 .. only:: node
    .. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
        :language: javascript
        :start-after: START replace_normalizer
        :end-before: END replace_normalizer
        :dedent: 8
 Of course, if you change the way a tokenizer applies normalization, you should probably retrain it
 from scratch afterward.
--- a/tokenizers/tests/documentation.rs
+++ b/tokenizers/tests/documentation.rs
@ -262,3 +262,34 @@ fn quicktour() -> tokenizers::Result<()> {
    assert_eq!(output[1].get_attention_mask(), [1, 1, 1, 1, 1, 1, 1, 0]);
    Ok(())
 }
 #[test]
 fn pipeline() -> tokenizers::Result<()> {
    // START pipeline_reload_tokenizer
    use tokenizers::Tokenizer;
    let mut tokenizer = Tokenizer::from_file("data/tokenizer-wiki.json")?;
    // END pipeline_reload_tokenizer
    // START pipeline_setup_normalizer
    use tokenizers::normalizers::{
        strip::StripAccents, unicode::NFD, utils::Sequence as NormalizerSequence,
    };
    let normalizer = NormalizerSequence::new(vec![NFD.into(), StripAccents.into()]);
    // END pipeline_setup_normalizer
    // START pipeline_test_normalizer
    use tokenizers::{NormalizedString, Normalizer};
    let mut normalized = NormalizedString::from("Héllò hôw are ü?");
    normalizer.normalize(&mut normalized)?;
    println!("{}", normalized.get());
    // "Hello how are u?"
    // END pipeline_test_normalizer
    assert_eq!(normalized.get(), "Hello how are u?");
    // START pipeline_replace_normalizer
    tokenizer.with_normalizer(normalizer);
    // END pipeline_replace_normalizer
    Ok(())
 }