Doc - Update Normalizer part of the Pipeline page

2025-08-22 16:25:30 +00:00 · 2020-10-27 18:45:16 -04:00
parent ab7bae466a
commit 13a80050f0
6 changed files with 207 additions and 21 deletions
--- a/bindings/node/Makefile
+++ b/bindings/node/Makefile
@ -12,7 +12,7 @@ style:
 check-style:
 	npm run lint-check

-TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json
+TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json

 # Launch the test suite
 test: $(TESTS_RESOURCES)
@ -28,3 +28,7 @@ $(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt
 $(DATA_DIR)/roberta.json :
 	$(dir_guard)
 	wget https://storage.googleapis.com/tokenizers/roberta.json -O $@
+
+$(DATA_DIR)/tokenizer-wiki.json :
+	$(dir_guard)
+	wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json -O $@
--- a/bindings/node/examples/documentation/pipeline.test.ts
+++ b/bindings/node/examples/documentation/pipeline.test.ts
@ -0,0 +1,30 @@
+const globRequire = require;
+
+describe("pipelineExample", () => {
+    // This is a hack to let us require using path similar to what the user has to use
+    function require(mod: string) {
+        let path = mod.slice("tokenizers/".length);
+        return globRequire("../../lib/" + path);
+    }
+
+    it("", async () => {
+        // START reload_tokenizer
+        const { Tokenizer } = require("tokenizers/bindings/tokenizer");
+
+        const tokenizer = Tokenizer.fromFile("data/tokenizer-wiki.json");
+        // END reload_tokenizer
+        // START setup_normalizer
+        const { sequenceNormalizer, nfdNormalizer, stripAccentsNormalizer } = require("tokenizers/bindings/normalizers");
+
+        const normalizer = sequenceNormalizer([nfdNormalizer(), stripAccentsNormalizer()]);
+        // END setup_normalizer
+        // START test_normalizer
+        let normalized = normalizer.normalizeStr("Héllò hôw are ü?")
+        // "Hello how are u?"
+        // END test_normalizer
+        expect(normalized).toEqual("Hello how are u?");
+        // START replace_normalizer
+        tokenizer.setNormalizer(normalizer)
+        // END replace_normalizer
+    });
+});
--- a/bindings/python/tests/documentation/test_pipeline.py
+++ b/bindings/python/tests/documentation/test_pipeline.py
@ -0,0 +1,32 @@
+from ..utils import data_dir, doc_wiki_tokenizer
+from tokenizers import Tokenizer
+
+
+class TestPipeline:
+    def test_pipeline(self, doc_wiki_tokenizer):
+        def print(*args, **kwargs):
+            pass
+
+        try:
+            # START reload_tokenizer
+            from tokenizers import Tokenizer
+
+            tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")
+            # END reload_tokenizer
+        except Exception:
+            tokenizer = Tokenizer.from_file(doc_wiki_tokenizer)
+
+        # START setup_normalizer
+        from tokenizers import normalizers
+        from tokenizers.normalizers import NFD, StripAccents
+
+        normalizer = normalizers.Sequence([NFD(), StripAccents()])
+        # END setup_normalizer
+        # START test_normalizer
+        normalizer.normalize_str("Héllò hôw are ü?")
+        # "Hello how are u?"
+        # END test_normalizer
+        assert normalizer.normalize_str("Héllò hôw are ü?") == "Hello how are u?"
+        # START replace_normalizer
+        tokenizer.normalizer = normalizer
+        # END replace_normalizer
--- a/docs/source/_ext/rust_doc.py
+++ b/docs/source/_ext/rust_doc.py
@ -26,6 +26,8 @@ class RustRef:
            l, title = self.make_func_link(parts, title)
        if doctype == "meth":
            l, title = self.make_meth_link(parts, title)
+        if doctype == "trait":
+            l, title = self.make_trait_link(parts, title)
        link += l

        node = nodes.reference(internal=False, refuri=link, text=title)
@ -72,11 +74,23 @@ class RustRef:

        return link, title

+    def make_trait_link(self, parts, title):
+        link = ""
+        trait_name = parts[-1]
+
+        path = parts[:-1]
+        for p in path:
+            link += f"/{p}"
+        link += f"/trait.{trait_name}.html"
+
+        return link, title
+

 def setup(app):
    app.add_role("rust:struct", RustRef())
    app.add_role("rust:func", RustRef())
    app.add_role("rust:meth", RustRef())
+    app.add_role("rust:trait", RustRef())

    return {
        "version": "0.1",
--- a/docs/source/pipeline.rst
+++ b/docs/source/pipeline.rst
@ -1,8 +1,8 @@
 The tokenization pipeline
 ====================================================================================================

-When calling :meth:`~tokenizers.Tokenizer.encode` or :meth:`~tokenizers.Tokenizer.encode_batch`, the
-input text(s) go through the following pipeline:
+When calling :entity:`Tokenizer.encode` or :entity:`Tokenizer.encode_batch`, the input text(s) go
+through the following pipeline:

 - :ref:`normalization`
 - :ref:`pre-tokenization`
@ -14,14 +14,32 @@ We'll see in details what happens during each of those steps in detail, as well
 each of those steps to your needs. If you're already familiar with those steps and want to learn by
 seeing some code, jump to :ref:`our BERT from scratch example <example>`.

-For the examples that require a :class:`~tokenizers.Tokenizer`, we will use the tokenizer we trained
+For the examples that require a :entity:`Tokenizer`, we will use the tokenizer we trained
 in the :doc:`quicktour`, which you can load with:

-.. code-block:: python
+.. only:: python

-    from tokenizers import Tokenizer
+    .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
+        :language: python
+        :start-after: START reload_tokenizer
+        :end-before: END reload_tokenizer
+        :dedent: 8

-    tokenizer = Tokenizer.from_file("pretrained/wiki.json")
+.. only:: rust
+
+    .. literalinclude:: ../../tokenizers/tests/documentation.rs
+        :language: rust
+        :start-after: START pipeline_reload_tokenizer
+        :end-before: END pipeline_reload_tokenizer
+        :dedent: 4
+
+.. only:: node
+
+    .. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
+        :language: javascript
+        :start-after: START reload_tokenizer
+        :end-before: END reload_tokenizer
+        :dedent: 8


 .. _normalization:
@ -36,31 +54,88 @@ or lowercasing all text. If you're familiar with `Unicode normalization
 in most tokenizers.

 Each normalization operation is represented in the 🤗 Tokenizers library by a
-:class:`~tokenizers.normalizers.Normalizer`, and you can combine several of those by using a
-:class:`~tokenizers.normalizers.Sequence`. Here is a normalizer applying NFD Unicode normalization
+:entity:`Normalizer`, and you can combine several of those by using a
+:entity:`normalizers.Sequence`. Here is a normalizer applying NFD Unicode normalization
 and removing accents as an example:

-.. code-block:: python
+.. only:: python

-    import tokenizers
-    from tokenizers.normalizers import NFD, StripAccents
+    .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
+        :language: python
+        :start-after: START setup_normalizer
+        :end-before: END setup_normalizer
+        :dedent: 8

-    normalizer = tokenizers.normalizers.Sequence([NFD(), StripAccents()])
+.. only:: rust

-You can apply that normalizer to any string with the
-:meth:`~tokenizers.normalizers.Normalizer.normalize_str` method:
+    .. literalinclude:: ../../tokenizers/tests/documentation.rs
+        :language: rust
+        :start-after: START pipeline_setup_normalizer
+        :end-before: END pipeline_setup_normalizer
+        :dedent: 4

-.. code-block:: python
+.. only:: node

-    normalizer.normalize_str("Héllò hôw are ü?")
-    # "Hello how are u?"
+    .. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
+        :language: javascript
+        :start-after: START setup_normalizer
+        :end-before: END setup_normalizer
+        :dedent: 8

-When building a :class:`~tokenizers.Tokenizer`, you can customize its normalizer by just changing
+
+You can manually test that normalizer by applying it to any string:
+
+.. only:: python
+
+    .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
+        :language: python
+        :start-after: START test_normalizer
+        :end-before: END test_normalizer
+        :dedent: 8
+
+.. only:: rust
+
+    .. literalinclude:: ../../tokenizers/tests/documentation.rs
+        :language: rust
+        :start-after: START pipeline_test_normalizer
+        :end-before: END pipeline_test_normalizer
+        :dedent: 4
+
+.. only:: node
+
+    .. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
+        :language: javascript
+        :start-after: START test_normalizer
+        :end-before: END test_normalizer
+        :dedent: 8
+
+
+When building a :entity:`Tokenizer`, you can customize its normalizer by just changing
 the corresponding attribute:

-.. code-block:: python
+.. only:: python

-    tokenizer.normalizer = normalizer
+    .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
+        :language: python
+        :start-after: START replace_normalizer
+        :end-before: END replace_normalizer
+        :dedent: 8
+
+.. only:: rust
+
+    .. literalinclude:: ../../tokenizers/tests/documentation.rs
+        :language: rust
+        :start-after: START pipeline_replace_normalizer
+        :end-before: END pipeline_replace_normalizer
+        :dedent: 4
+
+.. only:: node
+
+    .. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
+        :language: javascript
+        :start-after: START replace_normalizer
+        :end-before: END replace_normalizer
+        :dedent: 8

 Of course, if you change the way a tokenizer applies normalization, you should probably retrain it
 from scratch afterward.
--- a/tokenizers/tests/documentation.rs
+++ b/tokenizers/tests/documentation.rs
@ -262,3 +262,34 @@ fn quicktour() -> tokenizers::Result<()> {
    assert_eq!(output[1].get_attention_mask(), [1, 1, 1, 1, 1, 1, 1, 0]);
    Ok(())
 }
+
+#[test]
+fn pipeline() -> tokenizers::Result<()> {
+    // START pipeline_reload_tokenizer
+    use tokenizers::Tokenizer;
+
+    let mut tokenizer = Tokenizer::from_file("data/tokenizer-wiki.json")?;
+    // END pipeline_reload_tokenizer
+    // START pipeline_setup_normalizer
+    use tokenizers::normalizers::{
+        strip::StripAccents, unicode::NFD, utils::Sequence as NormalizerSequence,
+    };
+
+    let normalizer = NormalizerSequence::new(vec![NFD.into(), StripAccents.into()]);
+    // END pipeline_setup_normalizer
+    // START pipeline_test_normalizer
+    use tokenizers::{NormalizedString, Normalizer};
+
+    let mut normalized = NormalizedString::from("Héllò hôw are ü?");
+    normalizer.normalize(&mut normalized)?;
+
+    println!("{}", normalized.get());
+    // "Hello how are u?"
+    // END pipeline_test_normalizer
+    assert_eq!(normalized.get(), "Hello how are u?");
+    // START pipeline_replace_normalizer
+    tokenizer.with_normalizer(normalizer);
+    // END pipeline_replace_normalizer
+
+    Ok(())
+}