diff --git a/bindings/node/Makefile b/bindings/node/Makefile index 7998e0a2..60b23c7e 100644 --- a/bindings/node/Makefile +++ b/bindings/node/Makefile @@ -12,7 +12,7 @@ style: check-style: npm run lint-check -TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json +TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json # Launch the test suite test: $(TESTS_RESOURCES) @@ -28,3 +28,7 @@ $(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt $(DATA_DIR)/roberta.json : $(dir_guard) wget https://storage.googleapis.com/tokenizers/roberta.json -O $@ + +$(DATA_DIR)/tokenizer-wiki.json : + $(dir_guard) + wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json -O $@ diff --git a/bindings/node/examples/documentation/pipeline.test.ts b/bindings/node/examples/documentation/pipeline.test.ts new file mode 100644 index 00000000..cefd8c7c --- /dev/null +++ b/bindings/node/examples/documentation/pipeline.test.ts @@ -0,0 +1,30 @@ +const globRequire = require; + +describe("pipelineExample", () => { + // This is a hack to let us require using path similar to what the user has to use + function require(mod: string) { + let path = mod.slice("tokenizers/".length); + return globRequire("../../lib/" + path); + } + + it("", async () => { + // START reload_tokenizer + const { Tokenizer } = require("tokenizers/bindings/tokenizer"); + + const tokenizer = Tokenizer.fromFile("data/tokenizer-wiki.json"); + // END reload_tokenizer + // START setup_normalizer + const { sequenceNormalizer, nfdNormalizer, stripAccentsNormalizer } = require("tokenizers/bindings/normalizers"); + + const normalizer = sequenceNormalizer([nfdNormalizer(), stripAccentsNormalizer()]); + // END setup_normalizer + // START test_normalizer + let normalized = normalizer.normalizeStr("Héllò hôw are ü?") + // "Hello how are u?" + // END test_normalizer + expect(normalized).toEqual("Hello how are u?"); + // START replace_normalizer + tokenizer.setNormalizer(normalizer) + // END replace_normalizer + }); +}); diff --git a/bindings/python/tests/documentation/test_pipeline.py b/bindings/python/tests/documentation/test_pipeline.py new file mode 100644 index 00000000..30c58f93 --- /dev/null +++ b/bindings/python/tests/documentation/test_pipeline.py @@ -0,0 +1,32 @@ +from ..utils import data_dir, doc_wiki_tokenizer +from tokenizers import Tokenizer + + +class TestPipeline: + def test_pipeline(self, doc_wiki_tokenizer): + def print(*args, **kwargs): + pass + + try: + # START reload_tokenizer + from tokenizers import Tokenizer + + tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json") + # END reload_tokenizer + except Exception: + tokenizer = Tokenizer.from_file(doc_wiki_tokenizer) + + # START setup_normalizer + from tokenizers import normalizers + from tokenizers.normalizers import NFD, StripAccents + + normalizer = normalizers.Sequence([NFD(), StripAccents()]) + # END setup_normalizer + # START test_normalizer + normalizer.normalize_str("Héllò hôw are ü?") + # "Hello how are u?" + # END test_normalizer + assert normalizer.normalize_str("Héllò hôw are ü?") == "Hello how are u?" + # START replace_normalizer + tokenizer.normalizer = normalizer + # END replace_normalizer diff --git a/docs/source/_ext/rust_doc.py b/docs/source/_ext/rust_doc.py index 699462d6..ca549d49 100644 --- a/docs/source/_ext/rust_doc.py +++ b/docs/source/_ext/rust_doc.py @@ -26,6 +26,8 @@ class RustRef: l, title = self.make_func_link(parts, title) if doctype == "meth": l, title = self.make_meth_link(parts, title) + if doctype == "trait": + l, title = self.make_trait_link(parts, title) link += l node = nodes.reference(internal=False, refuri=link, text=title) @@ -72,11 +74,23 @@ class RustRef: return link, title + def make_trait_link(self, parts, title): + link = "" + trait_name = parts[-1] + + path = parts[:-1] + for p in path: + link += f"/{p}" + link += f"/trait.{trait_name}.html" + + return link, title + def setup(app): app.add_role("rust:struct", RustRef()) app.add_role("rust:func", RustRef()) app.add_role("rust:meth", RustRef()) + app.add_role("rust:trait", RustRef()) return { "version": "0.1", diff --git a/docs/source/pipeline.rst b/docs/source/pipeline.rst index 6f106fd5..153140ba 100644 --- a/docs/source/pipeline.rst +++ b/docs/source/pipeline.rst @@ -1,8 +1,8 @@ The tokenization pipeline ==================================================================================================== -When calling :meth:`~tokenizers.Tokenizer.encode` or :meth:`~tokenizers.Tokenizer.encode_batch`, the -input text(s) go through the following pipeline: +When calling :entity:`Tokenizer.encode` or :entity:`Tokenizer.encode_batch`, the input text(s) go +through the following pipeline: - :ref:`normalization` - :ref:`pre-tokenization` @@ -14,14 +14,32 @@ We'll see in details what happens during each of those steps in detail, as well each of those steps to your needs. If you're already familiar with those steps and want to learn by seeing some code, jump to :ref:`our BERT from scratch example `. -For the examples that require a :class:`~tokenizers.Tokenizer`, we will use the tokenizer we trained +For the examples that require a :entity:`Tokenizer`, we will use the tokenizer we trained in the :doc:`quicktour`, which you can load with: -.. code-block:: python +.. only:: python - from tokenizers import Tokenizer + .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py + :language: python + :start-after: START reload_tokenizer + :end-before: END reload_tokenizer + :dedent: 8 - tokenizer = Tokenizer.from_file("pretrained/wiki.json") +.. only:: rust + + .. literalinclude:: ../../tokenizers/tests/documentation.rs + :language: rust + :start-after: START pipeline_reload_tokenizer + :end-before: END pipeline_reload_tokenizer + :dedent: 4 + +.. only:: node + + .. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts + :language: javascript + :start-after: START reload_tokenizer + :end-before: END reload_tokenizer + :dedent: 8 .. _normalization: @@ -36,31 +54,88 @@ or lowercasing all text. If you're familiar with `Unicode normalization in most tokenizers. Each normalization operation is represented in the 🤗 Tokenizers library by a -:class:`~tokenizers.normalizers.Normalizer`, and you can combine several of those by using a -:class:`~tokenizers.normalizers.Sequence`. Here is a normalizer applying NFD Unicode normalization +:entity:`Normalizer`, and you can combine several of those by using a +:entity:`normalizers.Sequence`. Here is a normalizer applying NFD Unicode normalization and removing accents as an example: -.. code-block:: python +.. only:: python - import tokenizers - from tokenizers.normalizers import NFD, StripAccents + .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py + :language: python + :start-after: START setup_normalizer + :end-before: END setup_normalizer + :dedent: 8 - normalizer = tokenizers.normalizers.Sequence([NFD(), StripAccents()]) +.. only:: rust -You can apply that normalizer to any string with the -:meth:`~tokenizers.normalizers.Normalizer.normalize_str` method: + .. literalinclude:: ../../tokenizers/tests/documentation.rs + :language: rust + :start-after: START pipeline_setup_normalizer + :end-before: END pipeline_setup_normalizer + :dedent: 4 -.. code-block:: python +.. only:: node - normalizer.normalize_str("Héllò hôw are ü?") - # "Hello how are u?" + .. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts + :language: javascript + :start-after: START setup_normalizer + :end-before: END setup_normalizer + :dedent: 8 -When building a :class:`~tokenizers.Tokenizer`, you can customize its normalizer by just changing + +You can manually test that normalizer by applying it to any string: + +.. only:: python + + .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py + :language: python + :start-after: START test_normalizer + :end-before: END test_normalizer + :dedent: 8 + +.. only:: rust + + .. literalinclude:: ../../tokenizers/tests/documentation.rs + :language: rust + :start-after: START pipeline_test_normalizer + :end-before: END pipeline_test_normalizer + :dedent: 4 + +.. only:: node + + .. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts + :language: javascript + :start-after: START test_normalizer + :end-before: END test_normalizer + :dedent: 8 + + +When building a :entity:`Tokenizer`, you can customize its normalizer by just changing the corresponding attribute: -.. code-block:: python +.. only:: python - tokenizer.normalizer = normalizer + .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py + :language: python + :start-after: START replace_normalizer + :end-before: END replace_normalizer + :dedent: 8 + +.. only:: rust + + .. literalinclude:: ../../tokenizers/tests/documentation.rs + :language: rust + :start-after: START pipeline_replace_normalizer + :end-before: END pipeline_replace_normalizer + :dedent: 4 + +.. only:: node + + .. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts + :language: javascript + :start-after: START replace_normalizer + :end-before: END replace_normalizer + :dedent: 8 Of course, if you change the way a tokenizer applies normalization, you should probably retrain it from scratch afterward. diff --git a/tokenizers/tests/documentation.rs b/tokenizers/tests/documentation.rs index 8f1a8bf5..fb03b247 100644 --- a/tokenizers/tests/documentation.rs +++ b/tokenizers/tests/documentation.rs @@ -262,3 +262,34 @@ fn quicktour() -> tokenizers::Result<()> { assert_eq!(output[1].get_attention_mask(), [1, 1, 1, 1, 1, 1, 1, 0]); Ok(()) } + +#[test] +fn pipeline() -> tokenizers::Result<()> { + // START pipeline_reload_tokenizer + use tokenizers::Tokenizer; + + let mut tokenizer = Tokenizer::from_file("data/tokenizer-wiki.json")?; + // END pipeline_reload_tokenizer + // START pipeline_setup_normalizer + use tokenizers::normalizers::{ + strip::StripAccents, unicode::NFD, utils::Sequence as NormalizerSequence, + }; + + let normalizer = NormalizerSequence::new(vec![NFD.into(), StripAccents.into()]); + // END pipeline_setup_normalizer + // START pipeline_test_normalizer + use tokenizers::{NormalizedString, Normalizer}; + + let mut normalized = NormalizedString::from("Héllò hôw are ü?"); + normalizer.normalize(&mut normalized)?; + + println!("{}", normalized.get()); + // "Hello how are u?" + // END pipeline_test_normalizer + assert_eq!(normalized.get(), "Hello how are u?"); + // START pipeline_replace_normalizer + tokenizer.with_normalizer(normalizer); + // END pipeline_replace_normalizer + + Ok(()) +}