Doc - Update Normalizer part of the Pipeline page

This commit is contained in:
Anthony MOI
2020-10-27 18:45:16 -04:00
committed by Anthony MOI
parent ab7bae466a
commit 13a80050f0
6 changed files with 207 additions and 21 deletions

View File

@ -12,7 +12,7 @@ style:
check-style: check-style:
npm run lint-check npm run lint-check
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json
# Launch the test suite # Launch the test suite
test: $(TESTS_RESOURCES) test: $(TESTS_RESOURCES)
@ -28,3 +28,7 @@ $(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt
$(DATA_DIR)/roberta.json : $(DATA_DIR)/roberta.json :
$(dir_guard) $(dir_guard)
wget https://storage.googleapis.com/tokenizers/roberta.json -O $@ wget https://storage.googleapis.com/tokenizers/roberta.json -O $@
$(DATA_DIR)/tokenizer-wiki.json :
$(dir_guard)
wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json -O $@

View File

@ -0,0 +1,30 @@
const globRequire = require;
describe("pipelineExample", () => {
// This is a hack to let us require using path similar to what the user has to use
function require(mod: string) {
let path = mod.slice("tokenizers/".length);
return globRequire("../../lib/" + path);
}
it("", async () => {
// START reload_tokenizer
const { Tokenizer } = require("tokenizers/bindings/tokenizer");
const tokenizer = Tokenizer.fromFile("data/tokenizer-wiki.json");
// END reload_tokenizer
// START setup_normalizer
const { sequenceNormalizer, nfdNormalizer, stripAccentsNormalizer } = require("tokenizers/bindings/normalizers");
const normalizer = sequenceNormalizer([nfdNormalizer(), stripAccentsNormalizer()]);
// END setup_normalizer
// START test_normalizer
let normalized = normalizer.normalizeStr("Héllò hôw are ü?")
// "Hello how are u?"
// END test_normalizer
expect(normalized).toEqual("Hello how are u?");
// START replace_normalizer
tokenizer.setNormalizer(normalizer)
// END replace_normalizer
});
});

View File

@ -0,0 +1,32 @@
from ..utils import data_dir, doc_wiki_tokenizer
from tokenizers import Tokenizer
class TestPipeline:
def test_pipeline(self, doc_wiki_tokenizer):
def print(*args, **kwargs):
pass
try:
# START reload_tokenizer
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")
# END reload_tokenizer
except Exception:
tokenizer = Tokenizer.from_file(doc_wiki_tokenizer)
# START setup_normalizer
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents
normalizer = normalizers.Sequence([NFD(), StripAccents()])
# END setup_normalizer
# START test_normalizer
normalizer.normalize_str("Héllò hôw are ü?")
# "Hello how are u?"
# END test_normalizer
assert normalizer.normalize_str("Héllò hôw are ü?") == "Hello how are u?"
# START replace_normalizer
tokenizer.normalizer = normalizer
# END replace_normalizer

View File

@ -26,6 +26,8 @@ class RustRef:
l, title = self.make_func_link(parts, title) l, title = self.make_func_link(parts, title)
if doctype == "meth": if doctype == "meth":
l, title = self.make_meth_link(parts, title) l, title = self.make_meth_link(parts, title)
if doctype == "trait":
l, title = self.make_trait_link(parts, title)
link += l link += l
node = nodes.reference(internal=False, refuri=link, text=title) node = nodes.reference(internal=False, refuri=link, text=title)
@ -72,11 +74,23 @@ class RustRef:
return link, title return link, title
def make_trait_link(self, parts, title):
link = ""
trait_name = parts[-1]
path = parts[:-1]
for p in path:
link += f"/{p}"
link += f"/trait.{trait_name}.html"
return link, title
def setup(app): def setup(app):
app.add_role("rust:struct", RustRef()) app.add_role("rust:struct", RustRef())
app.add_role("rust:func", RustRef()) app.add_role("rust:func", RustRef())
app.add_role("rust:meth", RustRef()) app.add_role("rust:meth", RustRef())
app.add_role("rust:trait", RustRef())
return { return {
"version": "0.1", "version": "0.1",

View File

@ -1,8 +1,8 @@
The tokenization pipeline The tokenization pipeline
==================================================================================================== ====================================================================================================
When calling :meth:`~tokenizers.Tokenizer.encode` or :meth:`~tokenizers.Tokenizer.encode_batch`, the When calling :entity:`Tokenizer.encode` or :entity:`Tokenizer.encode_batch`, the input text(s) go
input text(s) go through the following pipeline: through the following pipeline:
- :ref:`normalization` - :ref:`normalization`
- :ref:`pre-tokenization` - :ref:`pre-tokenization`
@ -14,14 +14,32 @@ We'll see in details what happens during each of those steps in detail, as well
each of those steps to your needs. If you're already familiar with those steps and want to learn by each of those steps to your needs. If you're already familiar with those steps and want to learn by
seeing some code, jump to :ref:`our BERT from scratch example <example>`. seeing some code, jump to :ref:`our BERT from scratch example <example>`.
For the examples that require a :class:`~tokenizers.Tokenizer`, we will use the tokenizer we trained For the examples that require a :entity:`Tokenizer`, we will use the tokenizer we trained
in the :doc:`quicktour`, which you can load with: in the :doc:`quicktour`, which you can load with:
.. code-block:: python .. only:: python
from tokenizers import Tokenizer .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
:language: python
:start-after: START reload_tokenizer
:end-before: END reload_tokenizer
:dedent: 8
tokenizer = Tokenizer.from_file("pretrained/wiki.json") .. only:: rust
.. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust
:start-after: START pipeline_reload_tokenizer
:end-before: END pipeline_reload_tokenizer
:dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
:language: javascript
:start-after: START reload_tokenizer
:end-before: END reload_tokenizer
:dedent: 8
.. _normalization: .. _normalization:
@ -36,31 +54,88 @@ or lowercasing all text. If you're familiar with `Unicode normalization
in most tokenizers. in most tokenizers.
Each normalization operation is represented in the 🤗 Tokenizers library by a Each normalization operation is represented in the 🤗 Tokenizers library by a
:class:`~tokenizers.normalizers.Normalizer`, and you can combine several of those by using a :entity:`Normalizer`, and you can combine several of those by using a
:class:`~tokenizers.normalizers.Sequence`. Here is a normalizer applying NFD Unicode normalization :entity:`normalizers.Sequence`. Here is a normalizer applying NFD Unicode normalization
and removing accents as an example: and removing accents as an example:
.. code-block:: python .. only:: python
import tokenizers .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
from tokenizers.normalizers import NFD, StripAccents :language: python
:start-after: START setup_normalizer
:end-before: END setup_normalizer
:dedent: 8
normalizer = tokenizers.normalizers.Sequence([NFD(), StripAccents()]) .. only:: rust
You can apply that normalizer to any string with the .. literalinclude:: ../../tokenizers/tests/documentation.rs
:meth:`~tokenizers.normalizers.Normalizer.normalize_str` method: :language: rust
:start-after: START pipeline_setup_normalizer
:end-before: END pipeline_setup_normalizer
:dedent: 4
.. code-block:: python .. only:: node
normalizer.normalize_str("Héllò hôw are ü?") .. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
# "Hello how are u?" :language: javascript
:start-after: START setup_normalizer
:end-before: END setup_normalizer
:dedent: 8
When building a :class:`~tokenizers.Tokenizer`, you can customize its normalizer by just changing
You can manually test that normalizer by applying it to any string:
.. only:: python
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
:language: python
:start-after: START test_normalizer
:end-before: END test_normalizer
:dedent: 8
.. only:: rust
.. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust
:start-after: START pipeline_test_normalizer
:end-before: END pipeline_test_normalizer
:dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
:language: javascript
:start-after: START test_normalizer
:end-before: END test_normalizer
:dedent: 8
When building a :entity:`Tokenizer`, you can customize its normalizer by just changing
the corresponding attribute: the corresponding attribute:
.. code-block:: python .. only:: python
tokenizer.normalizer = normalizer .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
:language: python
:start-after: START replace_normalizer
:end-before: END replace_normalizer
:dedent: 8
.. only:: rust
.. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust
:start-after: START pipeline_replace_normalizer
:end-before: END pipeline_replace_normalizer
:dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
:language: javascript
:start-after: START replace_normalizer
:end-before: END replace_normalizer
:dedent: 8
Of course, if you change the way a tokenizer applies normalization, you should probably retrain it Of course, if you change the way a tokenizer applies normalization, you should probably retrain it
from scratch afterward. from scratch afterward.

View File

@ -262,3 +262,34 @@ fn quicktour() -> tokenizers::Result<()> {
assert_eq!(output[1].get_attention_mask(), [1, 1, 1, 1, 1, 1, 1, 0]); assert_eq!(output[1].get_attention_mask(), [1, 1, 1, 1, 1, 1, 1, 0]);
Ok(()) Ok(())
} }
#[test]
fn pipeline() -> tokenizers::Result<()> {
// START pipeline_reload_tokenizer
use tokenizers::Tokenizer;
let mut tokenizer = Tokenizer::from_file("data/tokenizer-wiki.json")?;
// END pipeline_reload_tokenizer
// START pipeline_setup_normalizer
use tokenizers::normalizers::{
strip::StripAccents, unicode::NFD, utils::Sequence as NormalizerSequence,
};
let normalizer = NormalizerSequence::new(vec![NFD.into(), StripAccents.into()]);
// END pipeline_setup_normalizer
// START pipeline_test_normalizer
use tokenizers::{NormalizedString, Normalizer};
let mut normalized = NormalizedString::from("Héllò hôw are ü?");
normalizer.normalize(&mut normalized)?;
println!("{}", normalized.get());
// "Hello how are u?"
// END pipeline_test_normalizer
assert_eq!(normalized.get(), "Hello how are u?");
// START pipeline_replace_normalizer
tokenizer.with_normalizer(normalizer);
// END pipeline_replace_normalizer
Ok(())
}