mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Doc - Update Normalizer part of the Pipeline page
This commit is contained in:
@ -12,7 +12,7 @@ style:
|
|||||||
check-style:
|
check-style:
|
||||||
npm run lint-check
|
npm run lint-check
|
||||||
|
|
||||||
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json
|
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json
|
||||||
|
|
||||||
# Launch the test suite
|
# Launch the test suite
|
||||||
test: $(TESTS_RESOURCES)
|
test: $(TESTS_RESOURCES)
|
||||||
@ -28,3 +28,7 @@ $(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt
|
|||||||
$(DATA_DIR)/roberta.json :
|
$(DATA_DIR)/roberta.json :
|
||||||
$(dir_guard)
|
$(dir_guard)
|
||||||
wget https://storage.googleapis.com/tokenizers/roberta.json -O $@
|
wget https://storage.googleapis.com/tokenizers/roberta.json -O $@
|
||||||
|
|
||||||
|
$(DATA_DIR)/tokenizer-wiki.json :
|
||||||
|
$(dir_guard)
|
||||||
|
wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json -O $@
|
||||||
|
30
bindings/node/examples/documentation/pipeline.test.ts
Normal file
30
bindings/node/examples/documentation/pipeline.test.ts
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
const globRequire = require;
|
||||||
|
|
||||||
|
describe("pipelineExample", () => {
|
||||||
|
// This is a hack to let us require using path similar to what the user has to use
|
||||||
|
function require(mod: string) {
|
||||||
|
let path = mod.slice("tokenizers/".length);
|
||||||
|
return globRequire("../../lib/" + path);
|
||||||
|
}
|
||||||
|
|
||||||
|
it("", async () => {
|
||||||
|
// START reload_tokenizer
|
||||||
|
const { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
||||||
|
|
||||||
|
const tokenizer = Tokenizer.fromFile("data/tokenizer-wiki.json");
|
||||||
|
// END reload_tokenizer
|
||||||
|
// START setup_normalizer
|
||||||
|
const { sequenceNormalizer, nfdNormalizer, stripAccentsNormalizer } = require("tokenizers/bindings/normalizers");
|
||||||
|
|
||||||
|
const normalizer = sequenceNormalizer([nfdNormalizer(), stripAccentsNormalizer()]);
|
||||||
|
// END setup_normalizer
|
||||||
|
// START test_normalizer
|
||||||
|
let normalized = normalizer.normalizeStr("Héllò hôw are ü?")
|
||||||
|
// "Hello how are u?"
|
||||||
|
// END test_normalizer
|
||||||
|
expect(normalized).toEqual("Hello how are u?");
|
||||||
|
// START replace_normalizer
|
||||||
|
tokenizer.setNormalizer(normalizer)
|
||||||
|
// END replace_normalizer
|
||||||
|
});
|
||||||
|
});
|
32
bindings/python/tests/documentation/test_pipeline.py
Normal file
32
bindings/python/tests/documentation/test_pipeline.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
from ..utils import data_dir, doc_wiki_tokenizer
|
||||||
|
from tokenizers import Tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
class TestPipeline:
|
||||||
|
def test_pipeline(self, doc_wiki_tokenizer):
|
||||||
|
def print(*args, **kwargs):
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
# START reload_tokenizer
|
||||||
|
from tokenizers import Tokenizer
|
||||||
|
|
||||||
|
tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")
|
||||||
|
# END reload_tokenizer
|
||||||
|
except Exception:
|
||||||
|
tokenizer = Tokenizer.from_file(doc_wiki_tokenizer)
|
||||||
|
|
||||||
|
# START setup_normalizer
|
||||||
|
from tokenizers import normalizers
|
||||||
|
from tokenizers.normalizers import NFD, StripAccents
|
||||||
|
|
||||||
|
normalizer = normalizers.Sequence([NFD(), StripAccents()])
|
||||||
|
# END setup_normalizer
|
||||||
|
# START test_normalizer
|
||||||
|
normalizer.normalize_str("Héllò hôw are ü?")
|
||||||
|
# "Hello how are u?"
|
||||||
|
# END test_normalizer
|
||||||
|
assert normalizer.normalize_str("Héllò hôw are ü?") == "Hello how are u?"
|
||||||
|
# START replace_normalizer
|
||||||
|
tokenizer.normalizer = normalizer
|
||||||
|
# END replace_normalizer
|
@ -26,6 +26,8 @@ class RustRef:
|
|||||||
l, title = self.make_func_link(parts, title)
|
l, title = self.make_func_link(parts, title)
|
||||||
if doctype == "meth":
|
if doctype == "meth":
|
||||||
l, title = self.make_meth_link(parts, title)
|
l, title = self.make_meth_link(parts, title)
|
||||||
|
if doctype == "trait":
|
||||||
|
l, title = self.make_trait_link(parts, title)
|
||||||
link += l
|
link += l
|
||||||
|
|
||||||
node = nodes.reference(internal=False, refuri=link, text=title)
|
node = nodes.reference(internal=False, refuri=link, text=title)
|
||||||
@ -72,11 +74,23 @@ class RustRef:
|
|||||||
|
|
||||||
return link, title
|
return link, title
|
||||||
|
|
||||||
|
def make_trait_link(self, parts, title):
|
||||||
|
link = ""
|
||||||
|
trait_name = parts[-1]
|
||||||
|
|
||||||
|
path = parts[:-1]
|
||||||
|
for p in path:
|
||||||
|
link += f"/{p}"
|
||||||
|
link += f"/trait.{trait_name}.html"
|
||||||
|
|
||||||
|
return link, title
|
||||||
|
|
||||||
|
|
||||||
def setup(app):
|
def setup(app):
|
||||||
app.add_role("rust:struct", RustRef())
|
app.add_role("rust:struct", RustRef())
|
||||||
app.add_role("rust:func", RustRef())
|
app.add_role("rust:func", RustRef())
|
||||||
app.add_role("rust:meth", RustRef())
|
app.add_role("rust:meth", RustRef())
|
||||||
|
app.add_role("rust:trait", RustRef())
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"version": "0.1",
|
"version": "0.1",
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
The tokenization pipeline
|
The tokenization pipeline
|
||||||
====================================================================================================
|
====================================================================================================
|
||||||
|
|
||||||
When calling :meth:`~tokenizers.Tokenizer.encode` or :meth:`~tokenizers.Tokenizer.encode_batch`, the
|
When calling :entity:`Tokenizer.encode` or :entity:`Tokenizer.encode_batch`, the input text(s) go
|
||||||
input text(s) go through the following pipeline:
|
through the following pipeline:
|
||||||
|
|
||||||
- :ref:`normalization`
|
- :ref:`normalization`
|
||||||
- :ref:`pre-tokenization`
|
- :ref:`pre-tokenization`
|
||||||
@ -14,14 +14,32 @@ We'll see in details what happens during each of those steps in detail, as well
|
|||||||
each of those steps to your needs. If you're already familiar with those steps and want to learn by
|
each of those steps to your needs. If you're already familiar with those steps and want to learn by
|
||||||
seeing some code, jump to :ref:`our BERT from scratch example <example>`.
|
seeing some code, jump to :ref:`our BERT from scratch example <example>`.
|
||||||
|
|
||||||
For the examples that require a :class:`~tokenizers.Tokenizer`, we will use the tokenizer we trained
|
For the examples that require a :entity:`Tokenizer`, we will use the tokenizer we trained
|
||||||
in the :doc:`quicktour`, which you can load with:
|
in the :doc:`quicktour`, which you can load with:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
from tokenizers import Tokenizer
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||||
|
:language: python
|
||||||
|
:start-after: START reload_tokenizer
|
||||||
|
:end-before: END reload_tokenizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
tokenizer = Tokenizer.from_file("pretrained/wiki.json")
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START pipeline_reload_tokenizer
|
||||||
|
:end-before: END pipeline_reload_tokenizer
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START reload_tokenizer
|
||||||
|
:end-before: END reload_tokenizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
|
|
||||||
.. _normalization:
|
.. _normalization:
|
||||||
@ -36,31 +54,88 @@ or lowercasing all text. If you're familiar with `Unicode normalization
|
|||||||
in most tokenizers.
|
in most tokenizers.
|
||||||
|
|
||||||
Each normalization operation is represented in the 🤗 Tokenizers library by a
|
Each normalization operation is represented in the 🤗 Tokenizers library by a
|
||||||
:class:`~tokenizers.normalizers.Normalizer`, and you can combine several of those by using a
|
:entity:`Normalizer`, and you can combine several of those by using a
|
||||||
:class:`~tokenizers.normalizers.Sequence`. Here is a normalizer applying NFD Unicode normalization
|
:entity:`normalizers.Sequence`. Here is a normalizer applying NFD Unicode normalization
|
||||||
and removing accents as an example:
|
and removing accents as an example:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
import tokenizers
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||||
from tokenizers.normalizers import NFD, StripAccents
|
:language: python
|
||||||
|
:start-after: START setup_normalizer
|
||||||
|
:end-before: END setup_normalizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
normalizer = tokenizers.normalizers.Sequence([NFD(), StripAccents()])
|
.. only:: rust
|
||||||
|
|
||||||
You can apply that normalizer to any string with the
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
:meth:`~tokenizers.normalizers.Normalizer.normalize_str` method:
|
:language: rust
|
||||||
|
:start-after: START pipeline_setup_normalizer
|
||||||
|
:end-before: END pipeline_setup_normalizer
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: node
|
||||||
|
|
||||||
normalizer.normalize_str("Héllò hôw are ü?")
|
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||||
# "Hello how are u?"
|
:language: javascript
|
||||||
|
:start-after: START setup_normalizer
|
||||||
|
:end-before: END setup_normalizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
When building a :class:`~tokenizers.Tokenizer`, you can customize its normalizer by just changing
|
|
||||||
|
You can manually test that normalizer by applying it to any string:
|
||||||
|
|
||||||
|
.. only:: python
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||||
|
:language: python
|
||||||
|
:start-after: START test_normalizer
|
||||||
|
:end-before: END test_normalizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START pipeline_test_normalizer
|
||||||
|
:end-before: END pipeline_test_normalizer
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START test_normalizer
|
||||||
|
:end-before: END test_normalizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
|
|
||||||
|
When building a :entity:`Tokenizer`, you can customize its normalizer by just changing
|
||||||
the corresponding attribute:
|
the corresponding attribute:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
tokenizer.normalizer = normalizer
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||||
|
:language: python
|
||||||
|
:start-after: START replace_normalizer
|
||||||
|
:end-before: END replace_normalizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START pipeline_replace_normalizer
|
||||||
|
:end-before: END pipeline_replace_normalizer
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START replace_normalizer
|
||||||
|
:end-before: END replace_normalizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
Of course, if you change the way a tokenizer applies normalization, you should probably retrain it
|
Of course, if you change the way a tokenizer applies normalization, you should probably retrain it
|
||||||
from scratch afterward.
|
from scratch afterward.
|
||||||
|
@ -262,3 +262,34 @@ fn quicktour() -> tokenizers::Result<()> {
|
|||||||
assert_eq!(output[1].get_attention_mask(), [1, 1, 1, 1, 1, 1, 1, 0]);
|
assert_eq!(output[1].get_attention_mask(), [1, 1, 1, 1, 1, 1, 1, 0]);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn pipeline() -> tokenizers::Result<()> {
|
||||||
|
// START pipeline_reload_tokenizer
|
||||||
|
use tokenizers::Tokenizer;
|
||||||
|
|
||||||
|
let mut tokenizer = Tokenizer::from_file("data/tokenizer-wiki.json")?;
|
||||||
|
// END pipeline_reload_tokenizer
|
||||||
|
// START pipeline_setup_normalizer
|
||||||
|
use tokenizers::normalizers::{
|
||||||
|
strip::StripAccents, unicode::NFD, utils::Sequence as NormalizerSequence,
|
||||||
|
};
|
||||||
|
|
||||||
|
let normalizer = NormalizerSequence::new(vec![NFD.into(), StripAccents.into()]);
|
||||||
|
// END pipeline_setup_normalizer
|
||||||
|
// START pipeline_test_normalizer
|
||||||
|
use tokenizers::{NormalizedString, Normalizer};
|
||||||
|
|
||||||
|
let mut normalized = NormalizedString::from("Héllò hôw are ü?");
|
||||||
|
normalizer.normalize(&mut normalized)?;
|
||||||
|
|
||||||
|
println!("{}", normalized.get());
|
||||||
|
// "Hello how are u?"
|
||||||
|
// END pipeline_test_normalizer
|
||||||
|
assert_eq!(normalized.get(), "Hello how are u?");
|
||||||
|
// START pipeline_replace_normalizer
|
||||||
|
tokenizer.with_normalizer(normalizer);
|
||||||
|
// END pipeline_replace_normalizer
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user