mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Doc - Update Normalizer part of the Pipeline page
This commit is contained in:
@ -12,7 +12,7 @@ style:
|
||||
check-style:
|
||||
npm run lint-check
|
||||
|
||||
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json
|
||||
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json
|
||||
|
||||
# Launch the test suite
|
||||
test: $(TESTS_RESOURCES)
|
||||
@ -28,3 +28,7 @@ $(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt
|
||||
$(DATA_DIR)/roberta.json :
|
||||
$(dir_guard)
|
||||
wget https://storage.googleapis.com/tokenizers/roberta.json -O $@
|
||||
|
||||
$(DATA_DIR)/tokenizer-wiki.json :
|
||||
$(dir_guard)
|
||||
wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json -O $@
|
||||
|
30
bindings/node/examples/documentation/pipeline.test.ts
Normal file
30
bindings/node/examples/documentation/pipeline.test.ts
Normal file
@ -0,0 +1,30 @@
|
||||
const globRequire = require;
|
||||
|
||||
describe("pipelineExample", () => {
|
||||
// This is a hack to let us require using path similar to what the user has to use
|
||||
function require(mod: string) {
|
||||
let path = mod.slice("tokenizers/".length);
|
||||
return globRequire("../../lib/" + path);
|
||||
}
|
||||
|
||||
it("", async () => {
|
||||
// START reload_tokenizer
|
||||
const { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
||||
|
||||
const tokenizer = Tokenizer.fromFile("data/tokenizer-wiki.json");
|
||||
// END reload_tokenizer
|
||||
// START setup_normalizer
|
||||
const { sequenceNormalizer, nfdNormalizer, stripAccentsNormalizer } = require("tokenizers/bindings/normalizers");
|
||||
|
||||
const normalizer = sequenceNormalizer([nfdNormalizer(), stripAccentsNormalizer()]);
|
||||
// END setup_normalizer
|
||||
// START test_normalizer
|
||||
let normalized = normalizer.normalizeStr("Héllò hôw are ü?")
|
||||
// "Hello how are u?"
|
||||
// END test_normalizer
|
||||
expect(normalized).toEqual("Hello how are u?");
|
||||
// START replace_normalizer
|
||||
tokenizer.setNormalizer(normalizer)
|
||||
// END replace_normalizer
|
||||
});
|
||||
});
|
32
bindings/python/tests/documentation/test_pipeline.py
Normal file
32
bindings/python/tests/documentation/test_pipeline.py
Normal file
@ -0,0 +1,32 @@
|
||||
from ..utils import data_dir, doc_wiki_tokenizer
|
||||
from tokenizers import Tokenizer
|
||||
|
||||
|
||||
class TestPipeline:
|
||||
def test_pipeline(self, doc_wiki_tokenizer):
|
||||
def print(*args, **kwargs):
|
||||
pass
|
||||
|
||||
try:
|
||||
# START reload_tokenizer
|
||||
from tokenizers import Tokenizer
|
||||
|
||||
tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")
|
||||
# END reload_tokenizer
|
||||
except Exception:
|
||||
tokenizer = Tokenizer.from_file(doc_wiki_tokenizer)
|
||||
|
||||
# START setup_normalizer
|
||||
from tokenizers import normalizers
|
||||
from tokenizers.normalizers import NFD, StripAccents
|
||||
|
||||
normalizer = normalizers.Sequence([NFD(), StripAccents()])
|
||||
# END setup_normalizer
|
||||
# START test_normalizer
|
||||
normalizer.normalize_str("Héllò hôw are ü?")
|
||||
# "Hello how are u?"
|
||||
# END test_normalizer
|
||||
assert normalizer.normalize_str("Héllò hôw are ü?") == "Hello how are u?"
|
||||
# START replace_normalizer
|
||||
tokenizer.normalizer = normalizer
|
||||
# END replace_normalizer
|
@ -26,6 +26,8 @@ class RustRef:
|
||||
l, title = self.make_func_link(parts, title)
|
||||
if doctype == "meth":
|
||||
l, title = self.make_meth_link(parts, title)
|
||||
if doctype == "trait":
|
||||
l, title = self.make_trait_link(parts, title)
|
||||
link += l
|
||||
|
||||
node = nodes.reference(internal=False, refuri=link, text=title)
|
||||
@ -72,11 +74,23 @@ class RustRef:
|
||||
|
||||
return link, title
|
||||
|
||||
def make_trait_link(self, parts, title):
|
||||
link = ""
|
||||
trait_name = parts[-1]
|
||||
|
||||
path = parts[:-1]
|
||||
for p in path:
|
||||
link += f"/{p}"
|
||||
link += f"/trait.{trait_name}.html"
|
||||
|
||||
return link, title
|
||||
|
||||
|
||||
def setup(app):
|
||||
app.add_role("rust:struct", RustRef())
|
||||
app.add_role("rust:func", RustRef())
|
||||
app.add_role("rust:meth", RustRef())
|
||||
app.add_role("rust:trait", RustRef())
|
||||
|
||||
return {
|
||||
"version": "0.1",
|
||||
|
@ -1,8 +1,8 @@
|
||||
The tokenization pipeline
|
||||
====================================================================================================
|
||||
|
||||
When calling :meth:`~tokenizers.Tokenizer.encode` or :meth:`~tokenizers.Tokenizer.encode_batch`, the
|
||||
input text(s) go through the following pipeline:
|
||||
When calling :entity:`Tokenizer.encode` or :entity:`Tokenizer.encode_batch`, the input text(s) go
|
||||
through the following pipeline:
|
||||
|
||||
- :ref:`normalization`
|
||||
- :ref:`pre-tokenization`
|
||||
@ -14,14 +14,32 @@ We'll see in details what happens during each of those steps in detail, as well
|
||||
each of those steps to your needs. If you're already familiar with those steps and want to learn by
|
||||
seeing some code, jump to :ref:`our BERT from scratch example <example>`.
|
||||
|
||||
For the examples that require a :class:`~tokenizers.Tokenizer`, we will use the tokenizer we trained
|
||||
For the examples that require a :entity:`Tokenizer`, we will use the tokenizer we trained
|
||||
in the :doc:`quicktour`, which you can load with:
|
||||
|
||||
.. code-block:: python
|
||||
.. only:: python
|
||||
|
||||
from tokenizers import Tokenizer
|
||||
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||
:language: python
|
||||
:start-after: START reload_tokenizer
|
||||
:end-before: END reload_tokenizer
|
||||
:dedent: 8
|
||||
|
||||
tokenizer = Tokenizer.from_file("pretrained/wiki.json")
|
||||
.. only:: rust
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START pipeline_reload_tokenizer
|
||||
:end-before: END pipeline_reload_tokenizer
|
||||
:dedent: 4
|
||||
|
||||
.. only:: node
|
||||
|
||||
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||
:language: javascript
|
||||
:start-after: START reload_tokenizer
|
||||
:end-before: END reload_tokenizer
|
||||
:dedent: 8
|
||||
|
||||
|
||||
.. _normalization:
|
||||
@ -36,31 +54,88 @@ or lowercasing all text. If you're familiar with `Unicode normalization
|
||||
in most tokenizers.
|
||||
|
||||
Each normalization operation is represented in the 🤗 Tokenizers library by a
|
||||
:class:`~tokenizers.normalizers.Normalizer`, and you can combine several of those by using a
|
||||
:class:`~tokenizers.normalizers.Sequence`. Here is a normalizer applying NFD Unicode normalization
|
||||
:entity:`Normalizer`, and you can combine several of those by using a
|
||||
:entity:`normalizers.Sequence`. Here is a normalizer applying NFD Unicode normalization
|
||||
and removing accents as an example:
|
||||
|
||||
.. code-block:: python
|
||||
.. only:: python
|
||||
|
||||
import tokenizers
|
||||
from tokenizers.normalizers import NFD, StripAccents
|
||||
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||
:language: python
|
||||
:start-after: START setup_normalizer
|
||||
:end-before: END setup_normalizer
|
||||
:dedent: 8
|
||||
|
||||
normalizer = tokenizers.normalizers.Sequence([NFD(), StripAccents()])
|
||||
.. only:: rust
|
||||
|
||||
You can apply that normalizer to any string with the
|
||||
:meth:`~tokenizers.normalizers.Normalizer.normalize_str` method:
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START pipeline_setup_normalizer
|
||||
:end-before: END pipeline_setup_normalizer
|
||||
:dedent: 4
|
||||
|
||||
.. code-block:: python
|
||||
.. only:: node
|
||||
|
||||
normalizer.normalize_str("Héllò hôw are ü?")
|
||||
# "Hello how are u?"
|
||||
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||
:language: javascript
|
||||
:start-after: START setup_normalizer
|
||||
:end-before: END setup_normalizer
|
||||
:dedent: 8
|
||||
|
||||
When building a :class:`~tokenizers.Tokenizer`, you can customize its normalizer by just changing
|
||||
|
||||
You can manually test that normalizer by applying it to any string:
|
||||
|
||||
.. only:: python
|
||||
|
||||
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||
:language: python
|
||||
:start-after: START test_normalizer
|
||||
:end-before: END test_normalizer
|
||||
:dedent: 8
|
||||
|
||||
.. only:: rust
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START pipeline_test_normalizer
|
||||
:end-before: END pipeline_test_normalizer
|
||||
:dedent: 4
|
||||
|
||||
.. only:: node
|
||||
|
||||
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||
:language: javascript
|
||||
:start-after: START test_normalizer
|
||||
:end-before: END test_normalizer
|
||||
:dedent: 8
|
||||
|
||||
|
||||
When building a :entity:`Tokenizer`, you can customize its normalizer by just changing
|
||||
the corresponding attribute:
|
||||
|
||||
.. code-block:: python
|
||||
.. only:: python
|
||||
|
||||
tokenizer.normalizer = normalizer
|
||||
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||
:language: python
|
||||
:start-after: START replace_normalizer
|
||||
:end-before: END replace_normalizer
|
||||
:dedent: 8
|
||||
|
||||
.. only:: rust
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START pipeline_replace_normalizer
|
||||
:end-before: END pipeline_replace_normalizer
|
||||
:dedent: 4
|
||||
|
||||
.. only:: node
|
||||
|
||||
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||
:language: javascript
|
||||
:start-after: START replace_normalizer
|
||||
:end-before: END replace_normalizer
|
||||
:dedent: 8
|
||||
|
||||
Of course, if you change the way a tokenizer applies normalization, you should probably retrain it
|
||||
from scratch afterward.
|
||||
|
@ -262,3 +262,34 @@ fn quicktour() -> tokenizers::Result<()> {
|
||||
assert_eq!(output[1].get_attention_mask(), [1, 1, 1, 1, 1, 1, 1, 0]);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pipeline() -> tokenizers::Result<()> {
|
||||
// START pipeline_reload_tokenizer
|
||||
use tokenizers::Tokenizer;
|
||||
|
||||
let mut tokenizer = Tokenizer::from_file("data/tokenizer-wiki.json")?;
|
||||
// END pipeline_reload_tokenizer
|
||||
// START pipeline_setup_normalizer
|
||||
use tokenizers::normalizers::{
|
||||
strip::StripAccents, unicode::NFD, utils::Sequence as NormalizerSequence,
|
||||
};
|
||||
|
||||
let normalizer = NormalizerSequence::new(vec![NFD.into(), StripAccents.into()]);
|
||||
// END pipeline_setup_normalizer
|
||||
// START pipeline_test_normalizer
|
||||
use tokenizers::{NormalizedString, Normalizer};
|
||||
|
||||
let mut normalized = NormalizedString::from("Héllò hôw are ü?");
|
||||
normalizer.normalize(&mut normalized)?;
|
||||
|
||||
println!("{}", normalized.get());
|
||||
// "Hello how are u?"
|
||||
// END pipeline_test_normalizer
|
||||
assert_eq!(normalized.get(), "Hello how are u?");
|
||||
// START pipeline_replace_normalizer
|
||||
tokenizer.with_normalizer(normalizer);
|
||||
// END pipeline_replace_normalizer
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
Reference in New Issue
Block a user