Doc - Update Decoder part of the Pipeline page

This commit is contained in:
Anthony MOI
2020-10-28 16:35:26 -04:00
committed by Anthony MOI
parent 8b65c1f4bc
commit 9521603e08
5 changed files with 220 additions and 57 deletions

View File

@ -66,37 +66,20 @@ describe("pipelineExample", () => {
[["[CLS]", 1], ["[SEP]", 2]] [["[CLS]", 1], ["[SEP]", 2]]
)); ));
// END setup_processor // END setup_processor
// START test_decoding
let output = tokenizer.encode("Hello, y'all! How are you 😁 ?");
console.log(output.getIds());
// [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]
tokenizer.decode([1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]);
// "Hello , y ' all ! How are you ?"
// END test_decoding
}); });
it("shows a full bert example", async () => { var { Tokenizer } = require("tokenizers/bindings/tokenizer");
// START bert_setup_tokenizer const slow_bert_training = async (bertTokenizer: typeof Tokenizer) => {
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
let { WordPiece } = require("tokenizers/bindings/models"); let { WordPiece } = require("tokenizers/bindings/models");
let bert_tokenizer = Tokenizer(WordPiece.empty());
// END bert_setup_tokenizer
// START bert_setup_normalizer
let { sequenceNormalizer, lowercaseNormalizer, nfdNormalizer, stripAccentsNormalizer }
= require("tokenizers/bindings/normalizers");
bert_tokenizer.setNormalizer(sequenceNormalizer([
nfdNormalizer(), lowercaseNormalizer(), stripAccentsNormalizer()
]))
// END bert_setup_normalizer
// START bert_setup_pre_tokenizer
let { whitespacePreTokenizer } = require("tokenizers/bindings/pre_tokenizers");
bert_tokenizer.setPreTokenizer = whitespacePreTokenizer();
// END bert_setup_pre_tokenizer
// START bert_setup_processor
let { templateProcessing } = require("tokenizers/bindings/processors");
bert_tokenizer.setPostProcessor(templateProcessing(
"[CLS] $A [SEP]",
"[CLS] $A [SEP] $B:1 [SEP]:1",
[["[CLS]", 1], ["[SEP]", 2]]
));
// END bert_setup_processor
// START bert_train_tokenizer // START bert_train_tokenizer
let { wordPieceTrainer } = require("tokenizers/bindings/trainers"); let { wordPieceTrainer } = require("tokenizers/bindings/trainers");
let { promisify } = require("utils"); let { promisify } = require("utils");
@ -106,15 +89,61 @@ describe("pipelineExample", () => {
specialTokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] specialTokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
}); });
let files = ["test", "train", "valid"].map(split => `data/wikitext-103-raw/wiki.${split}.raw`); let files = ["test", "train", "valid"].map(split => `data/wikitext-103-raw/wiki.${split}.raw`);
bert_tokenizer.train(trainer, files); bertTokenizer.train(trainer, files);
let model_files = bert_tokenizer.getModel.save("data", "bert-wiki"); let modelFiles = bertTokenizer.getModel.save("data", "bert-wiki");
let fromFile = promisify(WordPiece.fromFile); let fromFile = promisify(WordPiece.fromFile);
bert_tokenizer.setModel(await fromFile(model_files[0], { bertTokenizer.setModel(await fromFile(modelFiles[0], {
unkToken: "[UNK]" unkToken: "[UNK]"
})); }));
bert_tokenizer.save("data/bert-wiki.json") bertTokenizer.save("data/bert-wiki.json")
// END bert_train_tokenizer // END bert_train_tokenizer
};
console.log(slow_bert_training); // disable unused warning
it("shows a full bert example", async () => {
// START bert_setup_tokenizer
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
let { WordPiece } = require("tokenizers/bindings/models");
let bertTokenizer = Tokenizer(WordPiece.empty());
// END bert_setup_tokenizer
// START bert_setup_normalizer
let { sequenceNormalizer, lowercaseNormalizer, nfdNormalizer, stripAccentsNormalizer }
= require("tokenizers/bindings/normalizers");
bertTokenizer.setNormalizer(sequenceNormalizer([
nfdNormalizer(), lowercaseNormalizer(), stripAccentsNormalizer()
]))
// END bert_setup_normalizer
// START bert_setup_pre_tokenizer
let { whitespacePreTokenizer } = require("tokenizers/bindings/pre_tokenizers");
bertTokenizer.setPreTokenizer = whitespacePreTokenizer();
// END bert_setup_pre_tokenizer
// START bert_setup_processor
let { templateProcessing } = require("tokenizers/bindings/processors");
bertTokenizer.setPostProcessor(templateProcessing(
"[CLS] $A [SEP]",
"[CLS] $A [SEP] $B:1 [SEP]:1",
[["[CLS]", 1], ["[SEP]", 2]]
));
// END bert_setup_processor
// START bert_test_decoding
let output = bertTokenizer.encode("Welcome to the 🤗 Tokenizers library.");
console.log(output.getTokens());
// ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]
bertTokenizer.decode(output.getIds());
// "welcome to the tok ##eni ##zer ##s library ."
// END bert_test_decoding
// START bert_proper_decoding
let { wordPieceDecoder } = require("tokenizers/bindings/decoders");
bertTokenizer.setDecoder(wordPieceDecoder());
bertTokenizer.decode(output.ids);
// "welcome to the tokenizers library."
// END bert_proper_decoding
}); });
}); });

View File

@ -76,14 +76,27 @@ class TestPipeline:
# START setup_processor # START setup_processor
from tokenizers.processors import TemplateProcessing from tokenizers.processors import TemplateProcessing
tokenizer.post_processor = TemplateProcessing tokenizer.post_processor = TemplateProcessing(
single="[CLS] $A [SEP]", single="[CLS] $A [SEP]",
pair="[CLS] $A [SEP] $B:1 [SEP]:1", pair="[CLS] $A [SEP] $B:1 [SEP]:1",
special_tokens=[("[CLS]", 1), ("[SEP]", 2)], special_tokens=[("[CLS]", 1), ("[SEP]", 2)],
) )
# END setup_processor # END setup_processor
# START test_decoding
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output.ids)
# [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]
def test_bert_example(self): tokenizer.decode([1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2])
# "Hello , y ' all ! How are you ?"
# END test_decoding
assert output.ids == [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]
assert (
tokenizer.decode([1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2])
== "Hello , y ' all ! How are you ?"
)
def bert_example(self):
# START bert_setup_tokenizer # START bert_setup_tokenizer
from tokenizers import Tokenizer from tokenizers import Tokenizer
from tokenizers.models import WordPiece from tokenizers.models import WordPiece
@ -94,9 +107,7 @@ class TestPipeline:
from tokenizers import normalizers from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents from tokenizers.normalizers import Lowercase, NFD, StripAccents
bert_tokenizer.normalizer = normalizers.Sequence([ bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
NFD(), Lowercase(), StripAccents()
])
# END bert_setup_normalizer # END bert_setup_normalizer
# START bert_setup_pre_tokenizer # START bert_setup_pre_tokenizer
from tokenizers.pre_tokenizers import Whitespace from tokenizers.pre_tokenizers import Whitespace
@ -112,7 +123,7 @@ class TestPipeline:
special_tokens=[ special_tokens=[
("[CLS]", 1), ("[CLS]", 1),
("[SEP]", 2), ("[SEP]", 2),
] ],
) )
# END bert_setup_processor # END bert_setup_processor
# START bert_train_tokenizer # START bert_train_tokenizer
@ -129,3 +140,16 @@ class TestPipeline:
bert_tokenizer.save("data/bert-wiki.json") bert_tokenizer.save("data/bert-wiki.json")
# END bert_train_tokenizer # END bert_train_tokenizer
# START bert_test_decoding
output = bert_tokenizer.encode("Welcome to the 🤗 Tokenizers library.")
print(output.tokens)
# ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]
bert_tokenizer.decoder(output.ids)
# "welcome to the tok ##eni ##zer ##s library ."
# END bert_test_decoding
# START bert_proper_decoding
bert_tokenizer.decoder = tokenizers.decoders.WordPiece()
bert_tokenizer.decode(output.ids)
# "welcome to the tokenizers library."
# END bert_proper_decoding

View File

@ -18,6 +18,10 @@
:meth:`~tokenizers.Tokenizer.encode` :meth:`~tokenizers.Tokenizer.encode`
Tokenizer.encode_batch Tokenizer.encode_batch
:meth:`~tokenizers.Tokenizer.encode_batch` :meth:`~tokenizers.Tokenizer.encode_batch`
Tokenizer.decode
:meth:`~tokenizers.Tokenizer.decode`
Tokenizer.decode_batch
:meth:`~tokenizers.Tokenizer.decode_batch`
Tokenizer.token_to_id Tokenizer.token_to_id
:meth:`~tokenizers.Tokenizer.token_to_id` :meth:`~tokenizers.Tokenizer.token_to_id`
Tokenizer.enable_padding Tokenizer.enable_padding
@ -42,6 +46,8 @@
:class:`~tokenizers.models.WordLevel` :class:`~tokenizers.models.WordLevel`
models.WordPiece models.WordPiece
:class:`~tokenizers.models.WordPiece` :class:`~tokenizers.models.WordPiece`
Decoder
:class:`~tokenizers.decoders.Decoder`
.. entities:: rust .. entities:: rust
@ -63,6 +69,10 @@
:rust:meth:`~tokenizers::tokenizer::Tokenizer::encode` :rust:meth:`~tokenizers::tokenizer::Tokenizer::encode`
Tokenizer.encode_batch Tokenizer.encode_batch
:rust:meth:`~tokenizers::tokenizer::Tokenizer::encode_batch` :rust:meth:`~tokenizers::tokenizer::Tokenizer::encode_batch`
Tokenizer.decode
:rust:meth:`~tokenizers::tokenizer::Tokenizer::decode`
Tokenizer.decode_batch
:rust:meth:`~tokenizers::tokenizer::Tokenizer::decode_batch`
Tokenizer.token_to_id Tokenizer.token_to_id
:rust:meth:`~tokenizers::tokenizer::Tokenizer::token_to_id` :rust:meth:`~tokenizers::tokenizer::Tokenizer::token_to_id`
Tokenizer.enable_padding Tokenizer.enable_padding
@ -87,6 +97,8 @@
:rust:struct:`~tokenizers::models::wordlevel::WordLevel` :rust:struct:`~tokenizers::models::wordlevel::WordLevel`
models.WordPiece models.WordPiece
:rust:struct:`~tokenizers::models::wordpiece::WordPiece` :rust:struct:`~tokenizers::models::wordpiece::WordPiece`
Decoder
:rust:trait:`~tokenizers::tokenizer::Decoder`
.. entities:: node .. entities:: node
@ -108,6 +120,10 @@
:obj:`Tokenizer.encode()` :obj:`Tokenizer.encode()`
Tokenizer.encode_batch Tokenizer.encode_batch
:obj:`Tokenizer.encodeBatch()` :obj:`Tokenizer.encodeBatch()`
Tokenizer.decode
:obj:`Tokenizer.decode()`
Tokenizer.decode_batch
:obj:`Tokenizer.decodeBatch()`
Tokenizer.token_to_id Tokenizer.token_to_id
:obj:`Tokenizer.tokenToId()` :obj:`Tokenizer.tokenToId()`
Tokenizer.enable_padding Tokenizer.enable_padding
@ -132,3 +148,5 @@
:obj:`WordLevel` :obj:`WordLevel`
models.WordPiece models.WordPiece
:obj:`WordPiece` :obj:`WordPiece`
Decoder
:obj:`Decoder`

View File

@ -447,40 +447,104 @@ We can use this tokenizer and train on it on wikitext like in the :doc:`quicktou
Decoding Decoding
---------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------
On top of encoding the input texts, a :class:`~tokenizers.Tokenizer` also has an API for decoding, .. entities:: python
bert_tokenizer
:obj:`bert_tokenizer`
.. entities:: rust
bert_tokenizer
:obj:`bert_tokenizer`
.. entities:: node
bert_tokenizer
:obj:`bertTokenizer`
On top of encoding the input texts, a :entity:`Tokenizer` also has an API for decoding,
that is converting IDs generated by your model back to a text. This is done by the methods that is converting IDs generated by your model back to a text. This is done by the methods
:meth:`~tokenizers.Tokenizer.decode` (for one predicted text) and :entity:`Tokenizer.decode` (for one predicted text) and :entity:`Tokenizer.decode_batch` (for a
:meth:`~tokenizers.Tokenizer.decode_batch` (for a batch of predictions). batch of predictions).
The `decoder` will first convert the IDs back to tokens (using the tokenizer's vocabulary) and The `decoder` will first convert the IDs back to tokens (using the tokenizer's vocabulary) and
remove all special tokens, then join those tokens with spaces: remove all special tokens, then join those tokens with spaces:
.. code-block:: python .. only:: python
output = tokenizer.encode("Hello, y'all! How are you 😁 ?") .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
print(output.ids) :language: python
# [27194, 16, 93, 11, 5068, 5, 7928, 5083, 6190, 0, 35] :start-after: START test_decoding
:end-before: END test_decoding
:dedent: 8
tokenizer.decode([27194, 16, 93, 11, 5068, 5, 7928, 5083, 6190, 0, 35]) .. only:: rust
# "Hello , y ' all ! How are you ?"
.. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust
:start-after: START pipeline_test_decoding
:end-before: END pipeline_test_decoding
:dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
:language: javascript
:start-after: START test_decoding
:end-before: END test_decoding
:dedent: 8
If you used a model that added special characters to represent subtokens of a given "word" (like If you used a model that added special characters to represent subtokens of a given "word" (like
the :obj:`"##"` in WordPiece) you will need to customize the `decoder` to treat them properly. If we the :obj:`"##"` in WordPiece) you will need to customize the `decoder` to treat them properly. If we
take our previous :obj:`bert_tokenizer` for instance the default decoing will give: take our previous :entity:`bert_tokenizer` for instance the default decoing will give:
.. code-block:: python .. only:: python
output = bert_tokenizer.encode("Welcome to the 🤗 Tokenizers library.") .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
print(output.tokens) :language: python
# ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"] :start-after: START bert_test_decoding
:end-before: END bert_test_decoding
:dedent: 8
bert_tokenizer.decoder(output.ids) .. only:: rust
# "welcome to the tok ##eni ##zer ##s library ."
.. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust
:start-after: START bert_test_decoding
:end-before: END bert_test_decoding
:dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
:language: javascript
:start-after: START bert_test_decoding
:end-before: END bert_test_decoding
:dedent: 8
But by changing it to a proper decoder, we get: But by changing it to a proper decoder, we get:
.. code-block:: python .. only:: python
bert_tokenizer.decoder = tokenizers.decoders.WordPiece() .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
bert_tokenizer.decode(output.ids) :language: python
# "welcome to the tokenizers library." :start-after: START bert_proper_decoding
:end-before: END bert_proper_decoding
:dedent: 8
.. only:: rust
.. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust
:start-after: START bert_proper_decoding
:end-before: END bert_proper_decoding
:dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
:language: javascript
:start-after: START bert_proper_decoding
:end-before: END bert_proper_decoding
:dedent: 8

View File

@ -368,6 +368,18 @@ fn pipeline() -> tokenizers::Result<()> {
.unwrap(), .unwrap(),
); );
// END pipeline_setup_processor // END pipeline_setup_processor
// START pipeline_test_decoding
let output = tokenizer.encode("Hello, y'all! How are you 😁 ?", true)?;
println!("{:?}", output.get_ids());
// [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]
let decoded = tokenizer.decode(
vec![1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2],
true,
)?;
println!("{}", decoded);
// "Hello , y ' all ! How are you ?"
// END pipeline_test_decoding
Ok(()) Ok(())
} }
@ -444,6 +456,22 @@ fn pipeline_bert() -> tokenizers::Result<()> {
bert_tokenizer.save("data/bert-wiki.json", false)?; bert_tokenizer.save("data/bert-wiki.json", false)?;
// END bert_train_tokenizer // END bert_train_tokenizer
// START bert_test_decoding
let output = bert_tokenizer.encode("Welcome to the 🤗 Tokenizers library.", true)?;
println!("{:?}", output.get_tokens());
// ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]
let decoded = bert_tokenizer.decode(output.get_ids().to_vec(), true)?;
println!("{}", decoded);
// "welcome to the tok ##eni ##zer ##s library ."
// END bert_test_decoding
// START bert_proper_decoding
use tokenizers::decoders::wordpiece::WordPiece as WordPieceDecoder;
bert_tokenizer.with_decoder(WordPieceDecoder::default());
let decoded = bert_tokenizer.decode(output.get_ids().to_vec(), true)?;
// "welcome to the tokenizers library."
// END bert_proper_decoding
println!("{}", decoded);
Ok(()) Ok(())
} }