Doc - Update quicktour for Node

This commit is contained in:
Anthony MOI
2020-10-29 11:56:26 -04:00
committed by Anthony MOI
parent 9521603e08
commit b23310b481
3 changed files with 371 additions and 5 deletions

View File

@ -1,4 +1,4 @@
const globRequire = require; var globRequire = require;
describe("pipelineExample", () => { describe("pipelineExample", () => {
// This is a hack to let us require using path similar to what the user has to use // This is a hack to let us require using path similar to what the user has to use
@ -82,7 +82,7 @@ describe("pipelineExample", () => {
// START bert_train_tokenizer // START bert_train_tokenizer
let { wordPieceTrainer } = require("tokenizers/bindings/trainers"); let { wordPieceTrainer } = require("tokenizers/bindings/trainers");
let { promisify } = require("utils"); let { promisify } = require("util");
let trainer = wordPieceTrainer({ let trainer = wordPieceTrainer({
vocabSize: 30522, vocabSize: 30522,
@ -107,7 +107,7 @@ describe("pipelineExample", () => {
let { Tokenizer } = require("tokenizers/bindings/tokenizer"); let { Tokenizer } = require("tokenizers/bindings/tokenizer");
let { WordPiece } = require("tokenizers/bindings/models"); let { WordPiece } = require("tokenizers/bindings/models");
let bertTokenizer = Tokenizer(WordPiece.empty()); let bertTokenizer = new Tokenizer(WordPiece.empty());
// END bert_setup_tokenizer // END bert_setup_tokenizer
// START bert_setup_normalizer // START bert_setup_normalizer
let { sequenceNormalizer, lowercaseNormalizer, nfdNormalizer, stripAccentsNormalizer } let { sequenceNormalizer, lowercaseNormalizer, nfdNormalizer, stripAccentsNormalizer }
@ -118,12 +118,12 @@ describe("pipelineExample", () => {
])) ]))
// END bert_setup_normalizer // END bert_setup_normalizer
// START bert_setup_pre_tokenizer // START bert_setup_pre_tokenizer
let { whitespacePreTokenizer } = require("tokenizers/bindings/pre_tokenizers"); let { whitespacePreTokenizer } = require("tokenizers/bindings/pre-tokenizers");
bertTokenizer.setPreTokenizer = whitespacePreTokenizer(); bertTokenizer.setPreTokenizer = whitespacePreTokenizer();
// END bert_setup_pre_tokenizer // END bert_setup_pre_tokenizer
// START bert_setup_processor // START bert_setup_processor
let { templateProcessing } = require("tokenizers/bindings/processors"); let { templateProcessing } = require("tokenizers/bindings/post-processors");
bertTokenizer.setPostProcessor(templateProcessing( bertTokenizer.setPostProcessor(templateProcessing(
"[CLS] $A [SEP]", "[CLS] $A [SEP]",

View File

@ -0,0 +1,190 @@
var globRequire = require;
describe("quicktourExample", () => {
function require(mod: string) {
if (mod.startsWith("tokenizers/")) {
let path = mod.slice("tokenizers/".length);
return globRequire("../../lib/" + path);
} else {
return globRequire(mod);
}
}
it.skip("trains the tokenizer", async () => {
// START init_tokenizer
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
let { BPE } = require("tokenizers/bindings/models");
let tokenizer = new Tokenizer(BPE.empty());
// END init_tokenizer
// START init_trainer
let { bpeTrainer } = require("tokenizers/bindings/trainers");
let trainer = bpeTrainer({
specialTokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
});
// END init_trainer
// START init_pretok
let { whitespacePreTokenizer } = require("tokenizers/bindings/pre-tokenizers");
tokenizer.setPreTokenizer(whitespacePreTokenizer());
// END init_pretok
// START train
let files = ["test", "train", "valid"].map(split => `data/wikitext-103-raw/wiki.${split}.raw`);
tokenizer.train(trainer, files);
// END train
// START reload_model
let { promisify } = require("util");
let modelFiles = tokenizer.getModel().save("data", "wiki");
let fromFile = promisify(BPE.fromFile);
tokenizer.setModel(await fromFile(modelFiles[0], modelFiles[1], {
unkToken: "[UNK]"
}));
// END reload_model
// START save
tokenizer.save("data/tokenizer-wiki.json");
// END save
});
it("shows a quicktour example", async () => {
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
let console = {
log: (..._args: any[]) => {}
};
// START reload_tokenizer
let tokenizer = Tokenizer.fromFile("data/tokenizer-wiki.json");
// END reload_tokenizer
// START encode
let { promisify } = require('util');
let encode = promisify(tokenizer.encode.bind(tokenizer));
var output = await encode("Hello, y'all! How are you 😁 ?");
// END encode
// START print_tokens
console.log(output.getTokens());
// ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"]
// END print_tokens
expect(output.getTokens()).toEqual([
"Hello",
",",
"y",
"'",
"all",
"!",
"How",
"are",
"you",
"[UNK]",
"?",
]);
// START print_ids
console.log(output.getIds());
// [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
// END print_ids
expect(output.getIds()).toEqual([27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]);
// START print_offsets
let offsets = output.getOffsets();
console.log(offsets[9]);
// (26, 27)
// END print_offsets
expect(offsets[9]).toEqual([26, 27]);
// START use_offsets
let { slice } = require("tokenizers/bindings/utils");
let sentence = "Hello, y'all! How are you 😁 ?"
let [start, end] = offsets[9];
console.log(slice(sentence, start, end));
// "😁"
// END use_offsets
expect(slice(sentence, start, end)).toEqual("😁");
// START check_sep
console.log(tokenizer.tokenToId("[SEP]"));
// 2
// END check_sep
expect(tokenizer.tokenToId("[SEP]")).toEqual(2);
// START init_template_processing
let { templateProcessing } = require("tokenizers/bindings/post-processors");
tokenizer.setPostProcessor(templateProcessing(
"[CLS] $A [SEP]",
"[CLS] $A [SEP] $B:1 [SEP]:1",
[
["[CLS]", tokenizer.tokenToId("[CLS]")],
["[SEP]", tokenizer.tokenToId("[SEP]")],
],
));
// END init_template_processing
// START print_special_tokens
var output = await encode("Hello, y'all! How are you 😁 ?");
console.log(output.getTokens());
// ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
// END print_special_tokens
expect(output.getTokens()).toEqual([
"[CLS]",
"Hello",
",",
"y",
"'",
"all",
"!",
"How",
"are",
"you",
"[UNK]",
"?",
"[SEP]",
]);
// START print_special_tokens_pair
var output = await encode("Hello, y'all!", "How are you 😁 ?");
console.log(output.getTokens());
// ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"]
// END print_special_tokens_pair
expect(output.getTokens()).toEqual([
"[CLS]",
"Hello",
",",
"y",
"'",
"all",
"!",
"[SEP]",
"How",
"are",
"you",
"[UNK]",
"?",
"[SEP]",
]);
// START print_type_ids
console.log(output.getTypeIds());
// [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
// END print_type_ids
expect(output.getTypeIds()).toEqual([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]);
// START encode_batch
let encodeBatch = promisify(tokenizer.encodeBatch.bind(tokenizer));
var output = await encodeBatch(["Hello, y'all!", "How are you 😁 ?"]);
// END encode_batch
// START encode_batch_pair
var output = await encodeBatch(
[["Hello, y'all!", "How are you 😁 ?"], ["Hello to you too!", "I'm fine, thank you!"]]
);
// END encode_batch_pair
// START enable_padding
tokenizer.setPadding({ padId: 3, padToken: "[PAD]" });
// END enable_padding
// START print_batch_tokens
var output = await encodeBatch(["Hello, y'all!", "How are you 😁 ?"]);
console.log(output[1].getTokens());
// ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
// END print_batch_tokens
expect(output[1].getTokens()).toEqual(["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]);
// START print_attention_mask
console.log(output[1].getAttentionMask());
// [1, 1, 1, 1, 1, 1, 1, 0]
// END print_attention_mask
expect(output[1].getAttentionMask()).toEqual([1, 1, 1, 1, 1, 1, 1, 0]);
});
});

View File

@ -99,6 +99,14 @@ one with a BPE model:
:end-before: END quicktour_init_tokenizer :end-before: END quicktour_init_tokenizer
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START init_tokenizer
:end-before: END init_tokenizer
:dedent: 8
To train our tokenizer on the wikitext files, we will need to instantiate a `trainer`, in this case To train our tokenizer on the wikitext files, we will need to instantiate a `trainer`, in this case
a :entity:`BpeTrainer` a :entity:`BpeTrainer`
@ -118,6 +126,14 @@ a :entity:`BpeTrainer`
:end-before: END quicktour_init_trainer :end-before: END quicktour_init_trainer
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START init_trainer
:end-before: END init_trainer
:dedent: 8
We can set the training arguments like :entity:`vocab_size` or :entity:`min_frequency` (here left at We can set the training arguments like :entity:`vocab_size` or :entity:`min_frequency` (here left at
their default values of 30,000 and 0) but the most important part is to give the their default values of 30,000 and 0) but the most important part is to give the
:entity:`special_tokens` we plan to use later on (they are not used at all during training) so that :entity:`special_tokens` we plan to use later on (they are not used at all during training) so that
@ -151,6 +167,14 @@ on whitespace.
:end-before: END quicktour_init_pretok :end-before: END quicktour_init_pretok
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START init_pretok
:end-before: END init_pretok
:dedent: 8
Now, we can just call the :entity:`Tokenizer.train` method with any list of files we want Now, we can just call the :entity:`Tokenizer.train` method with any list of files we want
to use: to use:
@ -170,6 +194,14 @@ to use:
:end-before: END quicktour_train :end-before: END quicktour_train
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START train
:end-before: END train
:dedent: 8
This should only take a few seconds to train our tokenizer on the full wikitext dataset! Once this This should only take a few seconds to train our tokenizer on the full wikitext dataset! Once this
is done, we need to save the model and reinstantiate it with the unknown token, or this token won't is done, we need to save the model and reinstantiate it with the unknown token, or this token won't
be used. This will be simplified in a further release, to let you set the :entity:`unk_token` when be used. This will be simplified in a further release, to let you set the :entity:`unk_token` when
@ -191,6 +223,14 @@ first instantiating the model.
:end-before: END quicktour_reload_model :end-before: END quicktour_reload_model
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START reload_model
:end-before: END reload_model
:dedent: 8
To save the tokenizer in one file that contains all its configuration and vocabulary, just use the To save the tokenizer in one file that contains all its configuration and vocabulary, just use the
:entity:`Tokenizer.save` method: :entity:`Tokenizer.save` method:
@ -210,6 +250,14 @@ To save the tokenizer in one file that contains all its configuration and vocabu
:end-before: END quicktour_save :end-before: END quicktour_save
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START save
:end-before: END save
:dedent: 8
and you can reload your tokenizer from that file with the :entity:`Tokenizer.from_file` and you can reload your tokenizer from that file with the :entity:`Tokenizer.from_file`
:entity:`classmethod`: :entity:`classmethod`:
@ -229,6 +277,14 @@ and you can reload your tokenizer from that file with the :entity:`Tokenizer.fro
:end-before: END quicktour_reload_tokenizer :end-before: END quicktour_reload_tokenizer
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START reload_tokenizer
:end-before: END reload_tokenizer
:dedent: 8
Using the tokenizer Using the tokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -251,6 +307,14 @@ Now that we have trained a tokenizer, we can use it on any text we want with the
:end-before: END quicktour_encode :end-before: END quicktour_encode
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START encode
:end-before: END encode
:dedent: 8
This applied the full pipeline of the tokenizer on the text, returning an This applied the full pipeline of the tokenizer on the text, returning an
:entity:`Encoding` object. To learn more about this pipeline, and how to apply (or :entity:`Encoding` object. To learn more about this pipeline, and how to apply (or
customize) parts of it, check out :doc:`this page <pipeline>`. customize) parts of it, check out :doc:`this page <pipeline>`.
@ -275,6 +339,14 @@ tokens:
:end-before: END quicktour_print_tokens :end-before: END quicktour_print_tokens
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START print_tokens
:end-before: END print_tokens
:dedent: 8
Similarly, the :obj:`ids` attribute will contain the index of each of those tokens in the Similarly, the :obj:`ids` attribute will contain the index of each of those tokens in the
tokenizer's vocabulary: tokenizer's vocabulary:
@ -294,6 +366,14 @@ tokenizer's vocabulary:
:end-before: END quicktour_print_ids :end-before: END quicktour_print_ids
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START print_ids
:end-before: END print_ids
:dedent: 8
An important feature of the 🤗 Tokenizers library is that it comes with full alignment tracking, An important feature of the 🤗 Tokenizers library is that it comes with full alignment tracking,
meaning you can always get the part of your original sentence that corresponds to a given token. meaning you can always get the part of your original sentence that corresponds to a given token.
Those are stored in the :obj:`offsets` attribute of our :entity:`Encoding` object. For Those are stored in the :obj:`offsets` attribute of our :entity:`Encoding` object. For
@ -316,6 +396,14 @@ which is the token at index 9 in the list, we can just ask for the offset at the
:end-before: END quicktour_print_offsets :end-before: END quicktour_print_offsets
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START print_offsets
:end-before: END print_offsets
:dedent: 8
and those are the indices that correspond to the emoji in the original sentence: and those are the indices that correspond to the emoji in the original sentence:
.. only:: python .. only:: python
@ -334,6 +422,14 @@ and those are the indices that correspond to the emoji in the original sentence:
:end-before: END quicktour_use_offsets :end-before: END quicktour_use_offsets
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START use_offsets
:end-before: END use_offsets
:dedent: 8
Post-processing Post-processing
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -362,6 +458,14 @@ list of special tokens, so this should be their IDs. To double-check, we can use
:end-before: END quicktour_check_sep :end-before: END quicktour_check_sep
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START check_sep
:end-before: END check_sep
:dedent: 8
Here is how we can set the post-processing to give us the traditional BERT inputs: Here is how we can set the post-processing to give us the traditional BERT inputs:
.. only:: python .. only:: python
@ -380,6 +484,14 @@ Here is how we can set the post-processing to give us the traditional BERT input
:end-before: END quicktour_init_template_processing :end-before: END quicktour_init_template_processing
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START init_template_processing
:end-before: END init_template_processing
:dedent: 8
Let's go over this snippet of code in more details. First we specify the template for single Let's go over this snippet of code in more details. First we specify the template for single
sentences: those should have the form :obj:`"[CLS] $A [SEP]"` where :obj:`$A` represents our sentences: those should have the form :obj:`"[CLS] $A [SEP]"` where :obj:`$A` represents our
sentence. sentence.
@ -410,6 +522,14 @@ To check out this worked properly, let's try to encode the same sentence as befo
:end-before: END quicktour_print_special_tokens :end-before: END quicktour_print_special_tokens
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START print_special_tokens
:end-before: END print_special_tokens
:dedent: 8
To check the results on a pair of sentences, we just pass the two sentences to To check the results on a pair of sentences, we just pass the two sentences to
:entity:`Tokenizer.encode`: :entity:`Tokenizer.encode`:
@ -429,6 +549,14 @@ To check the results on a pair of sentences, we just pass the two sentences to
:end-before: END quicktour_print_special_tokens_pair :end-before: END quicktour_print_special_tokens_pair
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START print_special_tokens_pair
:end-before: END print_special_tokens_pair
:dedent: 8
You can then check the type IDs attributed to each token is correct with You can then check the type IDs attributed to each token is correct with
.. only:: python .. only:: python
@ -447,6 +575,14 @@ You can then check the type IDs attributed to each token is correct with
:end-before: END quicktour_print_type_ids :end-before: END quicktour_print_type_ids
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START print_type_ids
:end-before: END print_type_ids
:dedent: 8
If you save your tokenizer with :entity:`Tokenizer.save`, the post-processor will be saved along. If you save your tokenizer with :entity:`Tokenizer.save`, the post-processor will be saved along.
Encoding multiple sentences in a batch Encoding multiple sentences in a batch
@ -471,6 +607,14 @@ using the :entity:`Tokenizer.encode_batch` method:
:end-before: END quicktour_encode_batch :end-before: END quicktour_encode_batch
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START encode_batch
:end-before: END encode_batch
:dedent: 8
The output is then a list of :entity:`Encoding` objects like the ones we saw before. You The output is then a list of :entity:`Encoding` objects like the ones we saw before. You
can process together as many texts as you like, as long as it fits in memory. can process together as many texts as you like, as long as it fits in memory.
@ -494,6 +638,14 @@ B:
:end-before: END quicktour_encode_batch_pair :end-before: END quicktour_encode_batch_pair
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START encode_batch_pair
:end-before: END encode_batch_pair
:dedent: 8
When encoding multiple sentences, you can automatically pad the outputs to the longest sentence When encoding multiple sentences, you can automatically pad the outputs to the longest sentence
present by using :entity:`Tokenizer.enable_padding`, with the :entity:`pad_token` and its ID present by using :entity:`Tokenizer.enable_padding`, with the :entity:`pad_token` and its ID
(which we can double-check the id for the padding token with (which we can double-check the id for the padding token with
@ -515,6 +667,14 @@ present by using :entity:`Tokenizer.enable_padding`, with the :entity:`pad_token
:end-before: END quicktour_enable_padding :end-before: END quicktour_enable_padding
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START enable_padding
:end-before: END enable_padding
:dedent: 8
We can set the :obj:`direction` of the padding (defaults to the right) or a given :obj:`length` if We can set the :obj:`direction` of the padding (defaults to the right) or a given :obj:`length` if
we want to pad every sample to that specific number (here we leave it unset to pad to the size of we want to pad every sample to that specific number (here we leave it unset to pad to the size of
the longest text). the longest text).
@ -535,6 +695,14 @@ the longest text).
:end-before: END quicktour_print_batch_tokens :end-before: END quicktour_print_batch_tokens
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START print_batch_tokens
:end-before: END print_batch_tokens
:dedent: 8
In this case, the `attention mask` generated by the tokenizer takes the padding into account: In this case, the `attention mask` generated by the tokenizer takes the padding into account:
.. only:: python .. only:: python
@ -553,6 +721,14 @@ In this case, the `attention mask` generated by the tokenizer takes the padding
:end-before: END quicktour_print_attention_mask :end-before: END quicktour_print_attention_mask
:dedent: 4 :dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
:language: javascript
:start-after: START print_attention_mask
:end-before: END print_attention_mask
:dedent: 8
.. _pretrained: .. _pretrained:
.. only:: python .. only:: python