mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Doc - Update quicktour for Node
This commit is contained in:
@ -1,4 +1,4 @@
|
|||||||
const globRequire = require;
|
var globRequire = require;
|
||||||
|
|
||||||
describe("pipelineExample", () => {
|
describe("pipelineExample", () => {
|
||||||
// This is a hack to let us require using path similar to what the user has to use
|
// This is a hack to let us require using path similar to what the user has to use
|
||||||
@ -82,7 +82,7 @@ describe("pipelineExample", () => {
|
|||||||
|
|
||||||
// START bert_train_tokenizer
|
// START bert_train_tokenizer
|
||||||
let { wordPieceTrainer } = require("tokenizers/bindings/trainers");
|
let { wordPieceTrainer } = require("tokenizers/bindings/trainers");
|
||||||
let { promisify } = require("utils");
|
let { promisify } = require("util");
|
||||||
|
|
||||||
let trainer = wordPieceTrainer({
|
let trainer = wordPieceTrainer({
|
||||||
vocabSize: 30522,
|
vocabSize: 30522,
|
||||||
@ -107,7 +107,7 @@ describe("pipelineExample", () => {
|
|||||||
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
||||||
let { WordPiece } = require("tokenizers/bindings/models");
|
let { WordPiece } = require("tokenizers/bindings/models");
|
||||||
|
|
||||||
let bertTokenizer = Tokenizer(WordPiece.empty());
|
let bertTokenizer = new Tokenizer(WordPiece.empty());
|
||||||
// END bert_setup_tokenizer
|
// END bert_setup_tokenizer
|
||||||
// START bert_setup_normalizer
|
// START bert_setup_normalizer
|
||||||
let { sequenceNormalizer, lowercaseNormalizer, nfdNormalizer, stripAccentsNormalizer }
|
let { sequenceNormalizer, lowercaseNormalizer, nfdNormalizer, stripAccentsNormalizer }
|
||||||
@ -118,12 +118,12 @@ describe("pipelineExample", () => {
|
|||||||
]))
|
]))
|
||||||
// END bert_setup_normalizer
|
// END bert_setup_normalizer
|
||||||
// START bert_setup_pre_tokenizer
|
// START bert_setup_pre_tokenizer
|
||||||
let { whitespacePreTokenizer } = require("tokenizers/bindings/pre_tokenizers");
|
let { whitespacePreTokenizer } = require("tokenizers/bindings/pre-tokenizers");
|
||||||
|
|
||||||
bertTokenizer.setPreTokenizer = whitespacePreTokenizer();
|
bertTokenizer.setPreTokenizer = whitespacePreTokenizer();
|
||||||
// END bert_setup_pre_tokenizer
|
// END bert_setup_pre_tokenizer
|
||||||
// START bert_setup_processor
|
// START bert_setup_processor
|
||||||
let { templateProcessing } = require("tokenizers/bindings/processors");
|
let { templateProcessing } = require("tokenizers/bindings/post-processors");
|
||||||
|
|
||||||
bertTokenizer.setPostProcessor(templateProcessing(
|
bertTokenizer.setPostProcessor(templateProcessing(
|
||||||
"[CLS] $A [SEP]",
|
"[CLS] $A [SEP]",
|
||||||
|
190
bindings/node/examples/documentation/quicktour.test.ts
Normal file
190
bindings/node/examples/documentation/quicktour.test.ts
Normal file
@ -0,0 +1,190 @@
|
|||||||
|
var globRequire = require;
|
||||||
|
|
||||||
|
describe("quicktourExample", () => {
|
||||||
|
function require(mod: string) {
|
||||||
|
if (mod.startsWith("tokenizers/")) {
|
||||||
|
let path = mod.slice("tokenizers/".length);
|
||||||
|
return globRequire("../../lib/" + path);
|
||||||
|
} else {
|
||||||
|
return globRequire(mod);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
it.skip("trains the tokenizer", async () => {
|
||||||
|
// START init_tokenizer
|
||||||
|
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
||||||
|
let { BPE } = require("tokenizers/bindings/models");
|
||||||
|
|
||||||
|
let tokenizer = new Tokenizer(BPE.empty());
|
||||||
|
// END init_tokenizer
|
||||||
|
// START init_trainer
|
||||||
|
let { bpeTrainer } = require("tokenizers/bindings/trainers");
|
||||||
|
|
||||||
|
let trainer = bpeTrainer({
|
||||||
|
specialTokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
|
||||||
|
});
|
||||||
|
// END init_trainer
|
||||||
|
// START init_pretok
|
||||||
|
let { whitespacePreTokenizer } = require("tokenizers/bindings/pre-tokenizers");
|
||||||
|
|
||||||
|
tokenizer.setPreTokenizer(whitespacePreTokenizer());
|
||||||
|
// END init_pretok
|
||||||
|
// START train
|
||||||
|
let files = ["test", "train", "valid"].map(split => `data/wikitext-103-raw/wiki.${split}.raw`);
|
||||||
|
tokenizer.train(trainer, files);
|
||||||
|
// END train
|
||||||
|
// START reload_model
|
||||||
|
let { promisify } = require("util");
|
||||||
|
|
||||||
|
let modelFiles = tokenizer.getModel().save("data", "wiki");
|
||||||
|
let fromFile = promisify(BPE.fromFile);
|
||||||
|
tokenizer.setModel(await fromFile(modelFiles[0], modelFiles[1], {
|
||||||
|
unkToken: "[UNK]"
|
||||||
|
}));
|
||||||
|
// END reload_model
|
||||||
|
// START save
|
||||||
|
tokenizer.save("data/tokenizer-wiki.json");
|
||||||
|
// END save
|
||||||
|
});
|
||||||
|
|
||||||
|
it("shows a quicktour example", async () => {
|
||||||
|
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
||||||
|
let console = {
|
||||||
|
log: (..._args: any[]) => {}
|
||||||
|
};
|
||||||
|
|
||||||
|
// START reload_tokenizer
|
||||||
|
let tokenizer = Tokenizer.fromFile("data/tokenizer-wiki.json");
|
||||||
|
// END reload_tokenizer
|
||||||
|
// START encode
|
||||||
|
let { promisify } = require('util');
|
||||||
|
let encode = promisify(tokenizer.encode.bind(tokenizer));
|
||||||
|
|
||||||
|
var output = await encode("Hello, y'all! How are you 😁 ?");
|
||||||
|
// END encode
|
||||||
|
// START print_tokens
|
||||||
|
console.log(output.getTokens());
|
||||||
|
// ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"]
|
||||||
|
// END print_tokens
|
||||||
|
expect(output.getTokens()).toEqual([
|
||||||
|
"Hello",
|
||||||
|
",",
|
||||||
|
"y",
|
||||||
|
"'",
|
||||||
|
"all",
|
||||||
|
"!",
|
||||||
|
"How",
|
||||||
|
"are",
|
||||||
|
"you",
|
||||||
|
"[UNK]",
|
||||||
|
"?",
|
||||||
|
]);
|
||||||
|
// START print_ids
|
||||||
|
console.log(output.getIds());
|
||||||
|
// [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
|
||||||
|
// END print_ids
|
||||||
|
expect(output.getIds()).toEqual([27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]);
|
||||||
|
// START print_offsets
|
||||||
|
let offsets = output.getOffsets();
|
||||||
|
console.log(offsets[9]);
|
||||||
|
// (26, 27)
|
||||||
|
// END print_offsets
|
||||||
|
expect(offsets[9]).toEqual([26, 27]);
|
||||||
|
// START use_offsets
|
||||||
|
let { slice } = require("tokenizers/bindings/utils");
|
||||||
|
|
||||||
|
let sentence = "Hello, y'all! How are you 😁 ?"
|
||||||
|
let [start, end] = offsets[9];
|
||||||
|
console.log(slice(sentence, start, end));
|
||||||
|
// "😁"
|
||||||
|
// END use_offsets
|
||||||
|
expect(slice(sentence, start, end)).toEqual("😁");
|
||||||
|
// START check_sep
|
||||||
|
console.log(tokenizer.tokenToId("[SEP]"));
|
||||||
|
// 2
|
||||||
|
// END check_sep
|
||||||
|
expect(tokenizer.tokenToId("[SEP]")).toEqual(2);
|
||||||
|
// START init_template_processing
|
||||||
|
let { templateProcessing } = require("tokenizers/bindings/post-processors");
|
||||||
|
|
||||||
|
tokenizer.setPostProcessor(templateProcessing(
|
||||||
|
"[CLS] $A [SEP]",
|
||||||
|
"[CLS] $A [SEP] $B:1 [SEP]:1",
|
||||||
|
[
|
||||||
|
["[CLS]", tokenizer.tokenToId("[CLS]")],
|
||||||
|
["[SEP]", tokenizer.tokenToId("[SEP]")],
|
||||||
|
],
|
||||||
|
));
|
||||||
|
// END init_template_processing
|
||||||
|
// START print_special_tokens
|
||||||
|
var output = await encode("Hello, y'all! How are you 😁 ?");
|
||||||
|
console.log(output.getTokens());
|
||||||
|
// ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
|
||||||
|
// END print_special_tokens
|
||||||
|
expect(output.getTokens()).toEqual([
|
||||||
|
"[CLS]",
|
||||||
|
"Hello",
|
||||||
|
",",
|
||||||
|
"y",
|
||||||
|
"'",
|
||||||
|
"all",
|
||||||
|
"!",
|
||||||
|
"How",
|
||||||
|
"are",
|
||||||
|
"you",
|
||||||
|
"[UNK]",
|
||||||
|
"?",
|
||||||
|
"[SEP]",
|
||||||
|
]);
|
||||||
|
// START print_special_tokens_pair
|
||||||
|
var output = await encode("Hello, y'all!", "How are you 😁 ?");
|
||||||
|
console.log(output.getTokens());
|
||||||
|
// ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"]
|
||||||
|
// END print_special_tokens_pair
|
||||||
|
expect(output.getTokens()).toEqual([
|
||||||
|
"[CLS]",
|
||||||
|
"Hello",
|
||||||
|
",",
|
||||||
|
"y",
|
||||||
|
"'",
|
||||||
|
"all",
|
||||||
|
"!",
|
||||||
|
"[SEP]",
|
||||||
|
"How",
|
||||||
|
"are",
|
||||||
|
"you",
|
||||||
|
"[UNK]",
|
||||||
|
"?",
|
||||||
|
"[SEP]",
|
||||||
|
]);
|
||||||
|
// START print_type_ids
|
||||||
|
console.log(output.getTypeIds());
|
||||||
|
// [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
|
||||||
|
// END print_type_ids
|
||||||
|
expect(output.getTypeIds()).toEqual([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]);
|
||||||
|
// START encode_batch
|
||||||
|
let encodeBatch = promisify(tokenizer.encodeBatch.bind(tokenizer));
|
||||||
|
|
||||||
|
var output = await encodeBatch(["Hello, y'all!", "How are you 😁 ?"]);
|
||||||
|
// END encode_batch
|
||||||
|
// START encode_batch_pair
|
||||||
|
var output = await encodeBatch(
|
||||||
|
[["Hello, y'all!", "How are you 😁 ?"], ["Hello to you too!", "I'm fine, thank you!"]]
|
||||||
|
);
|
||||||
|
// END encode_batch_pair
|
||||||
|
// START enable_padding
|
||||||
|
tokenizer.setPadding({ padId: 3, padToken: "[PAD]" });
|
||||||
|
// END enable_padding
|
||||||
|
// START print_batch_tokens
|
||||||
|
var output = await encodeBatch(["Hello, y'all!", "How are you 😁 ?"]);
|
||||||
|
console.log(output[1].getTokens());
|
||||||
|
// ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
|
||||||
|
// END print_batch_tokens
|
||||||
|
expect(output[1].getTokens()).toEqual(["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]);
|
||||||
|
// START print_attention_mask
|
||||||
|
console.log(output[1].getAttentionMask());
|
||||||
|
// [1, 1, 1, 1, 1, 1, 1, 0]
|
||||||
|
// END print_attention_mask
|
||||||
|
expect(output[1].getAttentionMask()).toEqual([1, 1, 1, 1, 1, 1, 1, 0]);
|
||||||
|
});
|
||||||
|
});
|
@ -99,6 +99,14 @@ one with a BPE model:
|
|||||||
:end-before: END quicktour_init_tokenizer
|
:end-before: END quicktour_init_tokenizer
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START init_tokenizer
|
||||||
|
:end-before: END init_tokenizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
To train our tokenizer on the wikitext files, we will need to instantiate a `trainer`, in this case
|
To train our tokenizer on the wikitext files, we will need to instantiate a `trainer`, in this case
|
||||||
a :entity:`BpeTrainer`
|
a :entity:`BpeTrainer`
|
||||||
|
|
||||||
@ -118,6 +126,14 @@ a :entity:`BpeTrainer`
|
|||||||
:end-before: END quicktour_init_trainer
|
:end-before: END quicktour_init_trainer
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START init_trainer
|
||||||
|
:end-before: END init_trainer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
We can set the training arguments like :entity:`vocab_size` or :entity:`min_frequency` (here left at
|
We can set the training arguments like :entity:`vocab_size` or :entity:`min_frequency` (here left at
|
||||||
their default values of 30,000 and 0) but the most important part is to give the
|
their default values of 30,000 and 0) but the most important part is to give the
|
||||||
:entity:`special_tokens` we plan to use later on (they are not used at all during training) so that
|
:entity:`special_tokens` we plan to use later on (they are not used at all during training) so that
|
||||||
@ -151,6 +167,14 @@ on whitespace.
|
|||||||
:end-before: END quicktour_init_pretok
|
:end-before: END quicktour_init_pretok
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START init_pretok
|
||||||
|
:end-before: END init_pretok
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
Now, we can just call the :entity:`Tokenizer.train` method with any list of files we want
|
Now, we can just call the :entity:`Tokenizer.train` method with any list of files we want
|
||||||
to use:
|
to use:
|
||||||
|
|
||||||
@ -170,6 +194,14 @@ to use:
|
|||||||
:end-before: END quicktour_train
|
:end-before: END quicktour_train
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START train
|
||||||
|
:end-before: END train
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
This should only take a few seconds to train our tokenizer on the full wikitext dataset! Once this
|
This should only take a few seconds to train our tokenizer on the full wikitext dataset! Once this
|
||||||
is done, we need to save the model and reinstantiate it with the unknown token, or this token won't
|
is done, we need to save the model and reinstantiate it with the unknown token, or this token won't
|
||||||
be used. This will be simplified in a further release, to let you set the :entity:`unk_token` when
|
be used. This will be simplified in a further release, to let you set the :entity:`unk_token` when
|
||||||
@ -191,6 +223,14 @@ first instantiating the model.
|
|||||||
:end-before: END quicktour_reload_model
|
:end-before: END quicktour_reload_model
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START reload_model
|
||||||
|
:end-before: END reload_model
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
To save the tokenizer in one file that contains all its configuration and vocabulary, just use the
|
To save the tokenizer in one file that contains all its configuration and vocabulary, just use the
|
||||||
:entity:`Tokenizer.save` method:
|
:entity:`Tokenizer.save` method:
|
||||||
|
|
||||||
@ -210,6 +250,14 @@ To save the tokenizer in one file that contains all its configuration and vocabu
|
|||||||
:end-before: END quicktour_save
|
:end-before: END quicktour_save
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START save
|
||||||
|
:end-before: END save
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
and you can reload your tokenizer from that file with the :entity:`Tokenizer.from_file`
|
and you can reload your tokenizer from that file with the :entity:`Tokenizer.from_file`
|
||||||
:entity:`classmethod`:
|
:entity:`classmethod`:
|
||||||
|
|
||||||
@ -229,6 +277,14 @@ and you can reload your tokenizer from that file with the :entity:`Tokenizer.fro
|
|||||||
:end-before: END quicktour_reload_tokenizer
|
:end-before: END quicktour_reload_tokenizer
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START reload_tokenizer
|
||||||
|
:end-before: END reload_tokenizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
Using the tokenizer
|
Using the tokenizer
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
@ -251,6 +307,14 @@ Now that we have trained a tokenizer, we can use it on any text we want with the
|
|||||||
:end-before: END quicktour_encode
|
:end-before: END quicktour_encode
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START encode
|
||||||
|
:end-before: END encode
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
This applied the full pipeline of the tokenizer on the text, returning an
|
This applied the full pipeline of the tokenizer on the text, returning an
|
||||||
:entity:`Encoding` object. To learn more about this pipeline, and how to apply (or
|
:entity:`Encoding` object. To learn more about this pipeline, and how to apply (or
|
||||||
customize) parts of it, check out :doc:`this page <pipeline>`.
|
customize) parts of it, check out :doc:`this page <pipeline>`.
|
||||||
@ -275,6 +339,14 @@ tokens:
|
|||||||
:end-before: END quicktour_print_tokens
|
:end-before: END quicktour_print_tokens
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START print_tokens
|
||||||
|
:end-before: END print_tokens
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
Similarly, the :obj:`ids` attribute will contain the index of each of those tokens in the
|
Similarly, the :obj:`ids` attribute will contain the index of each of those tokens in the
|
||||||
tokenizer's vocabulary:
|
tokenizer's vocabulary:
|
||||||
|
|
||||||
@ -294,6 +366,14 @@ tokenizer's vocabulary:
|
|||||||
:end-before: END quicktour_print_ids
|
:end-before: END quicktour_print_ids
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START print_ids
|
||||||
|
:end-before: END print_ids
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
An important feature of the 🤗 Tokenizers library is that it comes with full alignment tracking,
|
An important feature of the 🤗 Tokenizers library is that it comes with full alignment tracking,
|
||||||
meaning you can always get the part of your original sentence that corresponds to a given token.
|
meaning you can always get the part of your original sentence that corresponds to a given token.
|
||||||
Those are stored in the :obj:`offsets` attribute of our :entity:`Encoding` object. For
|
Those are stored in the :obj:`offsets` attribute of our :entity:`Encoding` object. For
|
||||||
@ -316,6 +396,14 @@ which is the token at index 9 in the list, we can just ask for the offset at the
|
|||||||
:end-before: END quicktour_print_offsets
|
:end-before: END quicktour_print_offsets
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START print_offsets
|
||||||
|
:end-before: END print_offsets
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
and those are the indices that correspond to the emoji in the original sentence:
|
and those are the indices that correspond to the emoji in the original sentence:
|
||||||
|
|
||||||
.. only:: python
|
.. only:: python
|
||||||
@ -334,6 +422,14 @@ and those are the indices that correspond to the emoji in the original sentence:
|
|||||||
:end-before: END quicktour_use_offsets
|
:end-before: END quicktour_use_offsets
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START use_offsets
|
||||||
|
:end-before: END use_offsets
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
Post-processing
|
Post-processing
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
@ -362,6 +458,14 @@ list of special tokens, so this should be their IDs. To double-check, we can use
|
|||||||
:end-before: END quicktour_check_sep
|
:end-before: END quicktour_check_sep
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START check_sep
|
||||||
|
:end-before: END check_sep
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
Here is how we can set the post-processing to give us the traditional BERT inputs:
|
Here is how we can set the post-processing to give us the traditional BERT inputs:
|
||||||
|
|
||||||
.. only:: python
|
.. only:: python
|
||||||
@ -380,6 +484,14 @@ Here is how we can set the post-processing to give us the traditional BERT input
|
|||||||
:end-before: END quicktour_init_template_processing
|
:end-before: END quicktour_init_template_processing
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START init_template_processing
|
||||||
|
:end-before: END init_template_processing
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
Let's go over this snippet of code in more details. First we specify the template for single
|
Let's go over this snippet of code in more details. First we specify the template for single
|
||||||
sentences: those should have the form :obj:`"[CLS] $A [SEP]"` where :obj:`$A` represents our
|
sentences: those should have the form :obj:`"[CLS] $A [SEP]"` where :obj:`$A` represents our
|
||||||
sentence.
|
sentence.
|
||||||
@ -410,6 +522,14 @@ To check out this worked properly, let's try to encode the same sentence as befo
|
|||||||
:end-before: END quicktour_print_special_tokens
|
:end-before: END quicktour_print_special_tokens
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START print_special_tokens
|
||||||
|
:end-before: END print_special_tokens
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
To check the results on a pair of sentences, we just pass the two sentences to
|
To check the results on a pair of sentences, we just pass the two sentences to
|
||||||
:entity:`Tokenizer.encode`:
|
:entity:`Tokenizer.encode`:
|
||||||
|
|
||||||
@ -429,6 +549,14 @@ To check the results on a pair of sentences, we just pass the two sentences to
|
|||||||
:end-before: END quicktour_print_special_tokens_pair
|
:end-before: END quicktour_print_special_tokens_pair
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START print_special_tokens_pair
|
||||||
|
:end-before: END print_special_tokens_pair
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
You can then check the type IDs attributed to each token is correct with
|
You can then check the type IDs attributed to each token is correct with
|
||||||
|
|
||||||
.. only:: python
|
.. only:: python
|
||||||
@ -447,6 +575,14 @@ You can then check the type IDs attributed to each token is correct with
|
|||||||
:end-before: END quicktour_print_type_ids
|
:end-before: END quicktour_print_type_ids
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START print_type_ids
|
||||||
|
:end-before: END print_type_ids
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
If you save your tokenizer with :entity:`Tokenizer.save`, the post-processor will be saved along.
|
If you save your tokenizer with :entity:`Tokenizer.save`, the post-processor will be saved along.
|
||||||
|
|
||||||
Encoding multiple sentences in a batch
|
Encoding multiple sentences in a batch
|
||||||
@ -471,6 +607,14 @@ using the :entity:`Tokenizer.encode_batch` method:
|
|||||||
:end-before: END quicktour_encode_batch
|
:end-before: END quicktour_encode_batch
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START encode_batch
|
||||||
|
:end-before: END encode_batch
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
The output is then a list of :entity:`Encoding` objects like the ones we saw before. You
|
The output is then a list of :entity:`Encoding` objects like the ones we saw before. You
|
||||||
can process together as many texts as you like, as long as it fits in memory.
|
can process together as many texts as you like, as long as it fits in memory.
|
||||||
|
|
||||||
@ -494,6 +638,14 @@ B:
|
|||||||
:end-before: END quicktour_encode_batch_pair
|
:end-before: END quicktour_encode_batch_pair
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START encode_batch_pair
|
||||||
|
:end-before: END encode_batch_pair
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
When encoding multiple sentences, you can automatically pad the outputs to the longest sentence
|
When encoding multiple sentences, you can automatically pad the outputs to the longest sentence
|
||||||
present by using :entity:`Tokenizer.enable_padding`, with the :entity:`pad_token` and its ID
|
present by using :entity:`Tokenizer.enable_padding`, with the :entity:`pad_token` and its ID
|
||||||
(which we can double-check the id for the padding token with
|
(which we can double-check the id for the padding token with
|
||||||
@ -515,6 +667,14 @@ present by using :entity:`Tokenizer.enable_padding`, with the :entity:`pad_token
|
|||||||
:end-before: END quicktour_enable_padding
|
:end-before: END quicktour_enable_padding
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START enable_padding
|
||||||
|
:end-before: END enable_padding
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
We can set the :obj:`direction` of the padding (defaults to the right) or a given :obj:`length` if
|
We can set the :obj:`direction` of the padding (defaults to the right) or a given :obj:`length` if
|
||||||
we want to pad every sample to that specific number (here we leave it unset to pad to the size of
|
we want to pad every sample to that specific number (here we leave it unset to pad to the size of
|
||||||
the longest text).
|
the longest text).
|
||||||
@ -535,6 +695,14 @@ the longest text).
|
|||||||
:end-before: END quicktour_print_batch_tokens
|
:end-before: END quicktour_print_batch_tokens
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START print_batch_tokens
|
||||||
|
:end-before: END print_batch_tokens
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
In this case, the `attention mask` generated by the tokenizer takes the padding into account:
|
In this case, the `attention mask` generated by the tokenizer takes the padding into account:
|
||||||
|
|
||||||
.. only:: python
|
.. only:: python
|
||||||
@ -553,6 +721,14 @@ In this case, the `attention mask` generated by the tokenizer takes the padding
|
|||||||
:end-before: END quicktour_print_attention_mask
|
:end-before: END quicktour_print_attention_mask
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/quicktour.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START print_attention_mask
|
||||||
|
:end-before: END print_attention_mask
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
.. _pretrained:
|
.. _pretrained:
|
||||||
|
|
||||||
.. only:: python
|
.. only:: python
|
||||||
|
Reference in New Issue
Block a user