mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Doc - Update quicktour for Node
This commit is contained in:
@ -1,4 +1,4 @@
|
||||
const globRequire = require;
|
||||
var globRequire = require;
|
||||
|
||||
describe("pipelineExample", () => {
|
||||
// This is a hack to let us require using path similar to what the user has to use
|
||||
@ -82,7 +82,7 @@ describe("pipelineExample", () => {
|
||||
|
||||
// START bert_train_tokenizer
|
||||
let { wordPieceTrainer } = require("tokenizers/bindings/trainers");
|
||||
let { promisify } = require("utils");
|
||||
let { promisify } = require("util");
|
||||
|
||||
let trainer = wordPieceTrainer({
|
||||
vocabSize: 30522,
|
||||
@ -107,7 +107,7 @@ describe("pipelineExample", () => {
|
||||
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
||||
let { WordPiece } = require("tokenizers/bindings/models");
|
||||
|
||||
let bertTokenizer = Tokenizer(WordPiece.empty());
|
||||
let bertTokenizer = new Tokenizer(WordPiece.empty());
|
||||
// END bert_setup_tokenizer
|
||||
// START bert_setup_normalizer
|
||||
let { sequenceNormalizer, lowercaseNormalizer, nfdNormalizer, stripAccentsNormalizer }
|
||||
@ -118,12 +118,12 @@ describe("pipelineExample", () => {
|
||||
]))
|
||||
// END bert_setup_normalizer
|
||||
// START bert_setup_pre_tokenizer
|
||||
let { whitespacePreTokenizer } = require("tokenizers/bindings/pre_tokenizers");
|
||||
let { whitespacePreTokenizer } = require("tokenizers/bindings/pre-tokenizers");
|
||||
|
||||
bertTokenizer.setPreTokenizer = whitespacePreTokenizer();
|
||||
// END bert_setup_pre_tokenizer
|
||||
// START bert_setup_processor
|
||||
let { templateProcessing } = require("tokenizers/bindings/processors");
|
||||
let { templateProcessing } = require("tokenizers/bindings/post-processors");
|
||||
|
||||
bertTokenizer.setPostProcessor(templateProcessing(
|
||||
"[CLS] $A [SEP]",
|
||||
|
190
bindings/node/examples/documentation/quicktour.test.ts
Normal file
190
bindings/node/examples/documentation/quicktour.test.ts
Normal file
@ -0,0 +1,190 @@
|
||||
var globRequire = require;
|
||||
|
||||
describe("quicktourExample", () => {
|
||||
function require(mod: string) {
|
||||
if (mod.startsWith("tokenizers/")) {
|
||||
let path = mod.slice("tokenizers/".length);
|
||||
return globRequire("../../lib/" + path);
|
||||
} else {
|
||||
return globRequire(mod);
|
||||
}
|
||||
}
|
||||
|
||||
it.skip("trains the tokenizer", async () => {
|
||||
// START init_tokenizer
|
||||
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
||||
let { BPE } = require("tokenizers/bindings/models");
|
||||
|
||||
let tokenizer = new Tokenizer(BPE.empty());
|
||||
// END init_tokenizer
|
||||
// START init_trainer
|
||||
let { bpeTrainer } = require("tokenizers/bindings/trainers");
|
||||
|
||||
let trainer = bpeTrainer({
|
||||
specialTokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
|
||||
});
|
||||
// END init_trainer
|
||||
// START init_pretok
|
||||
let { whitespacePreTokenizer } = require("tokenizers/bindings/pre-tokenizers");
|
||||
|
||||
tokenizer.setPreTokenizer(whitespacePreTokenizer());
|
||||
// END init_pretok
|
||||
// START train
|
||||
let files = ["test", "train", "valid"].map(split => `data/wikitext-103-raw/wiki.${split}.raw`);
|
||||
tokenizer.train(trainer, files);
|
||||
// END train
|
||||
// START reload_model
|
||||
let { promisify } = require("util");
|
||||
|
||||
let modelFiles = tokenizer.getModel().save("data", "wiki");
|
||||
let fromFile = promisify(BPE.fromFile);
|
||||
tokenizer.setModel(await fromFile(modelFiles[0], modelFiles[1], {
|
||||
unkToken: "[UNK]"
|
||||
}));
|
||||
// END reload_model
|
||||
// START save
|
||||
tokenizer.save("data/tokenizer-wiki.json");
|
||||
// END save
|
||||
});
|
||||
|
||||
it("shows a quicktour example", async () => {
|
||||
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
||||
let console = {
|
||||
log: (..._args: any[]) => {}
|
||||
};
|
||||
|
||||
// START reload_tokenizer
|
||||
let tokenizer = Tokenizer.fromFile("data/tokenizer-wiki.json");
|
||||
// END reload_tokenizer
|
||||
// START encode
|
||||
let { promisify } = require('util');
|
||||
let encode = promisify(tokenizer.encode.bind(tokenizer));
|
||||
|
||||
var output = await encode("Hello, y'all! How are you 😁 ?");
|
||||
// END encode
|
||||
// START print_tokens
|
||||
console.log(output.getTokens());
|
||||
// ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"]
|
||||
// END print_tokens
|
||||
expect(output.getTokens()).toEqual([
|
||||
"Hello",
|
||||
",",
|
||||
"y",
|
||||
"'",
|
||||
"all",
|
||||
"!",
|
||||
"How",
|
||||
"are",
|
||||
"you",
|
||||
"[UNK]",
|
||||
"?",
|
||||
]);
|
||||
// START print_ids
|
||||
console.log(output.getIds());
|
||||
// [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
|
||||
// END print_ids
|
||||
expect(output.getIds()).toEqual([27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]);
|
||||
// START print_offsets
|
||||
let offsets = output.getOffsets();
|
||||
console.log(offsets[9]);
|
||||
// (26, 27)
|
||||
// END print_offsets
|
||||
expect(offsets[9]).toEqual([26, 27]);
|
||||
// START use_offsets
|
||||
let { slice } = require("tokenizers/bindings/utils");
|
||||
|
||||
let sentence = "Hello, y'all! How are you 😁 ?"
|
||||
let [start, end] = offsets[9];
|
||||
console.log(slice(sentence, start, end));
|
||||
// "😁"
|
||||
// END use_offsets
|
||||
expect(slice(sentence, start, end)).toEqual("😁");
|
||||
// START check_sep
|
||||
console.log(tokenizer.tokenToId("[SEP]"));
|
||||
// 2
|
||||
// END check_sep
|
||||
expect(tokenizer.tokenToId("[SEP]")).toEqual(2);
|
||||
// START init_template_processing
|
||||
let { templateProcessing } = require("tokenizers/bindings/post-processors");
|
||||
|
||||
tokenizer.setPostProcessor(templateProcessing(
|
||||
"[CLS] $A [SEP]",
|
||||
"[CLS] $A [SEP] $B:1 [SEP]:1",
|
||||
[
|
||||
["[CLS]", tokenizer.tokenToId("[CLS]")],
|
||||
["[SEP]", tokenizer.tokenToId("[SEP]")],
|
||||
],
|
||||
));
|
||||
// END init_template_processing
|
||||
// START print_special_tokens
|
||||
var output = await encode("Hello, y'all! How are you 😁 ?");
|
||||
console.log(output.getTokens());
|
||||
// ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
|
||||
// END print_special_tokens
|
||||
expect(output.getTokens()).toEqual([
|
||||
"[CLS]",
|
||||
"Hello",
|
||||
",",
|
||||
"y",
|
||||
"'",
|
||||
"all",
|
||||
"!",
|
||||
"How",
|
||||
"are",
|
||||
"you",
|
||||
"[UNK]",
|
||||
"?",
|
||||
"[SEP]",
|
||||
]);
|
||||
// START print_special_tokens_pair
|
||||
var output = await encode("Hello, y'all!", "How are you 😁 ?");
|
||||
console.log(output.getTokens());
|
||||
// ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"]
|
||||
// END print_special_tokens_pair
|
||||
expect(output.getTokens()).toEqual([
|
||||
"[CLS]",
|
||||
"Hello",
|
||||
",",
|
||||
"y",
|
||||
"'",
|
||||
"all",
|
||||
"!",
|
||||
"[SEP]",
|
||||
"How",
|
||||
"are",
|
||||
"you",
|
||||
"[UNK]",
|
||||
"?",
|
||||
"[SEP]",
|
||||
]);
|
||||
// START print_type_ids
|
||||
console.log(output.getTypeIds());
|
||||
// [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
|
||||
// END print_type_ids
|
||||
expect(output.getTypeIds()).toEqual([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]);
|
||||
// START encode_batch
|
||||
let encodeBatch = promisify(tokenizer.encodeBatch.bind(tokenizer));
|
||||
|
||||
var output = await encodeBatch(["Hello, y'all!", "How are you 😁 ?"]);
|
||||
// END encode_batch
|
||||
// START encode_batch_pair
|
||||
var output = await encodeBatch(
|
||||
[["Hello, y'all!", "How are you 😁 ?"], ["Hello to you too!", "I'm fine, thank you!"]]
|
||||
);
|
||||
// END encode_batch_pair
|
||||
// START enable_padding
|
||||
tokenizer.setPadding({ padId: 3, padToken: "[PAD]" });
|
||||
// END enable_padding
|
||||
// START print_batch_tokens
|
||||
var output = await encodeBatch(["Hello, y'all!", "How are you 😁 ?"]);
|
||||
console.log(output[1].getTokens());
|
||||
// ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
|
||||
// END print_batch_tokens
|
||||
expect(output[1].getTokens()).toEqual(["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]);
|
||||
// START print_attention_mask
|
||||
console.log(output[1].getAttentionMask());
|
||||
// [1, 1, 1, 1, 1, 1, 1, 0]
|
||||
// END print_attention_mask
|
||||
expect(output[1].getAttentionMask()).toEqual([1, 1, 1, 1, 1, 1, 1, 0]);
|
||||
});
|
||||
});
|
Reference in New Issue
Block a user