Node - Trainers train the Model in-place

This commit is contained in:
Anthony MOI
2020-11-19 19:57:50 -05:00
committed by Anthony MOI
parent 387b8a1033
commit 7fc37a03e8
5 changed files with 69 additions and 48 deletions

View File

@ -94,7 +94,7 @@ describe("pipelineExample", () => {
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
let { WordPiece } = require("tokenizers/bindings/models");
let bertTokenizer = new Tokenizer(WordPiece.empty());
let bertTokenizer = new Tokenizer(WordPiece.init({}, { unkToken: "[UNK]" }));
// END bert_setup_tokenizer
// START bert_setup_normalizer
let { sequenceNormalizer, lowercaseNormalizer, nfdNormalizer, stripAccentsNormalizer }
@ -120,20 +120,13 @@ describe("pipelineExample", () => {
// END bert_setup_processor
// START bert_train_tokenizer
let { wordPieceTrainer } = require("tokenizers/bindings/trainers");
let { promisify } = require("util");
let trainer = wordPieceTrainer({
vocabSize: 30522,
specialTokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
});
let files = ["test", "train", "valid"].map(split => `data/wikitext-103-raw/wiki.${split}.raw`);
bertTokenizer.train(trainer, files);
let modelFiles = bertTokenizer.getModel().save("data", "bert-wiki");
let fromFile = promisify(WordPiece.fromFile);
bertTokenizer.setModel(await fromFile(modelFiles[0], {
unkToken: "[UNK]"
}));
bertTokenizer.train(files, trainer);
bertTokenizer.save("data/bert-wiki.json")
// END bert_train_tokenizer

View File

@ -16,7 +16,7 @@ describe("quicktourExample", () => {
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
let { BPE } = require("tokenizers/bindings/models");
let tokenizer = new Tokenizer(BPE.empty());
let tokenizer = new Tokenizer(BPE.init({}, [], { unkToken: "[UNK]" }));
// END init_tokenizer
// START init_trainer
let { bpeTrainer } = require("tokenizers/bindings/trainers");
@ -32,17 +32,8 @@ describe("quicktourExample", () => {
// END init_pretok
// START train
let files = ["test", "train", "valid"].map(split => `data/wikitext-103-raw/wiki.${split}.raw`);
tokenizer.train(trainer, files);
tokenizer.train(files, trainer);
// END train
// START reload_model
let { promisify } = require("util");
let modelFiles = tokenizer.getModel().save("data", "wiki");
let fromFile = promisify(BPE.fromFile);
tokenizer.setModel(await fromFile(modelFiles[0], modelFiles[1], {
unkToken: "[UNK]"
}));
// END reload_model
// START save
tokenizer.save("data/tokenizer-wiki.json");
// END save