mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Node - Trainers train the Model in-place
This commit is contained in:
@ -94,7 +94,7 @@ describe("pipelineExample", () => {
|
||||
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
||||
let { WordPiece } = require("tokenizers/bindings/models");
|
||||
|
||||
let bertTokenizer = new Tokenizer(WordPiece.empty());
|
||||
let bertTokenizer = new Tokenizer(WordPiece.init({}, { unkToken: "[UNK]" }));
|
||||
// END bert_setup_tokenizer
|
||||
// START bert_setup_normalizer
|
||||
let { sequenceNormalizer, lowercaseNormalizer, nfdNormalizer, stripAccentsNormalizer }
|
||||
@ -120,20 +120,13 @@ describe("pipelineExample", () => {
|
||||
// END bert_setup_processor
|
||||
// START bert_train_tokenizer
|
||||
let { wordPieceTrainer } = require("tokenizers/bindings/trainers");
|
||||
let { promisify } = require("util");
|
||||
|
||||
let trainer = wordPieceTrainer({
|
||||
vocabSize: 30522,
|
||||
specialTokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
|
||||
});
|
||||
let files = ["test", "train", "valid"].map(split => `data/wikitext-103-raw/wiki.${split}.raw`);
|
||||
bertTokenizer.train(trainer, files);
|
||||
|
||||
let modelFiles = bertTokenizer.getModel().save("data", "bert-wiki");
|
||||
let fromFile = promisify(WordPiece.fromFile);
|
||||
bertTokenizer.setModel(await fromFile(modelFiles[0], {
|
||||
unkToken: "[UNK]"
|
||||
}));
|
||||
bertTokenizer.train(files, trainer);
|
||||
|
||||
bertTokenizer.save("data/bert-wiki.json")
|
||||
// END bert_train_tokenizer
|
||||
|
@ -16,7 +16,7 @@ describe("quicktourExample", () => {
|
||||
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
||||
let { BPE } = require("tokenizers/bindings/models");
|
||||
|
||||
let tokenizer = new Tokenizer(BPE.empty());
|
||||
let tokenizer = new Tokenizer(BPE.init({}, [], { unkToken: "[UNK]" }));
|
||||
// END init_tokenizer
|
||||
// START init_trainer
|
||||
let { bpeTrainer } = require("tokenizers/bindings/trainers");
|
||||
@ -32,17 +32,8 @@ describe("quicktourExample", () => {
|
||||
// END init_pretok
|
||||
// START train
|
||||
let files = ["test", "train", "valid"].map(split => `data/wikitext-103-raw/wiki.${split}.raw`);
|
||||
tokenizer.train(trainer, files);
|
||||
tokenizer.train(files, trainer);
|
||||
// END train
|
||||
// START reload_model
|
||||
let { promisify } = require("util");
|
||||
|
||||
let modelFiles = tokenizer.getModel().save("data", "wiki");
|
||||
let fromFile = promisify(BPE.fromFile);
|
||||
tokenizer.setModel(await fromFile(modelFiles[0], modelFiles[1], {
|
||||
unkToken: "[UNK]"
|
||||
}));
|
||||
// END reload_model
|
||||
// START save
|
||||
tokenizer.save("data/tokenizer-wiki.json");
|
||||
// END save
|
||||
|
Reference in New Issue
Block a user