Files
tokenizers/bindings/node/examples/train.test.js
2020-11-02 17:07:27 -05:00

48 lines
1.1 KiB
JavaScript

/*eslint-disable no-undef*/
const {
Tokenizer,
models,
normalizers,
pre_tokenizers,
post_processors,
decoders,
trainers,
AddedToken,
} = require("..");
describe("trainExample", () => {
it("", () => {
// START train_tokenizer
const vocabSize = 100;
const tokenizer = new Tokenizer(models.BPE.empty());
tokenizer.normalizer = normalizers.sequenceNormalizer([
normalizers.stripNormalizer(),
normalizers.nfcNormalizer(),
]);
tokenizer.pre_tokenizer = pre_tokenizers.byteLevelPreTokenizer();
tokenizer.post_processor = post_processors.byteLevelProcessing();
tokenizer.decoder = decoders.byteLevelDecoder();
const trainer = trainers.bpeTrainer({
vocabSize,
minFrequency: 0,
specialTokens: [
new AddedToken("<s>", true),
new AddedToken("<pad>", true),
new AddedToken("</s>", true),
new AddedToken("<unk>", true),
new AddedToken("<mask>", true),
],
showProgress: false,
});
tokenizer.train(trainer, ["data/small.txt"]);
tokenizer.save("data/tokenizer.json");
// END train_tokenizer
expect(1).toBe(1);
});
});