/*eslint-disable no-undef*/ const { Tokenizer, models, normalizers, pre_tokenizers, post_processors, decoders, trainers, AddedToken, } = require(".."); describe("trainExample", () => { it("", () => { // START train_tokenizer const vocabSize = 100; const tokenizer = new Tokenizer(models.BPE.empty()); tokenizer.normalizer = normalizers.sequenceNormalizer([ normalizers.stripNormalizer(), normalizers.nfcNormalizer(), ]); tokenizer.pre_tokenizer = pre_tokenizers.byteLevelPreTokenizer(); tokenizer.post_processor = post_processors.byteLevelProcessing(); tokenizer.decoder = decoders.byteLevelDecoder(); const trainer = trainers.bpeTrainer({ vocabSize, minFrequency: 0, specialTokens: [ new AddedToken("", true), new AddedToken("", true), new AddedToken("", true), new AddedToken("", true), new AddedToken("", true), ], showProgress: false, }); tokenizer.train(trainer, ["data/small.txt"]); tokenizer.save("data/tokenizer.json"); // END train_tokenizer expect(1).toBe(1); }); });