/* eslint-disable */ var globRequire = require console.log = (..._args: any[]) => {} describe('quicktourExample', () => { function require(mod: string) { if (mod.startsWith('tokenizers')) { return globRequire('../../') } else { return globRequire(mod) } } it.skip('trains the tokenizer', async () => { // START init_tokenizer let { Tokenizer } = require('tokenizers') let { BPE } = require('tokenizers') let tokenizer = new Tokenizer(BPE.init({}, [], { unkToken: '[UNK]' })) // END init_tokenizer // START init_trainer let { bpeTrainer } = require('tokenizers') let trainer = bpeTrainer({ specialTokens: ['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]'], }) // END init_trainer // START init_pretok let { whitespacePreTokenizer } = require('tokenizers') tokenizer.setPreTokenizer(whitespacePreTokenizer()) // END init_pretok // START train let files = ['test', 'train', 'valid'].map((split) => `data/wikitext-103-raw/wiki.${split}.raw`) tokenizer.train(files, trainer) // END train // START save tokenizer.save('data/tokenizer-wiki.json') // END save }) it('shows a quicktour example', async () => { let { Tokenizer } = require('tokenizers') // START reload_tokenizer let tokenizer = Tokenizer.fromFile('data/tokenizer-wiki.json') // END reload_tokenizer // START encode var output = await tokenizer.encode("Hello, y'all! How are you 😁 ?") // END encode // START print_tokens console.log(output.getTokens()) // ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"] // END print_tokens expect(output.getTokens()).toEqual(['Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '[UNK]', '?']) // START print_ids console.log(output.getIds()) // [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35] // END print_ids expect(output.getIds()).toEqual([27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]) // START print_offsets let offsets = output.getOffsets() console.log(offsets[9]) // (26, 27) // END print_offsets expect(offsets[9]).toEqual([26, 27]) // START use_offsets let { slice } = require('tokenizers') let sentence = "Hello, y'all! How are you 😁 ?" let [start, end] = offsets[9] console.log(slice(sentence, start, end)) // "😁" // END use_offsets expect(slice(sentence, start, end)).toEqual('😁') // START check_sep console.log(tokenizer.tokenToId('[SEP]')) // 2 // END check_sep expect(tokenizer.tokenToId('[SEP]')).toEqual(2) // START init_template_processing let { templateProcessing } = require('tokenizers') tokenizer.setPostProcessor( templateProcessing('[CLS] $A [SEP]', '[CLS] $A [SEP] $B:1 [SEP]:1', [ ['[CLS]', tokenizer.tokenToId('[CLS]')], ['[SEP]', tokenizer.tokenToId('[SEP]')], ]), ) // END init_template_processing // START print_special_tokens var output = await tokenizer.encode("Hello, y'all! How are you 😁 ?") console.log(output.getTokens()) // ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"] // END print_special_tokens expect(output.getTokens()).toEqual([ '[CLS]', 'Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '[UNK]', '?', '[SEP]', ]) // START print_special_tokens_pair var output = await tokenizer.encode("Hello, y'all!", 'How are you 😁 ?') console.log(output.getTokens()) // ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"] // END print_special_tokens_pair expect(output.getTokens()).toEqual([ '[CLS]', 'Hello', ',', 'y', "'", 'all', '!', '[SEP]', 'How', 'are', 'you', '[UNK]', '?', '[SEP]', ]) // START print_type_ids console.log(output.getTypeIds()) // [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] // END print_type_ids expect(output.getTypeIds()).toEqual([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) // START encode_batch var output = await tokenizer.encodeBatch(["Hello, y'all!", 'How are you 😁 ?']) // END encode_batch // START encode_batch_pair // var output = await tokenizer.encodeBatch( // [["Hello, y'all!", "How are you 😁 ?"], ["Hello to you too!", "I'm fine, thank you!"]] // ); // END encode_batch_pair // START enable_padding tokenizer.setPadding({ padId: 3, padToken: '[PAD]' }) // END enable_padding // START print_batch_tokens var output = await tokenizer.encodeBatch(["Hello, y'all!", 'How are you 😁 ?']) console.log(output[1].getTokens()) // ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"] // END print_batch_tokens expect(output[1].getTokens()).toEqual(['[CLS]', 'How', 'are', 'you', '[UNK]', '?', '[SEP]', '[PAD]']) // START print_attention_mask console.log(output[1].getAttentionMask()) // [1, 1, 1, 1, 1, 1, 1, 0] // END print_attention_mask expect(output[1].getAttentionMask()).toEqual([1, 1, 1, 1, 1, 1, 1, 0]) }) })