Files
tokenizers/bindings/node/examples/documentation/quicktour.test.ts
Nicolas Patry d2010d5165 Move to maturing mimicking move for safetensors. + Rewritten node bindings. (#1331)
* Move to maturing mimicking move for `safetensors`.

* Tmp.

* Fix sdist.

* Wat?

* Clippy 1.72

* Remove if.

* Conda sed.

* Fix doc check workflow.

* Moving to maturin AND removing http + openssl mess (smoothing transition
moving to `huggingface_hub`)

* Fix dep

* Black.

* New node bindings.

* Fix docs + node cache ?

* Yarn.

* Working dir.

* Extension module.

* Put back interpreter.

* Remove cache.

* New attempt

* Multi python.

* Remove FromPretrained.

* Remove traces of `fromPretrained`.

* Drop 3.12 for windows?

* Typo.

* Put back the default feature for ignoring links during simple test.

* Fix ?

* x86_64 -> x64.

* Remove warning for windows bindings.

* Excluse aarch.

* Include/exclude.

* Put back workflows in correct states.
2023-08-28 16:24:14 +02:00

164 lines
5.2 KiB
TypeScript

/* eslint-disable */
var globRequire = require
console.log = (..._args: any[]) => {}
describe('quicktourExample', () => {
function require(mod: string) {
if (mod.startsWith('tokenizers')) {
return globRequire('../../')
} else {
return globRequire(mod)
}
}
it.skip('trains the tokenizer', async () => {
// START init_tokenizer
let { Tokenizer } = require('tokenizers')
let { BPE } = require('tokenizers')
let tokenizer = new Tokenizer(BPE.init({}, [], { unkToken: '[UNK]' }))
// END init_tokenizer
// START init_trainer
let { bpeTrainer } = require('tokenizers')
let trainer = bpeTrainer({
specialTokens: ['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]'],
})
// END init_trainer
// START init_pretok
let { whitespacePreTokenizer } = require('tokenizers')
tokenizer.setPreTokenizer(whitespacePreTokenizer())
// END init_pretok
// START train
let files = ['test', 'train', 'valid'].map((split) => `data/wikitext-103-raw/wiki.${split}.raw`)
tokenizer.train(files, trainer)
// END train
// START save
tokenizer.save('data/tokenizer-wiki.json')
// END save
})
it('shows a quicktour example', async () => {
let { Tokenizer } = require('tokenizers')
// START reload_tokenizer
let tokenizer = Tokenizer.fromFile('data/tokenizer-wiki.json')
// END reload_tokenizer
// START encode
var output = await tokenizer.encode("Hello, y'all! How are you 😁 ?")
// END encode
// START print_tokens
console.log(output.getTokens())
// ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"]
// END print_tokens
expect(output.getTokens()).toEqual(['Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '[UNK]', '?'])
// START print_ids
console.log(output.getIds())
// [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
// END print_ids
expect(output.getIds()).toEqual([27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35])
// START print_offsets
let offsets = output.getOffsets()
console.log(offsets[9])
// (26, 27)
// END print_offsets
expect(offsets[9]).toEqual([26, 27])
// START use_offsets
let { slice } = require('tokenizers')
let sentence = "Hello, y'all! How are you 😁 ?"
let [start, end] = offsets[9]
console.log(slice(sentence, start, end))
// "😁"
// END use_offsets
expect(slice(sentence, start, end)).toEqual('😁')
// START check_sep
console.log(tokenizer.tokenToId('[SEP]'))
// 2
// END check_sep
expect(tokenizer.tokenToId('[SEP]')).toEqual(2)
// START init_template_processing
let { templateProcessing } = require('tokenizers')
tokenizer.setPostProcessor(
templateProcessing('[CLS] $A [SEP]', '[CLS] $A [SEP] $B:1 [SEP]:1', [
['[CLS]', tokenizer.tokenToId('[CLS]')],
['[SEP]', tokenizer.tokenToId('[SEP]')],
]),
)
// END init_template_processing
// START print_special_tokens
var output = await tokenizer.encode("Hello, y'all! How are you 😁 ?")
console.log(output.getTokens())
// ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
// END print_special_tokens
expect(output.getTokens()).toEqual([
'[CLS]',
'Hello',
',',
'y',
"'",
'all',
'!',
'How',
'are',
'you',
'[UNK]',
'?',
'[SEP]',
])
// START print_special_tokens_pair
var output = await tokenizer.encode("Hello, y'all!", 'How are you 😁 ?')
console.log(output.getTokens())
// ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"]
// END print_special_tokens_pair
expect(output.getTokens()).toEqual([
'[CLS]',
'Hello',
',',
'y',
"'",
'all',
'!',
'[SEP]',
'How',
'are',
'you',
'[UNK]',
'?',
'[SEP]',
])
// START print_type_ids
console.log(output.getTypeIds())
// [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
// END print_type_ids
expect(output.getTypeIds()).toEqual([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
// START encode_batch
var output = await tokenizer.encodeBatch(["Hello, y'all!", 'How are you 😁 ?'])
// END encode_batch
// START encode_batch_pair
// var output = await tokenizer.encodeBatch(
// [["Hello, y'all!", "How are you 😁 ?"], ["Hello to you too!", "I'm fine, thank you!"]]
// );
// END encode_batch_pair
// START enable_padding
tokenizer.setPadding({ padId: 3, padToken: '[PAD]' })
// END enable_padding
// START print_batch_tokens
var output = await tokenizer.encodeBatch(["Hello, y'all!", 'How are you 😁 ?'])
console.log(output[1].getTokens())
// ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
// END print_batch_tokens
expect(output[1].getTokens()).toEqual(['[CLS]', 'How', 'are', 'you', '[UNK]', '?', '[SEP]', '[PAD]'])
// START print_attention_mask
console.log(output[1].getAttentionMask())
// [1, 1, 1, 1, 1, 1, 1, 0]
// END print_attention_mask
expect(output[1].getAttentionMask()).toEqual([1, 1, 1, 1, 1, 1, 1, 0])
})
})