mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
* Move to maturing mimicking move for `safetensors`. * Tmp. * Fix sdist. * Wat? * Clippy 1.72 * Remove if. * Conda sed. * Fix doc check workflow. * Moving to maturin AND removing http + openssl mess (smoothing transition moving to `huggingface_hub`) * Fix dep * Black. * New node bindings. * Fix docs + node cache ? * Yarn. * Working dir. * Extension module. * Put back interpreter. * Remove cache. * New attempt * Multi python. * Remove FromPretrained. * Remove traces of `fromPretrained`. * Drop 3.12 for windows? * Typo. * Put back the default feature for ignoring links during simple test. * Fix ? * x86_64 -> x64. * Remove warning for windows bindings. * Excluse aarch. * Include/exclude. * Put back workflows in correct states.
164 lines
5.2 KiB
TypeScript
164 lines
5.2 KiB
TypeScript
/* eslint-disable */
|
|
var globRequire = require
|
|
|
|
console.log = (..._args: any[]) => {}
|
|
|
|
describe('quicktourExample', () => {
|
|
function require(mod: string) {
|
|
if (mod.startsWith('tokenizers')) {
|
|
return globRequire('../../')
|
|
} else {
|
|
return globRequire(mod)
|
|
}
|
|
}
|
|
|
|
it.skip('trains the tokenizer', async () => {
|
|
// START init_tokenizer
|
|
let { Tokenizer } = require('tokenizers')
|
|
let { BPE } = require('tokenizers')
|
|
|
|
let tokenizer = new Tokenizer(BPE.init({}, [], { unkToken: '[UNK]' }))
|
|
// END init_tokenizer
|
|
// START init_trainer
|
|
let { bpeTrainer } = require('tokenizers')
|
|
|
|
let trainer = bpeTrainer({
|
|
specialTokens: ['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]'],
|
|
})
|
|
// END init_trainer
|
|
// START init_pretok
|
|
let { whitespacePreTokenizer } = require('tokenizers')
|
|
|
|
tokenizer.setPreTokenizer(whitespacePreTokenizer())
|
|
// END init_pretok
|
|
// START train
|
|
let files = ['test', 'train', 'valid'].map((split) => `data/wikitext-103-raw/wiki.${split}.raw`)
|
|
tokenizer.train(files, trainer)
|
|
// END train
|
|
// START save
|
|
tokenizer.save('data/tokenizer-wiki.json')
|
|
// END save
|
|
})
|
|
|
|
it('shows a quicktour example', async () => {
|
|
let { Tokenizer } = require('tokenizers')
|
|
|
|
// START reload_tokenizer
|
|
let tokenizer = Tokenizer.fromFile('data/tokenizer-wiki.json')
|
|
// END reload_tokenizer
|
|
// START encode
|
|
|
|
var output = await tokenizer.encode("Hello, y'all! How are you 😁 ?")
|
|
// END encode
|
|
// START print_tokens
|
|
console.log(output.getTokens())
|
|
// ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"]
|
|
// END print_tokens
|
|
expect(output.getTokens()).toEqual(['Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '[UNK]', '?'])
|
|
// START print_ids
|
|
console.log(output.getIds())
|
|
// [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
|
|
// END print_ids
|
|
expect(output.getIds()).toEqual([27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35])
|
|
// START print_offsets
|
|
let offsets = output.getOffsets()
|
|
console.log(offsets[9])
|
|
// (26, 27)
|
|
// END print_offsets
|
|
expect(offsets[9]).toEqual([26, 27])
|
|
// START use_offsets
|
|
let { slice } = require('tokenizers')
|
|
|
|
let sentence = "Hello, y'all! How are you 😁 ?"
|
|
let [start, end] = offsets[9]
|
|
console.log(slice(sentence, start, end))
|
|
// "😁"
|
|
// END use_offsets
|
|
expect(slice(sentence, start, end)).toEqual('😁')
|
|
// START check_sep
|
|
console.log(tokenizer.tokenToId('[SEP]'))
|
|
// 2
|
|
// END check_sep
|
|
expect(tokenizer.tokenToId('[SEP]')).toEqual(2)
|
|
// START init_template_processing
|
|
let { templateProcessing } = require('tokenizers')
|
|
|
|
tokenizer.setPostProcessor(
|
|
templateProcessing('[CLS] $A [SEP]', '[CLS] $A [SEP] $B:1 [SEP]:1', [
|
|
['[CLS]', tokenizer.tokenToId('[CLS]')],
|
|
['[SEP]', tokenizer.tokenToId('[SEP]')],
|
|
]),
|
|
)
|
|
// END init_template_processing
|
|
// START print_special_tokens
|
|
var output = await tokenizer.encode("Hello, y'all! How are you 😁 ?")
|
|
console.log(output.getTokens())
|
|
// ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
|
|
// END print_special_tokens
|
|
expect(output.getTokens()).toEqual([
|
|
'[CLS]',
|
|
'Hello',
|
|
',',
|
|
'y',
|
|
"'",
|
|
'all',
|
|
'!',
|
|
'How',
|
|
'are',
|
|
'you',
|
|
'[UNK]',
|
|
'?',
|
|
'[SEP]',
|
|
])
|
|
// START print_special_tokens_pair
|
|
var output = await tokenizer.encode("Hello, y'all!", 'How are you 😁 ?')
|
|
console.log(output.getTokens())
|
|
// ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"]
|
|
// END print_special_tokens_pair
|
|
expect(output.getTokens()).toEqual([
|
|
'[CLS]',
|
|
'Hello',
|
|
',',
|
|
'y',
|
|
"'",
|
|
'all',
|
|
'!',
|
|
'[SEP]',
|
|
'How',
|
|
'are',
|
|
'you',
|
|
'[UNK]',
|
|
'?',
|
|
'[SEP]',
|
|
])
|
|
// START print_type_ids
|
|
console.log(output.getTypeIds())
|
|
// [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
|
|
// END print_type_ids
|
|
expect(output.getTypeIds()).toEqual([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
|
|
// START encode_batch
|
|
|
|
var output = await tokenizer.encodeBatch(["Hello, y'all!", 'How are you 😁 ?'])
|
|
// END encode_batch
|
|
// START encode_batch_pair
|
|
// var output = await tokenizer.encodeBatch(
|
|
// [["Hello, y'all!", "How are you 😁 ?"], ["Hello to you too!", "I'm fine, thank you!"]]
|
|
// );
|
|
// END encode_batch_pair
|
|
// START enable_padding
|
|
tokenizer.setPadding({ padId: 3, padToken: '[PAD]' })
|
|
// END enable_padding
|
|
// START print_batch_tokens
|
|
var output = await tokenizer.encodeBatch(["Hello, y'all!", 'How are you 😁 ?'])
|
|
console.log(output[1].getTokens())
|
|
// ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
|
|
// END print_batch_tokens
|
|
expect(output[1].getTokens()).toEqual(['[CLS]', 'How', 'are', 'you', '[UNK]', '?', '[SEP]', '[PAD]'])
|
|
// START print_attention_mask
|
|
console.log(output[1].getAttentionMask())
|
|
// [1, 1, 1, 1, 1, 1, 1, 0]
|
|
// END print_attention_mask
|
|
expect(output[1].getAttentionMask()).toEqual([1, 1, 1, 1, 1, 1, 1, 0])
|
|
})
|
|
})
|