mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Doc - Cleanup old tests & node lints
This commit is contained in:
@ -1,8 +0,0 @@
|
||||
const tokenizers = require("..");
|
||||
|
||||
let bpe = tokenizers.models.BPE.fromFiles(
|
||||
"./data/gpt2-vocab.json",
|
||||
"./data/gpt2-merges.txt"
|
||||
);
|
||||
let tokenizer = new tokenizers.Tokenizer(bpe);
|
||||
console.log(bpe, tokenizer);
|
@ -1,3 +1,4 @@
|
||||
/* eslint-disable */
|
||||
var globRequire = require;
|
||||
|
||||
describe("pipelineExample", () => {
|
||||
|
@ -1,3 +1,4 @@
|
||||
/* eslint-disable */
|
||||
var globRequire = require;
|
||||
|
||||
describe("quicktourExample", () => {
|
||||
|
@ -1,27 +0,0 @@
|
||||
/*eslint-disable no-undef*/
|
||||
const tokenizers = require("..");
|
||||
const { promisify } = require("util");
|
||||
|
||||
describe("loadExample", () => {
|
||||
beforeAll(async () => {});
|
||||
it("", async () => {
|
||||
const example = "This is an example";
|
||||
const ids = [713, 16, 41, 1246];
|
||||
const tokens = ["This", "Ġis", "Ġan", "Ġexample"];
|
||||
|
||||
// START load_tokenizer
|
||||
const tokenizer = tokenizers.Tokenizer.fromFile("data/roberta.json");
|
||||
// END load_tokenizer
|
||||
|
||||
// You could also use regular callbacks
|
||||
const encode = promisify(tokenizer.encode.bind(tokenizer));
|
||||
const decode = promisify(tokenizer.decode.bind(tokenizer));
|
||||
|
||||
const encoded = await encode(example);
|
||||
expect(encoded.getIds()).toEqual(ids);
|
||||
expect(encoded.getTokens()).toEqual(tokens);
|
||||
|
||||
const decoded = await decode(ids);
|
||||
expect(decoded).toEqual(example);
|
||||
});
|
||||
});
|
@ -1,47 +0,0 @@
|
||||
/*eslint-disable no-undef*/
|
||||
|
||||
const {
|
||||
Tokenizer,
|
||||
models,
|
||||
normalizers,
|
||||
pre_tokenizers,
|
||||
post_processors,
|
||||
decoders,
|
||||
trainers,
|
||||
AddedToken,
|
||||
} = require("..");
|
||||
|
||||
describe("trainExample", () => {
|
||||
it("", () => {
|
||||
// START train_tokenizer
|
||||
const vocabSize = 100;
|
||||
|
||||
const tokenizer = new Tokenizer(models.BPE.empty());
|
||||
tokenizer.normalizer = normalizers.sequenceNormalizer([
|
||||
normalizers.stripNormalizer(),
|
||||
normalizers.nfcNormalizer(),
|
||||
]);
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.byteLevelPreTokenizer();
|
||||
tokenizer.post_processor = post_processors.byteLevelProcessing();
|
||||
tokenizer.decoder = decoders.byteLevelDecoder();
|
||||
|
||||
const trainer = trainers.bpeTrainer({
|
||||
vocabSize,
|
||||
minFrequency: 0,
|
||||
specialTokens: [
|
||||
new AddedToken("<s>", true),
|
||||
new AddedToken("<pad>", true),
|
||||
new AddedToken("</s>", true),
|
||||
new AddedToken("<unk>", true),
|
||||
new AddedToken("<mask>", true),
|
||||
],
|
||||
showProgress: false,
|
||||
});
|
||||
|
||||
tokenizer.train(trainer, ["data/small.txt"]);
|
||||
tokenizer.save("data/tokenizer.json");
|
||||
// END train_tokenizer
|
||||
|
||||
expect(1).toBe(1);
|
||||
});
|
||||
});
|
@ -1,19 +0,0 @@
|
||||
from tokenizers import Tokenizer
|
||||
|
||||
|
||||
def test_load_tokenizer():
|
||||
# START load_tokenizer
|
||||
tokenizer = Tokenizer.from_file("data/roberta.json")
|
||||
# END load_tokenizer
|
||||
|
||||
example = "This is an example"
|
||||
ids = [713, 16, 41, 1246]
|
||||
tokens = ["This", "Ġis", "Ġan", "Ġexample"]
|
||||
|
||||
encodings = tokenizer.encode(example)
|
||||
|
||||
assert encodings.ids == ids
|
||||
assert encodings.tokens == tokens
|
||||
|
||||
decoded = tokenizer.decode(ids)
|
||||
assert decoded == example
|
@ -1,43 +0,0 @@
|
||||
from tokenizers import (
|
||||
Tokenizer,
|
||||
normalizers,
|
||||
pre_tokenizers,
|
||||
models,
|
||||
decoders,
|
||||
processors,
|
||||
trainers,
|
||||
AddedToken,
|
||||
)
|
||||
|
||||
|
||||
def test_train_tokenizer():
|
||||
# START train_tokenizer
|
||||
vocab_size = 100
|
||||
|
||||
tokenizer = Tokenizer(models.BPE())
|
||||
tokenizer.normalizer = normalizers.Sequence(
|
||||
[
|
||||
normalizers.Strip(),
|
||||
normalizers.NFC(),
|
||||
]
|
||||
)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
||||
tokenizer.post_processor = processors.ByteLevel()
|
||||
tokenizer.decoder = decoders.ByteLevel()
|
||||
|
||||
trainer = trainers.BpeTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=0,
|
||||
special_tokens=[
|
||||
AddedToken("<s>"),
|
||||
AddedToken("<pad>"),
|
||||
AddedToken("</s>"),
|
||||
AddedToken("<unk>"),
|
||||
AddedToken("<mask>"),
|
||||
],
|
||||
show_progress=False,
|
||||
)
|
||||
|
||||
tokenizer.train(trainer, ["data/small.txt"])
|
||||
tokenizer.save("data/tokenizer.json")
|
||||
# END train_tokenizer
|
Reference in New Issue
Block a user