Doc - Cleanup old tests & node lints

This commit is contained in:
Anthony MOI
2020-10-29 12:29:46 -04:00
committed by Anthony MOI
parent b23310b481
commit b6ffd9cba0
7 changed files with 2 additions and 144 deletions

View File

@ -1,8 +0,0 @@
const tokenizers = require("..");
let bpe = tokenizers.models.BPE.fromFiles(
"./data/gpt2-vocab.json",
"./data/gpt2-merges.txt"
);
let tokenizer = new tokenizers.Tokenizer(bpe);
console.log(bpe, tokenizer);

View File

@ -1,3 +1,4 @@
/* eslint-disable */
var globRequire = require;
describe("pipelineExample", () => {

View File

@ -1,3 +1,4 @@
/* eslint-disable */
var globRequire = require;
describe("quicktourExample", () => {

View File

@ -1,27 +0,0 @@
/*eslint-disable no-undef*/
const tokenizers = require("..");
const { promisify } = require("util");
describe("loadExample", () => {
beforeAll(async () => {});
it("", async () => {
const example = "This is an example";
const ids = [713, 16, 41, 1246];
const tokens = ["This", "Ġis", "Ġan", "Ġexample"];
// START load_tokenizer
const tokenizer = tokenizers.Tokenizer.fromFile("data/roberta.json");
// END load_tokenizer
// You could also use regular callbacks
const encode = promisify(tokenizer.encode.bind(tokenizer));
const decode = promisify(tokenizer.decode.bind(tokenizer));
const encoded = await encode(example);
expect(encoded.getIds()).toEqual(ids);
expect(encoded.getTokens()).toEqual(tokens);
const decoded = await decode(ids);
expect(decoded).toEqual(example);
});
});

View File

@ -1,47 +0,0 @@
/*eslint-disable no-undef*/
const {
Tokenizer,
models,
normalizers,
pre_tokenizers,
post_processors,
decoders,
trainers,
AddedToken,
} = require("..");
describe("trainExample", () => {
it("", () => {
// START train_tokenizer
const vocabSize = 100;
const tokenizer = new Tokenizer(models.BPE.empty());
tokenizer.normalizer = normalizers.sequenceNormalizer([
normalizers.stripNormalizer(),
normalizers.nfcNormalizer(),
]);
tokenizer.pre_tokenizer = pre_tokenizers.byteLevelPreTokenizer();
tokenizer.post_processor = post_processors.byteLevelProcessing();
tokenizer.decoder = decoders.byteLevelDecoder();
const trainer = trainers.bpeTrainer({
vocabSize,
minFrequency: 0,
specialTokens: [
new AddedToken("<s>", true),
new AddedToken("<pad>", true),
new AddedToken("</s>", true),
new AddedToken("<unk>", true),
new AddedToken("<mask>", true),
],
showProgress: false,
});
tokenizer.train(trainer, ["data/small.txt"]);
tokenizer.save("data/tokenizer.json");
// END train_tokenizer
expect(1).toBe(1);
});
});

View File

@ -1,19 +0,0 @@
from tokenizers import Tokenizer
def test_load_tokenizer():
# START load_tokenizer
tokenizer = Tokenizer.from_file("data/roberta.json")
# END load_tokenizer
example = "This is an example"
ids = [713, 16, 41, 1246]
tokens = ["This", "Ġis", "Ġan", "Ġexample"]
encodings = tokenizer.encode(example)
assert encodings.ids == ids
assert encodings.tokens == tokens
decoded = tokenizer.decode(ids)
assert decoded == example

View File

@ -1,43 +0,0 @@
from tokenizers import (
Tokenizer,
normalizers,
pre_tokenizers,
models,
decoders,
processors,
trainers,
AddedToken,
)
def test_train_tokenizer():
# START train_tokenizer
vocab_size = 100
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence(
[
normalizers.Strip(),
normalizers.NFC(),
]
)
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.post_processor = processors.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
min_frequency=0,
special_tokens=[
AddedToken("<s>"),
AddedToken("<pad>"),
AddedToken("</s>"),
AddedToken("<unk>"),
AddedToken("<mask>"),
],
show_progress=False,
)
tokenizer.train(trainer, ["data/small.txt"])
tokenizer.save("data/tokenizer.json")
# END train_tokenizer