mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 16:49:27 +00:00
Doc - Improve python and node tests
This commit is contained in:
@ -12,7 +12,7 @@ style:
|
|||||||
check-style:
|
check-style:
|
||||||
npm run lint-check
|
npm run lint-check
|
||||||
|
|
||||||
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json
|
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json
|
||||||
|
|
||||||
# Launch the test suite
|
# Launch the test suite
|
||||||
test: $(TESTS_RESOURCES)
|
test: $(TESTS_RESOURCES)
|
||||||
@ -32,3 +32,7 @@ $(DATA_DIR)/roberta.json :
|
|||||||
$(DATA_DIR)/tokenizer-wiki.json :
|
$(DATA_DIR)/tokenizer-wiki.json :
|
||||||
$(dir_guard)
|
$(dir_guard)
|
||||||
wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json -O $@
|
wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json -O $@
|
||||||
|
|
||||||
|
$(DATA_DIR)/bert-wiki.json :
|
||||||
|
$(dir_guard)
|
||||||
|
wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json -O $@
|
||||||
|
@ -4,9 +4,16 @@ var globRequire = require;
|
|||||||
describe("pipelineExample", () => {
|
describe("pipelineExample", () => {
|
||||||
// This is a hack to let us require using path similar to what the user has to use
|
// This is a hack to let us require using path similar to what the user has to use
|
||||||
function require(mod: string) {
|
function require(mod: string) {
|
||||||
|
if (mod.startsWith("tokenizers/")) {
|
||||||
let path = mod.slice("tokenizers/".length);
|
let path = mod.slice("tokenizers/".length);
|
||||||
return globRequire("../../lib/" + path);
|
return globRequire("../../lib/" + path);
|
||||||
|
} else {
|
||||||
|
return globRequire(mod);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
let console = {
|
||||||
|
log: (..._args: any[]) => {}
|
||||||
|
};
|
||||||
|
|
||||||
it("shows pipeline parts", async () => {
|
it("shows pipeline parts", async () => {
|
||||||
// START reload_tokenizer
|
// START reload_tokenizer
|
||||||
@ -20,7 +27,7 @@ describe("pipelineExample", () => {
|
|||||||
let normalizer = sequenceNormalizer([nfdNormalizer(), stripAccentsNormalizer()]);
|
let normalizer = sequenceNormalizer([nfdNormalizer(), stripAccentsNormalizer()]);
|
||||||
// END setup_normalizer
|
// END setup_normalizer
|
||||||
// START test_normalizer
|
// START test_normalizer
|
||||||
let normalized = normalizer.normalizeStr("Héllò hôw are ü?")
|
let normalized = normalizer.normalizeString("Héllò hôw are ü?")
|
||||||
// "Hello how are u?"
|
// "Hello how are u?"
|
||||||
// END test_normalizer
|
// END test_normalizer
|
||||||
expect(normalized).toEqual("Hello how are u?");
|
expect(normalized).toEqual("Hello how are u?");
|
||||||
@ -28,10 +35,10 @@ describe("pipelineExample", () => {
|
|||||||
tokenizer.setNormalizer(normalizer)
|
tokenizer.setNormalizer(normalizer)
|
||||||
// END replace_normalizer
|
// END replace_normalizer
|
||||||
// START setup_pre_tokenizer
|
// START setup_pre_tokenizer
|
||||||
let { whitespacePreTokenizer } = require("tokenizers/bindings/pre_tokenizers");
|
let { whitespacePreTokenizer } = require("tokenizers/bindings/pre-tokenizers");
|
||||||
|
|
||||||
var preTokenizer = whitespacePreTokenizer();
|
var preTokenizer = whitespacePreTokenizer();
|
||||||
var preTokenized = preTokenizer.preTokenizeStr("Hello! How are you? I'm fine, thank you.");
|
var preTokenized = preTokenizer.preTokenizeString("Hello! How are you? I'm fine, thank you.");
|
||||||
// END setup_pre_tokenizer
|
// END setup_pre_tokenizer
|
||||||
expect(preTokenized).toEqual([
|
expect(preTokenized).toEqual([
|
||||||
["Hello", [0, 5]],
|
["Hello", [0, 5]],
|
||||||
@ -50,16 +57,16 @@ describe("pipelineExample", () => {
|
|||||||
[".", [39, 40]]
|
[".", [39, 40]]
|
||||||
]);
|
]);
|
||||||
// START combine_pre_tokenizer
|
// START combine_pre_tokenizer
|
||||||
let { sequencePreTokenizer, digitsPreTokenizer } = require("tokenizers/bindings/pre_tokenizers");
|
let { sequencePreTokenizer, digitsPreTokenizer } = require("tokenizers/bindings/pre-tokenizers");
|
||||||
|
|
||||||
var preTokenizer = sequencePreTokenizer([whitespacePreTokenizer(), digitsPreTokenizer(true)]);
|
var preTokenizer = sequencePreTokenizer([whitespacePreTokenizer(), digitsPreTokenizer(true)]);
|
||||||
var preTokenized = preTokenizer.preTokenizeStr("Call 911!");
|
var preTokenized = preTokenizer.preTokenizeString("Call 911!");
|
||||||
// END combine_pre_tokenizer
|
// END combine_pre_tokenizer
|
||||||
// START replace_pre_tokenizer
|
// START replace_pre_tokenizer
|
||||||
tokenizer.setPreTokenizer(preTokenizer)
|
tokenizer.setPreTokenizer(preTokenizer)
|
||||||
// END replace_pre_tokenizer
|
// END replace_pre_tokenizer
|
||||||
// START setup_processor
|
// START setup_processor
|
||||||
let { templateProcessing } = require("tokenizers/bindings/processors");
|
let { templateProcessing } = require("tokenizers/bindings/post-processors");
|
||||||
|
|
||||||
tokenizer.setPostProcessor(templateProcessing(
|
tokenizer.setPostProcessor(templateProcessing(
|
||||||
"[CLS] $A [SEP]",
|
"[CLS] $A [SEP]",
|
||||||
@ -68,42 +75,21 @@ describe("pipelineExample", () => {
|
|||||||
));
|
));
|
||||||
// END setup_processor
|
// END setup_processor
|
||||||
// START test_decoding
|
// START test_decoding
|
||||||
let output = tokenizer.encode("Hello, y'all! How are you 😁 ?");
|
let { promisify } = require('util');
|
||||||
|
let encode = promisify(tokenizer.encode.bind(tokenizer));
|
||||||
|
let decode = promisify(tokenizer.decode.bind(tokenizer));
|
||||||
|
|
||||||
|
let output = await encode("Hello, y'all! How are you 😁 ?");
|
||||||
console.log(output.getIds());
|
console.log(output.getIds());
|
||||||
// [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]
|
// [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]
|
||||||
|
|
||||||
tokenizer.decode([1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]);
|
let decoded = await decode([1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2], true);
|
||||||
// "Hello , y ' all ! How are you ?"
|
// "Hello , y ' all ! How are you ?"
|
||||||
// END test_decoding
|
// END test_decoding
|
||||||
|
expect(decoded).toEqual("Hello , y ' all ! How are you ?");
|
||||||
});
|
});
|
||||||
|
|
||||||
var { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
it.skip("trains the tokenizer", async () => {
|
||||||
const slow_bert_training = async (bertTokenizer: typeof Tokenizer) => {
|
|
||||||
let { WordPiece } = require("tokenizers/bindings/models");
|
|
||||||
|
|
||||||
// START bert_train_tokenizer
|
|
||||||
let { wordPieceTrainer } = require("tokenizers/bindings/trainers");
|
|
||||||
let { promisify } = require("util");
|
|
||||||
|
|
||||||
let trainer = wordPieceTrainer({
|
|
||||||
vocabSize: 30522,
|
|
||||||
specialTokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
|
|
||||||
});
|
|
||||||
let files = ["test", "train", "valid"].map(split => `data/wikitext-103-raw/wiki.${split}.raw`);
|
|
||||||
bertTokenizer.train(trainer, files);
|
|
||||||
|
|
||||||
let modelFiles = bertTokenizer.getModel.save("data", "bert-wiki");
|
|
||||||
let fromFile = promisify(WordPiece.fromFile);
|
|
||||||
bertTokenizer.setModel(await fromFile(modelFiles[0], {
|
|
||||||
unkToken: "[UNK]"
|
|
||||||
}));
|
|
||||||
|
|
||||||
bertTokenizer.save("data/bert-wiki.json")
|
|
||||||
// END bert_train_tokenizer
|
|
||||||
};
|
|
||||||
console.log(slow_bert_training); // disable unused warning
|
|
||||||
|
|
||||||
it("shows a full bert example", async () => {
|
|
||||||
// START bert_setup_tokenizer
|
// START bert_setup_tokenizer
|
||||||
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
||||||
let { WordPiece } = require("tokenizers/bindings/models");
|
let { WordPiece } = require("tokenizers/bindings/models");
|
||||||
@ -121,7 +107,7 @@ describe("pipelineExample", () => {
|
|||||||
// START bert_setup_pre_tokenizer
|
// START bert_setup_pre_tokenizer
|
||||||
let { whitespacePreTokenizer } = require("tokenizers/bindings/pre-tokenizers");
|
let { whitespacePreTokenizer } = require("tokenizers/bindings/pre-tokenizers");
|
||||||
|
|
||||||
bertTokenizer.setPreTokenizer = whitespacePreTokenizer();
|
bertTokenizer.setPreTokenizer(whitespacePreTokenizer());
|
||||||
// END bert_setup_pre_tokenizer
|
// END bert_setup_pre_tokenizer
|
||||||
// START bert_setup_processor
|
// START bert_setup_processor
|
||||||
let { templateProcessing } = require("tokenizers/bindings/post-processors");
|
let { templateProcessing } = require("tokenizers/bindings/post-processors");
|
||||||
@ -132,19 +118,50 @@ describe("pipelineExample", () => {
|
|||||||
[["[CLS]", 1], ["[SEP]", 2]]
|
[["[CLS]", 1], ["[SEP]", 2]]
|
||||||
));
|
));
|
||||||
// END bert_setup_processor
|
// END bert_setup_processor
|
||||||
|
// START bert_train_tokenizer
|
||||||
|
let { wordPieceTrainer } = require("tokenizers/bindings/trainers");
|
||||||
|
let { promisify } = require("util");
|
||||||
|
|
||||||
|
let trainer = wordPieceTrainer({
|
||||||
|
vocabSize: 30522,
|
||||||
|
specialTokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
|
||||||
|
});
|
||||||
|
let files = ["test", "train", "valid"].map(split => `data/wikitext-103-raw/wiki.${split}.raw`);
|
||||||
|
bertTokenizer.train(trainer, files);
|
||||||
|
|
||||||
|
let modelFiles = bertTokenizer.getModel().save("data", "bert-wiki");
|
||||||
|
let fromFile = promisify(WordPiece.fromFile);
|
||||||
|
bertTokenizer.setModel(await fromFile(modelFiles[0], {
|
||||||
|
unkToken: "[UNK]"
|
||||||
|
}));
|
||||||
|
|
||||||
|
bertTokenizer.save("data/bert-wiki.json")
|
||||||
|
// END bert_train_tokenizer
|
||||||
|
});
|
||||||
|
|
||||||
|
it("shows a full bert example", async () => {
|
||||||
|
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
||||||
|
let bertTokenizer = await Tokenizer.fromFile("data/bert-wiki.json")
|
||||||
|
|
||||||
// START bert_test_decoding
|
// START bert_test_decoding
|
||||||
let output = bertTokenizer.encode("Welcome to the 🤗 Tokenizers library.");
|
let { promisify } = require("util");
|
||||||
|
let encode = promisify(bertTokenizer.encode.bind(bertTokenizer));
|
||||||
|
let decode = promisify(bertTokenizer.decode.bind(bertTokenizer));
|
||||||
|
|
||||||
|
let output = await encode("Welcome to the 🤗 Tokenizers library.");
|
||||||
console.log(output.getTokens());
|
console.log(output.getTokens());
|
||||||
// ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]
|
// ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]
|
||||||
|
|
||||||
bertTokenizer.decode(output.getIds());
|
var decoded = await decode(output.getIds(), true);
|
||||||
// "welcome to the tok ##eni ##zer ##s library ."
|
// "welcome to the tok ##eni ##zer ##s library ."
|
||||||
// END bert_test_decoding
|
// END bert_test_decoding
|
||||||
|
expect(decoded).toEqual("welcome to the tok ##eni ##zer ##s library .");
|
||||||
// START bert_proper_decoding
|
// START bert_proper_decoding
|
||||||
let { wordPieceDecoder } = require("tokenizers/bindings/decoders");
|
let { wordPieceDecoder } = require("tokenizers/bindings/decoders");
|
||||||
bertTokenizer.setDecoder(wordPieceDecoder());
|
bertTokenizer.setDecoder(wordPieceDecoder());
|
||||||
bertTokenizer.decode(output.ids);
|
var decoded = await decode(output.getIds(), true);
|
||||||
// "welcome to the tokenizers library."
|
// "welcome to the tokenizers library."
|
||||||
// END bert_proper_decoding
|
// END bert_proper_decoding
|
||||||
|
expect(decoded).toEqual("welcome to the tokenizers library.");
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from ..utils import data_dir, doc_wiki_tokenizer
|
from ..utils import data_dir, doc_wiki_tokenizer, doc_pipeline_bert_tokenizer
|
||||||
from tokenizers import Tokenizer
|
from tokenizers import Tokenizer
|
||||||
|
|
||||||
|
|
||||||
@ -96,7 +96,8 @@ class TestPipeline:
|
|||||||
== "Hello , y ' all ! How are you ?"
|
== "Hello , y ' all ! How are you ?"
|
||||||
)
|
)
|
||||||
|
|
||||||
def bert_example(self):
|
@staticmethod
|
||||||
|
def slow_train():
|
||||||
# START bert_setup_tokenizer
|
# START bert_setup_tokenizer
|
||||||
from tokenizers import Tokenizer
|
from tokenizers import Tokenizer
|
||||||
from tokenizers.models import WordPiece
|
from tokenizers.models import WordPiece
|
||||||
@ -136,20 +137,49 @@ class TestPipeline:
|
|||||||
bert_tokenizer.train(trainer, files)
|
bert_tokenizer.train(trainer, files)
|
||||||
|
|
||||||
model_files = bert_tokenizer.model.save("data", "bert-wiki")
|
model_files = bert_tokenizer.model.save("data", "bert-wiki")
|
||||||
bert_tokenizer.model = WordPiece(*model_files, unk_token="[UNK]")
|
bert_tokenizer.model = WordPiece.from_file(*model_files, unk_token="[UNK]")
|
||||||
|
|
||||||
bert_tokenizer.save("data/bert-wiki.json")
|
bert_tokenizer.save("data/bert-wiki.json")
|
||||||
# END bert_train_tokenizer
|
# END bert_train_tokenizer
|
||||||
|
|
||||||
|
def test_bert_example(self):
|
||||||
|
try:
|
||||||
|
bert_tokenizer = Tokenizer.from_file("data/bert-wiki.json")
|
||||||
|
except Exception:
|
||||||
|
bert_tokenizer = Tokenizer.from_file(doc_pipeline_bert_tokenizer)
|
||||||
|
|
||||||
# START bert_test_decoding
|
# START bert_test_decoding
|
||||||
output = bert_tokenizer.encode("Welcome to the 🤗 Tokenizers library.")
|
output = bert_tokenizer.encode("Welcome to the 🤗 Tokenizers library.")
|
||||||
print(output.tokens)
|
print(output.tokens)
|
||||||
# ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]
|
# ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]
|
||||||
|
|
||||||
bert_tokenizer.decoder(output.ids)
|
bert_tokenizer.decode(output.ids)
|
||||||
# "welcome to the tok ##eni ##zer ##s library ."
|
# "welcome to the tok ##eni ##zer ##s library ."
|
||||||
# END bert_test_decoding
|
# END bert_test_decoding
|
||||||
|
assert bert_tokenizer.decode(output.ids) == "welcome to the tok ##eni ##zer ##s library ."
|
||||||
# START bert_proper_decoding
|
# START bert_proper_decoding
|
||||||
bert_tokenizer.decoder = tokenizers.decoders.WordPiece()
|
from tokenizers import decoders
|
||||||
|
|
||||||
|
bert_tokenizer.decoder = decoders.WordPiece()
|
||||||
bert_tokenizer.decode(output.ids)
|
bert_tokenizer.decode(output.ids)
|
||||||
# "welcome to the tokenizers library."
|
# "welcome to the tokenizers library."
|
||||||
# END bert_proper_decoding
|
# END bert_proper_decoding
|
||||||
|
assert bert_tokenizer.decode(output.ids) == "welcome to the tokenizers library."
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from urllib import request
|
||||||
|
from zipfile import ZipFile
|
||||||
|
import os
|
||||||
|
|
||||||
|
if not os.path.isdir("data/wikitext-103-raw"):
|
||||||
|
print("Downloading wikitext-103...")
|
||||||
|
wiki_text, _ = request.urlretrieve(
|
||||||
|
"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip"
|
||||||
|
)
|
||||||
|
with ZipFile(wiki_text, "r") as z:
|
||||||
|
print("Unzipping in data...")
|
||||||
|
z.extractall("data")
|
||||||
|
|
||||||
|
print("Now training...")
|
||||||
|
TestPipeline.slow_train()
|
||||||
|
@ -90,6 +90,14 @@ def doc_wiki_tokenizer(data_dir):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def doc_pipeline_bert_tokenizer(data_dir):
|
||||||
|
return download(
|
||||||
|
"https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json",
|
||||||
|
"bert-wiki.json",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def multiprocessing_with_parallelism(tokenizer, enabled: bool):
|
def multiprocessing_with_parallelism(tokenizer, enabled: bool):
|
||||||
"""
|
"""
|
||||||
This helper can be used to test that disabling parallelism avoids dead locks when the
|
This helper can be used to test that disabling parallelism avoids dead locks when the
|
||||||
|
Reference in New Issue
Block a user