Doc - Improve snippets testing

This commit is contained in:
Anthony MOI
2020-10-02 15:52:33 -04:00
committed by Anthony MOI
parent f4e7754112
commit 000c19a7a5
12 changed files with 84 additions and 68 deletions

View File

@ -9,9 +9,9 @@ describe("loadExample", () => {
const ids = [713, 16, 41, 1246];
const tokens = ["This", "Ġis", "Ġan", "Ġexample"];
// START load
// START load_tokenizer
const tokenizer = tokenizers.Tokenizer.fromFile("data/roberta.json");
// END load
// END load_tokenizer
// You could also use regular callbacks
const encode = promisify(tokenizer.encode.bind(tokenizer));

View File

@ -13,6 +13,7 @@ const {
describe("trainExample", () => {
it("", () => {
// START train_tokenizer
const vocabSize = 100;
const tokenizer = new Tokenizer(models.BPE.empty());
@ -39,6 +40,7 @@ describe("trainExample", () => {
tokenizer.train(trainer, ["data/small.txt"]);
tokenizer.save("data/tokenizer.json");
// END train_tokenizer
expect(1).toBe(1);
});

1
bindings/python/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
data

View File

@ -0,0 +1,19 @@
from tokenizers import Tokenizer
def test_load_tokenizer():
# START load_tokenizer
tokenizer = Tokenizer.from_file("data/roberta.json")
# END load_tokenizer
example = "This is an example"
ids = [713, 16, 41, 1246]
tokens = ["This", "Ġis", "Ġan", "Ġexample"]
encodings = tokenizer.encode(example)
assert encodings.ids == ids
assert encodings.tokens == tokens
decoded = tokenizer.decode(ids)
assert decoded == example

View File

@ -0,0 +1,43 @@
from tokenizers import (
Tokenizer,
normalizers,
pre_tokenizers,
models,
decoders,
processors,
trainers,
AddedToken,
)
def test_train_tokenizer():
# START train_tokenizer
vocab_size = 100
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence(
[
normalizers.Strip(),
normalizers.NFC(),
]
)
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.post_processor = processors.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
min_frequency=0,
special_tokens=[
AddedToken("<s>"),
AddedToken("<pad>"),
AddedToken("</s>"),
AddedToken("<unk>"),
AddedToken("<mask>"),
],
show_progress=False,
)
tokenizer.train(trainer, ["data/small.txt"])
tokenizer.save("data/tokenizer.json")
# END train_tokenizer

View File

@ -1,17 +0,0 @@
from tokenizers import Tokenizer
# START load
tokenizer = Tokenizer.from_file("data/roberta.json")
# END load
example = "This is an example"
ids = [713, 16, 41, 1246]
tokens = ["This", "Ġis", "Ġan", "Ġexample"]
encodings = tokenizer.encode(example)
assert encodings.ids == ids
assert encodings.tokens == tokens
decoded = tokenizer.decode(ids)
assert decoded == example

View File

@ -1,40 +0,0 @@
from tokenizers import (
Tokenizer,
normalizers,
pre_tokenizers,
models,
decoders,
processors,
trainers,
AddedToken,
)
vocab_size = 100
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence(
[
normalizers.Strip(),
normalizers.NFC(),
]
)
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.post_processor = processors.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.BpeTrainer(
vocab_size=vocab_size,
min_frequency=0,
special_tokens=[
AddedToken("<s>"),
AddedToken("<pad>"),
AddedToken("</s>"),
AddedToken("<unk>"),
AddedToken("<mask>"),
],
show_progress=False,
)
tokenizer.train(trainer, ["data/small.txt"])
tokenizer.save("data/tokenizer.json")

View File

@ -9,8 +9,6 @@ class TestByteLevelBPE:
tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"])
output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
tokenizer.save("roberta.json")
assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
assert output.tokens == [
"The",

View File

@ -50,10 +50,11 @@ Loading a previously saved tokenizer is extremely simple and requires a single l
.. only:: Python
.. literalinclude:: ../../bindings/python/tests/examples/test_load.py
.. literalinclude:: ../../bindings/python/tests/documentation/test_load.py
:language: python
:start-after: START load
:end-before: END load
:start-after: START load_tokenizer
:end-before: END load_tokenizer
:dedent: 4
.. only:: Node
@ -79,10 +80,16 @@ Small guide of :ref:`how to create a Tokenizer options<tokenizer_blocks>`.
.. only:: Python
.. literalinclude:: ../../bindings/python/tests/examples/test_train.py
.. literalinclude:: ../../bindings/python/tests/documentation/test_train.py
:language: python
:start-after: START train_tokenizer
:end-before: END train_tokenizer
:dedent: 4
.. only:: Node
.. literalinclude:: ../../bindings/node/examples/train.test.js
:language: javascript
:start-after: START train_tokenizer
:end-before: END train_tokenizer
:dedent: 4

View File

@ -6,7 +6,7 @@ dir_guard=@mkdir -p $(@D)
SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt
BENCHMARK_RESOURCES = $(SHARED_RESOURCES)
TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/albert-base-v1-tokenizer.json
TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/albert-base-v1-tokenizer.json $(DATA_DIR)/roberta.json
.PHONY : build
build :
@ -67,3 +67,7 @@ $(DATA_DIR)/big.txt :
$(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt
head -100 $(DATA_DIR)/big.txt > $@
$(DATA_DIR)/roberta.json :
$(dir_guard)
wget https://storage.googleapis.com/tokenizers/roberta.json -O $@

View File

@ -37,13 +37,12 @@ fn train_tokenizer() {
tokenizer
.train(&trainer, vec!["data/small.txt".to_string()])
.unwrap()
.save("data/trained-tokenizer-tests.json", pretty)
.save("data/tokenizer.json", pretty)
.unwrap();
// END train_tokenizer
}
#[test]
#[ignore]
fn load_tokenizer() {
// START load_tokenizer
let tokenizer = Tokenizer::from_file("data/roberta.json").unwrap();