mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Doc - Improve snippets testing
This commit is contained in:
@ -9,9 +9,9 @@ describe("loadExample", () => {
|
||||
const ids = [713, 16, 41, 1246];
|
||||
const tokens = ["This", "Ġis", "Ġan", "Ġexample"];
|
||||
|
||||
// START load
|
||||
// START load_tokenizer
|
||||
const tokenizer = tokenizers.Tokenizer.fromFile("data/roberta.json");
|
||||
// END load
|
||||
// END load_tokenizer
|
||||
|
||||
// You could also use regular callbacks
|
||||
const encode = promisify(tokenizer.encode.bind(tokenizer));
|
||||
|
@ -13,6 +13,7 @@ const {
|
||||
|
||||
describe("trainExample", () => {
|
||||
it("", () => {
|
||||
// START train_tokenizer
|
||||
const vocabSize = 100;
|
||||
|
||||
const tokenizer = new Tokenizer(models.BPE.empty());
|
||||
@ -39,6 +40,7 @@ describe("trainExample", () => {
|
||||
|
||||
tokenizer.train(trainer, ["data/small.txt"]);
|
||||
tokenizer.save("data/tokenizer.json");
|
||||
// END train_tokenizer
|
||||
|
||||
expect(1).toBe(1);
|
||||
});
|
||||
|
1
bindings/python/.gitignore
vendored
Normal file
1
bindings/python/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
data
|
19
bindings/python/tests/documentation/test_load.py
Normal file
19
bindings/python/tests/documentation/test_load.py
Normal file
@ -0,0 +1,19 @@
|
||||
from tokenizers import Tokenizer
|
||||
|
||||
|
||||
def test_load_tokenizer():
|
||||
# START load_tokenizer
|
||||
tokenizer = Tokenizer.from_file("data/roberta.json")
|
||||
# END load_tokenizer
|
||||
|
||||
example = "This is an example"
|
||||
ids = [713, 16, 41, 1246]
|
||||
tokens = ["This", "Ġis", "Ġan", "Ġexample"]
|
||||
|
||||
encodings = tokenizer.encode(example)
|
||||
|
||||
assert encodings.ids == ids
|
||||
assert encodings.tokens == tokens
|
||||
|
||||
decoded = tokenizer.decode(ids)
|
||||
assert decoded == example
|
43
bindings/python/tests/documentation/test_train.py
Normal file
43
bindings/python/tests/documentation/test_train.py
Normal file
@ -0,0 +1,43 @@
|
||||
from tokenizers import (
|
||||
Tokenizer,
|
||||
normalizers,
|
||||
pre_tokenizers,
|
||||
models,
|
||||
decoders,
|
||||
processors,
|
||||
trainers,
|
||||
AddedToken,
|
||||
)
|
||||
|
||||
|
||||
def test_train_tokenizer():
|
||||
# START train_tokenizer
|
||||
vocab_size = 100
|
||||
|
||||
tokenizer = Tokenizer(models.BPE())
|
||||
tokenizer.normalizer = normalizers.Sequence(
|
||||
[
|
||||
normalizers.Strip(),
|
||||
normalizers.NFC(),
|
||||
]
|
||||
)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
||||
tokenizer.post_processor = processors.ByteLevel()
|
||||
tokenizer.decoder = decoders.ByteLevel()
|
||||
|
||||
trainer = trainers.BpeTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=0,
|
||||
special_tokens=[
|
||||
AddedToken("<s>"),
|
||||
AddedToken("<pad>"),
|
||||
AddedToken("</s>"),
|
||||
AddedToken("<unk>"),
|
||||
AddedToken("<mask>"),
|
||||
],
|
||||
show_progress=False,
|
||||
)
|
||||
|
||||
tokenizer.train(trainer, ["data/small.txt"])
|
||||
tokenizer.save("data/tokenizer.json")
|
||||
# END train_tokenizer
|
@ -1,17 +0,0 @@
|
||||
from tokenizers import Tokenizer
|
||||
|
||||
# START load
|
||||
tokenizer = Tokenizer.from_file("data/roberta.json")
|
||||
# END load
|
||||
|
||||
example = "This is an example"
|
||||
ids = [713, 16, 41, 1246]
|
||||
tokens = ["This", "Ġis", "Ġan", "Ġexample"]
|
||||
|
||||
encodings = tokenizer.encode(example)
|
||||
|
||||
assert encodings.ids == ids
|
||||
assert encodings.tokens == tokens
|
||||
|
||||
decoded = tokenizer.decode(ids)
|
||||
assert decoded == example
|
@ -1,40 +0,0 @@
|
||||
from tokenizers import (
|
||||
Tokenizer,
|
||||
normalizers,
|
||||
pre_tokenizers,
|
||||
models,
|
||||
decoders,
|
||||
processors,
|
||||
trainers,
|
||||
AddedToken,
|
||||
)
|
||||
|
||||
|
||||
vocab_size = 100
|
||||
|
||||
tokenizer = Tokenizer(models.BPE())
|
||||
tokenizer.normalizer = normalizers.Sequence(
|
||||
[
|
||||
normalizers.Strip(),
|
||||
normalizers.NFC(),
|
||||
]
|
||||
)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
||||
tokenizer.post_processor = processors.ByteLevel()
|
||||
tokenizer.decoder = decoders.ByteLevel()
|
||||
|
||||
trainer = trainers.BpeTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=0,
|
||||
special_tokens=[
|
||||
AddedToken("<s>"),
|
||||
AddedToken("<pad>"),
|
||||
AddedToken("</s>"),
|
||||
AddedToken("<unk>"),
|
||||
AddedToken("<mask>"),
|
||||
],
|
||||
show_progress=False,
|
||||
)
|
||||
|
||||
tokenizer.train(trainer, ["data/small.txt"])
|
||||
tokenizer.save("data/tokenizer.json")
|
@ -9,8 +9,6 @@ class TestByteLevelBPE:
|
||||
tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"])
|
||||
output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
|
||||
|
||||
tokenizer.save("roberta.json")
|
||||
|
||||
assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
|
||||
assert output.tokens == [
|
||||
"The",
|
||||
|
@ -50,10 +50,11 @@ Loading a previously saved tokenizer is extremely simple and requires a single l
|
||||
|
||||
.. only:: Python
|
||||
|
||||
.. literalinclude:: ../../bindings/python/tests/examples/test_load.py
|
||||
.. literalinclude:: ../../bindings/python/tests/documentation/test_load.py
|
||||
:language: python
|
||||
:start-after: START load
|
||||
:end-before: END load
|
||||
:start-after: START load_tokenizer
|
||||
:end-before: END load_tokenizer
|
||||
:dedent: 4
|
||||
|
||||
.. only:: Node
|
||||
|
||||
@ -79,10 +80,16 @@ Small guide of :ref:`how to create a Tokenizer options<tokenizer_blocks>`.
|
||||
|
||||
.. only:: Python
|
||||
|
||||
.. literalinclude:: ../../bindings/python/tests/examples/test_train.py
|
||||
.. literalinclude:: ../../bindings/python/tests/documentation/test_train.py
|
||||
:language: python
|
||||
:start-after: START train_tokenizer
|
||||
:end-before: END train_tokenizer
|
||||
:dedent: 4
|
||||
|
||||
.. only:: Node
|
||||
|
||||
.. literalinclude:: ../../bindings/node/examples/train.test.js
|
||||
:language: javascript
|
||||
:start-after: START train_tokenizer
|
||||
:end-before: END train_tokenizer
|
||||
:dedent: 4
|
||||
|
@ -6,7 +6,7 @@ dir_guard=@mkdir -p $(@D)
|
||||
|
||||
SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt
|
||||
BENCHMARK_RESOURCES = $(SHARED_RESOURCES)
|
||||
TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/albert-base-v1-tokenizer.json
|
||||
TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/albert-base-v1-tokenizer.json $(DATA_DIR)/roberta.json
|
||||
|
||||
.PHONY : build
|
||||
build :
|
||||
@ -67,3 +67,7 @@ $(DATA_DIR)/big.txt :
|
||||
|
||||
$(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt
|
||||
head -100 $(DATA_DIR)/big.txt > $@
|
||||
|
||||
$(DATA_DIR)/roberta.json :
|
||||
$(dir_guard)
|
||||
wget https://storage.googleapis.com/tokenizers/roberta.json -O $@
|
||||
|
@ -37,13 +37,12 @@ fn train_tokenizer() {
|
||||
tokenizer
|
||||
.train(&trainer, vec!["data/small.txt".to_string()])
|
||||
.unwrap()
|
||||
.save("data/trained-tokenizer-tests.json", pretty)
|
||||
.save("data/tokenizer.json", pretty)
|
||||
.unwrap();
|
||||
// END train_tokenizer
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn load_tokenizer() {
|
||||
// START load_tokenizer
|
||||
let tokenizer = Tokenizer::from_file("data/roberta.json").unwrap();
|
||||
|
Reference in New Issue
Block a user