Doc - Quicktour uses python tested code

This commit is contained in:
Anthony MOI
2020-10-14 16:25:38 -04:00
committed by Anthony MOI
parent 108b2a6b9b
commit 4cf0a0b72c
3 changed files with 336 additions and 81 deletions

View File

@ -0,0 +1,194 @@
from ..utils import data_dir, doc_wiki_tokenizer
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
class TestQuicktour:
# This method contains everything we don't want to run
@staticmethod
def slow_train():
tokenizer, trainer = TestQuicktour.get_tokenizer_trainer()
# START train
files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
tokenizer.train(trainer, files)
# END train
# START reload_model
files = tokenizer.model.save("data", "wiki")
tokenizer.model = BPE.from_files(*files, unk_token="[UNK]")
# END reload_model
# START save
tokenizer.save("data/tokenizer-wiki.json")
# END save
@staticmethod
def get_tokenizer_trainer():
# START init_tokenizer
from tokenizers import Tokenizer
from tokenizers.models import BPE
tokenizer = Tokenizer(BPE())
# END init_tokenizer
# START init_trainer
from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
# END init_trainer
# START init_pretok
from tokenizers.pre_tokenizers import Whitespace
tokenizer.pre_tokenizer = Whitespace()
# END init_pretok
return tokenizer, trainer
def test_quicktour(self, doc_wiki_tokenizer):
def print(*args, **kwargs):
pass
try:
# START reload_tokenizer
tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")
# END reload_tokenizer
except Exception:
tokenizer = Tokenizer.from_file(doc_wiki_tokenizer)
# START encode
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
# END encode
# START print_tokens
print(output.tokens)
# ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"]
# END print_tokens
assert output.tokens == [
"Hello",
",",
"y",
"'",
"all",
"!",
"How",
"are",
"you",
"[UNK]",
"?",
]
# START print_ids
print(output.ids)
# [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
# END print_ids
assert output.ids == [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
# START print_offsets
print(output.offsets[9])
# (26, 27)
# END print_offsets
assert output.offsets[9] == (26, 27)
# START use_offsets
sentence = "Hello, y'all! How are you 😁 ?"
sentence[26:27]
# "😁"
# END use_offsets
assert sentence[26:27] == "😁"
# START check_sep
tokenizer.token_to_id("[SEP]")
# 2
# END check_sep
assert tokenizer.token_to_id("[SEP]") == 2
# START init_template_processing
from tokenizers.processors import TemplateProcessing
tokenizer.post_processor = TemplateProcessing(
single="[CLS] $A [SEP]",
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
special_tokens=[
("[CLS]", tokenizer.token_to_id("[CLS]")),
("[SEP]", tokenizer.token_to_id("[SEP]")),
],
)
# END init_template_processing
# START print_special_tokens
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output.tokens)
# ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
# END print_special_tokens
assert output.tokens == [
"[CLS]",
"Hello",
",",
"y",
"'",
"all",
"!",
"How",
"are",
"you",
"[UNK]",
"?",
"[SEP]",
]
# START print_special_tokens_pair
output = tokenizer.encode("Hello, y'all!", "How are you 😁 ?")
print(output.tokens)
# ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"]
# END print_special_tokens_pair
assert output.tokens == [
"[CLS]",
"Hello",
",",
"y",
"'",
"all",
"!",
"[SEP]",
"How",
"are",
"you",
"[UNK]",
"?",
"[SEP]",
]
# START print_type_ids
print(output.type_ids)
# [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
# END print_type_ids
assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
# START encode_batch
output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
# END encode_batch
# START encode_batch_pair
output = tokenizer.encode_batch(
[["Hello, y'all!", "How are you 😁 ?"], ["Hello to you too!", "I'm fine, thank you!"]]
)
# END encode_batch_pair
# START enable_padding
tokenizer.enable_padding(pad_id=3, pad_token="[PAD]")
# END enable_padding
# START print_batch_tokens
output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
print(output[1].tokens)
# ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
# END print_batch_tokens
assert output[1].tokens == ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
# START print_attention_mask
print(output[1].attention_mask)
# [1, 1, 1, 1, 1, 1, 1, 0]
# END print_attention_mask
assert output[1].attention_mask == [1, 1, 1, 1, 1, 1, 1, 0]
if __name__ == "__main__":
from urllib import request
from zipfile import ZipFile
import os
if not os.path.isdir("data/wikitext-103-raw"):
print("Downloading wikitext-103...")
wiki_text, _ = request.urlretrieve(
"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip"
)
with ZipFile(wiki_text, "r") as z:
print("Unzipping in data...")
z.extractall("data")
print("Now training...")
TestQuicktour.slow_train()

View File

@ -6,8 +6,8 @@ import pytest
DATA_PATH = os.path.join("tests", "data") DATA_PATH = os.path.join("tests", "data")
def download(url): def download(url, with_filename=None):
filename = url.rsplit("/")[-1] filename = with_filename if with_filename is not None else url.rsplit("/")[-1]
filepath = os.path.join(DATA_PATH, filename) filepath = os.path.join(DATA_PATH, filename)
if not os.path.exists(filepath): if not os.path.exists(filepath):
with open(filepath, "wb") as f: with open(filepath, "wb") as f:
@ -82,6 +82,14 @@ def albert_base(data_dir):
) )
@pytest.fixture(scope="session")
def doc_wiki_tokenizer(data_dir):
return download(
"https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json",
"tokenizer-wiki.json",
)
def multiprocessing_with_parallelism(tokenizer, enabled: bool): def multiprocessing_with_parallelism(tokenizer, enabled: bool):
""" """
This helper can be used to test that disabling parallelism avoids dead locks when the This helper can be used to test that disabling parallelism avoids dead locks when the

View File

@ -36,21 +36,24 @@ documentation. Here, training the tokenizer means it will learn merge rules by:
The main API of the library is the class :class:`~tokenizers.Tokenizer`, here is how we instantiate The main API of the library is the class :class:`~tokenizers.Tokenizer`, here is how we instantiate
one with a BPE model: one with a BPE model:
.. code-block:: python .. only:: python
from tokenizers import Tokenizer .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
from tokenizers.models import BPE :language: python
:start-after: START init_tokenizer
tokenizer = Tokenizer(BPE()) :end-before: END init_tokenizer
:dedent: 8
To train our tokenizer on the wikitext files, we will need to instantiate a `trainer`, in this case To train our tokenizer on the wikitext files, we will need to instantiate a `trainer`, in this case
a :class:`~tokenizers.BpeTrainer`: a :class:`~tokenizers.BpeTrainer`:
.. code-block:: python .. only:: python
from tokenizers.trainers import BpeTrainer .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
:language: python
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) :start-after: START init_trainer
:end-before: END init_trainer
:dedent: 8
We can set the training arguments like :obj:`vocab_size` or :obj:`min_frequency` (here left at their We can set the training arguments like :obj:`vocab_size` or :obj:`min_frequency` (here left at their
default values of 30,000 and 0) but the most important part is to give the :obj:`special_tokens` we default values of 30,000 and 0) but the most important part is to give the :obj:`special_tokens` we
@ -69,43 +72,59 @@ pre-tokenizer will ensure no token is bigger than a word returned by the pre-tok
to train a subword BPE tokenizer, and we will use the easiest pre-tokenizer possible by splitting to train a subword BPE tokenizer, and we will use the easiest pre-tokenizer possible by splitting
on whitespace. on whitespace.
.. code-block:: python .. only:: python
from tokenizers.pre_tokenizers import Whitespace .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
:language: python
tokenizer.pre_tokenizer = Whitespace() :start-after: START init_pretok
:end-before: END init_pretok
:dedent: 8
Now, we can just call the :meth:`~tokenizers.Tokenizer.train` method with any list of files we want Now, we can just call the :meth:`~tokenizers.Tokenizer.train` method with any list of files we want
to use: to use:
.. code-block:: python .. only:: python
files = [f"wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]] .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
tokenizer.train(trainer, files) :language: python
:start-after: START train
:end-before: END train
:dedent: 8
This should only take a few seconds to train our tokenizer on the full wikitext dataset! Once this This should only take a few seconds to train our tokenizer on the full wikitext dataset! Once this
is done, we need to save the model and reinstantiate it with the unknown token, or this token won't is done, we need to save the model and reinstantiate it with the unknown token, or this token won't
be used. This will be simplified in a further release, to let you set the :obj:`unk_token` when be used. This will be simplified in a further release, to let you set the :obj:`unk_token` when
first instantiating the model. first instantiating the model.
.. code-block:: python .. only:: python
files = tokenizer.model.save("pretrained", "wiki") .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
tokenizer.model = BPE(*files, unk_token="[UNK]") :language: python
:start-after: START reload_model
:end-before: END reload_model
:dedent: 8
To save the tokenizer in one file that contains all its configuration and vocabulary, just use the To save the tokenizer in one file that contains all its configuration and vocabulary, just use the
:meth:`~tokenizers.Tokenizer.save` method: :meth:`~tokenizers.Tokenizer.save` method:
.. code-block:: python .. only:: python
tokenizer.save("pretrained/wiki.json") .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
:language: python
:start-after: START save
:end-before: END save
:dedent: 8
and you can reload your tokenizer from that file with the :meth:`~tokenizers.Tokenizer.from_file` and you can reload your tokenizer from that file with the :meth:`~tokenizers.Tokenizer.from_file`
class method: class method:
.. code-block:: python .. only:: python
tokenizer = Tokenizer.from_file("pretrained/wiki.json") .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
:language: python
:start-after: START reload_tokenizer
:end-before: END reload_tokenizer
:dedent: 12
Using the tokenizer Using the tokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -113,9 +132,13 @@ Using the tokenizer
Now that we have trained a tokenizer, we can use it on any text we want with the Now that we have trained a tokenizer, we can use it on any text we want with the
:meth:`~tokenizers.Tokenizer.encode` method: :meth:`~tokenizers.Tokenizer.encode` method:
.. code-block:: python .. only:: python
output = tokenizer.encode("Hello, y'all! How are you 😁 ?") .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
:language: python
:start-after: START encode
:end-before: END encode
:dedent: 8
This applied the full pipeline of the tokenizer on the text, returning an This applied the full pipeline of the tokenizer on the text, returning an
:class:`~tokenizers.Encoding` object. To learn more about this pipeline, and how to apply (or :class:`~tokenizers.Encoding` object. To learn more about this pipeline, and how to apply (or
@ -125,18 +148,24 @@ This :class:`~tokenizers.Encoding` object then has all the attributes you need f
learning model (or other). The :obj:`tokens` attribute contains the segmentation of your text in learning model (or other). The :obj:`tokens` attribute contains the segmentation of your text in
tokens: tokens:
.. code-block:: python .. only:: python
print(output.tokens) .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
# ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"] :language: python
:start-after: START print_tokens
:end-before: END print_tokens
:dedent: 8
Similarly, the :obj:`ids` attribute will contain the index of each of those tokens in the Similarly, the :obj:`ids` attribute will contain the index of each of those tokens in the
tokenizer's vocabulary: tokenizer's vocabulary:
.. code-block:: python .. only:: python
print(output.ids) .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
# [27194, 16, 93, 11, 5068, 5, 7928, 5083, 6190, 0, 35] :language: python
:start-after: START print_ids
:end-before: END print_ids
:dedent: 8
An important feature of the 🤗 Tokenizers library is that it comes with full alignment tracking, An important feature of the 🤗 Tokenizers library is that it comes with full alignment tracking,
meaning you can always get the part of your original sentence that corresponds to a given token. meaning you can always get the part of your original sentence that corresponds to a given token.
@ -144,18 +173,23 @@ Those are stored in the :obj:`offsets` attribute of our :class:`~tokenizers.Enco
instance, let's assume we would want to find back what caused the :obj:`"[UNK]"` token to appear, instance, let's assume we would want to find back what caused the :obj:`"[UNK]"` token to appear,
which is the token at index 9 in the list, we can just ask for the offset at the index: which is the token at index 9 in the list, we can just ask for the offset at the index:
.. code-block:: python .. only:: python
print(output.offsets[9]) .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
# (26, 27) :language: python
:start-after: START print_offsets
:end-before: END print_offsets
:dedent: 8
and those are the indices that correspond to the emoji in the original sentence: and those are the indices that correspond to the emoji in the original sentence:
.. code-block:: python .. only:: python
sentence = "Hello, y'all! How are you 😁 ?" .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
sentence[26:27] :language: python
# "😁" :start-after: START use_offsets
:end-before: END use_offsets
:dedent: 8
Post-processing Post-processing
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -169,25 +203,23 @@ When we built our tokenizer, we set :obj:`"[CLS]"` and :obj:`"[SEP]"` in positio
list of special tokens, so this should be their IDs. To double-check, we can use the list of special tokens, so this should be their IDs. To double-check, we can use the
:meth:`~tokenizers.Tokenizer.token_to_id` method: :meth:`~tokenizers.Tokenizer.token_to_id` method:
.. code-block:: python .. only:: python
tokenizer.token_to_id("[SEP]") .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
# 2 :language: python
:start-after: START check_sep
:end-before: END check_sep
:dedent: 8
Here is how we can set the post-processing to give us the traditional BERT inputs: Here is how we can set the post-processing to give us the traditional BERT inputs:
.. code-block:: python .. only:: python
from tokenizers.processors import TemplateProcessing .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
:language: python
tokenizer.post_processor = TemplateProcessing :start-after: START init_template_processing
single="[CLS] $A [SEP]", :end-before: END init_template_processing
pair="[CLS] $A [SEP] $B:1 [SEP]:1", :dedent: 8
special_tokens=[
("[CLS]", tokenizer.token_to_id("[CLS]")),
("[SEP]", tokenizer.token_to_id("[SEP]"))
],
)
Let's go over this snippet of code in more details. First we specify the template for single Let's go over this snippet of code in more details. First we specify the template for single
sentences: those should have the form :obj:`"[CLS] $A [SEP]"` where :obj:`$A` represents our sentences: those should have the form :obj:`"[CLS] $A [SEP]"` where :obj:`$A` represents our
@ -203,27 +235,34 @@ Lastly, we specify the special tokens we used and their IDs in our tokenizer's v
To check out this worked properly, let's try to encode the same sentence as before: To check out this worked properly, let's try to encode the same sentence as before:
.. code-block:: python .. only:: python
output = tokenizer.encode("Hello, y'all! How are you 😁 ?") .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
print(output.tokens) :language: python
# ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"] :start-after: START print_special_tokens
:end-before: END print_special_tokens
:dedent: 8
To check the results on a pair of sentences, we just pass the two sentences to To check the results on a pair of sentences, we just pass the two sentences to
:meth:`~tokenizers.Tokenizer.encode`: :meth:`~tokenizers.Tokenizer.encode`:
.. code-block:: python .. only:: python
output = tokenizer.encode("Hello, y'all!", "How are you 😁 ?") .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
print(output.tokens) :language: python
# ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"] :start-after: START print_special_tokens_pair
:end-before: END print_special_tokens_pair
:dedent: 8
You can then check the type IDs attributed to each token is correct with You can then check the type IDs attributed to each token is correct with
.. code-block:: python .. only:: python
print(output.type_ids) .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
# [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] :language: python
:start-after: START print_type_ids
:end-before: END print_type_ids
:dedent: 8
If you save your tokenizer with :meth:`~tokenizers.Tokenizer.save`, the post-processor will be saved If you save your tokenizer with :meth:`~tokenizers.Tokenizer.save`, the post-processor will be saved
along. along.
@ -234,9 +273,13 @@ Encoding multiple sentences in a batch
To get the full speed of the 🤗 Tokenizers library, it's best to process your texts by batches by To get the full speed of the 🤗 Tokenizers library, it's best to process your texts by batches by
using the :meth:`~tokenizers.Tokenizer.encode_batch` method: using the :meth:`~tokenizers.Tokenizer.encode_batch` method:
.. code-block:: python .. only:: python
output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"]) .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
:language: python
:start-after: START encode_batch
:end-before: END encode_batch
:dedent: 8
The output is then a list of :class:`~tokenizers.Encoding` objects like the ones we saw before. You The output is then a list of :class:`~tokenizers.Encoding` objects like the ones we saw before. You
can process together as many texts as you like, as long as it fits in memory. can process together as many texts as you like, as long as it fits in memory.
@ -245,38 +288,48 @@ To process a batch of sentences pairs, pass two lists to the
:meth:`~tokenizers.Tokenizer.encode_batch` method: the list of sentences A and the list of sentences :meth:`~tokenizers.Tokenizer.encode_batch` method: the list of sentences A and the list of sentences
B: B:
.. code-block:: python .. only:: python
output = tokenizer.encode_batch( .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
["Hello, y'all!", "How are you 😁 ?"], :language: python
["Hello to you too!", "I'm fine, thank you!"] :start-after: START encode_batch_pair
) :end-before: END encode_batch_pair
:dedent: 8
When encoding multiple sentences, you can automatically pad the outputs to the longest sentence When encoding multiple sentences, you can automatically pad the outputs to the longest sentence
present by using :meth:`~tokenizers.Tokenizer.enable_padding`, with the :obj:`pad_token` and its ID present by using :meth:`~tokenizers.Tokenizer.enable_padding`, with the :obj:`pad_token` and its ID
(which we can double-check the id for the padding token with (which we can double-check the id for the padding token with
:meth:`~tokenizers.Tokenizer.token_to_id` like before): :meth:`~tokenizers.Tokenizer.token_to_id` like before):
.. code-block:: python .. only:: python
tokenizer.enable_padding(pad_id=3, pad_token="[PAD]") .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
:language: python
:start-after: START enable_padding
:end-before: END enable_padding
:dedent: 8
We can set the :obj:`direction` of the padding (defaults to the right) or a given :obj:`length` if We can set the :obj:`direction` of the padding (defaults to the right) or a given :obj:`length` if
we want to pad every sample to that specific number (here we leave it unset to pad to the size of we want to pad every sample to that specific number (here we leave it unset to pad to the size of
the longest text). the longest text).
.. code-block:: python .. only:: python
output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"]) .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
print(output[1].tokens) :language: python
# ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"] :start-after: START print_batch_tokens
:end-before: END print_batch_tokens
:dedent: 8
In this case, the `attention mask` generated by the tokenizer takes the padding into account: In this case, the `attention mask` generated by the tokenizer takes the padding into account:
.. code-block:: python .. only:: python
print(output[1].attention_mask) .. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
[1, 1, 1, 1, 1, 1, 1, 0] :language: python
:start-after: START print_attention_mask
:end-before: END print_attention_mask
:dedent: 8
.. _pretrained: .. _pretrained: