mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Doc - Quicktour uses python tested code
This commit is contained in:
194
bindings/python/tests/documentation/test_quicktour.py
Normal file
194
bindings/python/tests/documentation/test_quicktour.py
Normal file
@ -0,0 +1,194 @@
|
|||||||
|
from ..utils import data_dir, doc_wiki_tokenizer
|
||||||
|
from tokenizers import Tokenizer
|
||||||
|
from tokenizers.models import BPE
|
||||||
|
from tokenizers.trainers import BpeTrainer
|
||||||
|
from tokenizers.pre_tokenizers import Whitespace
|
||||||
|
|
||||||
|
|
||||||
|
class TestQuicktour:
|
||||||
|
# This method contains everything we don't want to run
|
||||||
|
@staticmethod
|
||||||
|
def slow_train():
|
||||||
|
tokenizer, trainer = TestQuicktour.get_tokenizer_trainer()
|
||||||
|
|
||||||
|
# START train
|
||||||
|
files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
|
||||||
|
tokenizer.train(trainer, files)
|
||||||
|
# END train
|
||||||
|
# START reload_model
|
||||||
|
files = tokenizer.model.save("data", "wiki")
|
||||||
|
tokenizer.model = BPE.from_files(*files, unk_token="[UNK]")
|
||||||
|
# END reload_model
|
||||||
|
# START save
|
||||||
|
tokenizer.save("data/tokenizer-wiki.json")
|
||||||
|
# END save
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_tokenizer_trainer():
|
||||||
|
# START init_tokenizer
|
||||||
|
from tokenizers import Tokenizer
|
||||||
|
from tokenizers.models import BPE
|
||||||
|
|
||||||
|
tokenizer = Tokenizer(BPE())
|
||||||
|
# END init_tokenizer
|
||||||
|
# START init_trainer
|
||||||
|
from tokenizers.trainers import BpeTrainer
|
||||||
|
|
||||||
|
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
||||||
|
# END init_trainer
|
||||||
|
# START init_pretok
|
||||||
|
from tokenizers.pre_tokenizers import Whitespace
|
||||||
|
|
||||||
|
tokenizer.pre_tokenizer = Whitespace()
|
||||||
|
# END init_pretok
|
||||||
|
return tokenizer, trainer
|
||||||
|
|
||||||
|
def test_quicktour(self, doc_wiki_tokenizer):
|
||||||
|
def print(*args, **kwargs):
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
# START reload_tokenizer
|
||||||
|
tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")
|
||||||
|
# END reload_tokenizer
|
||||||
|
except Exception:
|
||||||
|
tokenizer = Tokenizer.from_file(doc_wiki_tokenizer)
|
||||||
|
# START encode
|
||||||
|
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
|
||||||
|
# END encode
|
||||||
|
# START print_tokens
|
||||||
|
print(output.tokens)
|
||||||
|
# ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"]
|
||||||
|
# END print_tokens
|
||||||
|
assert output.tokens == [
|
||||||
|
"Hello",
|
||||||
|
",",
|
||||||
|
"y",
|
||||||
|
"'",
|
||||||
|
"all",
|
||||||
|
"!",
|
||||||
|
"How",
|
||||||
|
"are",
|
||||||
|
"you",
|
||||||
|
"[UNK]",
|
||||||
|
"?",
|
||||||
|
]
|
||||||
|
# START print_ids
|
||||||
|
print(output.ids)
|
||||||
|
# [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
|
||||||
|
# END print_ids
|
||||||
|
assert output.ids == [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
|
||||||
|
# START print_offsets
|
||||||
|
print(output.offsets[9])
|
||||||
|
# (26, 27)
|
||||||
|
# END print_offsets
|
||||||
|
assert output.offsets[9] == (26, 27)
|
||||||
|
# START use_offsets
|
||||||
|
sentence = "Hello, y'all! How are you 😁 ?"
|
||||||
|
sentence[26:27]
|
||||||
|
# "😁"
|
||||||
|
# END use_offsets
|
||||||
|
assert sentence[26:27] == "😁"
|
||||||
|
# START check_sep
|
||||||
|
tokenizer.token_to_id("[SEP]")
|
||||||
|
# 2
|
||||||
|
# END check_sep
|
||||||
|
assert tokenizer.token_to_id("[SEP]") == 2
|
||||||
|
# START init_template_processing
|
||||||
|
from tokenizers.processors import TemplateProcessing
|
||||||
|
|
||||||
|
tokenizer.post_processor = TemplateProcessing(
|
||||||
|
single="[CLS] $A [SEP]",
|
||||||
|
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
|
||||||
|
special_tokens=[
|
||||||
|
("[CLS]", tokenizer.token_to_id("[CLS]")),
|
||||||
|
("[SEP]", tokenizer.token_to_id("[SEP]")),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
# END init_template_processing
|
||||||
|
# START print_special_tokens
|
||||||
|
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
|
||||||
|
print(output.tokens)
|
||||||
|
# ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
|
||||||
|
# END print_special_tokens
|
||||||
|
assert output.tokens == [
|
||||||
|
"[CLS]",
|
||||||
|
"Hello",
|
||||||
|
",",
|
||||||
|
"y",
|
||||||
|
"'",
|
||||||
|
"all",
|
||||||
|
"!",
|
||||||
|
"How",
|
||||||
|
"are",
|
||||||
|
"you",
|
||||||
|
"[UNK]",
|
||||||
|
"?",
|
||||||
|
"[SEP]",
|
||||||
|
]
|
||||||
|
# START print_special_tokens_pair
|
||||||
|
output = tokenizer.encode("Hello, y'all!", "How are you 😁 ?")
|
||||||
|
print(output.tokens)
|
||||||
|
# ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"]
|
||||||
|
# END print_special_tokens_pair
|
||||||
|
assert output.tokens == [
|
||||||
|
"[CLS]",
|
||||||
|
"Hello",
|
||||||
|
",",
|
||||||
|
"y",
|
||||||
|
"'",
|
||||||
|
"all",
|
||||||
|
"!",
|
||||||
|
"[SEP]",
|
||||||
|
"How",
|
||||||
|
"are",
|
||||||
|
"you",
|
||||||
|
"[UNK]",
|
||||||
|
"?",
|
||||||
|
"[SEP]",
|
||||||
|
]
|
||||||
|
# START print_type_ids
|
||||||
|
print(output.type_ids)
|
||||||
|
# [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
|
||||||
|
# END print_type_ids
|
||||||
|
assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
|
||||||
|
# START encode_batch
|
||||||
|
output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
|
||||||
|
# END encode_batch
|
||||||
|
# START encode_batch_pair
|
||||||
|
output = tokenizer.encode_batch(
|
||||||
|
[["Hello, y'all!", "How are you 😁 ?"], ["Hello to you too!", "I'm fine, thank you!"]]
|
||||||
|
)
|
||||||
|
# END encode_batch_pair
|
||||||
|
# START enable_padding
|
||||||
|
tokenizer.enable_padding(pad_id=3, pad_token="[PAD]")
|
||||||
|
# END enable_padding
|
||||||
|
# START print_batch_tokens
|
||||||
|
output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
|
||||||
|
print(output[1].tokens)
|
||||||
|
# ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
|
||||||
|
# END print_batch_tokens
|
||||||
|
assert output[1].tokens == ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
|
||||||
|
# START print_attention_mask
|
||||||
|
print(output[1].attention_mask)
|
||||||
|
# [1, 1, 1, 1, 1, 1, 1, 0]
|
||||||
|
# END print_attention_mask
|
||||||
|
assert output[1].attention_mask == [1, 1, 1, 1, 1, 1, 1, 0]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from urllib import request
|
||||||
|
from zipfile import ZipFile
|
||||||
|
import os
|
||||||
|
|
||||||
|
if not os.path.isdir("data/wikitext-103-raw"):
|
||||||
|
print("Downloading wikitext-103...")
|
||||||
|
wiki_text, _ = request.urlretrieve(
|
||||||
|
"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip"
|
||||||
|
)
|
||||||
|
with ZipFile(wiki_text, "r") as z:
|
||||||
|
print("Unzipping in data...")
|
||||||
|
z.extractall("data")
|
||||||
|
|
||||||
|
print("Now training...")
|
||||||
|
TestQuicktour.slow_train()
|
@ -6,8 +6,8 @@ import pytest
|
|||||||
DATA_PATH = os.path.join("tests", "data")
|
DATA_PATH = os.path.join("tests", "data")
|
||||||
|
|
||||||
|
|
||||||
def download(url):
|
def download(url, with_filename=None):
|
||||||
filename = url.rsplit("/")[-1]
|
filename = with_filename if with_filename is not None else url.rsplit("/")[-1]
|
||||||
filepath = os.path.join(DATA_PATH, filename)
|
filepath = os.path.join(DATA_PATH, filename)
|
||||||
if not os.path.exists(filepath):
|
if not os.path.exists(filepath):
|
||||||
with open(filepath, "wb") as f:
|
with open(filepath, "wb") as f:
|
||||||
@ -82,6 +82,14 @@ def albert_base(data_dir):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def doc_wiki_tokenizer(data_dir):
|
||||||
|
return download(
|
||||||
|
"https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json",
|
||||||
|
"tokenizer-wiki.json",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def multiprocessing_with_parallelism(tokenizer, enabled: bool):
|
def multiprocessing_with_parallelism(tokenizer, enabled: bool):
|
||||||
"""
|
"""
|
||||||
This helper can be used to test that disabling parallelism avoids dead locks when the
|
This helper can be used to test that disabling parallelism avoids dead locks when the
|
||||||
|
@ -36,21 +36,24 @@ documentation. Here, training the tokenizer means it will learn merge rules by:
|
|||||||
The main API of the library is the class :class:`~tokenizers.Tokenizer`, here is how we instantiate
|
The main API of the library is the class :class:`~tokenizers.Tokenizer`, here is how we instantiate
|
||||||
one with a BPE model:
|
one with a BPE model:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
from tokenizers import Tokenizer
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
from tokenizers.models import BPE
|
:language: python
|
||||||
|
:start-after: START init_tokenizer
|
||||||
tokenizer = Tokenizer(BPE())
|
:end-before: END init_tokenizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
To train our tokenizer on the wikitext files, we will need to instantiate a `trainer`, in this case
|
To train our tokenizer on the wikitext files, we will need to instantiate a `trainer`, in this case
|
||||||
a :class:`~tokenizers.BpeTrainer`:
|
a :class:`~tokenizers.BpeTrainer`:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
from tokenizers.trainers import BpeTrainer
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
|
:language: python
|
||||||
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
:start-after: START init_trainer
|
||||||
|
:end-before: END init_trainer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
We can set the training arguments like :obj:`vocab_size` or :obj:`min_frequency` (here left at their
|
We can set the training arguments like :obj:`vocab_size` or :obj:`min_frequency` (here left at their
|
||||||
default values of 30,000 and 0) but the most important part is to give the :obj:`special_tokens` we
|
default values of 30,000 and 0) but the most important part is to give the :obj:`special_tokens` we
|
||||||
@ -69,43 +72,59 @@ pre-tokenizer will ensure no token is bigger than a word returned by the pre-tok
|
|||||||
to train a subword BPE tokenizer, and we will use the easiest pre-tokenizer possible by splitting
|
to train a subword BPE tokenizer, and we will use the easiest pre-tokenizer possible by splitting
|
||||||
on whitespace.
|
on whitespace.
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
from tokenizers.pre_tokenizers import Whitespace
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
|
:language: python
|
||||||
tokenizer.pre_tokenizer = Whitespace()
|
:start-after: START init_pretok
|
||||||
|
:end-before: END init_pretok
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
Now, we can just call the :meth:`~tokenizers.Tokenizer.train` method with any list of files we want
|
Now, we can just call the :meth:`~tokenizers.Tokenizer.train` method with any list of files we want
|
||||||
to use:
|
to use:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
files = [f"wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
tokenizer.train(trainer, files)
|
:language: python
|
||||||
|
:start-after: START train
|
||||||
|
:end-before: END train
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
This should only take a few seconds to train our tokenizer on the full wikitext dataset! Once this
|
This should only take a few seconds to train our tokenizer on the full wikitext dataset! Once this
|
||||||
is done, we need to save the model and reinstantiate it with the unknown token, or this token won't
|
is done, we need to save the model and reinstantiate it with the unknown token, or this token won't
|
||||||
be used. This will be simplified in a further release, to let you set the :obj:`unk_token` when
|
be used. This will be simplified in a further release, to let you set the :obj:`unk_token` when
|
||||||
first instantiating the model.
|
first instantiating the model.
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
files = tokenizer.model.save("pretrained", "wiki")
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
tokenizer.model = BPE(*files, unk_token="[UNK]")
|
:language: python
|
||||||
|
:start-after: START reload_model
|
||||||
|
:end-before: END reload_model
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
To save the tokenizer in one file that contains all its configuration and vocabulary, just use the
|
To save the tokenizer in one file that contains all its configuration and vocabulary, just use the
|
||||||
:meth:`~tokenizers.Tokenizer.save` method:
|
:meth:`~tokenizers.Tokenizer.save` method:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
tokenizer.save("pretrained/wiki.json")
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
|
:language: python
|
||||||
|
:start-after: START save
|
||||||
|
:end-before: END save
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
and you can reload your tokenizer from that file with the :meth:`~tokenizers.Tokenizer.from_file`
|
and you can reload your tokenizer from that file with the :meth:`~tokenizers.Tokenizer.from_file`
|
||||||
class method:
|
class method:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
tokenizer = Tokenizer.from_file("pretrained/wiki.json")
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
|
:language: python
|
||||||
|
:start-after: START reload_tokenizer
|
||||||
|
:end-before: END reload_tokenizer
|
||||||
|
:dedent: 12
|
||||||
|
|
||||||
Using the tokenizer
|
Using the tokenizer
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
@ -113,9 +132,13 @@ Using the tokenizer
|
|||||||
Now that we have trained a tokenizer, we can use it on any text we want with the
|
Now that we have trained a tokenizer, we can use it on any text we want with the
|
||||||
:meth:`~tokenizers.Tokenizer.encode` method:
|
:meth:`~tokenizers.Tokenizer.encode` method:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
|
:language: python
|
||||||
|
:start-after: START encode
|
||||||
|
:end-before: END encode
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
This applied the full pipeline of the tokenizer on the text, returning an
|
This applied the full pipeline of the tokenizer on the text, returning an
|
||||||
:class:`~tokenizers.Encoding` object. To learn more about this pipeline, and how to apply (or
|
:class:`~tokenizers.Encoding` object. To learn more about this pipeline, and how to apply (or
|
||||||
@ -125,18 +148,24 @@ This :class:`~tokenizers.Encoding` object then has all the attributes you need f
|
|||||||
learning model (or other). The :obj:`tokens` attribute contains the segmentation of your text in
|
learning model (or other). The :obj:`tokens` attribute contains the segmentation of your text in
|
||||||
tokens:
|
tokens:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
print(output.tokens)
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
# ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"]
|
:language: python
|
||||||
|
:start-after: START print_tokens
|
||||||
|
:end-before: END print_tokens
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
Similarly, the :obj:`ids` attribute will contain the index of each of those tokens in the
|
Similarly, the :obj:`ids` attribute will contain the index of each of those tokens in the
|
||||||
tokenizer's vocabulary:
|
tokenizer's vocabulary:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
print(output.ids)
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
# [27194, 16, 93, 11, 5068, 5, 7928, 5083, 6190, 0, 35]
|
:language: python
|
||||||
|
:start-after: START print_ids
|
||||||
|
:end-before: END print_ids
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
An important feature of the 🤗 Tokenizers library is that it comes with full alignment tracking,
|
An important feature of the 🤗 Tokenizers library is that it comes with full alignment tracking,
|
||||||
meaning you can always get the part of your original sentence that corresponds to a given token.
|
meaning you can always get the part of your original sentence that corresponds to a given token.
|
||||||
@ -144,18 +173,23 @@ Those are stored in the :obj:`offsets` attribute of our :class:`~tokenizers.Enco
|
|||||||
instance, let's assume we would want to find back what caused the :obj:`"[UNK]"` token to appear,
|
instance, let's assume we would want to find back what caused the :obj:`"[UNK]"` token to appear,
|
||||||
which is the token at index 9 in the list, we can just ask for the offset at the index:
|
which is the token at index 9 in the list, we can just ask for the offset at the index:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
print(output.offsets[9])
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
# (26, 27)
|
:language: python
|
||||||
|
:start-after: START print_offsets
|
||||||
|
:end-before: END print_offsets
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
and those are the indices that correspond to the emoji in the original sentence:
|
and those are the indices that correspond to the emoji in the original sentence:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
sentence = "Hello, y'all! How are you 😁 ?"
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
sentence[26:27]
|
:language: python
|
||||||
# "😁"
|
:start-after: START use_offsets
|
||||||
|
:end-before: END use_offsets
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
Post-processing
|
Post-processing
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
@ -169,25 +203,23 @@ When we built our tokenizer, we set :obj:`"[CLS]"` and :obj:`"[SEP]"` in positio
|
|||||||
list of special tokens, so this should be their IDs. To double-check, we can use the
|
list of special tokens, so this should be their IDs. To double-check, we can use the
|
||||||
:meth:`~tokenizers.Tokenizer.token_to_id` method:
|
:meth:`~tokenizers.Tokenizer.token_to_id` method:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
tokenizer.token_to_id("[SEP]")
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
# 2
|
:language: python
|
||||||
|
:start-after: START check_sep
|
||||||
|
:end-before: END check_sep
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
Here is how we can set the post-processing to give us the traditional BERT inputs:
|
Here is how we can set the post-processing to give us the traditional BERT inputs:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
from tokenizers.processors import TemplateProcessing
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
|
:language: python
|
||||||
tokenizer.post_processor = TemplateProcessing
|
:start-after: START init_template_processing
|
||||||
single="[CLS] $A [SEP]",
|
:end-before: END init_template_processing
|
||||||
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
|
:dedent: 8
|
||||||
special_tokens=[
|
|
||||||
("[CLS]", tokenizer.token_to_id("[CLS]")),
|
|
||||||
("[SEP]", tokenizer.token_to_id("[SEP]"))
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
Let's go over this snippet of code in more details. First we specify the template for single
|
Let's go over this snippet of code in more details. First we specify the template for single
|
||||||
sentences: those should have the form :obj:`"[CLS] $A [SEP]"` where :obj:`$A` represents our
|
sentences: those should have the form :obj:`"[CLS] $A [SEP]"` where :obj:`$A` represents our
|
||||||
@ -203,27 +235,34 @@ Lastly, we specify the special tokens we used and their IDs in our tokenizer's v
|
|||||||
|
|
||||||
To check out this worked properly, let's try to encode the same sentence as before:
|
To check out this worked properly, let's try to encode the same sentence as before:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
print(output.tokens)
|
:language: python
|
||||||
# ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
|
:start-after: START print_special_tokens
|
||||||
|
:end-before: END print_special_tokens
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
To check the results on a pair of sentences, we just pass the two sentences to
|
To check the results on a pair of sentences, we just pass the two sentences to
|
||||||
:meth:`~tokenizers.Tokenizer.encode`:
|
:meth:`~tokenizers.Tokenizer.encode`:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
output = tokenizer.encode("Hello, y'all!", "How are you 😁 ?")
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
print(output.tokens)
|
:language: python
|
||||||
# ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"]
|
:start-after: START print_special_tokens_pair
|
||||||
|
:end-before: END print_special_tokens_pair
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
You can then check the type IDs attributed to each token is correct with
|
You can then check the type IDs attributed to each token is correct with
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
print(output.type_ids)
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
# [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
|
:language: python
|
||||||
|
:start-after: START print_type_ids
|
||||||
|
:end-before: END print_type_ids
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
If you save your tokenizer with :meth:`~tokenizers.Tokenizer.save`, the post-processor will be saved
|
If you save your tokenizer with :meth:`~tokenizers.Tokenizer.save`, the post-processor will be saved
|
||||||
along.
|
along.
|
||||||
@ -234,9 +273,13 @@ Encoding multiple sentences in a batch
|
|||||||
To get the full speed of the 🤗 Tokenizers library, it's best to process your texts by batches by
|
To get the full speed of the 🤗 Tokenizers library, it's best to process your texts by batches by
|
||||||
using the :meth:`~tokenizers.Tokenizer.encode_batch` method:
|
using the :meth:`~tokenizers.Tokenizer.encode_batch` method:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
|
:language: python
|
||||||
|
:start-after: START encode_batch
|
||||||
|
:end-before: END encode_batch
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
The output is then a list of :class:`~tokenizers.Encoding` objects like the ones we saw before. You
|
The output is then a list of :class:`~tokenizers.Encoding` objects like the ones we saw before. You
|
||||||
can process together as many texts as you like, as long as it fits in memory.
|
can process together as many texts as you like, as long as it fits in memory.
|
||||||
@ -245,38 +288,48 @@ To process a batch of sentences pairs, pass two lists to the
|
|||||||
:meth:`~tokenizers.Tokenizer.encode_batch` method: the list of sentences A and the list of sentences
|
:meth:`~tokenizers.Tokenizer.encode_batch` method: the list of sentences A and the list of sentences
|
||||||
B:
|
B:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
output = tokenizer.encode_batch(
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
["Hello, y'all!", "How are you 😁 ?"],
|
:language: python
|
||||||
["Hello to you too!", "I'm fine, thank you!"]
|
:start-after: START encode_batch_pair
|
||||||
)
|
:end-before: END encode_batch_pair
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
When encoding multiple sentences, you can automatically pad the outputs to the longest sentence
|
When encoding multiple sentences, you can automatically pad the outputs to the longest sentence
|
||||||
present by using :meth:`~tokenizers.Tokenizer.enable_padding`, with the :obj:`pad_token` and its ID
|
present by using :meth:`~tokenizers.Tokenizer.enable_padding`, with the :obj:`pad_token` and its ID
|
||||||
(which we can double-check the id for the padding token with
|
(which we can double-check the id for the padding token with
|
||||||
:meth:`~tokenizers.Tokenizer.token_to_id` like before):
|
:meth:`~tokenizers.Tokenizer.token_to_id` like before):
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
tokenizer.enable_padding(pad_id=3, pad_token="[PAD]")
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
|
:language: python
|
||||||
|
:start-after: START enable_padding
|
||||||
|
:end-before: END enable_padding
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
We can set the :obj:`direction` of the padding (defaults to the right) or a given :obj:`length` if
|
We can set the :obj:`direction` of the padding (defaults to the right) or a given :obj:`length` if
|
||||||
we want to pad every sample to that specific number (here we leave it unset to pad to the size of
|
we want to pad every sample to that specific number (here we leave it unset to pad to the size of
|
||||||
the longest text).
|
the longest text).
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
print(output[1].tokens)
|
:language: python
|
||||||
# ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
|
:start-after: START print_batch_tokens
|
||||||
|
:end-before: END print_batch_tokens
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
In this case, the `attention mask` generated by the tokenizer takes the padding into account:
|
In this case, the `attention mask` generated by the tokenizer takes the padding into account:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
print(output[1].attention_mask)
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_quicktour.py
|
||||||
[1, 1, 1, 1, 1, 1, 1, 0]
|
:language: python
|
||||||
|
:start-after: START print_attention_mask
|
||||||
|
:end-before: END print_attention_mask
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
.. _pretrained:
|
.. _pretrained:
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user