Doc - Update Bert example on the Pipeline page

This commit is contained in:
Anthony MOI
2020-10-28 15:51:07 -04:00
committed by Anthony MOI
parent 5839348a46
commit 8b65c1f4bc
4 changed files with 333 additions and 44 deletions

View File

@ -7,7 +7,7 @@ describe("pipelineExample", () => {
return globRequire("../../lib/" + path); return globRequire("../../lib/" + path);
} }
it("", async () => { it("shows pipeline parts", async () => {
// START reload_tokenizer // START reload_tokenizer
let { Tokenizer } = require("tokenizers/bindings/tokenizer"); let { Tokenizer } = require("tokenizers/bindings/tokenizer");
@ -57,5 +57,64 @@ describe("pipelineExample", () => {
// START replace_pre_tokenizer // START replace_pre_tokenizer
tokenizer.setPreTokenizer(preTokenizer) tokenizer.setPreTokenizer(preTokenizer)
// END replace_pre_tokenizer // END replace_pre_tokenizer
// START setup_processor
let { templateProcessing } = require("tokenizers/bindings/processors");
tokenizer.setPostProcessor(templateProcessing(
"[CLS] $A [SEP]",
"[CLS] $A [SEP] $B:1 [SEP]:1",
[["[CLS]", 1], ["[SEP]", 2]]
));
// END setup_processor
});
it("shows a full bert example", async () => {
// START bert_setup_tokenizer
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
let { WordPiece } = require("tokenizers/bindings/models");
let bert_tokenizer = Tokenizer(WordPiece.empty());
// END bert_setup_tokenizer
// START bert_setup_normalizer
let { sequenceNormalizer, lowercaseNormalizer, nfdNormalizer, stripAccentsNormalizer }
= require("tokenizers/bindings/normalizers");
bert_tokenizer.setNormalizer(sequenceNormalizer([
nfdNormalizer(), lowercaseNormalizer(), stripAccentsNormalizer()
]))
// END bert_setup_normalizer
// START bert_setup_pre_tokenizer
let { whitespacePreTokenizer } = require("tokenizers/bindings/pre_tokenizers");
bert_tokenizer.setPreTokenizer = whitespacePreTokenizer();
// END bert_setup_pre_tokenizer
// START bert_setup_processor
let { templateProcessing } = require("tokenizers/bindings/processors");
bert_tokenizer.setPostProcessor(templateProcessing(
"[CLS] $A [SEP]",
"[CLS] $A [SEP] $B:1 [SEP]:1",
[["[CLS]", 1], ["[SEP]", 2]]
));
// END bert_setup_processor
// START bert_train_tokenizer
let { wordPieceTrainer } = require("tokenizers/bindings/trainers");
let { promisify } = require("utils");
let trainer = wordPieceTrainer({
vocabSize: 30522,
specialTokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
});
let files = ["test", "train", "valid"].map(split => `data/wikitext-103-raw/wiki.${split}.raw`);
bert_tokenizer.train(trainer, files);
let model_files = bert_tokenizer.getModel.save("data", "bert-wiki");
let fromFile = promisify(WordPiece.fromFile);
bert_tokenizer.setModel(await fromFile(model_files[0], {
unkToken: "[UNK]"
}));
bert_tokenizer.save("data/bert-wiki.json")
// END bert_train_tokenizer
}); });
}); });

View File

@ -73,3 +73,59 @@ class TestPipeline:
# START replace_pre_tokenizer # START replace_pre_tokenizer
tokenizer.pre_tokenizer = pre_tokenizer tokenizer.pre_tokenizer = pre_tokenizer
# END replace_pre_tokenizer # END replace_pre_tokenizer
# START setup_processor
from tokenizers.processors import TemplateProcessing
tokenizer.post_processor = TemplateProcessing
single="[CLS] $A [SEP]",
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
special_tokens=[("[CLS]", 1), ("[SEP]", 2)],
)
# END setup_processor
def test_bert_example(self):
# START bert_setup_tokenizer
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
bert_tokenizer = Tokenizer(WordPiece())
# END bert_setup_tokenizer
# START bert_setup_normalizer
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents
bert_tokenizer.normalizer = normalizers.Sequence([
NFD(), Lowercase(), StripAccents()
])
# END bert_setup_normalizer
# START bert_setup_pre_tokenizer
from tokenizers.pre_tokenizers import Whitespace
bert_tokenizer.pre_tokenizer = Whitespace()
# END bert_setup_pre_tokenizer
# START bert_setup_processor
from tokenizers.processors import TemplateProcessing
bert_tokenizer.post_processor = TemplateProcessing(
single="[CLS] $A [SEP]",
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
special_tokens=[
("[CLS]", 1),
("[SEP]", 2),
]
)
# END bert_setup_processor
# START bert_train_tokenizer
from tokenizers.trainers import WordPieceTrainer
trainer = WordPieceTrainer(
vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)
files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
bert_tokenizer.train(trainer, files)
model_files = bert_tokenizer.model.save("data", "bert-wiki")
bert_tokenizer.model = WordPiece(*model_files, unk_token="[UNK]")
bert_tokenizer.save("data/bert-wiki.json")
# END bert_train_tokenizer

View File

@ -270,22 +270,36 @@ Post-Processing
---------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------
Post-processing is the last step of the tokenization pipeline, to perform any additional Post-processing is the last step of the tokenization pipeline, to perform any additional
transformation to the :class:`~tokenizers.Encoding` before it's returned, like adding potential transformation to the :entity:`Encoding` before it's returned, like adding potential
special tokens. special tokens.
As we saw in the quick tour, we can customize the post processor of a :class:`~tokenizers.Tokenizer` As we saw in the quick tour, we can customize the post processor of a :entity:`Tokenizer`
by setting the corresponding attribute. For instance, here is how we can post-process to make the by setting the corresponding attribute. For instance, here is how we can post-process to make the
inputs suitable for the BERT model: inputs suitable for the BERT model:
.. code-block:: python .. only:: python
from tokenizers.processors import TemplateProcessing .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
:language: python
:start-after: START setup_processor
:end-before: END setup_processor
:dedent: 8
tokenizer.post_processor = TemplateProcessing .. only:: rust
single="[CLS] $A [SEP]",
pair="[CLS] $A [SEP] $B:1 [SEP]:1", .. literalinclude:: ../../tokenizers/tests/documentation.rs
special_tokens=[("[CLS]", 1), ("[SEP]", 2)], :language: rust
) :start-after: START pipeline_setup_processor
:end-before: END pipeline_setup_processor
:dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
:language: javascript
:start-after: START setup_processor
:end-before: END setup_processor
:dedent: 8
Note that contrarily to the pre-tokenizer or the normalizer, you don't need to retrain a tokenizer Note that contrarily to the pre-tokenizer or the normalizer, you don't need to retrain a tokenizer
after changing its post-processor. after changing its post-processor.
@ -296,66 +310,136 @@ All together: a BERT tokenizer from scratch
---------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------
Let's put all those pieces together to build a BERT tokenizer. First, BERT relies on WordPiece, so Let's put all those pieces together to build a BERT tokenizer. First, BERT relies on WordPiece, so
we instantiate a new :class:`~tokenizers.Tokenizer` with this model: we instantiate a new :entity:`Tokenizer` with this model:
.. code-block:: python .. only:: python
from tokenizers import Tokenizer .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
from tokenizers.models import WordPiece :language: python
:start-after: START bert_setup_tokenizer
:end-before: END bert_setup_tokenizer
:dedent: 8
bert_tokenizer = Tokenizer(WordPiece()) .. only:: rust
.. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust
:start-after: START bert_setup_tokenizer
:end-before: END bert_setup_tokenizer
:dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
:language: javascript
:start-after: START bert_setup_tokenizer
:end-before: END bert_setup_tokenizer
:dedent: 8
Then we know that BERT preprocesses texts by removing accents and lowercasing. We also use a unicode Then we know that BERT preprocesses texts by removing accents and lowercasing. We also use a unicode
normalizer: normalizer:
.. code-block:: python .. only:: python
import tokenizers .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
from tokenizers.normalizers import Lowercase, NFD, StripAccents :language: python
:start-after: START bert_setup_normalizer
:end-before: END bert_setup_normalizer
:dedent: 8
bert_tokenizer.normalizer = tokenizers.normalizers.Sequence([ .. only:: rust
NFD(), Lowercase(), StripAccents()
]) .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust
:start-after: START bert_setup_normalizer
:end-before: END bert_setup_normalizer
:dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
:language: javascript
:start-after: START bert_setup_normalizer
:end-before: END bert_setup_normalizer
:dedent: 8
The pre-tokenizer is just splitting on whitespace and punctuation: The pre-tokenizer is just splitting on whitespace and punctuation:
.. code-block:: python .. only:: python
from tokenizers.pre_tokenizers import Whitespace .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
:language: python
:start-after: START bert_setup_pre_tokenizer
:end-before: END bert_setup_pre_tokenizer
:dedent: 8
bert_tokenizer.pre_tokenizer = Whitespace() .. only:: rust
.. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust
:start-after: START bert_setup_pre_tokenizer
:end-before: END bert_setup_pre_tokenizer
:dedent: 4
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
:language: javascript
:start-after: START bert_setup_pre_tokenizer
:end-before: END bert_setup_pre_tokenizer
:dedent: 8
And the post-processing uses the template we saw in the previous section: And the post-processing uses the template we saw in the previous section:
.. code-block:: python .. only:: python
from tokenizers.processors import TemplateProcessing .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
:language: python
:start-after: START bert_setup_processor
:end-before: END bert_setup_processor
:dedent: 8
bert_tokenizer.post_processor = TemplateProcessing( .. only:: rust
single="[CLS] $A [SEP]",
pair="[CLS] $A [SEP] $B:1 [SEP]:1", .. literalinclude:: ../../tokenizers/tests/documentation.rs
special_tokens=[ :language: rust
("[CLS]", bert_tokenizer.token_to_id("[CLS]")), :start-after: START bert_setup_processor
("[SEP]", bert_tokenizer.token_to_id("[SEP]")) :end-before: END bert_setup_processor
], :dedent: 4
)
.. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
:language: javascript
:start-after: START bert_setup_processor
:end-before: END bert_setup_processor
:dedent: 8
We can use this tokenizer and train on it on wikitext like in the :doc:`quicktour`: We can use this tokenizer and train on it on wikitext like in the :doc:`quicktour`:
.. code-block:: python .. only:: python
from tokenizers.trainers import WordPieceTrainer .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
:language: python
:start-after: START bert_train_tokenizer
:end-before: END bert_train_tokenizer
:dedent: 8
trainer = WordPieceTrainer( .. only:: rust
vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)
files = [f"wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
bert_tokenizer.train(trainer, files)
model_files = bert_tokenizer.model.save("pretrained", "bert-wiki") .. literalinclude:: ../../tokenizers/tests/documentation.rs
bert_tokenizer.model = WordPiece(*model_files, unk_token="[UNK]") :language: rust
:start-after: START bert_train_tokenizer
:end-before: END bert_train_tokenizer
:dedent: 4
bert_tokenizer.save("pretrained/bert-wiki.json") .. only:: node
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
:language: javascript
:start-after: START bert_train_tokenizer
:end-before: END bert_train_tokenizer
:dedent: 8
.. _decoding: .. _decoding:

View File

@ -354,6 +354,96 @@ fn pipeline() -> tokenizers::Result<()> {
// START pipeline_replace_pre_tokenizer // START pipeline_replace_pre_tokenizer
tokenizer.with_pre_tokenizer(pre_tokenizer); tokenizer.with_pre_tokenizer(pre_tokenizer);
// END pipeline_replace_pre_tokenizer // END pipeline_replace_pre_tokenizer
// START pipeline_setup_processor
use tokenizers::processors::template::TemplateProcessing;
tokenizer.with_post_processor(
TemplateProcessing::builder()
.try_single("[CLS] $A [SEP]")
.unwrap()
.try_pair("[CLS] $A [SEP] $B:1 [SEP]:1")
.unwrap()
.special_tokens(vec![("[CLS]", 1), ("[SEP]", 2)])
.build()
.unwrap(),
);
// END pipeline_setup_processor
Ok(())
}
#[test]
#[ignore]
fn pipeline_bert() -> tokenizers::Result<()> {
// START bert_setup_tokenizer
use tokenizers::models::wordpiece::WordPiece;
use tokenizers::Tokenizer;
let mut bert_tokenizer = Tokenizer::new(WordPiece::default());
// END bert_setup_tokenizer
// START bert_setup_normalizer
use tokenizers::normalizers::utils::Sequence as NormalizerSequence;
use tokenizers::normalizers::{strip::StripAccents, unicode::NFD, utils::Lowercase};
bert_tokenizer.with_normalizer(NormalizerSequence::new(vec![
NFD.into(),
Lowercase.into(),
StripAccents.into(),
]));
// END bert_setup_normalizer
// START bert_setup_pre_tokenizer
use tokenizers::pre_tokenizers::whitespace::Whitespace;
bert_tokenizer.with_pre_tokenizer(Whitespace::default());
// END bert_setup_pre_tokenizer
// START bert_setup_processor
use tokenizers::processors::template::TemplateProcessing;
bert_tokenizer.with_post_processor(
TemplateProcessing::builder()
.try_single("[CLS] $A [SEP]")
.unwrap()
.try_pair("[CLS] $A [SEP] $B:1 [SEP]:1")
.unwrap()
.special_tokens(vec![("[CLS]", 1), ("[SEP]", 2)])
.build()
.unwrap(),
);
// END bert_setup_processor
// START bert_train_tokenizer
use std::path::Path;
use tokenizers::models::{wordpiece::WordPieceTrainer, TrainerWrapper};
use tokenizers::Model;
let trainer: TrainerWrapper = WordPieceTrainer::builder()
.vocab_size(30_522)
.special_tokens(vec![
AddedToken::from("[UNK]", true),
AddedToken::from("[CLS]", true),
AddedToken::from("[SEP]", true),
AddedToken::from("[PAD]", true),
AddedToken::from("[MASK]", true),
])
.build()
.into();
let files = ["test", "train", "valid"]
.iter()
.map(|split| format!("data/wikitext-103-raw/wiki.{}.raw", split))
.collect::<Vec<_>>();
bert_tokenizer.train_and_replace(&trainer, files)?;
let model_files = bert_tokenizer
.get_model()
.save(&Path::new("data"), Some("bert-wiki"))?;
bert_tokenizer.with_model(
WordPiece::from_file(model_files[0].to_str().unwrap())
.unk_token("[UNK]".to_string())
.build()
.unwrap(),
);
bert_tokenizer.save("data/bert-wiki.json", false)?;
// END bert_train_tokenizer
Ok(()) Ok(())
} }