mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Doc - Update Bert example on the Pipeline page
This commit is contained in:
@ -7,7 +7,7 @@ describe("pipelineExample", () => {
|
|||||||
return globRequire("../../lib/" + path);
|
return globRequire("../../lib/" + path);
|
||||||
}
|
}
|
||||||
|
|
||||||
it("", async () => {
|
it("shows pipeline parts", async () => {
|
||||||
// START reload_tokenizer
|
// START reload_tokenizer
|
||||||
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
||||||
|
|
||||||
@ -57,5 +57,64 @@ describe("pipelineExample", () => {
|
|||||||
// START replace_pre_tokenizer
|
// START replace_pre_tokenizer
|
||||||
tokenizer.setPreTokenizer(preTokenizer)
|
tokenizer.setPreTokenizer(preTokenizer)
|
||||||
// END replace_pre_tokenizer
|
// END replace_pre_tokenizer
|
||||||
|
// START setup_processor
|
||||||
|
let { templateProcessing } = require("tokenizers/bindings/processors");
|
||||||
|
|
||||||
|
tokenizer.setPostProcessor(templateProcessing(
|
||||||
|
"[CLS] $A [SEP]",
|
||||||
|
"[CLS] $A [SEP] $B:1 [SEP]:1",
|
||||||
|
[["[CLS]", 1], ["[SEP]", 2]]
|
||||||
|
));
|
||||||
|
// END setup_processor
|
||||||
|
});
|
||||||
|
|
||||||
|
it("shows a full bert example", async () => {
|
||||||
|
// START bert_setup_tokenizer
|
||||||
|
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
||||||
|
let { WordPiece } = require("tokenizers/bindings/models");
|
||||||
|
|
||||||
|
let bert_tokenizer = Tokenizer(WordPiece.empty());
|
||||||
|
// END bert_setup_tokenizer
|
||||||
|
// START bert_setup_normalizer
|
||||||
|
let { sequenceNormalizer, lowercaseNormalizer, nfdNormalizer, stripAccentsNormalizer }
|
||||||
|
= require("tokenizers/bindings/normalizers");
|
||||||
|
|
||||||
|
bert_tokenizer.setNormalizer(sequenceNormalizer([
|
||||||
|
nfdNormalizer(), lowercaseNormalizer(), stripAccentsNormalizer()
|
||||||
|
]))
|
||||||
|
// END bert_setup_normalizer
|
||||||
|
// START bert_setup_pre_tokenizer
|
||||||
|
let { whitespacePreTokenizer } = require("tokenizers/bindings/pre_tokenizers");
|
||||||
|
|
||||||
|
bert_tokenizer.setPreTokenizer = whitespacePreTokenizer();
|
||||||
|
// END bert_setup_pre_tokenizer
|
||||||
|
// START bert_setup_processor
|
||||||
|
let { templateProcessing } = require("tokenizers/bindings/processors");
|
||||||
|
|
||||||
|
bert_tokenizer.setPostProcessor(templateProcessing(
|
||||||
|
"[CLS] $A [SEP]",
|
||||||
|
"[CLS] $A [SEP] $B:1 [SEP]:1",
|
||||||
|
[["[CLS]", 1], ["[SEP]", 2]]
|
||||||
|
));
|
||||||
|
// END bert_setup_processor
|
||||||
|
// START bert_train_tokenizer
|
||||||
|
let { wordPieceTrainer } = require("tokenizers/bindings/trainers");
|
||||||
|
let { promisify } = require("utils");
|
||||||
|
|
||||||
|
let trainer = wordPieceTrainer({
|
||||||
|
vocabSize: 30522,
|
||||||
|
specialTokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
|
||||||
|
});
|
||||||
|
let files = ["test", "train", "valid"].map(split => `data/wikitext-103-raw/wiki.${split}.raw`);
|
||||||
|
bert_tokenizer.train(trainer, files);
|
||||||
|
|
||||||
|
let model_files = bert_tokenizer.getModel.save("data", "bert-wiki");
|
||||||
|
let fromFile = promisify(WordPiece.fromFile);
|
||||||
|
bert_tokenizer.setModel(await fromFile(model_files[0], {
|
||||||
|
unkToken: "[UNK]"
|
||||||
|
}));
|
||||||
|
|
||||||
|
bert_tokenizer.save("data/bert-wiki.json")
|
||||||
|
// END bert_train_tokenizer
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -73,3 +73,59 @@ class TestPipeline:
|
|||||||
# START replace_pre_tokenizer
|
# START replace_pre_tokenizer
|
||||||
tokenizer.pre_tokenizer = pre_tokenizer
|
tokenizer.pre_tokenizer = pre_tokenizer
|
||||||
# END replace_pre_tokenizer
|
# END replace_pre_tokenizer
|
||||||
|
# START setup_processor
|
||||||
|
from tokenizers.processors import TemplateProcessing
|
||||||
|
|
||||||
|
tokenizer.post_processor = TemplateProcessing
|
||||||
|
single="[CLS] $A [SEP]",
|
||||||
|
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
|
||||||
|
special_tokens=[("[CLS]", 1), ("[SEP]", 2)],
|
||||||
|
)
|
||||||
|
# END setup_processor
|
||||||
|
|
||||||
|
def test_bert_example(self):
|
||||||
|
# START bert_setup_tokenizer
|
||||||
|
from tokenizers import Tokenizer
|
||||||
|
from tokenizers.models import WordPiece
|
||||||
|
|
||||||
|
bert_tokenizer = Tokenizer(WordPiece())
|
||||||
|
# END bert_setup_tokenizer
|
||||||
|
# START bert_setup_normalizer
|
||||||
|
from tokenizers import normalizers
|
||||||
|
from tokenizers.normalizers import Lowercase, NFD, StripAccents
|
||||||
|
|
||||||
|
bert_tokenizer.normalizer = normalizers.Sequence([
|
||||||
|
NFD(), Lowercase(), StripAccents()
|
||||||
|
])
|
||||||
|
# END bert_setup_normalizer
|
||||||
|
# START bert_setup_pre_tokenizer
|
||||||
|
from tokenizers.pre_tokenizers import Whitespace
|
||||||
|
|
||||||
|
bert_tokenizer.pre_tokenizer = Whitespace()
|
||||||
|
# END bert_setup_pre_tokenizer
|
||||||
|
# START bert_setup_processor
|
||||||
|
from tokenizers.processors import TemplateProcessing
|
||||||
|
|
||||||
|
bert_tokenizer.post_processor = TemplateProcessing(
|
||||||
|
single="[CLS] $A [SEP]",
|
||||||
|
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
|
||||||
|
special_tokens=[
|
||||||
|
("[CLS]", 1),
|
||||||
|
("[SEP]", 2),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
# END bert_setup_processor
|
||||||
|
# START bert_train_tokenizer
|
||||||
|
from tokenizers.trainers import WordPieceTrainer
|
||||||
|
|
||||||
|
trainer = WordPieceTrainer(
|
||||||
|
vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
|
||||||
|
)
|
||||||
|
files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
|
||||||
|
bert_tokenizer.train(trainer, files)
|
||||||
|
|
||||||
|
model_files = bert_tokenizer.model.save("data", "bert-wiki")
|
||||||
|
bert_tokenizer.model = WordPiece(*model_files, unk_token="[UNK]")
|
||||||
|
|
||||||
|
bert_tokenizer.save("data/bert-wiki.json")
|
||||||
|
# END bert_train_tokenizer
|
||||||
|
@ -270,22 +270,36 @@ Post-Processing
|
|||||||
----------------------------------------------------------------------------------------------------
|
----------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
Post-processing is the last step of the tokenization pipeline, to perform any additional
|
Post-processing is the last step of the tokenization pipeline, to perform any additional
|
||||||
transformation to the :class:`~tokenizers.Encoding` before it's returned, like adding potential
|
transformation to the :entity:`Encoding` before it's returned, like adding potential
|
||||||
special tokens.
|
special tokens.
|
||||||
|
|
||||||
As we saw in the quick tour, we can customize the post processor of a :class:`~tokenizers.Tokenizer`
|
As we saw in the quick tour, we can customize the post processor of a :entity:`Tokenizer`
|
||||||
by setting the corresponding attribute. For instance, here is how we can post-process to make the
|
by setting the corresponding attribute. For instance, here is how we can post-process to make the
|
||||||
inputs suitable for the BERT model:
|
inputs suitable for the BERT model:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
from tokenizers.processors import TemplateProcessing
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||||
|
:language: python
|
||||||
|
:start-after: START setup_processor
|
||||||
|
:end-before: END setup_processor
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
tokenizer.post_processor = TemplateProcessing
|
.. only:: rust
|
||||||
single="[CLS] $A [SEP]",
|
|
||||||
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
special_tokens=[("[CLS]", 1), ("[SEP]", 2)],
|
:language: rust
|
||||||
)
|
:start-after: START pipeline_setup_processor
|
||||||
|
:end-before: END pipeline_setup_processor
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START setup_processor
|
||||||
|
:end-before: END setup_processor
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
Note that contrarily to the pre-tokenizer or the normalizer, you don't need to retrain a tokenizer
|
Note that contrarily to the pre-tokenizer or the normalizer, you don't need to retrain a tokenizer
|
||||||
after changing its post-processor.
|
after changing its post-processor.
|
||||||
@ -296,66 +310,136 @@ All together: a BERT tokenizer from scratch
|
|||||||
----------------------------------------------------------------------------------------------------
|
----------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
Let's put all those pieces together to build a BERT tokenizer. First, BERT relies on WordPiece, so
|
Let's put all those pieces together to build a BERT tokenizer. First, BERT relies on WordPiece, so
|
||||||
we instantiate a new :class:`~tokenizers.Tokenizer` with this model:
|
we instantiate a new :entity:`Tokenizer` with this model:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
from tokenizers import Tokenizer
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||||
from tokenizers.models import WordPiece
|
:language: python
|
||||||
|
:start-after: START bert_setup_tokenizer
|
||||||
|
:end-before: END bert_setup_tokenizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
bert_tokenizer = Tokenizer(WordPiece())
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START bert_setup_tokenizer
|
||||||
|
:end-before: END bert_setup_tokenizer
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START bert_setup_tokenizer
|
||||||
|
:end-before: END bert_setup_tokenizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
Then we know that BERT preprocesses texts by removing accents and lowercasing. We also use a unicode
|
Then we know that BERT preprocesses texts by removing accents and lowercasing. We also use a unicode
|
||||||
normalizer:
|
normalizer:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
import tokenizers
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||||
from tokenizers.normalizers import Lowercase, NFD, StripAccents
|
:language: python
|
||||||
|
:start-after: START bert_setup_normalizer
|
||||||
|
:end-before: END bert_setup_normalizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
bert_tokenizer.normalizer = tokenizers.normalizers.Sequence([
|
.. only:: rust
|
||||||
NFD(), Lowercase(), StripAccents()
|
|
||||||
])
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START bert_setup_normalizer
|
||||||
|
:end-before: END bert_setup_normalizer
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START bert_setup_normalizer
|
||||||
|
:end-before: END bert_setup_normalizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
The pre-tokenizer is just splitting on whitespace and punctuation:
|
The pre-tokenizer is just splitting on whitespace and punctuation:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
from tokenizers.pre_tokenizers import Whitespace
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||||
|
:language: python
|
||||||
|
:start-after: START bert_setup_pre_tokenizer
|
||||||
|
:end-before: END bert_setup_pre_tokenizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
bert_tokenizer.pre_tokenizer = Whitespace()
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START bert_setup_pre_tokenizer
|
||||||
|
:end-before: END bert_setup_pre_tokenizer
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START bert_setup_pre_tokenizer
|
||||||
|
:end-before: END bert_setup_pre_tokenizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
And the post-processing uses the template we saw in the previous section:
|
And the post-processing uses the template we saw in the previous section:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
from tokenizers.processors import TemplateProcessing
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||||
|
:language: python
|
||||||
|
:start-after: START bert_setup_processor
|
||||||
|
:end-before: END bert_setup_processor
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
bert_tokenizer.post_processor = TemplateProcessing(
|
.. only:: rust
|
||||||
single="[CLS] $A [SEP]",
|
|
||||||
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
special_tokens=[
|
:language: rust
|
||||||
("[CLS]", bert_tokenizer.token_to_id("[CLS]")),
|
:start-after: START bert_setup_processor
|
||||||
("[SEP]", bert_tokenizer.token_to_id("[SEP]"))
|
:end-before: END bert_setup_processor
|
||||||
],
|
:dedent: 4
|
||||||
)
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START bert_setup_processor
|
||||||
|
:end-before: END bert_setup_processor
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
We can use this tokenizer and train on it on wikitext like in the :doc:`quicktour`:
|
We can use this tokenizer and train on it on wikitext like in the :doc:`quicktour`:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
from tokenizers.trainers import WordPieceTrainer
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||||
|
:language: python
|
||||||
|
:start-after: START bert_train_tokenizer
|
||||||
|
:end-before: END bert_train_tokenizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
trainer = WordPieceTrainer(
|
.. only:: rust
|
||||||
vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
|
|
||||||
)
|
|
||||||
files = [f"wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
|
|
||||||
bert_tokenizer.train(trainer, files)
|
|
||||||
|
|
||||||
model_files = bert_tokenizer.model.save("pretrained", "bert-wiki")
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
bert_tokenizer.model = WordPiece(*model_files, unk_token="[UNK]")
|
:language: rust
|
||||||
|
:start-after: START bert_train_tokenizer
|
||||||
|
:end-before: END bert_train_tokenizer
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
bert_tokenizer.save("pretrained/bert-wiki.json")
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START bert_train_tokenizer
|
||||||
|
:end-before: END bert_train_tokenizer
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
|
|
||||||
.. _decoding:
|
.. _decoding:
|
||||||
|
@ -354,6 +354,96 @@ fn pipeline() -> tokenizers::Result<()> {
|
|||||||
// START pipeline_replace_pre_tokenizer
|
// START pipeline_replace_pre_tokenizer
|
||||||
tokenizer.with_pre_tokenizer(pre_tokenizer);
|
tokenizer.with_pre_tokenizer(pre_tokenizer);
|
||||||
// END pipeline_replace_pre_tokenizer
|
// END pipeline_replace_pre_tokenizer
|
||||||
|
// START pipeline_setup_processor
|
||||||
|
use tokenizers::processors::template::TemplateProcessing;
|
||||||
|
|
||||||
|
tokenizer.with_post_processor(
|
||||||
|
TemplateProcessing::builder()
|
||||||
|
.try_single("[CLS] $A [SEP]")
|
||||||
|
.unwrap()
|
||||||
|
.try_pair("[CLS] $A [SEP] $B:1 [SEP]:1")
|
||||||
|
.unwrap()
|
||||||
|
.special_tokens(vec![("[CLS]", 1), ("[SEP]", 2)])
|
||||||
|
.build()
|
||||||
|
.unwrap(),
|
||||||
|
);
|
||||||
|
// END pipeline_setup_processor
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[ignore]
|
||||||
|
fn pipeline_bert() -> tokenizers::Result<()> {
|
||||||
|
// START bert_setup_tokenizer
|
||||||
|
use tokenizers::models::wordpiece::WordPiece;
|
||||||
|
use tokenizers::Tokenizer;
|
||||||
|
|
||||||
|
let mut bert_tokenizer = Tokenizer::new(WordPiece::default());
|
||||||
|
// END bert_setup_tokenizer
|
||||||
|
// START bert_setup_normalizer
|
||||||
|
use tokenizers::normalizers::utils::Sequence as NormalizerSequence;
|
||||||
|
use tokenizers::normalizers::{strip::StripAccents, unicode::NFD, utils::Lowercase};
|
||||||
|
|
||||||
|
bert_tokenizer.with_normalizer(NormalizerSequence::new(vec![
|
||||||
|
NFD.into(),
|
||||||
|
Lowercase.into(),
|
||||||
|
StripAccents.into(),
|
||||||
|
]));
|
||||||
|
// END bert_setup_normalizer
|
||||||
|
// START bert_setup_pre_tokenizer
|
||||||
|
use tokenizers::pre_tokenizers::whitespace::Whitespace;
|
||||||
|
|
||||||
|
bert_tokenizer.with_pre_tokenizer(Whitespace::default());
|
||||||
|
// END bert_setup_pre_tokenizer
|
||||||
|
// START bert_setup_processor
|
||||||
|
use tokenizers::processors::template::TemplateProcessing;
|
||||||
|
|
||||||
|
bert_tokenizer.with_post_processor(
|
||||||
|
TemplateProcessing::builder()
|
||||||
|
.try_single("[CLS] $A [SEP]")
|
||||||
|
.unwrap()
|
||||||
|
.try_pair("[CLS] $A [SEP] $B:1 [SEP]:1")
|
||||||
|
.unwrap()
|
||||||
|
.special_tokens(vec![("[CLS]", 1), ("[SEP]", 2)])
|
||||||
|
.build()
|
||||||
|
.unwrap(),
|
||||||
|
);
|
||||||
|
// END bert_setup_processor
|
||||||
|
// START bert_train_tokenizer
|
||||||
|
use std::path::Path;
|
||||||
|
use tokenizers::models::{wordpiece::WordPieceTrainer, TrainerWrapper};
|
||||||
|
use tokenizers::Model;
|
||||||
|
|
||||||
|
let trainer: TrainerWrapper = WordPieceTrainer::builder()
|
||||||
|
.vocab_size(30_522)
|
||||||
|
.special_tokens(vec![
|
||||||
|
AddedToken::from("[UNK]", true),
|
||||||
|
AddedToken::from("[CLS]", true),
|
||||||
|
AddedToken::from("[SEP]", true),
|
||||||
|
AddedToken::from("[PAD]", true),
|
||||||
|
AddedToken::from("[MASK]", true),
|
||||||
|
])
|
||||||
|
.build()
|
||||||
|
.into();
|
||||||
|
let files = ["test", "train", "valid"]
|
||||||
|
.iter()
|
||||||
|
.map(|split| format!("data/wikitext-103-raw/wiki.{}.raw", split))
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
bert_tokenizer.train_and_replace(&trainer, files)?;
|
||||||
|
|
||||||
|
let model_files = bert_tokenizer
|
||||||
|
.get_model()
|
||||||
|
.save(&Path::new("data"), Some("bert-wiki"))?;
|
||||||
|
bert_tokenizer.with_model(
|
||||||
|
WordPiece::from_file(model_files[0].to_str().unwrap())
|
||||||
|
.unk_token("[UNK]".to_string())
|
||||||
|
.build()
|
||||||
|
.unwrap(),
|
||||||
|
);
|
||||||
|
|
||||||
|
bert_tokenizer.save("data/bert-wiki.json", false)?;
|
||||||
|
// END bert_train_tokenizer
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user