mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Doc - Update Decoder part of the Pipeline page
This commit is contained in:
@ -66,37 +66,20 @@ describe("pipelineExample", () => {
|
|||||||
[["[CLS]", 1], ["[SEP]", 2]]
|
[["[CLS]", 1], ["[SEP]", 2]]
|
||||||
));
|
));
|
||||||
// END setup_processor
|
// END setup_processor
|
||||||
|
// START test_decoding
|
||||||
|
let output = tokenizer.encode("Hello, y'all! How are you 😁 ?");
|
||||||
|
console.log(output.getIds());
|
||||||
|
// [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]
|
||||||
|
|
||||||
|
tokenizer.decode([1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]);
|
||||||
|
// "Hello , y ' all ! How are you ?"
|
||||||
|
// END test_decoding
|
||||||
});
|
});
|
||||||
|
|
||||||
it("shows a full bert example", async () => {
|
var { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
||||||
// START bert_setup_tokenizer
|
const slow_bert_training = async (bertTokenizer: typeof Tokenizer) => {
|
||||||
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
|
||||||
let { WordPiece } = require("tokenizers/bindings/models");
|
let { WordPiece } = require("tokenizers/bindings/models");
|
||||||
|
|
||||||
let bert_tokenizer = Tokenizer(WordPiece.empty());
|
|
||||||
// END bert_setup_tokenizer
|
|
||||||
// START bert_setup_normalizer
|
|
||||||
let { sequenceNormalizer, lowercaseNormalizer, nfdNormalizer, stripAccentsNormalizer }
|
|
||||||
= require("tokenizers/bindings/normalizers");
|
|
||||||
|
|
||||||
bert_tokenizer.setNormalizer(sequenceNormalizer([
|
|
||||||
nfdNormalizer(), lowercaseNormalizer(), stripAccentsNormalizer()
|
|
||||||
]))
|
|
||||||
// END bert_setup_normalizer
|
|
||||||
// START bert_setup_pre_tokenizer
|
|
||||||
let { whitespacePreTokenizer } = require("tokenizers/bindings/pre_tokenizers");
|
|
||||||
|
|
||||||
bert_tokenizer.setPreTokenizer = whitespacePreTokenizer();
|
|
||||||
// END bert_setup_pre_tokenizer
|
|
||||||
// START bert_setup_processor
|
|
||||||
let { templateProcessing } = require("tokenizers/bindings/processors");
|
|
||||||
|
|
||||||
bert_tokenizer.setPostProcessor(templateProcessing(
|
|
||||||
"[CLS] $A [SEP]",
|
|
||||||
"[CLS] $A [SEP] $B:1 [SEP]:1",
|
|
||||||
[["[CLS]", 1], ["[SEP]", 2]]
|
|
||||||
));
|
|
||||||
// END bert_setup_processor
|
|
||||||
// START bert_train_tokenizer
|
// START bert_train_tokenizer
|
||||||
let { wordPieceTrainer } = require("tokenizers/bindings/trainers");
|
let { wordPieceTrainer } = require("tokenizers/bindings/trainers");
|
||||||
let { promisify } = require("utils");
|
let { promisify } = require("utils");
|
||||||
@ -106,15 +89,61 @@ describe("pipelineExample", () => {
|
|||||||
specialTokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
|
specialTokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
|
||||||
});
|
});
|
||||||
let files = ["test", "train", "valid"].map(split => `data/wikitext-103-raw/wiki.${split}.raw`);
|
let files = ["test", "train", "valid"].map(split => `data/wikitext-103-raw/wiki.${split}.raw`);
|
||||||
bert_tokenizer.train(trainer, files);
|
bertTokenizer.train(trainer, files);
|
||||||
|
|
||||||
let model_files = bert_tokenizer.getModel.save("data", "bert-wiki");
|
let modelFiles = bertTokenizer.getModel.save("data", "bert-wiki");
|
||||||
let fromFile = promisify(WordPiece.fromFile);
|
let fromFile = promisify(WordPiece.fromFile);
|
||||||
bert_tokenizer.setModel(await fromFile(model_files[0], {
|
bertTokenizer.setModel(await fromFile(modelFiles[0], {
|
||||||
unkToken: "[UNK]"
|
unkToken: "[UNK]"
|
||||||
}));
|
}));
|
||||||
|
|
||||||
bert_tokenizer.save("data/bert-wiki.json")
|
bertTokenizer.save("data/bert-wiki.json")
|
||||||
// END bert_train_tokenizer
|
// END bert_train_tokenizer
|
||||||
|
};
|
||||||
|
console.log(slow_bert_training); // disable unused warning
|
||||||
|
|
||||||
|
it("shows a full bert example", async () => {
|
||||||
|
// START bert_setup_tokenizer
|
||||||
|
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
|
||||||
|
let { WordPiece } = require("tokenizers/bindings/models");
|
||||||
|
|
||||||
|
let bertTokenizer = Tokenizer(WordPiece.empty());
|
||||||
|
// END bert_setup_tokenizer
|
||||||
|
// START bert_setup_normalizer
|
||||||
|
let { sequenceNormalizer, lowercaseNormalizer, nfdNormalizer, stripAccentsNormalizer }
|
||||||
|
= require("tokenizers/bindings/normalizers");
|
||||||
|
|
||||||
|
bertTokenizer.setNormalizer(sequenceNormalizer([
|
||||||
|
nfdNormalizer(), lowercaseNormalizer(), stripAccentsNormalizer()
|
||||||
|
]))
|
||||||
|
// END bert_setup_normalizer
|
||||||
|
// START bert_setup_pre_tokenizer
|
||||||
|
let { whitespacePreTokenizer } = require("tokenizers/bindings/pre_tokenizers");
|
||||||
|
|
||||||
|
bertTokenizer.setPreTokenizer = whitespacePreTokenizer();
|
||||||
|
// END bert_setup_pre_tokenizer
|
||||||
|
// START bert_setup_processor
|
||||||
|
let { templateProcessing } = require("tokenizers/bindings/processors");
|
||||||
|
|
||||||
|
bertTokenizer.setPostProcessor(templateProcessing(
|
||||||
|
"[CLS] $A [SEP]",
|
||||||
|
"[CLS] $A [SEP] $B:1 [SEP]:1",
|
||||||
|
[["[CLS]", 1], ["[SEP]", 2]]
|
||||||
|
));
|
||||||
|
// END bert_setup_processor
|
||||||
|
// START bert_test_decoding
|
||||||
|
let output = bertTokenizer.encode("Welcome to the 🤗 Tokenizers library.");
|
||||||
|
console.log(output.getTokens());
|
||||||
|
// ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]
|
||||||
|
|
||||||
|
bertTokenizer.decode(output.getIds());
|
||||||
|
// "welcome to the tok ##eni ##zer ##s library ."
|
||||||
|
// END bert_test_decoding
|
||||||
|
// START bert_proper_decoding
|
||||||
|
let { wordPieceDecoder } = require("tokenizers/bindings/decoders");
|
||||||
|
bertTokenizer.setDecoder(wordPieceDecoder());
|
||||||
|
bertTokenizer.decode(output.ids);
|
||||||
|
// "welcome to the tokenizers library."
|
||||||
|
// END bert_proper_decoding
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -76,14 +76,27 @@ class TestPipeline:
|
|||||||
# START setup_processor
|
# START setup_processor
|
||||||
from tokenizers.processors import TemplateProcessing
|
from tokenizers.processors import TemplateProcessing
|
||||||
|
|
||||||
tokenizer.post_processor = TemplateProcessing
|
tokenizer.post_processor = TemplateProcessing(
|
||||||
single="[CLS] $A [SEP]",
|
single="[CLS] $A [SEP]",
|
||||||
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
|
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
|
||||||
special_tokens=[("[CLS]", 1), ("[SEP]", 2)],
|
special_tokens=[("[CLS]", 1), ("[SEP]", 2)],
|
||||||
)
|
)
|
||||||
# END setup_processor
|
# END setup_processor
|
||||||
|
# START test_decoding
|
||||||
|
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
|
||||||
|
print(output.ids)
|
||||||
|
# [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]
|
||||||
|
|
||||||
def test_bert_example(self):
|
tokenizer.decode([1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2])
|
||||||
|
# "Hello , y ' all ! How are you ?"
|
||||||
|
# END test_decoding
|
||||||
|
assert output.ids == [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]
|
||||||
|
assert (
|
||||||
|
tokenizer.decode([1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2])
|
||||||
|
== "Hello , y ' all ! How are you ?"
|
||||||
|
)
|
||||||
|
|
||||||
|
def bert_example(self):
|
||||||
# START bert_setup_tokenizer
|
# START bert_setup_tokenizer
|
||||||
from tokenizers import Tokenizer
|
from tokenizers import Tokenizer
|
||||||
from tokenizers.models import WordPiece
|
from tokenizers.models import WordPiece
|
||||||
@ -94,9 +107,7 @@ class TestPipeline:
|
|||||||
from tokenizers import normalizers
|
from tokenizers import normalizers
|
||||||
from tokenizers.normalizers import Lowercase, NFD, StripAccents
|
from tokenizers.normalizers import Lowercase, NFD, StripAccents
|
||||||
|
|
||||||
bert_tokenizer.normalizer = normalizers.Sequence([
|
bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
|
||||||
NFD(), Lowercase(), StripAccents()
|
|
||||||
])
|
|
||||||
# END bert_setup_normalizer
|
# END bert_setup_normalizer
|
||||||
# START bert_setup_pre_tokenizer
|
# START bert_setup_pre_tokenizer
|
||||||
from tokenizers.pre_tokenizers import Whitespace
|
from tokenizers.pre_tokenizers import Whitespace
|
||||||
@ -112,7 +123,7 @@ class TestPipeline:
|
|||||||
special_tokens=[
|
special_tokens=[
|
||||||
("[CLS]", 1),
|
("[CLS]", 1),
|
||||||
("[SEP]", 2),
|
("[SEP]", 2),
|
||||||
]
|
],
|
||||||
)
|
)
|
||||||
# END bert_setup_processor
|
# END bert_setup_processor
|
||||||
# START bert_train_tokenizer
|
# START bert_train_tokenizer
|
||||||
@ -129,3 +140,16 @@ class TestPipeline:
|
|||||||
|
|
||||||
bert_tokenizer.save("data/bert-wiki.json")
|
bert_tokenizer.save("data/bert-wiki.json")
|
||||||
# END bert_train_tokenizer
|
# END bert_train_tokenizer
|
||||||
|
# START bert_test_decoding
|
||||||
|
output = bert_tokenizer.encode("Welcome to the 🤗 Tokenizers library.")
|
||||||
|
print(output.tokens)
|
||||||
|
# ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]
|
||||||
|
|
||||||
|
bert_tokenizer.decoder(output.ids)
|
||||||
|
# "welcome to the tok ##eni ##zer ##s library ."
|
||||||
|
# END bert_test_decoding
|
||||||
|
# START bert_proper_decoding
|
||||||
|
bert_tokenizer.decoder = tokenizers.decoders.WordPiece()
|
||||||
|
bert_tokenizer.decode(output.ids)
|
||||||
|
# "welcome to the tokenizers library."
|
||||||
|
# END bert_proper_decoding
|
||||||
|
@ -18,6 +18,10 @@
|
|||||||
:meth:`~tokenizers.Tokenizer.encode`
|
:meth:`~tokenizers.Tokenizer.encode`
|
||||||
Tokenizer.encode_batch
|
Tokenizer.encode_batch
|
||||||
:meth:`~tokenizers.Tokenizer.encode_batch`
|
:meth:`~tokenizers.Tokenizer.encode_batch`
|
||||||
|
Tokenizer.decode
|
||||||
|
:meth:`~tokenizers.Tokenizer.decode`
|
||||||
|
Tokenizer.decode_batch
|
||||||
|
:meth:`~tokenizers.Tokenizer.decode_batch`
|
||||||
Tokenizer.token_to_id
|
Tokenizer.token_to_id
|
||||||
:meth:`~tokenizers.Tokenizer.token_to_id`
|
:meth:`~tokenizers.Tokenizer.token_to_id`
|
||||||
Tokenizer.enable_padding
|
Tokenizer.enable_padding
|
||||||
@ -42,6 +46,8 @@
|
|||||||
:class:`~tokenizers.models.WordLevel`
|
:class:`~tokenizers.models.WordLevel`
|
||||||
models.WordPiece
|
models.WordPiece
|
||||||
:class:`~tokenizers.models.WordPiece`
|
:class:`~tokenizers.models.WordPiece`
|
||||||
|
Decoder
|
||||||
|
:class:`~tokenizers.decoders.Decoder`
|
||||||
|
|
||||||
.. entities:: rust
|
.. entities:: rust
|
||||||
|
|
||||||
@ -63,6 +69,10 @@
|
|||||||
:rust:meth:`~tokenizers::tokenizer::Tokenizer::encode`
|
:rust:meth:`~tokenizers::tokenizer::Tokenizer::encode`
|
||||||
Tokenizer.encode_batch
|
Tokenizer.encode_batch
|
||||||
:rust:meth:`~tokenizers::tokenizer::Tokenizer::encode_batch`
|
:rust:meth:`~tokenizers::tokenizer::Tokenizer::encode_batch`
|
||||||
|
Tokenizer.decode
|
||||||
|
:rust:meth:`~tokenizers::tokenizer::Tokenizer::decode`
|
||||||
|
Tokenizer.decode_batch
|
||||||
|
:rust:meth:`~tokenizers::tokenizer::Tokenizer::decode_batch`
|
||||||
Tokenizer.token_to_id
|
Tokenizer.token_to_id
|
||||||
:rust:meth:`~tokenizers::tokenizer::Tokenizer::token_to_id`
|
:rust:meth:`~tokenizers::tokenizer::Tokenizer::token_to_id`
|
||||||
Tokenizer.enable_padding
|
Tokenizer.enable_padding
|
||||||
@ -87,6 +97,8 @@
|
|||||||
:rust:struct:`~tokenizers::models::wordlevel::WordLevel`
|
:rust:struct:`~tokenizers::models::wordlevel::WordLevel`
|
||||||
models.WordPiece
|
models.WordPiece
|
||||||
:rust:struct:`~tokenizers::models::wordpiece::WordPiece`
|
:rust:struct:`~tokenizers::models::wordpiece::WordPiece`
|
||||||
|
Decoder
|
||||||
|
:rust:trait:`~tokenizers::tokenizer::Decoder`
|
||||||
|
|
||||||
.. entities:: node
|
.. entities:: node
|
||||||
|
|
||||||
@ -108,6 +120,10 @@
|
|||||||
:obj:`Tokenizer.encode()`
|
:obj:`Tokenizer.encode()`
|
||||||
Tokenizer.encode_batch
|
Tokenizer.encode_batch
|
||||||
:obj:`Tokenizer.encodeBatch()`
|
:obj:`Tokenizer.encodeBatch()`
|
||||||
|
Tokenizer.decode
|
||||||
|
:obj:`Tokenizer.decode()`
|
||||||
|
Tokenizer.decode_batch
|
||||||
|
:obj:`Tokenizer.decodeBatch()`
|
||||||
Tokenizer.token_to_id
|
Tokenizer.token_to_id
|
||||||
:obj:`Tokenizer.tokenToId()`
|
:obj:`Tokenizer.tokenToId()`
|
||||||
Tokenizer.enable_padding
|
Tokenizer.enable_padding
|
||||||
@ -132,3 +148,5 @@
|
|||||||
:obj:`WordLevel`
|
:obj:`WordLevel`
|
||||||
models.WordPiece
|
models.WordPiece
|
||||||
:obj:`WordPiece`
|
:obj:`WordPiece`
|
||||||
|
Decoder
|
||||||
|
:obj:`Decoder`
|
||||||
|
@ -447,40 +447,104 @@ We can use this tokenizer and train on it on wikitext like in the :doc:`quicktou
|
|||||||
Decoding
|
Decoding
|
||||||
----------------------------------------------------------------------------------------------------
|
----------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
On top of encoding the input texts, a :class:`~tokenizers.Tokenizer` also has an API for decoding,
|
.. entities:: python
|
||||||
|
|
||||||
|
bert_tokenizer
|
||||||
|
:obj:`bert_tokenizer`
|
||||||
|
|
||||||
|
.. entities:: rust
|
||||||
|
|
||||||
|
bert_tokenizer
|
||||||
|
:obj:`bert_tokenizer`
|
||||||
|
|
||||||
|
.. entities:: node
|
||||||
|
|
||||||
|
bert_tokenizer
|
||||||
|
:obj:`bertTokenizer`
|
||||||
|
|
||||||
|
|
||||||
|
On top of encoding the input texts, a :entity:`Tokenizer` also has an API for decoding,
|
||||||
that is converting IDs generated by your model back to a text. This is done by the methods
|
that is converting IDs generated by your model back to a text. This is done by the methods
|
||||||
:meth:`~tokenizers.Tokenizer.decode` (for one predicted text) and
|
:entity:`Tokenizer.decode` (for one predicted text) and :entity:`Tokenizer.decode_batch` (for a
|
||||||
:meth:`~tokenizers.Tokenizer.decode_batch` (for a batch of predictions).
|
batch of predictions).
|
||||||
|
|
||||||
The `decoder` will first convert the IDs back to tokens (using the tokenizer's vocabulary) and
|
The `decoder` will first convert the IDs back to tokens (using the tokenizer's vocabulary) and
|
||||||
remove all special tokens, then join those tokens with spaces:
|
remove all special tokens, then join those tokens with spaces:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||||
print(output.ids)
|
:language: python
|
||||||
# [27194, 16, 93, 11, 5068, 5, 7928, 5083, 6190, 0, 35]
|
:start-after: START test_decoding
|
||||||
|
:end-before: END test_decoding
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
tokenizer.decode([27194, 16, 93, 11, 5068, 5, 7928, 5083, 6190, 0, 35])
|
.. only:: rust
|
||||||
# "Hello , y ' all ! How are you ?"
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START pipeline_test_decoding
|
||||||
|
:end-before: END pipeline_test_decoding
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START test_decoding
|
||||||
|
:end-before: END test_decoding
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
If you used a model that added special characters to represent subtokens of a given "word" (like
|
If you used a model that added special characters to represent subtokens of a given "word" (like
|
||||||
the :obj:`"##"` in WordPiece) you will need to customize the `decoder` to treat them properly. If we
|
the :obj:`"##"` in WordPiece) you will need to customize the `decoder` to treat them properly. If we
|
||||||
take our previous :obj:`bert_tokenizer` for instance the default decoing will give:
|
take our previous :entity:`bert_tokenizer` for instance the default decoing will give:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
output = bert_tokenizer.encode("Welcome to the 🤗 Tokenizers library.")
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||||
print(output.tokens)
|
:language: python
|
||||||
# ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]
|
:start-after: START bert_test_decoding
|
||||||
|
:end-before: END bert_test_decoding
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
bert_tokenizer.decoder(output.ids)
|
.. only:: rust
|
||||||
# "welcome to the tok ##eni ##zer ##s library ."
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START bert_test_decoding
|
||||||
|
:end-before: END bert_test_decoding
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START bert_test_decoding
|
||||||
|
:end-before: END bert_test_decoding
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
But by changing it to a proper decoder, we get:
|
But by changing it to a proper decoder, we get:
|
||||||
|
|
||||||
.. code-block:: python
|
.. only:: python
|
||||||
|
|
||||||
bert_tokenizer.decoder = tokenizers.decoders.WordPiece()
|
.. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py
|
||||||
bert_tokenizer.decode(output.ids)
|
:language: python
|
||||||
# "welcome to the tokenizers library."
|
:start-after: START bert_proper_decoding
|
||||||
|
:end-before: END bert_proper_decoding
|
||||||
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START bert_proper_decoding
|
||||||
|
:end-before: END bert_proper_decoding
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
|
.. only:: node
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts
|
||||||
|
:language: javascript
|
||||||
|
:start-after: START bert_proper_decoding
|
||||||
|
:end-before: END bert_proper_decoding
|
||||||
|
:dedent: 8
|
||||||
|
@ -368,6 +368,18 @@ fn pipeline() -> tokenizers::Result<()> {
|
|||||||
.unwrap(),
|
.unwrap(),
|
||||||
);
|
);
|
||||||
// END pipeline_setup_processor
|
// END pipeline_setup_processor
|
||||||
|
// START pipeline_test_decoding
|
||||||
|
let output = tokenizer.encode("Hello, y'all! How are you 😁 ?", true)?;
|
||||||
|
println!("{:?}", output.get_ids());
|
||||||
|
// [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]
|
||||||
|
|
||||||
|
let decoded = tokenizer.decode(
|
||||||
|
vec![1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2],
|
||||||
|
true,
|
||||||
|
)?;
|
||||||
|
println!("{}", decoded);
|
||||||
|
// "Hello , y ' all ! How are you ?"
|
||||||
|
// END pipeline_test_decoding
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@ -444,6 +456,22 @@ fn pipeline_bert() -> tokenizers::Result<()> {
|
|||||||
|
|
||||||
bert_tokenizer.save("data/bert-wiki.json", false)?;
|
bert_tokenizer.save("data/bert-wiki.json", false)?;
|
||||||
// END bert_train_tokenizer
|
// END bert_train_tokenizer
|
||||||
|
// START bert_test_decoding
|
||||||
|
let output = bert_tokenizer.encode("Welcome to the 🤗 Tokenizers library.", true)?;
|
||||||
|
println!("{:?}", output.get_tokens());
|
||||||
|
// ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]
|
||||||
|
|
||||||
|
let decoded = bert_tokenizer.decode(output.get_ids().to_vec(), true)?;
|
||||||
|
println!("{}", decoded);
|
||||||
|
// "welcome to the tok ##eni ##zer ##s library ."
|
||||||
|
// END bert_test_decoding
|
||||||
|
// START bert_proper_decoding
|
||||||
|
use tokenizers::decoders::wordpiece::WordPiece as WordPieceDecoder;
|
||||||
|
bert_tokenizer.with_decoder(WordPieceDecoder::default());
|
||||||
|
let decoded = bert_tokenizer.decode(output.get_ids().to_vec(), true)?;
|
||||||
|
// "welcome to the tokenizers library."
|
||||||
|
// END bert_proper_decoding
|
||||||
|
println!("{}", decoded);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user