diff --git a/bindings/node/examples/documentation/pipeline.test.ts b/bindings/node/examples/documentation/pipeline.test.ts index b17436f7..1cac6047 100644 --- a/bindings/node/examples/documentation/pipeline.test.ts +++ b/bindings/node/examples/documentation/pipeline.test.ts @@ -66,37 +66,20 @@ describe("pipelineExample", () => { [["[CLS]", 1], ["[SEP]", 2]] )); // END setup_processor + // START test_decoding + let output = tokenizer.encode("Hello, y'all! How are you 😁 ?"); + console.log(output.getIds()); + // [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2] + + tokenizer.decode([1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]); + // "Hello , y ' all ! How are you ?" + // END test_decoding }); - it("shows a full bert example", async () => { - // START bert_setup_tokenizer - let { Tokenizer } = require("tokenizers/bindings/tokenizer"); + var { Tokenizer } = require("tokenizers/bindings/tokenizer"); + const slow_bert_training = async (bertTokenizer: typeof Tokenizer) => { let { WordPiece } = require("tokenizers/bindings/models"); - let bert_tokenizer = Tokenizer(WordPiece.empty()); - // END bert_setup_tokenizer - // START bert_setup_normalizer - let { sequenceNormalizer, lowercaseNormalizer, nfdNormalizer, stripAccentsNormalizer } - = require("tokenizers/bindings/normalizers"); - - bert_tokenizer.setNormalizer(sequenceNormalizer([ - nfdNormalizer(), lowercaseNormalizer(), stripAccentsNormalizer() - ])) - // END bert_setup_normalizer - // START bert_setup_pre_tokenizer - let { whitespacePreTokenizer } = require("tokenizers/bindings/pre_tokenizers"); - - bert_tokenizer.setPreTokenizer = whitespacePreTokenizer(); - // END bert_setup_pre_tokenizer - // START bert_setup_processor - let { templateProcessing } = require("tokenizers/bindings/processors"); - - bert_tokenizer.setPostProcessor(templateProcessing( - "[CLS] $A [SEP]", - "[CLS] $A [SEP] $B:1 [SEP]:1", - [["[CLS]", 1], ["[SEP]", 2]] - )); - // END bert_setup_processor // START bert_train_tokenizer let { wordPieceTrainer } = require("tokenizers/bindings/trainers"); let { promisify } = require("utils"); @@ -106,15 +89,61 @@ describe("pipelineExample", () => { specialTokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] }); let files = ["test", "train", "valid"].map(split => `data/wikitext-103-raw/wiki.${split}.raw`); - bert_tokenizer.train(trainer, files); + bertTokenizer.train(trainer, files); - let model_files = bert_tokenizer.getModel.save("data", "bert-wiki"); + let modelFiles = bertTokenizer.getModel.save("data", "bert-wiki"); let fromFile = promisify(WordPiece.fromFile); - bert_tokenizer.setModel(await fromFile(model_files[0], { + bertTokenizer.setModel(await fromFile(modelFiles[0], { unkToken: "[UNK]" })); - bert_tokenizer.save("data/bert-wiki.json") + bertTokenizer.save("data/bert-wiki.json") // END bert_train_tokenizer + }; + console.log(slow_bert_training); // disable unused warning + + it("shows a full bert example", async () => { + // START bert_setup_tokenizer + let { Tokenizer } = require("tokenizers/bindings/tokenizer"); + let { WordPiece } = require("tokenizers/bindings/models"); + + let bertTokenizer = Tokenizer(WordPiece.empty()); + // END bert_setup_tokenizer + // START bert_setup_normalizer + let { sequenceNormalizer, lowercaseNormalizer, nfdNormalizer, stripAccentsNormalizer } + = require("tokenizers/bindings/normalizers"); + + bertTokenizer.setNormalizer(sequenceNormalizer([ + nfdNormalizer(), lowercaseNormalizer(), stripAccentsNormalizer() + ])) + // END bert_setup_normalizer + // START bert_setup_pre_tokenizer + let { whitespacePreTokenizer } = require("tokenizers/bindings/pre_tokenizers"); + + bertTokenizer.setPreTokenizer = whitespacePreTokenizer(); + // END bert_setup_pre_tokenizer + // START bert_setup_processor + let { templateProcessing } = require("tokenizers/bindings/processors"); + + bertTokenizer.setPostProcessor(templateProcessing( + "[CLS] $A [SEP]", + "[CLS] $A [SEP] $B:1 [SEP]:1", + [["[CLS]", 1], ["[SEP]", 2]] + )); + // END bert_setup_processor + // START bert_test_decoding + let output = bertTokenizer.encode("Welcome to the 🤗 Tokenizers library."); + console.log(output.getTokens()); + // ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"] + + bertTokenizer.decode(output.getIds()); + // "welcome to the tok ##eni ##zer ##s library ." + // END bert_test_decoding + // START bert_proper_decoding + let { wordPieceDecoder } = require("tokenizers/bindings/decoders"); + bertTokenizer.setDecoder(wordPieceDecoder()); + bertTokenizer.decode(output.ids); + // "welcome to the tokenizers library." + // END bert_proper_decoding }); }); diff --git a/bindings/python/tests/documentation/test_pipeline.py b/bindings/python/tests/documentation/test_pipeline.py index 736f7df5..874c740e 100644 --- a/bindings/python/tests/documentation/test_pipeline.py +++ b/bindings/python/tests/documentation/test_pipeline.py @@ -76,14 +76,27 @@ class TestPipeline: # START setup_processor from tokenizers.processors import TemplateProcessing - tokenizer.post_processor = TemplateProcessing + tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[("[CLS]", 1), ("[SEP]", 2)], ) # END setup_processor + # START test_decoding + output = tokenizer.encode("Hello, y'all! How are you 😁 ?") + print(output.ids) + # [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2] - def test_bert_example(self): + tokenizer.decode([1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]) + # "Hello , y ' all ! How are you ?" + # END test_decoding + assert output.ids == [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2] + assert ( + tokenizer.decode([1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]) + == "Hello , y ' all ! How are you ?" + ) + + def bert_example(self): # START bert_setup_tokenizer from tokenizers import Tokenizer from tokenizers.models import WordPiece @@ -94,9 +107,7 @@ class TestPipeline: from tokenizers import normalizers from tokenizers.normalizers import Lowercase, NFD, StripAccents - bert_tokenizer.normalizer = normalizers.Sequence([ - NFD(), Lowercase(), StripAccents() - ]) + bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) # END bert_setup_normalizer # START bert_setup_pre_tokenizer from tokenizers.pre_tokenizers import Whitespace @@ -112,7 +123,7 @@ class TestPipeline: special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), - ] + ], ) # END bert_setup_processor # START bert_train_tokenizer @@ -129,3 +140,16 @@ class TestPipeline: bert_tokenizer.save("data/bert-wiki.json") # END bert_train_tokenizer + # START bert_test_decoding + output = bert_tokenizer.encode("Welcome to the 🤗 Tokenizers library.") + print(output.tokens) + # ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"] + + bert_tokenizer.decoder(output.ids) + # "welcome to the tok ##eni ##zer ##s library ." + # END bert_test_decoding + # START bert_proper_decoding + bert_tokenizer.decoder = tokenizers.decoders.WordPiece() + bert_tokenizer.decode(output.ids) + # "welcome to the tokenizers library." + # END bert_proper_decoding diff --git a/docs/source/entities.inc b/docs/source/entities.inc index 604d3040..cd3f5048 100644 --- a/docs/source/entities.inc +++ b/docs/source/entities.inc @@ -18,6 +18,10 @@ :meth:`~tokenizers.Tokenizer.encode` Tokenizer.encode_batch :meth:`~tokenizers.Tokenizer.encode_batch` + Tokenizer.decode + :meth:`~tokenizers.Tokenizer.decode` + Tokenizer.decode_batch + :meth:`~tokenizers.Tokenizer.decode_batch` Tokenizer.token_to_id :meth:`~tokenizers.Tokenizer.token_to_id` Tokenizer.enable_padding @@ -42,6 +46,8 @@ :class:`~tokenizers.models.WordLevel` models.WordPiece :class:`~tokenizers.models.WordPiece` + Decoder + :class:`~tokenizers.decoders.Decoder` .. entities:: rust @@ -63,6 +69,10 @@ :rust:meth:`~tokenizers::tokenizer::Tokenizer::encode` Tokenizer.encode_batch :rust:meth:`~tokenizers::tokenizer::Tokenizer::encode_batch` + Tokenizer.decode + :rust:meth:`~tokenizers::tokenizer::Tokenizer::decode` + Tokenizer.decode_batch + :rust:meth:`~tokenizers::tokenizer::Tokenizer::decode_batch` Tokenizer.token_to_id :rust:meth:`~tokenizers::tokenizer::Tokenizer::token_to_id` Tokenizer.enable_padding @@ -87,6 +97,8 @@ :rust:struct:`~tokenizers::models::wordlevel::WordLevel` models.WordPiece :rust:struct:`~tokenizers::models::wordpiece::WordPiece` + Decoder + :rust:trait:`~tokenizers::tokenizer::Decoder` .. entities:: node @@ -108,6 +120,10 @@ :obj:`Tokenizer.encode()` Tokenizer.encode_batch :obj:`Tokenizer.encodeBatch()` + Tokenizer.decode + :obj:`Tokenizer.decode()` + Tokenizer.decode_batch + :obj:`Tokenizer.decodeBatch()` Tokenizer.token_to_id :obj:`Tokenizer.tokenToId()` Tokenizer.enable_padding @@ -132,3 +148,5 @@ :obj:`WordLevel` models.WordPiece :obj:`WordPiece` + Decoder + :obj:`Decoder` diff --git a/docs/source/pipeline.rst b/docs/source/pipeline.rst index 1fe13a98..d843ee57 100644 --- a/docs/source/pipeline.rst +++ b/docs/source/pipeline.rst @@ -447,40 +447,104 @@ We can use this tokenizer and train on it on wikitext like in the :doc:`quicktou Decoding ---------------------------------------------------------------------------------------------------- -On top of encoding the input texts, a :class:`~tokenizers.Tokenizer` also has an API for decoding, +.. entities:: python + + bert_tokenizer + :obj:`bert_tokenizer` + +.. entities:: rust + + bert_tokenizer + :obj:`bert_tokenizer` + +.. entities:: node + + bert_tokenizer + :obj:`bertTokenizer` + + +On top of encoding the input texts, a :entity:`Tokenizer` also has an API for decoding, that is converting IDs generated by your model back to a text. This is done by the methods -:meth:`~tokenizers.Tokenizer.decode` (for one predicted text) and -:meth:`~tokenizers.Tokenizer.decode_batch` (for a batch of predictions). +:entity:`Tokenizer.decode` (for one predicted text) and :entity:`Tokenizer.decode_batch` (for a +batch of predictions). The `decoder` will first convert the IDs back to tokens (using the tokenizer's vocabulary) and remove all special tokens, then join those tokens with spaces: -.. code-block:: python +.. only:: python - output = tokenizer.encode("Hello, y'all! How are you 😁 ?") - print(output.ids) - # [27194, 16, 93, 11, 5068, 5, 7928, 5083, 6190, 0, 35] + .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py + :language: python + :start-after: START test_decoding + :end-before: END test_decoding + :dedent: 8 - tokenizer.decode([27194, 16, 93, 11, 5068, 5, 7928, 5083, 6190, 0, 35]) - # "Hello , y ' all ! How are you ?" +.. only:: rust + + .. literalinclude:: ../../tokenizers/tests/documentation.rs + :language: rust + :start-after: START pipeline_test_decoding + :end-before: END pipeline_test_decoding + :dedent: 4 + +.. only:: node + + .. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts + :language: javascript + :start-after: START test_decoding + :end-before: END test_decoding + :dedent: 8 If you used a model that added special characters to represent subtokens of a given "word" (like the :obj:`"##"` in WordPiece) you will need to customize the `decoder` to treat them properly. If we -take our previous :obj:`bert_tokenizer` for instance the default decoing will give: +take our previous :entity:`bert_tokenizer` for instance the default decoing will give: -.. code-block:: python +.. only:: python - output = bert_tokenizer.encode("Welcome to the 🤗 Tokenizers library.") - print(output.tokens) - # ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"] + .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py + :language: python + :start-after: START bert_test_decoding + :end-before: END bert_test_decoding + :dedent: 8 - bert_tokenizer.decoder(output.ids) - # "welcome to the tok ##eni ##zer ##s library ." +.. only:: rust + + .. literalinclude:: ../../tokenizers/tests/documentation.rs + :language: rust + :start-after: START bert_test_decoding + :end-before: END bert_test_decoding + :dedent: 4 + +.. only:: node + + .. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts + :language: javascript + :start-after: START bert_test_decoding + :end-before: END bert_test_decoding + :dedent: 8 But by changing it to a proper decoder, we get: -.. code-block:: python +.. only:: python - bert_tokenizer.decoder = tokenizers.decoders.WordPiece() - bert_tokenizer.decode(output.ids) - # "welcome to the tokenizers library." + .. literalinclude:: ../../bindings/python/tests/documentation/test_pipeline.py + :language: python + :start-after: START bert_proper_decoding + :end-before: END bert_proper_decoding + :dedent: 8 + +.. only:: rust + + .. literalinclude:: ../../tokenizers/tests/documentation.rs + :language: rust + :start-after: START bert_proper_decoding + :end-before: END bert_proper_decoding + :dedent: 4 + +.. only:: node + + .. literalinclude:: ../../bindings/node/examples/documentation/pipeline.test.ts + :language: javascript + :start-after: START bert_proper_decoding + :end-before: END bert_proper_decoding + :dedent: 8 diff --git a/tokenizers/tests/documentation.rs b/tokenizers/tests/documentation.rs index cc69050a..3a94b5c8 100644 --- a/tokenizers/tests/documentation.rs +++ b/tokenizers/tests/documentation.rs @@ -368,6 +368,18 @@ fn pipeline() -> tokenizers::Result<()> { .unwrap(), ); // END pipeline_setup_processor + // START pipeline_test_decoding + let output = tokenizer.encode("Hello, y'all! How are you 😁 ?", true)?; + println!("{:?}", output.get_ids()); + // [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2] + + let decoded = tokenizer.decode( + vec![1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2], + true, + )?; + println!("{}", decoded); + // "Hello , y ' all ! How are you ?" + // END pipeline_test_decoding Ok(()) } @@ -444,6 +456,22 @@ fn pipeline_bert() -> tokenizers::Result<()> { bert_tokenizer.save("data/bert-wiki.json", false)?; // END bert_train_tokenizer + // START bert_test_decoding + let output = bert_tokenizer.encode("Welcome to the 🤗 Tokenizers library.", true)?; + println!("{:?}", output.get_tokens()); + // ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"] + + let decoded = bert_tokenizer.decode(output.get_ids().to_vec(), true)?; + println!("{}", decoded); + // "welcome to the tok ##eni ##zer ##s library ." + // END bert_test_decoding + // START bert_proper_decoding + use tokenizers::decoders::wordpiece::WordPiece as WordPieceDecoder; + bert_tokenizer.with_decoder(WordPieceDecoder::default()); + let decoded = bert_tokenizer.decode(output.get_ids().to_vec(), true)?; + // "welcome to the tokenizers library." + // END bert_proper_decoding + println!("{}", decoded); Ok(()) }