mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Doc - Add code snippets for rust in Quicktour
This commit is contained in:
@ -77,6 +77,14 @@ one with a BPE model:
|
|||||||
:end-before: END init_tokenizer
|
:end-before: END init_tokenizer
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START init_tokenizer
|
||||||
|
:end-before: END init_tokenizer
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
To train our tokenizer on the wikitext files, we will need to instantiate a `trainer`, in this case
|
To train our tokenizer on the wikitext files, we will need to instantiate a `trainer`, in this case
|
||||||
a :entity:`BpeTrainer`
|
a :entity:`BpeTrainer`
|
||||||
|
|
||||||
@ -88,6 +96,14 @@ a :entity:`BpeTrainer`
|
|||||||
:end-before: END init_trainer
|
:end-before: END init_trainer
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START init_trainer
|
||||||
|
:end-before: END init_trainer
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
We can set the training arguments like :entity:`vocab_size` or :entity:`min_frequency` (here left at
|
We can set the training arguments like :entity:`vocab_size` or :entity:`min_frequency` (here left at
|
||||||
their default values of 30,000 and 0) but the most important part is to give the
|
their default values of 30,000 and 0) but the most important part is to give the
|
||||||
:entity:`special_tokens` we plan to use later on (they are not used at all during training) so that
|
:entity:`special_tokens` we plan to use later on (they are not used at all during training) so that
|
||||||
@ -113,6 +129,14 @@ on whitespace.
|
|||||||
:end-before: END init_pretok
|
:end-before: END init_pretok
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START init_pretok
|
||||||
|
:end-before: END init_pretok
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
Now, we can just call the :entity:`Tokenizer.train` method with any list of files we want
|
Now, we can just call the :entity:`Tokenizer.train` method with any list of files we want
|
||||||
to use:
|
to use:
|
||||||
|
|
||||||
@ -124,6 +148,14 @@ to use:
|
|||||||
:end-before: END train
|
:end-before: END train
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START train
|
||||||
|
:end-before: END train
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
This should only take a few seconds to train our tokenizer on the full wikitext dataset! Once this
|
This should only take a few seconds to train our tokenizer on the full wikitext dataset! Once this
|
||||||
is done, we need to save the model and reinstantiate it with the unknown token, or this token won't
|
is done, we need to save the model and reinstantiate it with the unknown token, or this token won't
|
||||||
be used. This will be simplified in a further release, to let you set the :obj:`unk_token` when
|
be used. This will be simplified in a further release, to let you set the :obj:`unk_token` when
|
||||||
@ -137,6 +169,14 @@ first instantiating the model.
|
|||||||
:end-before: END reload_model
|
:end-before: END reload_model
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START reload_model
|
||||||
|
:end-before: END reload_model
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
To save the tokenizer in one file that contains all its configuration and vocabulary, just use the
|
To save the tokenizer in one file that contains all its configuration and vocabulary, just use the
|
||||||
:entity:`Tokenizer.save` method:
|
:entity:`Tokenizer.save` method:
|
||||||
|
|
||||||
@ -148,6 +188,14 @@ To save the tokenizer in one file that contains all its configuration and vocabu
|
|||||||
:end-before: END save
|
:end-before: END save
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START save
|
||||||
|
:end-before: END save
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
and you can reload your tokenizer from that file with the :entity:`Tokenizer.from_file`
|
and you can reload your tokenizer from that file with the :entity:`Tokenizer.from_file`
|
||||||
:entity:`classmethod`:
|
:entity:`classmethod`:
|
||||||
|
|
||||||
@ -159,6 +207,14 @@ and you can reload your tokenizer from that file with the :entity:`Tokenizer.fro
|
|||||||
:end-before: END reload_tokenizer
|
:end-before: END reload_tokenizer
|
||||||
:dedent: 12
|
:dedent: 12
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START reload_tokenizer
|
||||||
|
:end-before: END reload_tokenizer
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
Using the tokenizer
|
Using the tokenizer
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
@ -173,6 +229,14 @@ Now that we have trained a tokenizer, we can use it on any text we want with the
|
|||||||
:end-before: END encode
|
:end-before: END encode
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START encode
|
||||||
|
:end-before: END encode
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
This applied the full pipeline of the tokenizer on the text, returning an
|
This applied the full pipeline of the tokenizer on the text, returning an
|
||||||
:class:`~tokenizers.Encoding` object. To learn more about this pipeline, and how to apply (or
|
:class:`~tokenizers.Encoding` object. To learn more about this pipeline, and how to apply (or
|
||||||
customize) parts of it, check out :doc:`this page <pipeline>`.
|
customize) parts of it, check out :doc:`this page <pipeline>`.
|
||||||
@ -189,6 +253,14 @@ tokens:
|
|||||||
:end-before: END print_tokens
|
:end-before: END print_tokens
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START print_tokens
|
||||||
|
:end-before: END print_tokens
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
Similarly, the :obj:`ids` attribute will contain the index of each of those tokens in the
|
Similarly, the :obj:`ids` attribute will contain the index of each of those tokens in the
|
||||||
tokenizer's vocabulary:
|
tokenizer's vocabulary:
|
||||||
|
|
||||||
@ -200,6 +272,14 @@ tokenizer's vocabulary:
|
|||||||
:end-before: END print_ids
|
:end-before: END print_ids
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START print_ids
|
||||||
|
:end-before: END print_ids
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
An important feature of the 🤗 Tokenizers library is that it comes with full alignment tracking,
|
An important feature of the 🤗 Tokenizers library is that it comes with full alignment tracking,
|
||||||
meaning you can always get the part of your original sentence that corresponds to a given token.
|
meaning you can always get the part of your original sentence that corresponds to a given token.
|
||||||
Those are stored in the :obj:`offsets` attribute of our :class:`~tokenizers.Encoding` object. For
|
Those are stored in the :obj:`offsets` attribute of our :class:`~tokenizers.Encoding` object. For
|
||||||
@ -214,6 +294,14 @@ which is the token at index 9 in the list, we can just ask for the offset at the
|
|||||||
:end-before: END print_offsets
|
:end-before: END print_offsets
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START print_offsets
|
||||||
|
:end-before: END print_offsets
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
and those are the indices that correspond to the emoji in the original sentence:
|
and those are the indices that correspond to the emoji in the original sentence:
|
||||||
|
|
||||||
.. only:: python
|
.. only:: python
|
||||||
@ -224,6 +312,14 @@ and those are the indices that correspond to the emoji in the original sentence:
|
|||||||
:end-before: END use_offsets
|
:end-before: END use_offsets
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START use_offsets
|
||||||
|
:end-before: END use_offsets
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
Post-processing
|
Post-processing
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
@ -244,6 +340,14 @@ list of special tokens, so this should be their IDs. To double-check, we can use
|
|||||||
:end-before: END check_sep
|
:end-before: END check_sep
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START check_sep
|
||||||
|
:end-before: END check_sep
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
Here is how we can set the post-processing to give us the traditional BERT inputs:
|
Here is how we can set the post-processing to give us the traditional BERT inputs:
|
||||||
|
|
||||||
.. only:: python
|
.. only:: python
|
||||||
@ -254,6 +358,14 @@ Here is how we can set the post-processing to give us the traditional BERT input
|
|||||||
:end-before: END init_template_processing
|
:end-before: END init_template_processing
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START init_template_processing
|
||||||
|
:end-before: END init_template_processing
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
Let's go over this snippet of code in more details. First we specify the template for single
|
Let's go over this snippet of code in more details. First we specify the template for single
|
||||||
sentences: those should have the form :obj:`"[CLS] $A [SEP]"` where :obj:`$A` represents our
|
sentences: those should have the form :obj:`"[CLS] $A [SEP]"` where :obj:`$A` represents our
|
||||||
sentence.
|
sentence.
|
||||||
@ -276,6 +388,14 @@ To check out this worked properly, let's try to encode the same sentence as befo
|
|||||||
:end-before: END print_special_tokens
|
:end-before: END print_special_tokens
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START print_special_tokens
|
||||||
|
:end-before: END print_special_tokens
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
To check the results on a pair of sentences, we just pass the two sentences to
|
To check the results on a pair of sentences, we just pass the two sentences to
|
||||||
:meth:`~tokenizers.Tokenizer.encode`:
|
:meth:`~tokenizers.Tokenizer.encode`:
|
||||||
|
|
||||||
@ -287,6 +407,14 @@ To check the results on a pair of sentences, we just pass the two sentences to
|
|||||||
:end-before: END print_special_tokens_pair
|
:end-before: END print_special_tokens_pair
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START print_special_tokens_pair
|
||||||
|
:end-before: END print_special_tokens_pair
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
You can then check the type IDs attributed to each token is correct with
|
You can then check the type IDs attributed to each token is correct with
|
||||||
|
|
||||||
.. only:: python
|
.. only:: python
|
||||||
@ -297,6 +425,14 @@ You can then check the type IDs attributed to each token is correct with
|
|||||||
:end-before: END print_type_ids
|
:end-before: END print_type_ids
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START print_type_ids
|
||||||
|
:end-before: END print_type_ids
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
If you save your tokenizer with :meth:`~tokenizers.Tokenizer.save`, the post-processor will be saved
|
If you save your tokenizer with :meth:`~tokenizers.Tokenizer.save`, the post-processor will be saved
|
||||||
along.
|
along.
|
||||||
|
|
||||||
@ -314,6 +450,14 @@ using the :meth:`~tokenizers.Tokenizer.encode_batch` method:
|
|||||||
:end-before: END encode_batch
|
:end-before: END encode_batch
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START encode_batch
|
||||||
|
:end-before: END encode_batch
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
The output is then a list of :class:`~tokenizers.Encoding` objects like the ones we saw before. You
|
The output is then a list of :class:`~tokenizers.Encoding` objects like the ones we saw before. You
|
||||||
can process together as many texts as you like, as long as it fits in memory.
|
can process together as many texts as you like, as long as it fits in memory.
|
||||||
|
|
||||||
@ -329,6 +473,14 @@ B:
|
|||||||
:end-before: END encode_batch_pair
|
:end-before: END encode_batch_pair
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START encode_batch_pair
|
||||||
|
:end-before: END encode_batch_pair
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
When encoding multiple sentences, you can automatically pad the outputs to the longest sentence
|
When encoding multiple sentences, you can automatically pad the outputs to the longest sentence
|
||||||
present by using :meth:`~tokenizers.Tokenizer.enable_padding`, with the :obj:`pad_token` and its ID
|
present by using :meth:`~tokenizers.Tokenizer.enable_padding`, with the :obj:`pad_token` and its ID
|
||||||
(which we can double-check the id for the padding token with
|
(which we can double-check the id for the padding token with
|
||||||
@ -342,6 +494,14 @@ present by using :meth:`~tokenizers.Tokenizer.enable_padding`, with the :obj:`pa
|
|||||||
:end-before: END enable_padding
|
:end-before: END enable_padding
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START enable_padding
|
||||||
|
:end-before: END enable_padding
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
We can set the :obj:`direction` of the padding (defaults to the right) or a given :obj:`length` if
|
We can set the :obj:`direction` of the padding (defaults to the right) or a given :obj:`length` if
|
||||||
we want to pad every sample to that specific number (here we leave it unset to pad to the size of
|
we want to pad every sample to that specific number (here we leave it unset to pad to the size of
|
||||||
the longest text).
|
the longest text).
|
||||||
@ -354,6 +514,14 @@ the longest text).
|
|||||||
:end-before: END print_batch_tokens
|
:end-before: END print_batch_tokens
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START print_batch_tokens
|
||||||
|
:end-before: END print_batch_tokens
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
In this case, the `attention mask` generated by the tokenizer takes the padding into account:
|
In this case, the `attention mask` generated by the tokenizer takes the padding into account:
|
||||||
|
|
||||||
.. only:: python
|
.. only:: python
|
||||||
@ -364,6 +532,14 @@ In this case, the `attention mask` generated by the tokenizer takes the padding
|
|||||||
:end-before: END print_attention_mask
|
:end-before: END print_attention_mask
|
||||||
:dedent: 8
|
:dedent: 8
|
||||||
|
|
||||||
|
.. only:: rust
|
||||||
|
|
||||||
|
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||||
|
:language: rust
|
||||||
|
:start-after: START print_attention_mask
|
||||||
|
:end-before: END print_attention_mask
|
||||||
|
:dedent: 4
|
||||||
|
|
||||||
.. _pretrained:
|
.. _pretrained:
|
||||||
|
|
||||||
Using a pretrained tokenizer
|
Using a pretrained tokenizer
|
||||||
|
@ -6,7 +6,7 @@ dir_guard=@mkdir -p $(@D)
|
|||||||
|
|
||||||
SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt
|
SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt
|
||||||
BENCHMARK_RESOURCES = $(SHARED_RESOURCES)
|
BENCHMARK_RESOURCES = $(SHARED_RESOURCES)
|
||||||
TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/albert-base-v1-tokenizer.json $(DATA_DIR)/roberta.json
|
TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/albert-base-v1-tokenizer.json $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json
|
||||||
|
|
||||||
.PHONY : build
|
.PHONY : build
|
||||||
build :
|
build :
|
||||||
@ -71,3 +71,7 @@ $(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt
|
|||||||
$(DATA_DIR)/roberta.json :
|
$(DATA_DIR)/roberta.json :
|
||||||
$(dir_guard)
|
$(dir_guard)
|
||||||
wget https://storage.googleapis.com/tokenizers/roberta.json -O $@
|
wget https://storage.googleapis.com/tokenizers/roberta.json -O $@
|
||||||
|
|
||||||
|
$(DATA_DIR)/tokenizer-wiki.json :
|
||||||
|
$(dir_guard)
|
||||||
|
wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json -O $@
|
||||||
|
@ -24,11 +24,6 @@ use serde::de::DeserializeOwned;
|
|||||||
use serde::export::Formatter;
|
use serde::export::Formatter;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::decoders::DecoderWrapper;
|
|
||||||
use crate::models::ModelWrapper;
|
|
||||||
use crate::normalizers::NormalizerWrapper;
|
|
||||||
use crate::pre_tokenizers::PreTokenizerWrapper;
|
|
||||||
use crate::processors::PostProcessorWrapper;
|
|
||||||
use crate::utils::parallelism::*;
|
use crate::utils::parallelism::*;
|
||||||
|
|
||||||
mod added_vocabulary;
|
mod added_vocabulary;
|
||||||
@ -38,6 +33,13 @@ pub mod pattern;
|
|||||||
pub mod pre_tokenizer;
|
pub mod pre_tokenizer;
|
||||||
mod serialization;
|
mod serialization;
|
||||||
|
|
||||||
|
// Re-export wrappers
|
||||||
|
pub use crate::decoders::DecoderWrapper;
|
||||||
|
pub use crate::models::ModelWrapper;
|
||||||
|
pub use crate::normalizers::NormalizerWrapper;
|
||||||
|
pub use crate::pre_tokenizers::PreTokenizerWrapper;
|
||||||
|
pub use crate::processors::PostProcessorWrapper;
|
||||||
|
// And some other types
|
||||||
pub use crate::utils::iter::LinesWithEnding;
|
pub use crate::utils::iter::LinesWithEnding;
|
||||||
pub use crate::utils::padding::{pad_encodings, PaddingDirection, PaddingParams, PaddingStrategy};
|
pub use crate::utils::padding::{pad_encodings, PaddingDirection, PaddingParams, PaddingStrategy};
|
||||||
pub use crate::utils::truncation::{truncate_encodings, TruncationParams, TruncationStrategy};
|
pub use crate::utils::truncation::{truncate_encodings, TruncationParams, TruncationStrategy};
|
||||||
@ -383,7 +385,6 @@ impl Tokenizer {
|
|||||||
> {
|
> {
|
||||||
self.0
|
self.0
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self> {
|
pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self> {
|
||||||
let content = read_to_string(file)?;
|
let content = read_to_string(file)?;
|
||||||
Ok(serde_json::from_str(&content)?)
|
Ok(serde_json::from_str(&content)?)
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
use tokenizers::models::bpe::{BpeTrainerBuilder, BPE};
|
use tokenizers::models::bpe::{BpeTrainer, BpeTrainerBuilder, BPE};
|
||||||
use tokenizers::normalizers::{Sequence, Strip, NFC};
|
use tokenizers::normalizers::{Sequence, Strip, NFC};
|
||||||
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
|
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
|
||||||
use tokenizers::Tokenizer;
|
|
||||||
use tokenizers::{AddedToken, TokenizerBuilder};
|
use tokenizers::{AddedToken, TokenizerBuilder};
|
||||||
|
use tokenizers::{DecoderWrapper, NormalizerWrapper, PostProcessorWrapper, PreTokenizerWrapper};
|
||||||
|
use tokenizers::{Tokenizer, TokenizerImpl};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn train_tokenizer() {
|
fn train_tokenizer() {
|
||||||
// START train_tokenizer
|
|
||||||
let vocab_size: usize = 100;
|
let vocab_size: usize = 100;
|
||||||
let tokenizer = TokenizerBuilder::new()
|
let tokenizer = TokenizerBuilder::new()
|
||||||
.with_model(BPE::default())
|
.with_model(BPE::default())
|
||||||
@ -39,14 +39,11 @@ fn train_tokenizer() {
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
.save("data/tokenizer.json", pretty)
|
.save("data/tokenizer.json", pretty)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
// END train_tokenizer
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn load_tokenizer() {
|
fn load_tokenizer() {
|
||||||
// START load_tokenizer
|
|
||||||
let tokenizer = Tokenizer::from_file("data/roberta.json").unwrap();
|
let tokenizer = Tokenizer::from_file("data/roberta.json").unwrap();
|
||||||
// END load_tokenizer
|
|
||||||
|
|
||||||
let example = "This is an example";
|
let example = "This is an example";
|
||||||
let ids = vec![713, 16, 41, 1246];
|
let ids = vec![713, 16, 41, 1246];
|
||||||
@ -60,3 +57,208 @@ fn load_tokenizer() {
|
|||||||
let decoded = tokenizer.decode(ids, false).unwrap();
|
let decoded = tokenizer.decode(ids, false).unwrap();
|
||||||
assert_eq!(decoded, example);
|
assert_eq!(decoded, example);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[ignore]
|
||||||
|
fn quicktour_slow_train() -> tokenizers::Result<()> {
|
||||||
|
let (mut tokenizer, trainer) = quicktour_get_tokenizer_trainer()?;
|
||||||
|
|
||||||
|
// START train
|
||||||
|
let files = ["test", "train", "valid"]
|
||||||
|
.iter()
|
||||||
|
.map(|split| format!("data/wikitext-103-raw/wiki.{}.raw", split))
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
tokenizer.train_and_replace(&trainer, files)?;
|
||||||
|
// END train
|
||||||
|
// START reload_model
|
||||||
|
use std::path::Path;
|
||||||
|
use tokenizers::Model;
|
||||||
|
|
||||||
|
let saved_files = tokenizer
|
||||||
|
.get_model()
|
||||||
|
.save(&Path::new("data"), Some("wiki"))?;
|
||||||
|
tokenizer.with_model(
|
||||||
|
BPE::from_file(
|
||||||
|
saved_files[0].to_str().unwrap(),
|
||||||
|
&saved_files[1].to_str().unwrap(),
|
||||||
|
)
|
||||||
|
.unk_token("[UNK]".to_string())
|
||||||
|
.build()?,
|
||||||
|
);
|
||||||
|
// END reload_model
|
||||||
|
// START save
|
||||||
|
tokenizer.save("data/tokenizer-wiki.json", false)?;
|
||||||
|
// END save
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(unused_imports)]
|
||||||
|
fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<(
|
||||||
|
TokenizerImpl<
|
||||||
|
BPE,
|
||||||
|
NormalizerWrapper,
|
||||||
|
PreTokenizerWrapper,
|
||||||
|
PostProcessorWrapper,
|
||||||
|
DecoderWrapper,
|
||||||
|
>,
|
||||||
|
BpeTrainer,
|
||||||
|
)> {
|
||||||
|
// START init_tokenizer
|
||||||
|
use tokenizers::models::bpe::BPE;
|
||||||
|
use tokenizers::TokenizerBuilder;
|
||||||
|
|
||||||
|
let mut tokenizer: TokenizerImpl<
|
||||||
|
BPE,
|
||||||
|
NormalizerWrapper,
|
||||||
|
PreTokenizerWrapper,
|
||||||
|
PostProcessorWrapper,
|
||||||
|
DecoderWrapper,
|
||||||
|
> = TokenizerImpl::new(BPE::default());
|
||||||
|
// END init_tokenizer
|
||||||
|
// START init_trainer
|
||||||
|
use tokenizers::models::bpe::BpeTrainer;
|
||||||
|
|
||||||
|
let trainer = BpeTrainer::builder()
|
||||||
|
.special_tokens(vec![
|
||||||
|
AddedToken::from("[UNK]", true),
|
||||||
|
AddedToken::from("[CLS]", true),
|
||||||
|
AddedToken::from("[SEP]", true),
|
||||||
|
AddedToken::from("[PAD]", true),
|
||||||
|
AddedToken::from("[MASK]", true),
|
||||||
|
])
|
||||||
|
.build();
|
||||||
|
// END init_trainer
|
||||||
|
// START init_pretok
|
||||||
|
use tokenizers::pre_tokenizers::whitespace::Whitespace;
|
||||||
|
|
||||||
|
tokenizer.with_pre_tokenizer(Whitespace::default());
|
||||||
|
// END init_pretok
|
||||||
|
|
||||||
|
Ok((tokenizer, trainer))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn quicktour() -> tokenizers::Result<()> {
|
||||||
|
// START reload_tokenizer
|
||||||
|
let mut tokenizer = Tokenizer::from_file("data/tokenizer-wiki.json")?;
|
||||||
|
// END reload_tokenizer
|
||||||
|
// START encode
|
||||||
|
let output = tokenizer.encode("Hello, y'all! How are you 😁 ?", true)?;
|
||||||
|
// END encode
|
||||||
|
// START print_tokens
|
||||||
|
println!("{:?}", output.get_tokens());
|
||||||
|
// ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?",]
|
||||||
|
// END print_tokens
|
||||||
|
assert_eq!(
|
||||||
|
output.get_tokens(),
|
||||||
|
["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?",]
|
||||||
|
);
|
||||||
|
// START print_ids
|
||||||
|
println!("{:?}", output.get_ids());
|
||||||
|
// [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
|
||||||
|
// END print_ids
|
||||||
|
assert_eq!(
|
||||||
|
output.get_ids(),
|
||||||
|
[27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
|
||||||
|
);
|
||||||
|
// START print_offsets
|
||||||
|
println!("{:?}", output.get_offsets()[9]);
|
||||||
|
// (26, 30)
|
||||||
|
// END print_offsets
|
||||||
|
assert_eq!(output.get_offsets()[9], (26, 30));
|
||||||
|
// START use_offsets
|
||||||
|
let sentence = "Hello, y'all! How are you 😁 ?";
|
||||||
|
println!("{}", &sentence[26..30]);
|
||||||
|
// "😁"
|
||||||
|
// END use_offsets
|
||||||
|
// START check_sep
|
||||||
|
println!("{}", tokenizer.token_to_id("[SEP]").unwrap());
|
||||||
|
// 2
|
||||||
|
// END check_sep
|
||||||
|
assert_eq!(tokenizer.token_to_id("[SEP]"), Some(2));
|
||||||
|
// START init_template_processing
|
||||||
|
use tokenizers::processors::template::TemplateProcessing;
|
||||||
|
|
||||||
|
let special_tokens = vec![
|
||||||
|
("[CLS]", tokenizer.token_to_id("[CLS]").unwrap()),
|
||||||
|
("[SEP]", tokenizer.token_to_id("[SEP]").unwrap()),
|
||||||
|
];
|
||||||
|
tokenizer.with_post_processor(
|
||||||
|
TemplateProcessing::builder()
|
||||||
|
.try_single("[CLS] $A [SEP]")
|
||||||
|
.unwrap()
|
||||||
|
.try_pair("[CLS] $A [SEP] $B:1 [SEP]:1")
|
||||||
|
.unwrap()
|
||||||
|
.special_tokens(special_tokens)
|
||||||
|
.build()?,
|
||||||
|
);
|
||||||
|
// END init_template_processing
|
||||||
|
// START print_special_tokens
|
||||||
|
let output = tokenizer.encode("Hello, y'all! How are you 😁 ?", true)?;
|
||||||
|
println!("{:?}", output.get_tokens());
|
||||||
|
// ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
|
||||||
|
// END print_special_tokens
|
||||||
|
assert_eq!(
|
||||||
|
output.get_tokens(),
|
||||||
|
["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
|
||||||
|
);
|
||||||
|
// START print_special_tokens_pair
|
||||||
|
let output = tokenizer.encode(("Hello, y'all!", "How are you 😁 ?"), true)?;
|
||||||
|
println!("{:?}", output.get_tokens());
|
||||||
|
// ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"]
|
||||||
|
// END print_special_tokens_pair
|
||||||
|
assert_eq!(
|
||||||
|
output.get_tokens(),
|
||||||
|
[
|
||||||
|
"[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]",
|
||||||
|
"?", "[SEP]"
|
||||||
|
]
|
||||||
|
);
|
||||||
|
// START print_type_ids
|
||||||
|
println!("{:?}", output.get_type_ids());
|
||||||
|
// [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
|
||||||
|
// END print_type_ids
|
||||||
|
assert_eq!(
|
||||||
|
output.get_type_ids(),
|
||||||
|
[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
|
||||||
|
);
|
||||||
|
// START encode_batch
|
||||||
|
let output = tokenizer.encode_batch(vec!["Hello, y'all!", "How are you 😁 ?"], true)?;
|
||||||
|
// END encode_batch
|
||||||
|
println!("{:?}", output);
|
||||||
|
// START encode_batch_pair
|
||||||
|
let output = tokenizer.encode_batch(
|
||||||
|
vec![
|
||||||
|
("Hello, y'all!", "How are you 😁 ?"),
|
||||||
|
("Hello to you too!", "I'm fine, thank you!"),
|
||||||
|
],
|
||||||
|
true,
|
||||||
|
)?;
|
||||||
|
// END encode_batch_pair
|
||||||
|
println!("{:?}", output);
|
||||||
|
// START enable_padding
|
||||||
|
use tokenizers::PaddingParams;
|
||||||
|
|
||||||
|
tokenizer.with_padding(Some(PaddingParams {
|
||||||
|
pad_id: 3,
|
||||||
|
pad_token: "[PAD]".to_string(),
|
||||||
|
..PaddingParams::default()
|
||||||
|
}));
|
||||||
|
// END enable_padding
|
||||||
|
// START print_batch_tokens
|
||||||
|
let output = tokenizer.encode_batch(vec!["Hello, y'all!", "How are you 😁 ?"], true)?;
|
||||||
|
println!("{:?}", output[1].get_tokens());
|
||||||
|
// ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
|
||||||
|
// END print_batch_tokens
|
||||||
|
assert_eq!(
|
||||||
|
output[1].get_tokens(),
|
||||||
|
["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
|
||||||
|
);
|
||||||
|
// START print_attention_mask
|
||||||
|
println!("{:?}", output[1].get_attention_mask());
|
||||||
|
// [1, 1, 1, 1, 1, 1, 1, 0]
|
||||||
|
// END print_attention_mask
|
||||||
|
assert_eq!(output[1].get_attention_mask(), [1, 1, 1, 1, 1, 1, 1, 0]);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user