mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Doc - Better namespace for rust tests
This commit is contained in:
@ -95,8 +95,8 @@ one with a BPE model:
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START init_tokenizer
|
||||
:end-before: END init_tokenizer
|
||||
:start-after: START quicktour_init_tokenizer
|
||||
:end-before: END quicktour_init_tokenizer
|
||||
:dedent: 4
|
||||
|
||||
To train our tokenizer on the wikitext files, we will need to instantiate a `trainer`, in this case
|
||||
@ -114,8 +114,8 @@ a :entity:`BpeTrainer`
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START init_trainer
|
||||
:end-before: END init_trainer
|
||||
:start-after: START quicktour_init_trainer
|
||||
:end-before: END quicktour_init_trainer
|
||||
:dedent: 4
|
||||
|
||||
We can set the training arguments like :entity:`vocab_size` or :entity:`min_frequency` (here left at
|
||||
@ -147,8 +147,8 @@ on whitespace.
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START init_pretok
|
||||
:end-before: END init_pretok
|
||||
:start-after: START quicktour_init_pretok
|
||||
:end-before: END quicktour_init_pretok
|
||||
:dedent: 4
|
||||
|
||||
Now, we can just call the :entity:`Tokenizer.train` method with any list of files we want
|
||||
@ -166,8 +166,8 @@ to use:
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START train
|
||||
:end-before: END train
|
||||
:start-after: START quicktour_train
|
||||
:end-before: END quicktour_train
|
||||
:dedent: 4
|
||||
|
||||
This should only take a few seconds to train our tokenizer on the full wikitext dataset! Once this
|
||||
@ -187,8 +187,8 @@ first instantiating the model.
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START reload_model
|
||||
:end-before: END reload_model
|
||||
:start-after: START quicktour_reload_model
|
||||
:end-before: END quicktour_reload_model
|
||||
:dedent: 4
|
||||
|
||||
To save the tokenizer in one file that contains all its configuration and vocabulary, just use the
|
||||
@ -206,8 +206,8 @@ To save the tokenizer in one file that contains all its configuration and vocabu
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START save
|
||||
:end-before: END save
|
||||
:start-after: START quicktour_save
|
||||
:end-before: END quicktour_save
|
||||
:dedent: 4
|
||||
|
||||
and you can reload your tokenizer from that file with the :entity:`Tokenizer.from_file`
|
||||
@ -225,8 +225,8 @@ and you can reload your tokenizer from that file with the :entity:`Tokenizer.fro
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START reload_tokenizer
|
||||
:end-before: END reload_tokenizer
|
||||
:start-after: START quicktour_reload_tokenizer
|
||||
:end-before: END quicktour_reload_tokenizer
|
||||
:dedent: 4
|
||||
|
||||
Using the tokenizer
|
||||
@ -247,8 +247,8 @@ Now that we have trained a tokenizer, we can use it on any text we want with the
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START encode
|
||||
:end-before: END encode
|
||||
:start-after: START quicktour_encode
|
||||
:end-before: END quicktour_encode
|
||||
:dedent: 4
|
||||
|
||||
This applied the full pipeline of the tokenizer on the text, returning an
|
||||
@ -271,8 +271,8 @@ tokens:
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START print_tokens
|
||||
:end-before: END print_tokens
|
||||
:start-after: START quicktour_print_tokens
|
||||
:end-before: END quicktour_print_tokens
|
||||
:dedent: 4
|
||||
|
||||
Similarly, the :obj:`ids` attribute will contain the index of each of those tokens in the
|
||||
@ -290,8 +290,8 @@ tokenizer's vocabulary:
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START print_ids
|
||||
:end-before: END print_ids
|
||||
:start-after: START quicktour_print_ids
|
||||
:end-before: END quicktour_print_ids
|
||||
:dedent: 4
|
||||
|
||||
An important feature of the 🤗 Tokenizers library is that it comes with full alignment tracking,
|
||||
@ -312,8 +312,8 @@ which is the token at index 9 in the list, we can just ask for the offset at the
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START print_offsets
|
||||
:end-before: END print_offsets
|
||||
:start-after: START quicktour_print_offsets
|
||||
:end-before: END quicktour_print_offsets
|
||||
:dedent: 4
|
||||
|
||||
and those are the indices that correspond to the emoji in the original sentence:
|
||||
@ -330,8 +330,8 @@ and those are the indices that correspond to the emoji in the original sentence:
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START use_offsets
|
||||
:end-before: END use_offsets
|
||||
:start-after: START quicktour_use_offsets
|
||||
:end-before: END quicktour_use_offsets
|
||||
:dedent: 4
|
||||
|
||||
Post-processing
|
||||
@ -358,8 +358,8 @@ list of special tokens, so this should be their IDs. To double-check, we can use
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START check_sep
|
||||
:end-before: END check_sep
|
||||
:start-after: START quicktour_check_sep
|
||||
:end-before: END quicktour_check_sep
|
||||
:dedent: 4
|
||||
|
||||
Here is how we can set the post-processing to give us the traditional BERT inputs:
|
||||
@ -376,8 +376,8 @@ Here is how we can set the post-processing to give us the traditional BERT input
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START init_template_processing
|
||||
:end-before: END init_template_processing
|
||||
:start-after: START quicktour_init_template_processing
|
||||
:end-before: END quicktour_init_template_processing
|
||||
:dedent: 4
|
||||
|
||||
Let's go over this snippet of code in more details. First we specify the template for single
|
||||
@ -406,8 +406,8 @@ To check out this worked properly, let's try to encode the same sentence as befo
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START print_special_tokens
|
||||
:end-before: END print_special_tokens
|
||||
:start-after: START quicktour_print_special_tokens
|
||||
:end-before: END quicktour_print_special_tokens
|
||||
:dedent: 4
|
||||
|
||||
To check the results on a pair of sentences, we just pass the two sentences to
|
||||
@ -425,8 +425,8 @@ To check the results on a pair of sentences, we just pass the two sentences to
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START print_special_tokens_pair
|
||||
:end-before: END print_special_tokens_pair
|
||||
:start-after: START quicktour_print_special_tokens_pair
|
||||
:end-before: END quicktour_print_special_tokens_pair
|
||||
:dedent: 4
|
||||
|
||||
You can then check the type IDs attributed to each token is correct with
|
||||
@ -443,8 +443,8 @@ You can then check the type IDs attributed to each token is correct with
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START print_type_ids
|
||||
:end-before: END print_type_ids
|
||||
:start-after: START quicktour_print_type_ids
|
||||
:end-before: END quicktour_print_type_ids
|
||||
:dedent: 4
|
||||
|
||||
If you save your tokenizer with :entity:`Tokenizer.save`, the post-processor will be saved along.
|
||||
@ -467,8 +467,8 @@ using the :entity:`Tokenizer.encode_batch` method:
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START encode_batch
|
||||
:end-before: END encode_batch
|
||||
:start-after: START quicktour_encode_batch
|
||||
:end-before: END quicktour_encode_batch
|
||||
:dedent: 4
|
||||
|
||||
The output is then a list of :entity:`Encoding` objects like the ones we saw before. You
|
||||
@ -490,8 +490,8 @@ B:
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START encode_batch_pair
|
||||
:end-before: END encode_batch_pair
|
||||
:start-after: START quicktour_encode_batch_pair
|
||||
:end-before: END quicktour_encode_batch_pair
|
||||
:dedent: 4
|
||||
|
||||
When encoding multiple sentences, you can automatically pad the outputs to the longest sentence
|
||||
@ -511,8 +511,8 @@ present by using :entity:`Tokenizer.enable_padding`, with the :entity:`pad_token
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START enable_padding
|
||||
:end-before: END enable_padding
|
||||
:start-after: START quicktour_enable_padding
|
||||
:end-before: END quicktour_enable_padding
|
||||
:dedent: 4
|
||||
|
||||
We can set the :obj:`direction` of the padding (defaults to the right) or a given :obj:`length` if
|
||||
@ -531,8 +531,8 @@ the longest text).
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START print_batch_tokens
|
||||
:end-before: END print_batch_tokens
|
||||
:start-after: START quicktour_print_batch_tokens
|
||||
:end-before: END quicktour_print_batch_tokens
|
||||
:dedent: 4
|
||||
|
||||
In this case, the `attention mask` generated by the tokenizer takes the padding into account:
|
||||
@ -549,8 +549,8 @@ In this case, the `attention mask` generated by the tokenizer takes the padding
|
||||
|
||||
.. literalinclude:: ../../tokenizers/tests/documentation.rs
|
||||
:language: rust
|
||||
:start-after: START print_attention_mask
|
||||
:end-before: END print_attention_mask
|
||||
:start-after: START quicktour_print_attention_mask
|
||||
:end-before: END quicktour_print_attention_mask
|
||||
:dedent: 4
|
||||
|
||||
.. _pretrained:
|
||||
|
@ -63,14 +63,14 @@ fn load_tokenizer() {
|
||||
fn quicktour_slow_train() -> tokenizers::Result<()> {
|
||||
let (mut tokenizer, trainer) = quicktour_get_tokenizer_trainer()?;
|
||||
|
||||
// START train
|
||||
// START quicktour_train
|
||||
let files = ["test", "train", "valid"]
|
||||
.iter()
|
||||
.map(|split| format!("data/wikitext-103-raw/wiki.{}.raw", split))
|
||||
.collect::<Vec<_>>();
|
||||
tokenizer.train_and_replace(&trainer, files)?;
|
||||
// END train
|
||||
// START reload_model
|
||||
// END quicktour_train
|
||||
// START quicktour_reload_model
|
||||
use std::path::Path;
|
||||
use tokenizers::Model;
|
||||
|
||||
@ -85,10 +85,10 @@ fn quicktour_slow_train() -> tokenizers::Result<()> {
|
||||
.unk_token("[UNK]".to_string())
|
||||
.build()?,
|
||||
);
|
||||
// END reload_model
|
||||
// START save
|
||||
// END quicktour_reload_model
|
||||
// START quicktour_save
|
||||
tokenizer.save("data/tokenizer-wiki.json", false)?;
|
||||
// END save
|
||||
// END quicktour_save
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@ -104,7 +104,7 @@ fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<(
|
||||
>,
|
||||
BpeTrainer,
|
||||
)> {
|
||||
// START init_tokenizer
|
||||
// START quicktour_init_tokenizer
|
||||
use tokenizers::models::bpe::BPE;
|
||||
use tokenizers::TokenizerBuilder;
|
||||
|
||||
@ -115,8 +115,8 @@ fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<(
|
||||
PostProcessorWrapper,
|
||||
DecoderWrapper,
|
||||
> = TokenizerImpl::new(BPE::default());
|
||||
// END init_tokenizer
|
||||
// START init_trainer
|
||||
// END quicktour_init_tokenizer
|
||||
// START quicktour_init_trainer
|
||||
use tokenizers::models::bpe::BpeTrainer;
|
||||
|
||||
let trainer = BpeTrainer::builder()
|
||||
@ -128,56 +128,56 @@ fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<(
|
||||
AddedToken::from("[MASK]", true),
|
||||
])
|
||||
.build();
|
||||
// END init_trainer
|
||||
// START init_pretok
|
||||
// END quicktour_init_trainer
|
||||
// START quicktour_init_pretok
|
||||
use tokenizers::pre_tokenizers::whitespace::Whitespace;
|
||||
|
||||
tokenizer.with_pre_tokenizer(Whitespace::default());
|
||||
// END init_pretok
|
||||
// END quicktour_init_pretok
|
||||
|
||||
Ok((tokenizer, trainer))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn quicktour() -> tokenizers::Result<()> {
|
||||
// START reload_tokenizer
|
||||
// START quicktour_reload_tokenizer
|
||||
let mut tokenizer = Tokenizer::from_file("data/tokenizer-wiki.json")?;
|
||||
// END reload_tokenizer
|
||||
// START encode
|
||||
// END quicktour_reload_tokenizer
|
||||
// START quicktour_encode
|
||||
let output = tokenizer.encode("Hello, y'all! How are you 😁 ?", true)?;
|
||||
// END encode
|
||||
// START print_tokens
|
||||
// END quicktour_encode
|
||||
// START quicktour_print_tokens
|
||||
println!("{:?}", output.get_tokens());
|
||||
// ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?",]
|
||||
// END print_tokens
|
||||
// END quicktour_print_tokens
|
||||
assert_eq!(
|
||||
output.get_tokens(),
|
||||
["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?",]
|
||||
);
|
||||
// START print_ids
|
||||
// START quicktour_print_ids
|
||||
println!("{:?}", output.get_ids());
|
||||
// [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
|
||||
// END print_ids
|
||||
// END quicktour_print_ids
|
||||
assert_eq!(
|
||||
output.get_ids(),
|
||||
[27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
|
||||
);
|
||||
// START print_offsets
|
||||
// START quicktour_print_offsets
|
||||
println!("{:?}", output.get_offsets()[9]);
|
||||
// (26, 30)
|
||||
// END print_offsets
|
||||
// END quicktour_print_offsets
|
||||
assert_eq!(output.get_offsets()[9], (26, 30));
|
||||
// START use_offsets
|
||||
// START quicktour_use_offsets
|
||||
let sentence = "Hello, y'all! How are you 😁 ?";
|
||||
println!("{}", &sentence[26..30]);
|
||||
// "😁"
|
||||
// END use_offsets
|
||||
// START check_sep
|
||||
// END quicktour_use_offsets
|
||||
// START quicktour_check_sep
|
||||
println!("{}", tokenizer.token_to_id("[SEP]").unwrap());
|
||||
// 2
|
||||
// END check_sep
|
||||
// END quicktour_check_sep
|
||||
assert_eq!(tokenizer.token_to_id("[SEP]"), Some(2));
|
||||
// START init_template_processing
|
||||
// START quicktour_init_template_processing
|
||||
use tokenizers::processors::template::TemplateProcessing;
|
||||
|
||||
let special_tokens = vec![
|
||||
@ -193,21 +193,21 @@ fn quicktour() -> tokenizers::Result<()> {
|
||||
.special_tokens(special_tokens)
|
||||
.build()?,
|
||||
);
|
||||
// END init_template_processing
|
||||
// START print_special_tokens
|
||||
// END quicktour_init_template_processing
|
||||
// START quicktour_print_special_tokens
|
||||
let output = tokenizer.encode("Hello, y'all! How are you 😁 ?", true)?;
|
||||
println!("{:?}", output.get_tokens());
|
||||
// ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
|
||||
// END print_special_tokens
|
||||
// END quicktour_print_special_tokens
|
||||
assert_eq!(
|
||||
output.get_tokens(),
|
||||
["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
|
||||
);
|
||||
// START print_special_tokens_pair
|
||||
// START quicktour_print_special_tokens_pair
|
||||
let output = tokenizer.encode(("Hello, y'all!", "How are you 😁 ?"), true)?;
|
||||
println!("{:?}", output.get_tokens());
|
||||
// ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"]
|
||||
// END print_special_tokens_pair
|
||||
// END quicktour_print_special_tokens_pair
|
||||
assert_eq!(
|
||||
output.get_tokens(),
|
||||
[
|
||||
@ -215,19 +215,19 @@ fn quicktour() -> tokenizers::Result<()> {
|
||||
"?", "[SEP]"
|
||||
]
|
||||
);
|
||||
// START print_type_ids
|
||||
// START quicktour_print_type_ids
|
||||
println!("{:?}", output.get_type_ids());
|
||||
// [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
|
||||
// END print_type_ids
|
||||
// END quicktour_print_type_ids
|
||||
assert_eq!(
|
||||
output.get_type_ids(),
|
||||
[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
|
||||
);
|
||||
// START encode_batch
|
||||
// START quicktour_encode_batch
|
||||
let output = tokenizer.encode_batch(vec!["Hello, y'all!", "How are you 😁 ?"], true)?;
|
||||
// END encode_batch
|
||||
// END quicktour_encode_batch
|
||||
println!("{:?}", output);
|
||||
// START encode_batch_pair
|
||||
// START quicktour_encode_batch_pair
|
||||
let output = tokenizer.encode_batch(
|
||||
vec![
|
||||
("Hello, y'all!", "How are you 😁 ?"),
|
||||
@ -235,9 +235,9 @@ fn quicktour() -> tokenizers::Result<()> {
|
||||
],
|
||||
true,
|
||||
)?;
|
||||
// END encode_batch_pair
|
||||
// END quicktour_encode_batch_pair
|
||||
println!("{:?}", output);
|
||||
// START enable_padding
|
||||
// START quicktour_enable_padding
|
||||
use tokenizers::PaddingParams;
|
||||
|
||||
tokenizer.with_padding(Some(PaddingParams {
|
||||
@ -245,20 +245,20 @@ fn quicktour() -> tokenizers::Result<()> {
|
||||
pad_token: "[PAD]".to_string(),
|
||||
..PaddingParams::default()
|
||||
}));
|
||||
// END enable_padding
|
||||
// START print_batch_tokens
|
||||
// END quicktour_enable_padding
|
||||
// START quicktour_print_batch_tokens
|
||||
let output = tokenizer.encode_batch(vec!["Hello, y'all!", "How are you 😁 ?"], true)?;
|
||||
println!("{:?}", output[1].get_tokens());
|
||||
// ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
|
||||
// END print_batch_tokens
|
||||
// END quicktour_print_batch_tokens
|
||||
assert_eq!(
|
||||
output[1].get_tokens(),
|
||||
["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
|
||||
);
|
||||
// START print_attention_mask
|
||||
// START quicktour_print_attention_mask
|
||||
println!("{:?}", output[1].get_attention_mask());
|
||||
// [1, 1, 1, 1, 1, 1, 1, 0]
|
||||
// END print_attention_mask
|
||||
// END quicktour_print_attention_mask
|
||||
assert_eq!(output[1].get_attention_mask(), [1, 1, 1, 1, 1, 1, 1, 0]);
|
||||
Ok(())
|
||||
}
|
||||
|
Reference in New Issue
Block a user