Doc - Better namespace for rust tests

This commit is contained in:
Anthony MOI
2020-10-27 17:31:22 -04:00
committed by Anthony MOI
parent cace6561d7
commit ab7bae466a
2 changed files with 88 additions and 88 deletions

View File

@ -95,8 +95,8 @@ one with a BPE model:
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START init_tokenizer :start-after: START quicktour_init_tokenizer
:end-before: END init_tokenizer :end-before: END quicktour_init_tokenizer
:dedent: 4 :dedent: 4
To train our tokenizer on the wikitext files, we will need to instantiate a `trainer`, in this case To train our tokenizer on the wikitext files, we will need to instantiate a `trainer`, in this case
@ -114,8 +114,8 @@ a :entity:`BpeTrainer`
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START init_trainer :start-after: START quicktour_init_trainer
:end-before: END init_trainer :end-before: END quicktour_init_trainer
:dedent: 4 :dedent: 4
We can set the training arguments like :entity:`vocab_size` or :entity:`min_frequency` (here left at We can set the training arguments like :entity:`vocab_size` or :entity:`min_frequency` (here left at
@ -147,8 +147,8 @@ on whitespace.
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START init_pretok :start-after: START quicktour_init_pretok
:end-before: END init_pretok :end-before: END quicktour_init_pretok
:dedent: 4 :dedent: 4
Now, we can just call the :entity:`Tokenizer.train` method with any list of files we want Now, we can just call the :entity:`Tokenizer.train` method with any list of files we want
@ -166,8 +166,8 @@ to use:
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START train :start-after: START quicktour_train
:end-before: END train :end-before: END quicktour_train
:dedent: 4 :dedent: 4
This should only take a few seconds to train our tokenizer on the full wikitext dataset! Once this This should only take a few seconds to train our tokenizer on the full wikitext dataset! Once this
@ -187,8 +187,8 @@ first instantiating the model.
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START reload_model :start-after: START quicktour_reload_model
:end-before: END reload_model :end-before: END quicktour_reload_model
:dedent: 4 :dedent: 4
To save the tokenizer in one file that contains all its configuration and vocabulary, just use the To save the tokenizer in one file that contains all its configuration and vocabulary, just use the
@ -206,8 +206,8 @@ To save the tokenizer in one file that contains all its configuration and vocabu
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START save :start-after: START quicktour_save
:end-before: END save :end-before: END quicktour_save
:dedent: 4 :dedent: 4
and you can reload your tokenizer from that file with the :entity:`Tokenizer.from_file` and you can reload your tokenizer from that file with the :entity:`Tokenizer.from_file`
@ -225,8 +225,8 @@ and you can reload your tokenizer from that file with the :entity:`Tokenizer.fro
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START reload_tokenizer :start-after: START quicktour_reload_tokenizer
:end-before: END reload_tokenizer :end-before: END quicktour_reload_tokenizer
:dedent: 4 :dedent: 4
Using the tokenizer Using the tokenizer
@ -247,8 +247,8 @@ Now that we have trained a tokenizer, we can use it on any text we want with the
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START encode :start-after: START quicktour_encode
:end-before: END encode :end-before: END quicktour_encode
:dedent: 4 :dedent: 4
This applied the full pipeline of the tokenizer on the text, returning an This applied the full pipeline of the tokenizer on the text, returning an
@ -271,8 +271,8 @@ tokens:
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START print_tokens :start-after: START quicktour_print_tokens
:end-before: END print_tokens :end-before: END quicktour_print_tokens
:dedent: 4 :dedent: 4
Similarly, the :obj:`ids` attribute will contain the index of each of those tokens in the Similarly, the :obj:`ids` attribute will contain the index of each of those tokens in the
@ -290,8 +290,8 @@ tokenizer's vocabulary:
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START print_ids :start-after: START quicktour_print_ids
:end-before: END print_ids :end-before: END quicktour_print_ids
:dedent: 4 :dedent: 4
An important feature of the 🤗 Tokenizers library is that it comes with full alignment tracking, An important feature of the 🤗 Tokenizers library is that it comes with full alignment tracking,
@ -312,8 +312,8 @@ which is the token at index 9 in the list, we can just ask for the offset at the
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START print_offsets :start-after: START quicktour_print_offsets
:end-before: END print_offsets :end-before: END quicktour_print_offsets
:dedent: 4 :dedent: 4
and those are the indices that correspond to the emoji in the original sentence: and those are the indices that correspond to the emoji in the original sentence:
@ -330,8 +330,8 @@ and those are the indices that correspond to the emoji in the original sentence:
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START use_offsets :start-after: START quicktour_use_offsets
:end-before: END use_offsets :end-before: END quicktour_use_offsets
:dedent: 4 :dedent: 4
Post-processing Post-processing
@ -358,8 +358,8 @@ list of special tokens, so this should be their IDs. To double-check, we can use
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START check_sep :start-after: START quicktour_check_sep
:end-before: END check_sep :end-before: END quicktour_check_sep
:dedent: 4 :dedent: 4
Here is how we can set the post-processing to give us the traditional BERT inputs: Here is how we can set the post-processing to give us the traditional BERT inputs:
@ -376,8 +376,8 @@ Here is how we can set the post-processing to give us the traditional BERT input
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START init_template_processing :start-after: START quicktour_init_template_processing
:end-before: END init_template_processing :end-before: END quicktour_init_template_processing
:dedent: 4 :dedent: 4
Let's go over this snippet of code in more details. First we specify the template for single Let's go over this snippet of code in more details. First we specify the template for single
@ -406,8 +406,8 @@ To check out this worked properly, let's try to encode the same sentence as befo
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START print_special_tokens :start-after: START quicktour_print_special_tokens
:end-before: END print_special_tokens :end-before: END quicktour_print_special_tokens
:dedent: 4 :dedent: 4
To check the results on a pair of sentences, we just pass the two sentences to To check the results on a pair of sentences, we just pass the two sentences to
@ -425,8 +425,8 @@ To check the results on a pair of sentences, we just pass the two sentences to
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START print_special_tokens_pair :start-after: START quicktour_print_special_tokens_pair
:end-before: END print_special_tokens_pair :end-before: END quicktour_print_special_tokens_pair
:dedent: 4 :dedent: 4
You can then check the type IDs attributed to each token is correct with You can then check the type IDs attributed to each token is correct with
@ -443,8 +443,8 @@ You can then check the type IDs attributed to each token is correct with
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START print_type_ids :start-after: START quicktour_print_type_ids
:end-before: END print_type_ids :end-before: END quicktour_print_type_ids
:dedent: 4 :dedent: 4
If you save your tokenizer with :entity:`Tokenizer.save`, the post-processor will be saved along. If you save your tokenizer with :entity:`Tokenizer.save`, the post-processor will be saved along.
@ -467,8 +467,8 @@ using the :entity:`Tokenizer.encode_batch` method:
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START encode_batch :start-after: START quicktour_encode_batch
:end-before: END encode_batch :end-before: END quicktour_encode_batch
:dedent: 4 :dedent: 4
The output is then a list of :entity:`Encoding` objects like the ones we saw before. You The output is then a list of :entity:`Encoding` objects like the ones we saw before. You
@ -490,8 +490,8 @@ B:
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START encode_batch_pair :start-after: START quicktour_encode_batch_pair
:end-before: END encode_batch_pair :end-before: END quicktour_encode_batch_pair
:dedent: 4 :dedent: 4
When encoding multiple sentences, you can automatically pad the outputs to the longest sentence When encoding multiple sentences, you can automatically pad the outputs to the longest sentence
@ -511,8 +511,8 @@ present by using :entity:`Tokenizer.enable_padding`, with the :entity:`pad_token
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START enable_padding :start-after: START quicktour_enable_padding
:end-before: END enable_padding :end-before: END quicktour_enable_padding
:dedent: 4 :dedent: 4
We can set the :obj:`direction` of the padding (defaults to the right) or a given :obj:`length` if We can set the :obj:`direction` of the padding (defaults to the right) or a given :obj:`length` if
@ -531,8 +531,8 @@ the longest text).
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START print_batch_tokens :start-after: START quicktour_print_batch_tokens
:end-before: END print_batch_tokens :end-before: END quicktour_print_batch_tokens
:dedent: 4 :dedent: 4
In this case, the `attention mask` generated by the tokenizer takes the padding into account: In this case, the `attention mask` generated by the tokenizer takes the padding into account:
@ -549,8 +549,8 @@ In this case, the `attention mask` generated by the tokenizer takes the padding
.. literalinclude:: ../../tokenizers/tests/documentation.rs .. literalinclude:: ../../tokenizers/tests/documentation.rs
:language: rust :language: rust
:start-after: START print_attention_mask :start-after: START quicktour_print_attention_mask
:end-before: END print_attention_mask :end-before: END quicktour_print_attention_mask
:dedent: 4 :dedent: 4
.. _pretrained: .. _pretrained:

View File

@ -63,14 +63,14 @@ fn load_tokenizer() {
fn quicktour_slow_train() -> tokenizers::Result<()> { fn quicktour_slow_train() -> tokenizers::Result<()> {
let (mut tokenizer, trainer) = quicktour_get_tokenizer_trainer()?; let (mut tokenizer, trainer) = quicktour_get_tokenizer_trainer()?;
// START train // START quicktour_train
let files = ["test", "train", "valid"] let files = ["test", "train", "valid"]
.iter() .iter()
.map(|split| format!("data/wikitext-103-raw/wiki.{}.raw", split)) .map(|split| format!("data/wikitext-103-raw/wiki.{}.raw", split))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
tokenizer.train_and_replace(&trainer, files)?; tokenizer.train_and_replace(&trainer, files)?;
// END train // END quicktour_train
// START reload_model // START quicktour_reload_model
use std::path::Path; use std::path::Path;
use tokenizers::Model; use tokenizers::Model;
@ -85,10 +85,10 @@ fn quicktour_slow_train() -> tokenizers::Result<()> {
.unk_token("[UNK]".to_string()) .unk_token("[UNK]".to_string())
.build()?, .build()?,
); );
// END reload_model // END quicktour_reload_model
// START save // START quicktour_save
tokenizer.save("data/tokenizer-wiki.json", false)?; tokenizer.save("data/tokenizer-wiki.json", false)?;
// END save // END quicktour_save
Ok(()) Ok(())
} }
@ -104,7 +104,7 @@ fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<(
>, >,
BpeTrainer, BpeTrainer,
)> { )> {
// START init_tokenizer // START quicktour_init_tokenizer
use tokenizers::models::bpe::BPE; use tokenizers::models::bpe::BPE;
use tokenizers::TokenizerBuilder; use tokenizers::TokenizerBuilder;
@ -115,8 +115,8 @@ fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<(
PostProcessorWrapper, PostProcessorWrapper,
DecoderWrapper, DecoderWrapper,
> = TokenizerImpl::new(BPE::default()); > = TokenizerImpl::new(BPE::default());
// END init_tokenizer // END quicktour_init_tokenizer
// START init_trainer // START quicktour_init_trainer
use tokenizers::models::bpe::BpeTrainer; use tokenizers::models::bpe::BpeTrainer;
let trainer = BpeTrainer::builder() let trainer = BpeTrainer::builder()
@ -128,56 +128,56 @@ fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<(
AddedToken::from("[MASK]", true), AddedToken::from("[MASK]", true),
]) ])
.build(); .build();
// END init_trainer // END quicktour_init_trainer
// START init_pretok // START quicktour_init_pretok
use tokenizers::pre_tokenizers::whitespace::Whitespace; use tokenizers::pre_tokenizers::whitespace::Whitespace;
tokenizer.with_pre_tokenizer(Whitespace::default()); tokenizer.with_pre_tokenizer(Whitespace::default());
// END init_pretok // END quicktour_init_pretok
Ok((tokenizer, trainer)) Ok((tokenizer, trainer))
} }
#[test] #[test]
fn quicktour() -> tokenizers::Result<()> { fn quicktour() -> tokenizers::Result<()> {
// START reload_tokenizer // START quicktour_reload_tokenizer
let mut tokenizer = Tokenizer::from_file("data/tokenizer-wiki.json")?; let mut tokenizer = Tokenizer::from_file("data/tokenizer-wiki.json")?;
// END reload_tokenizer // END quicktour_reload_tokenizer
// START encode // START quicktour_encode
let output = tokenizer.encode("Hello, y'all! How are you 😁 ?", true)?; let output = tokenizer.encode("Hello, y'all! How are you 😁 ?", true)?;
// END encode // END quicktour_encode
// START print_tokens // START quicktour_print_tokens
println!("{:?}", output.get_tokens()); println!("{:?}", output.get_tokens());
// ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?",] // ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?",]
// END print_tokens // END quicktour_print_tokens
assert_eq!( assert_eq!(
output.get_tokens(), output.get_tokens(),
["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?",] ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?",]
); );
// START print_ids // START quicktour_print_ids
println!("{:?}", output.get_ids()); println!("{:?}", output.get_ids());
// [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35] // [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
// END print_ids // END quicktour_print_ids
assert_eq!( assert_eq!(
output.get_ids(), output.get_ids(),
[27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35] [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
); );
// START print_offsets // START quicktour_print_offsets
println!("{:?}", output.get_offsets()[9]); println!("{:?}", output.get_offsets()[9]);
// (26, 30) // (26, 30)
// END print_offsets // END quicktour_print_offsets
assert_eq!(output.get_offsets()[9], (26, 30)); assert_eq!(output.get_offsets()[9], (26, 30));
// START use_offsets // START quicktour_use_offsets
let sentence = "Hello, y'all! How are you 😁 ?"; let sentence = "Hello, y'all! How are you 😁 ?";
println!("{}", &sentence[26..30]); println!("{}", &sentence[26..30]);
// "😁" // "😁"
// END use_offsets // END quicktour_use_offsets
// START check_sep // START quicktour_check_sep
println!("{}", tokenizer.token_to_id("[SEP]").unwrap()); println!("{}", tokenizer.token_to_id("[SEP]").unwrap());
// 2 // 2
// END check_sep // END quicktour_check_sep
assert_eq!(tokenizer.token_to_id("[SEP]"), Some(2)); assert_eq!(tokenizer.token_to_id("[SEP]"), Some(2));
// START init_template_processing // START quicktour_init_template_processing
use tokenizers::processors::template::TemplateProcessing; use tokenizers::processors::template::TemplateProcessing;
let special_tokens = vec![ let special_tokens = vec![
@ -193,21 +193,21 @@ fn quicktour() -> tokenizers::Result<()> {
.special_tokens(special_tokens) .special_tokens(special_tokens)
.build()?, .build()?,
); );
// END init_template_processing // END quicktour_init_template_processing
// START print_special_tokens // START quicktour_print_special_tokens
let output = tokenizer.encode("Hello, y'all! How are you 😁 ?", true)?; let output = tokenizer.encode("Hello, y'all! How are you 😁 ?", true)?;
println!("{:?}", output.get_tokens()); println!("{:?}", output.get_tokens());
// ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"] // ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
// END print_special_tokens // END quicktour_print_special_tokens
assert_eq!( assert_eq!(
output.get_tokens(), output.get_tokens(),
["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"] ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
); );
// START print_special_tokens_pair // START quicktour_print_special_tokens_pair
let output = tokenizer.encode(("Hello, y'all!", "How are you 😁 ?"), true)?; let output = tokenizer.encode(("Hello, y'all!", "How are you 😁 ?"), true)?;
println!("{:?}", output.get_tokens()); println!("{:?}", output.get_tokens());
// ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"] // ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"]
// END print_special_tokens_pair // END quicktour_print_special_tokens_pair
assert_eq!( assert_eq!(
output.get_tokens(), output.get_tokens(),
[ [
@ -215,19 +215,19 @@ fn quicktour() -> tokenizers::Result<()> {
"?", "[SEP]" "?", "[SEP]"
] ]
); );
// START print_type_ids // START quicktour_print_type_ids
println!("{:?}", output.get_type_ids()); println!("{:?}", output.get_type_ids());
// [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] // [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
// END print_type_ids // END quicktour_print_type_ids
assert_eq!( assert_eq!(
output.get_type_ids(), output.get_type_ids(),
[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
); );
// START encode_batch // START quicktour_encode_batch
let output = tokenizer.encode_batch(vec!["Hello, y'all!", "How are you 😁 ?"], true)?; let output = tokenizer.encode_batch(vec!["Hello, y'all!", "How are you 😁 ?"], true)?;
// END encode_batch // END quicktour_encode_batch
println!("{:?}", output); println!("{:?}", output);
// START encode_batch_pair // START quicktour_encode_batch_pair
let output = tokenizer.encode_batch( let output = tokenizer.encode_batch(
vec![ vec![
("Hello, y'all!", "How are you 😁 ?"), ("Hello, y'all!", "How are you 😁 ?"),
@ -235,9 +235,9 @@ fn quicktour() -> tokenizers::Result<()> {
], ],
true, true,
)?; )?;
// END encode_batch_pair // END quicktour_encode_batch_pair
println!("{:?}", output); println!("{:?}", output);
// START enable_padding // START quicktour_enable_padding
use tokenizers::PaddingParams; use tokenizers::PaddingParams;
tokenizer.with_padding(Some(PaddingParams { tokenizer.with_padding(Some(PaddingParams {
@ -245,20 +245,20 @@ fn quicktour() -> tokenizers::Result<()> {
pad_token: "[PAD]".to_string(), pad_token: "[PAD]".to_string(),
..PaddingParams::default() ..PaddingParams::default()
})); }));
// END enable_padding // END quicktour_enable_padding
// START print_batch_tokens // START quicktour_print_batch_tokens
let output = tokenizer.encode_batch(vec!["Hello, y'all!", "How are you 😁 ?"], true)?; let output = tokenizer.encode_batch(vec!["Hello, y'all!", "How are you 😁 ?"], true)?;
println!("{:?}", output[1].get_tokens()); println!("{:?}", output[1].get_tokens());
// ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"] // ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
// END print_batch_tokens // END quicktour_print_batch_tokens
assert_eq!( assert_eq!(
output[1].get_tokens(), output[1].get_tokens(),
["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"] ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
); );
// START print_attention_mask // START quicktour_print_attention_mask
println!("{:?}", output[1].get_attention_mask()); println!("{:?}", output[1].get_attention_mask());
// [1, 1, 1, 1, 1, 1, 1, 0] // [1, 1, 1, 1, 1, 1, 1, 0]
// END print_attention_mask // END quicktour_print_attention_mask
assert_eq!(output[1].get_attention_mask(), [1, 1, 1, 1, 1, 1, 1, 0]); assert_eq!(output[1].get_attention_mask(), [1, 1, 1, 1, 1, 1, 1, 0]);
Ok(()) Ok(())
} }