From ab7bae466a8d11b085e0166c2eef448335054905 Mon Sep 17 00:00:00 2001
From: Anthony MOI <m.anthony.moi@gmail.com>
Date: Tue, 27 Oct 2020 17:31:22 -0400
Subject: [PATCH] Doc - Better namespace for rust tests

---
 docs/source/quicktour.rst         | 88 +++++++++++++++----------------
 tokenizers/tests/documentation.rs | 88 +++++++++++++++----------------
 2 files changed, 88 insertions(+), 88 deletions(-)

diff --git a/docs/source/quicktour.rst b/docs/source/quicktour.rst
index f46ca05d..3a3c700b 100644
--- a/docs/source/quicktour.rst
+++ b/docs/source/quicktour.rst
@@ -95,8 +95,8 @@ one with a BPE model:
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START init_tokenizer
-        :end-before: END init_tokenizer
+        :start-after: START quicktour_init_tokenizer
+        :end-before: END quicktour_init_tokenizer
         :dedent: 4
 
 To train our tokenizer on the wikitext files, we will need to instantiate a `trainer`, in this case
@@ -114,8 +114,8 @@ a :entity:`BpeTrainer`
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START init_trainer
-        :end-before: END init_trainer
+        :start-after: START quicktour_init_trainer
+        :end-before: END quicktour_init_trainer
         :dedent: 4
 
 We can set the training arguments like :entity:`vocab_size` or :entity:`min_frequency` (here left at
@@ -147,8 +147,8 @@ on whitespace.
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START init_pretok
-        :end-before: END init_pretok
+        :start-after: START quicktour_init_pretok
+        :end-before: END quicktour_init_pretok
         :dedent: 4
 
 Now, we can just call the :entity:`Tokenizer.train` method with any list of files we want
@@ -166,8 +166,8 @@ to use:
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START train
-        :end-before: END train
+        :start-after: START quicktour_train
+        :end-before: END quicktour_train
         :dedent: 4
 
 This should only take a few seconds to train our tokenizer on the full wikitext dataset! Once this
@@ -187,8 +187,8 @@ first instantiating the model.
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START reload_model
-        :end-before: END reload_model
+        :start-after: START quicktour_reload_model
+        :end-before: END quicktour_reload_model
         :dedent: 4
 
 To save the tokenizer in one file that contains all its configuration and vocabulary, just use the
@@ -206,8 +206,8 @@ To save the tokenizer in one file that contains all its configuration and vocabu
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START save
-        :end-before: END save
+        :start-after: START quicktour_save
+        :end-before: END quicktour_save
         :dedent: 4
 
 and you can reload your tokenizer from that file with the :entity:`Tokenizer.from_file`
@@ -225,8 +225,8 @@ and you can reload your tokenizer from that file with the :entity:`Tokenizer.fro
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START reload_tokenizer
-        :end-before: END reload_tokenizer
+        :start-after: START quicktour_reload_tokenizer
+        :end-before: END quicktour_reload_tokenizer
         :dedent: 4
 
 Using the tokenizer
@@ -247,8 +247,8 @@ Now that we have trained a tokenizer, we can use it on any text we want with the
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START encode
-        :end-before: END encode
+        :start-after: START quicktour_encode
+        :end-before: END quicktour_encode
         :dedent: 4
 
 This applied the full pipeline of the tokenizer on the text, returning an
@@ -271,8 +271,8 @@ tokens:
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START print_tokens
-        :end-before: END print_tokens
+        :start-after: START quicktour_print_tokens
+        :end-before: END quicktour_print_tokens
         :dedent: 4
 
 Similarly, the :obj:`ids` attribute will contain the index of each of those tokens in the
@@ -290,8 +290,8 @@ tokenizer's vocabulary:
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START print_ids
-        :end-before: END print_ids
+        :start-after: START quicktour_print_ids
+        :end-before: END quicktour_print_ids
         :dedent: 4
 
 An important feature of the 🤗 Tokenizers library is that it comes with full alignment tracking,
@@ -312,8 +312,8 @@ which is the token at index 9 in the list, we can just ask for the offset at the
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START print_offsets
-        :end-before: END print_offsets
+        :start-after: START quicktour_print_offsets
+        :end-before: END quicktour_print_offsets
         :dedent: 4
 
 and those are the indices that correspond to the emoji in the original sentence:
@@ -330,8 +330,8 @@ and those are the indices that correspond to the emoji in the original sentence:
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START use_offsets
-        :end-before: END use_offsets
+        :start-after: START quicktour_use_offsets
+        :end-before: END quicktour_use_offsets
         :dedent: 4
 
 Post-processing
@@ -358,8 +358,8 @@ list of special tokens, so this should be their IDs. To double-check, we can use
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START check_sep
-        :end-before: END check_sep
+        :start-after: START quicktour_check_sep
+        :end-before: END quicktour_check_sep
         :dedent: 4
 
 Here is how we can set the post-processing to give us the traditional BERT inputs:
@@ -376,8 +376,8 @@ Here is how we can set the post-processing to give us the traditional BERT input
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START init_template_processing
-        :end-before: END init_template_processing
+        :start-after: START quicktour_init_template_processing
+        :end-before: END quicktour_init_template_processing
         :dedent: 4
 
 Let's go over this snippet of code in more details. First we specify the template for single
@@ -406,8 +406,8 @@ To check out this worked properly, let's try to encode the same sentence as befo
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START print_special_tokens
-        :end-before: END print_special_tokens
+        :start-after: START quicktour_print_special_tokens
+        :end-before: END quicktour_print_special_tokens
         :dedent: 4
 
 To check the results on a pair of sentences, we just pass the two sentences to
@@ -425,8 +425,8 @@ To check the results on a pair of sentences, we just pass the two sentences to
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START print_special_tokens_pair
-        :end-before: END print_special_tokens_pair
+        :start-after: START quicktour_print_special_tokens_pair
+        :end-before: END quicktour_print_special_tokens_pair
         :dedent: 4
 
 You can then check the type IDs attributed to each token is correct with
@@ -443,8 +443,8 @@ You can then check the type IDs attributed to each token is correct with
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START print_type_ids
-        :end-before: END print_type_ids
+        :start-after: START quicktour_print_type_ids
+        :end-before: END quicktour_print_type_ids
         :dedent: 4
 
 If you save your tokenizer with :entity:`Tokenizer.save`, the post-processor will be saved along.
@@ -467,8 +467,8 @@ using the :entity:`Tokenizer.encode_batch` method:
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START encode_batch
-        :end-before: END encode_batch
+        :start-after: START quicktour_encode_batch
+        :end-before: END quicktour_encode_batch
         :dedent: 4
 
 The output is then a list of :entity:`Encoding` objects like the ones we saw before. You
@@ -490,8 +490,8 @@ B:
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START encode_batch_pair
-        :end-before: END encode_batch_pair
+        :start-after: START quicktour_encode_batch_pair
+        :end-before: END quicktour_encode_batch_pair
         :dedent: 4
 
 When encoding multiple sentences, you can automatically pad the outputs to the longest sentence
@@ -511,8 +511,8 @@ present by using :entity:`Tokenizer.enable_padding`, with the :entity:`pad_token
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START enable_padding
-        :end-before: END enable_padding
+        :start-after: START quicktour_enable_padding
+        :end-before: END quicktour_enable_padding
         :dedent: 4
 
 We can set the :obj:`direction` of the padding (defaults to the right) or a given :obj:`length` if
@@ -531,8 +531,8 @@ the longest text).
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START print_batch_tokens
-        :end-before: END print_batch_tokens
+        :start-after: START quicktour_print_batch_tokens
+        :end-before: END quicktour_print_batch_tokens
         :dedent: 4
 
 In this case, the `attention mask` generated by the tokenizer takes the padding into account:
@@ -549,8 +549,8 @@ In this case, the `attention mask` generated by the tokenizer takes the padding
 
     .. literalinclude:: ../../tokenizers/tests/documentation.rs
         :language: rust
-        :start-after: START print_attention_mask
-        :end-before: END print_attention_mask
+        :start-after: START quicktour_print_attention_mask
+        :end-before: END quicktour_print_attention_mask
         :dedent: 4
 
 .. _pretrained:
diff --git a/tokenizers/tests/documentation.rs b/tokenizers/tests/documentation.rs
index 95e331f9..8f1a8bf5 100644
--- a/tokenizers/tests/documentation.rs
+++ b/tokenizers/tests/documentation.rs
@@ -63,14 +63,14 @@ fn load_tokenizer() {
 fn quicktour_slow_train() -> tokenizers::Result<()> {
     let (mut tokenizer, trainer) = quicktour_get_tokenizer_trainer()?;
 
-    // START train
+    // START quicktour_train
     let files = ["test", "train", "valid"]
         .iter()
         .map(|split| format!("data/wikitext-103-raw/wiki.{}.raw", split))
         .collect::<Vec<_>>();
     tokenizer.train_and_replace(&trainer, files)?;
-    // END train
-    // START reload_model
+    // END quicktour_train
+    // START quicktour_reload_model
     use std::path::Path;
     use tokenizers::Model;
 
@@ -85,10 +85,10 @@ fn quicktour_slow_train() -> tokenizers::Result<()> {
         .unk_token("[UNK]".to_string())
         .build()?,
     );
-    // END reload_model
-    // START save
+    // END quicktour_reload_model
+    // START quicktour_save
     tokenizer.save("data/tokenizer-wiki.json", false)?;
-    // END save
+    // END quicktour_save
 
     Ok(())
 }
@@ -104,7 +104,7 @@ fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<(
     >,
     BpeTrainer,
 )> {
-    // START init_tokenizer
+    // START quicktour_init_tokenizer
     use tokenizers::models::bpe::BPE;
     use tokenizers::TokenizerBuilder;
 
@@ -115,8 +115,8 @@ fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<(
         PostProcessorWrapper,
         DecoderWrapper,
     > = TokenizerImpl::new(BPE::default());
-    // END init_tokenizer
-    // START init_trainer
+    // END quicktour_init_tokenizer
+    // START quicktour_init_trainer
     use tokenizers::models::bpe::BpeTrainer;
 
     let trainer = BpeTrainer::builder()
@@ -128,56 +128,56 @@ fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<(
             AddedToken::from("[MASK]", true),
         ])
         .build();
-    // END init_trainer
-    // START init_pretok
+    // END quicktour_init_trainer
+    // START quicktour_init_pretok
     use tokenizers::pre_tokenizers::whitespace::Whitespace;
 
     tokenizer.with_pre_tokenizer(Whitespace::default());
-    // END init_pretok
+    // END quicktour_init_pretok
 
     Ok((tokenizer, trainer))
 }
 
 #[test]
 fn quicktour() -> tokenizers::Result<()> {
-    // START reload_tokenizer
+    // START quicktour_reload_tokenizer
     let mut tokenizer = Tokenizer::from_file("data/tokenizer-wiki.json")?;
-    // END reload_tokenizer
-    // START encode
+    // END quicktour_reload_tokenizer
+    // START quicktour_encode
     let output = tokenizer.encode("Hello, y'all! How are you 😁 ?", true)?;
-    // END encode
-    // START print_tokens
+    // END quicktour_encode
+    // START quicktour_print_tokens
     println!("{:?}", output.get_tokens());
     // ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?",]
-    // END print_tokens
+    // END quicktour_print_tokens
     assert_eq!(
         output.get_tokens(),
         ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?",]
     );
-    // START print_ids
+    // START quicktour_print_ids
     println!("{:?}", output.get_ids());
     // [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
-    // END print_ids
+    // END quicktour_print_ids
     assert_eq!(
         output.get_ids(),
         [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
     );
-    // START print_offsets
+    // START quicktour_print_offsets
     println!("{:?}", output.get_offsets()[9]);
     // (26, 30)
-    // END print_offsets
+    // END quicktour_print_offsets
     assert_eq!(output.get_offsets()[9], (26, 30));
-    // START use_offsets
+    // START quicktour_use_offsets
     let sentence = "Hello, y'all! How are you 😁 ?";
     println!("{}", &sentence[26..30]);
     // "😁"
-    // END use_offsets
-    // START check_sep
+    // END quicktour_use_offsets
+    // START quicktour_check_sep
     println!("{}", tokenizer.token_to_id("[SEP]").unwrap());
     // 2
-    // END check_sep
+    // END quicktour_check_sep
     assert_eq!(tokenizer.token_to_id("[SEP]"), Some(2));
-    // START init_template_processing
+    // START quicktour_init_template_processing
     use tokenizers::processors::template::TemplateProcessing;
 
     let special_tokens = vec![
@@ -193,21 +193,21 @@ fn quicktour() -> tokenizers::Result<()> {
             .special_tokens(special_tokens)
             .build()?,
     );
-    // END init_template_processing
-    // START print_special_tokens
+    // END quicktour_init_template_processing
+    // START quicktour_print_special_tokens
     let output = tokenizer.encode("Hello, y'all! How are you 😁 ?", true)?;
     println!("{:?}", output.get_tokens());
     // ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
-    // END print_special_tokens
+    // END quicktour_print_special_tokens
     assert_eq!(
         output.get_tokens(),
         ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
     );
-    // START print_special_tokens_pair
+    // START quicktour_print_special_tokens_pair
     let output = tokenizer.encode(("Hello, y'all!", "How are you 😁 ?"), true)?;
     println!("{:?}", output.get_tokens());
     // ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"]
-    // END print_special_tokens_pair
+    // END quicktour_print_special_tokens_pair
     assert_eq!(
         output.get_tokens(),
         [
@@ -215,19 +215,19 @@ fn quicktour() -> tokenizers::Result<()> {
             "?", "[SEP]"
         ]
     );
-    // START print_type_ids
+    // START quicktour_print_type_ids
     println!("{:?}", output.get_type_ids());
     // [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
-    // END print_type_ids
+    // END quicktour_print_type_ids
     assert_eq!(
         output.get_type_ids(),
         [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
     );
-    // START encode_batch
+    // START quicktour_encode_batch
     let output = tokenizer.encode_batch(vec!["Hello, y'all!", "How are you 😁 ?"], true)?;
-    // END encode_batch
+    // END quicktour_encode_batch
     println!("{:?}", output);
-    // START encode_batch_pair
+    // START quicktour_encode_batch_pair
     let output = tokenizer.encode_batch(
         vec![
             ("Hello, y'all!", "How are you 😁 ?"),
@@ -235,9 +235,9 @@ fn quicktour() -> tokenizers::Result<()> {
         ],
         true,
     )?;
-    // END encode_batch_pair
+    // END quicktour_encode_batch_pair
     println!("{:?}", output);
-    // START enable_padding
+    // START quicktour_enable_padding
     use tokenizers::PaddingParams;
 
     tokenizer.with_padding(Some(PaddingParams {
@@ -245,20 +245,20 @@ fn quicktour() -> tokenizers::Result<()> {
         pad_token: "[PAD]".to_string(),
         ..PaddingParams::default()
     }));
-    // END enable_padding
-    // START print_batch_tokens
+    // END quicktour_enable_padding
+    // START quicktour_print_batch_tokens
     let output = tokenizer.encode_batch(vec!["Hello, y'all!", "How are you 😁 ?"], true)?;
     println!("{:?}", output[1].get_tokens());
     // ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
-    // END print_batch_tokens
+    // END quicktour_print_batch_tokens
     assert_eq!(
         output[1].get_tokens(),
         ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
     );
-    // START print_attention_mask
+    // START quicktour_print_attention_mask
     println!("{:?}", output[1].get_attention_mask());
     // [1, 1, 1, 1, 1, 1, 1, 0]
-    // END print_attention_mask
+    // END quicktour_print_attention_mask
     assert_eq!(output[1].get_attention_mask(), [1, 1, 1, 1, 1, 1, 1, 0]);
     Ok(())
 }