From ab7bae466a8d11b085e0166c2eef448335054905 Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Tue, 27 Oct 2020 17:31:22 -0400 Subject: [PATCH] Doc - Better namespace for rust tests --- docs/source/quicktour.rst | 88 +++++++++++++++---------------- tokenizers/tests/documentation.rs | 88 +++++++++++++++---------------- 2 files changed, 88 insertions(+), 88 deletions(-) diff --git a/docs/source/quicktour.rst b/docs/source/quicktour.rst index f46ca05d..3a3c700b 100644 --- a/docs/source/quicktour.rst +++ b/docs/source/quicktour.rst @@ -95,8 +95,8 @@ one with a BPE model: .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START init_tokenizer - :end-before: END init_tokenizer + :start-after: START quicktour_init_tokenizer + :end-before: END quicktour_init_tokenizer :dedent: 4 To train our tokenizer on the wikitext files, we will need to instantiate a `trainer`, in this case @@ -114,8 +114,8 @@ a :entity:`BpeTrainer` .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START init_trainer - :end-before: END init_trainer + :start-after: START quicktour_init_trainer + :end-before: END quicktour_init_trainer :dedent: 4 We can set the training arguments like :entity:`vocab_size` or :entity:`min_frequency` (here left at @@ -147,8 +147,8 @@ on whitespace. .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START init_pretok - :end-before: END init_pretok + :start-after: START quicktour_init_pretok + :end-before: END quicktour_init_pretok :dedent: 4 Now, we can just call the :entity:`Tokenizer.train` method with any list of files we want @@ -166,8 +166,8 @@ to use: .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START train - :end-before: END train + :start-after: START quicktour_train + :end-before: END quicktour_train :dedent: 4 This should only take a few seconds to train our tokenizer on the full wikitext dataset! Once this @@ -187,8 +187,8 @@ first instantiating the model. .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START reload_model - :end-before: END reload_model + :start-after: START quicktour_reload_model + :end-before: END quicktour_reload_model :dedent: 4 To save the tokenizer in one file that contains all its configuration and vocabulary, just use the @@ -206,8 +206,8 @@ To save the tokenizer in one file that contains all its configuration and vocabu .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START save - :end-before: END save + :start-after: START quicktour_save + :end-before: END quicktour_save :dedent: 4 and you can reload your tokenizer from that file with the :entity:`Tokenizer.from_file` @@ -225,8 +225,8 @@ and you can reload your tokenizer from that file with the :entity:`Tokenizer.fro .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START reload_tokenizer - :end-before: END reload_tokenizer + :start-after: START quicktour_reload_tokenizer + :end-before: END quicktour_reload_tokenizer :dedent: 4 Using the tokenizer @@ -247,8 +247,8 @@ Now that we have trained a tokenizer, we can use it on any text we want with the .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START encode - :end-before: END encode + :start-after: START quicktour_encode + :end-before: END quicktour_encode :dedent: 4 This applied the full pipeline of the tokenizer on the text, returning an @@ -271,8 +271,8 @@ tokens: .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START print_tokens - :end-before: END print_tokens + :start-after: START quicktour_print_tokens + :end-before: END quicktour_print_tokens :dedent: 4 Similarly, the :obj:`ids` attribute will contain the index of each of those tokens in the @@ -290,8 +290,8 @@ tokenizer's vocabulary: .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START print_ids - :end-before: END print_ids + :start-after: START quicktour_print_ids + :end-before: END quicktour_print_ids :dedent: 4 An important feature of the 🤗 Tokenizers library is that it comes with full alignment tracking, @@ -312,8 +312,8 @@ which is the token at index 9 in the list, we can just ask for the offset at the .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START print_offsets - :end-before: END print_offsets + :start-after: START quicktour_print_offsets + :end-before: END quicktour_print_offsets :dedent: 4 and those are the indices that correspond to the emoji in the original sentence: @@ -330,8 +330,8 @@ and those are the indices that correspond to the emoji in the original sentence: .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START use_offsets - :end-before: END use_offsets + :start-after: START quicktour_use_offsets + :end-before: END quicktour_use_offsets :dedent: 4 Post-processing @@ -358,8 +358,8 @@ list of special tokens, so this should be their IDs. To double-check, we can use .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START check_sep - :end-before: END check_sep + :start-after: START quicktour_check_sep + :end-before: END quicktour_check_sep :dedent: 4 Here is how we can set the post-processing to give us the traditional BERT inputs: @@ -376,8 +376,8 @@ Here is how we can set the post-processing to give us the traditional BERT input .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START init_template_processing - :end-before: END init_template_processing + :start-after: START quicktour_init_template_processing + :end-before: END quicktour_init_template_processing :dedent: 4 Let's go over this snippet of code in more details. First we specify the template for single @@ -406,8 +406,8 @@ To check out this worked properly, let's try to encode the same sentence as befo .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START print_special_tokens - :end-before: END print_special_tokens + :start-after: START quicktour_print_special_tokens + :end-before: END quicktour_print_special_tokens :dedent: 4 To check the results on a pair of sentences, we just pass the two sentences to @@ -425,8 +425,8 @@ To check the results on a pair of sentences, we just pass the two sentences to .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START print_special_tokens_pair - :end-before: END print_special_tokens_pair + :start-after: START quicktour_print_special_tokens_pair + :end-before: END quicktour_print_special_tokens_pair :dedent: 4 You can then check the type IDs attributed to each token is correct with @@ -443,8 +443,8 @@ You can then check the type IDs attributed to each token is correct with .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START print_type_ids - :end-before: END print_type_ids + :start-after: START quicktour_print_type_ids + :end-before: END quicktour_print_type_ids :dedent: 4 If you save your tokenizer with :entity:`Tokenizer.save`, the post-processor will be saved along. @@ -467,8 +467,8 @@ using the :entity:`Tokenizer.encode_batch` method: .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START encode_batch - :end-before: END encode_batch + :start-after: START quicktour_encode_batch + :end-before: END quicktour_encode_batch :dedent: 4 The output is then a list of :entity:`Encoding` objects like the ones we saw before. You @@ -490,8 +490,8 @@ B: .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START encode_batch_pair - :end-before: END encode_batch_pair + :start-after: START quicktour_encode_batch_pair + :end-before: END quicktour_encode_batch_pair :dedent: 4 When encoding multiple sentences, you can automatically pad the outputs to the longest sentence @@ -511,8 +511,8 @@ present by using :entity:`Tokenizer.enable_padding`, with the :entity:`pad_token .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START enable_padding - :end-before: END enable_padding + :start-after: START quicktour_enable_padding + :end-before: END quicktour_enable_padding :dedent: 4 We can set the :obj:`direction` of the padding (defaults to the right) or a given :obj:`length` if @@ -531,8 +531,8 @@ the longest text). .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START print_batch_tokens - :end-before: END print_batch_tokens + :start-after: START quicktour_print_batch_tokens + :end-before: END quicktour_print_batch_tokens :dedent: 4 In this case, the `attention mask` generated by the tokenizer takes the padding into account: @@ -549,8 +549,8 @@ In this case, the `attention mask` generated by the tokenizer takes the padding .. literalinclude:: ../../tokenizers/tests/documentation.rs :language: rust - :start-after: START print_attention_mask - :end-before: END print_attention_mask + :start-after: START quicktour_print_attention_mask + :end-before: END quicktour_print_attention_mask :dedent: 4 .. _pretrained: diff --git a/tokenizers/tests/documentation.rs b/tokenizers/tests/documentation.rs index 95e331f9..8f1a8bf5 100644 --- a/tokenizers/tests/documentation.rs +++ b/tokenizers/tests/documentation.rs @@ -63,14 +63,14 @@ fn load_tokenizer() { fn quicktour_slow_train() -> tokenizers::Result<()> { let (mut tokenizer, trainer) = quicktour_get_tokenizer_trainer()?; - // START train + // START quicktour_train let files = ["test", "train", "valid"] .iter() .map(|split| format!("data/wikitext-103-raw/wiki.{}.raw", split)) .collect::>(); tokenizer.train_and_replace(&trainer, files)?; - // END train - // START reload_model + // END quicktour_train + // START quicktour_reload_model use std::path::Path; use tokenizers::Model; @@ -85,10 +85,10 @@ fn quicktour_slow_train() -> tokenizers::Result<()> { .unk_token("[UNK]".to_string()) .build()?, ); - // END reload_model - // START save + // END quicktour_reload_model + // START quicktour_save tokenizer.save("data/tokenizer-wiki.json", false)?; - // END save + // END quicktour_save Ok(()) } @@ -104,7 +104,7 @@ fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<( >, BpeTrainer, )> { - // START init_tokenizer + // START quicktour_init_tokenizer use tokenizers::models::bpe::BPE; use tokenizers::TokenizerBuilder; @@ -115,8 +115,8 @@ fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<( PostProcessorWrapper, DecoderWrapper, > = TokenizerImpl::new(BPE::default()); - // END init_tokenizer - // START init_trainer + // END quicktour_init_tokenizer + // START quicktour_init_trainer use tokenizers::models::bpe::BpeTrainer; let trainer = BpeTrainer::builder() @@ -128,56 +128,56 @@ fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<( AddedToken::from("[MASK]", true), ]) .build(); - // END init_trainer - // START init_pretok + // END quicktour_init_trainer + // START quicktour_init_pretok use tokenizers::pre_tokenizers::whitespace::Whitespace; tokenizer.with_pre_tokenizer(Whitespace::default()); - // END init_pretok + // END quicktour_init_pretok Ok((tokenizer, trainer)) } #[test] fn quicktour() -> tokenizers::Result<()> { - // START reload_tokenizer + // START quicktour_reload_tokenizer let mut tokenizer = Tokenizer::from_file("data/tokenizer-wiki.json")?; - // END reload_tokenizer - // START encode + // END quicktour_reload_tokenizer + // START quicktour_encode let output = tokenizer.encode("Hello, y'all! How are you 😁 ?", true)?; - // END encode - // START print_tokens + // END quicktour_encode + // START quicktour_print_tokens println!("{:?}", output.get_tokens()); // ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?",] - // END print_tokens + // END quicktour_print_tokens assert_eq!( output.get_tokens(), ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?",] ); - // START print_ids + // START quicktour_print_ids println!("{:?}", output.get_ids()); // [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35] - // END print_ids + // END quicktour_print_ids assert_eq!( output.get_ids(), [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35] ); - // START print_offsets + // START quicktour_print_offsets println!("{:?}", output.get_offsets()[9]); // (26, 30) - // END print_offsets + // END quicktour_print_offsets assert_eq!(output.get_offsets()[9], (26, 30)); - // START use_offsets + // START quicktour_use_offsets let sentence = "Hello, y'all! How are you 😁 ?"; println!("{}", &sentence[26..30]); // "😁" - // END use_offsets - // START check_sep + // END quicktour_use_offsets + // START quicktour_check_sep println!("{}", tokenizer.token_to_id("[SEP]").unwrap()); // 2 - // END check_sep + // END quicktour_check_sep assert_eq!(tokenizer.token_to_id("[SEP]"), Some(2)); - // START init_template_processing + // START quicktour_init_template_processing use tokenizers::processors::template::TemplateProcessing; let special_tokens = vec![ @@ -193,21 +193,21 @@ fn quicktour() -> tokenizers::Result<()> { .special_tokens(special_tokens) .build()?, ); - // END init_template_processing - // START print_special_tokens + // END quicktour_init_template_processing + // START quicktour_print_special_tokens let output = tokenizer.encode("Hello, y'all! How are you 😁 ?", true)?; println!("{:?}", output.get_tokens()); // ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"] - // END print_special_tokens + // END quicktour_print_special_tokens assert_eq!( output.get_tokens(), ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"] ); - // START print_special_tokens_pair + // START quicktour_print_special_tokens_pair let output = tokenizer.encode(("Hello, y'all!", "How are you 😁 ?"), true)?; println!("{:?}", output.get_tokens()); // ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"] - // END print_special_tokens_pair + // END quicktour_print_special_tokens_pair assert_eq!( output.get_tokens(), [ @@ -215,19 +215,19 @@ fn quicktour() -> tokenizers::Result<()> { "?", "[SEP]" ] ); - // START print_type_ids + // START quicktour_print_type_ids println!("{:?}", output.get_type_ids()); // [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] - // END print_type_ids + // END quicktour_print_type_ids assert_eq!( output.get_type_ids(), [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] ); - // START encode_batch + // START quicktour_encode_batch let output = tokenizer.encode_batch(vec!["Hello, y'all!", "How are you 😁 ?"], true)?; - // END encode_batch + // END quicktour_encode_batch println!("{:?}", output); - // START encode_batch_pair + // START quicktour_encode_batch_pair let output = tokenizer.encode_batch( vec![ ("Hello, y'all!", "How are you 😁 ?"), @@ -235,9 +235,9 @@ fn quicktour() -> tokenizers::Result<()> { ], true, )?; - // END encode_batch_pair + // END quicktour_encode_batch_pair println!("{:?}", output); - // START enable_padding + // START quicktour_enable_padding use tokenizers::PaddingParams; tokenizer.with_padding(Some(PaddingParams { @@ -245,20 +245,20 @@ fn quicktour() -> tokenizers::Result<()> { pad_token: "[PAD]".to_string(), ..PaddingParams::default() })); - // END enable_padding - // START print_batch_tokens + // END quicktour_enable_padding + // START quicktour_print_batch_tokens let output = tokenizer.encode_batch(vec!["Hello, y'all!", "How are you 😁 ?"], true)?; println!("{:?}", output[1].get_tokens()); // ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"] - // END print_batch_tokens + // END quicktour_print_batch_tokens assert_eq!( output[1].get_tokens(), ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"] ); - // START print_attention_mask + // START quicktour_print_attention_mask println!("{:?}", output[1].get_attention_mask()); // [1, 1, 1, 1, 1, 1, 1, 0] - // END print_attention_mask + // END quicktour_print_attention_mask assert_eq!(output[1].get_attention_mask(), [1, 1, 1, 1, 1, 1, 1, 0]); Ok(()) }