From adfef0d906b28236874e3c6d99ac25cfdf24504b Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Mon, 2 Nov 2020 12:19:44 -0500 Subject: [PATCH] Doc - Add @narsil suggestions Co-Authored-By: Nicolas Patry --- README.md | 2 +- docs/source/components.rst | 2 +- tokenizers/Makefile | 6 +- tokenizers/tests/documentation.rs | 105 +++++++++++++++--------------- 4 files changed, 60 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index e733f1f8..761816c1 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ Then training your tokenizer on a set of files just takes two lines of codes: from tokenizers.trainers import BpeTrainer trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) -tokenizer.train(trainer, [wiki.train.raw, wiki.valid.raw, wiki.test.raw]) +tokenizer.train(trainer, ["wiki.train.raw", "wiki.valid.raw", "wiki.test.raw"]) ``` Once your tokenizer is trained, encode any text with just one line: diff --git a/docs/source/components.rst b/docs/source/components.rst index 1bf7bab5..f26d66ee 100644 --- a/docs/source/components.rst +++ b/docs/source/components.rst @@ -67,7 +67,7 @@ The ``Normalizer`` is optional. :header-rows: 1 * - Name - - Desription + - Description - Example * - NFD diff --git a/tokenizers/Makefile b/tokenizers/Makefile index 3b826d3d..29173d75 100644 --- a/tokenizers/Makefile +++ b/tokenizers/Makefile @@ -6,7 +6,7 @@ dir_guard=@mkdir -p $(@D) SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt BENCHMARK_RESOURCES = $(SHARED_RESOURCES) -TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/albert-base-v1-tokenizer.json $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json +TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/albert-base-v1-tokenizer.json $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json .PHONY : build build : @@ -75,3 +75,7 @@ $(DATA_DIR)/roberta.json : $(DATA_DIR)/tokenizer-wiki.json : $(dir_guard) wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json -O $@ + +$(DATA_DIR)/bert-wiki.json : + $(dir_guard) + wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json -O $@ diff --git a/tokenizers/tests/documentation.rs b/tokenizers/tests/documentation.rs index 90016a4c..9353d76c 100644 --- a/tokenizers/tests/documentation.rs +++ b/tokenizers/tests/documentation.rs @@ -1,4 +1,4 @@ -use tokenizers::models::bpe::{BpeTrainer, BpeTrainerBuilder, BPE}; +use tokenizers::models::bpe::{BpeTrainerBuilder, BPE}; use tokenizers::normalizers::{Sequence, Strip, NFC}; use tokenizers::pre_tokenizers::byte_level::ByteLevel; use tokenizers::{AddedToken, TokenizerBuilder}; @@ -61,52 +61,8 @@ fn load_tokenizer() { #[test] #[ignore] fn quicktour_slow_train() -> tokenizers::Result<()> { - let (mut tokenizer, trainer) = quicktour_get_tokenizer_trainer()?; - - // START quicktour_train - let files = ["test", "train", "valid"] - .iter() - .map(|split| format!("data/wikitext-103-raw/wiki.{}.raw", split)) - .collect::>(); - tokenizer.train_and_replace(&trainer, files)?; - // END quicktour_train - // START quicktour_reload_model - use std::path::Path; - use tokenizers::Model; - - let saved_files = tokenizer - .get_model() - .save(&Path::new("data"), Some("wiki"))?; - tokenizer.with_model( - BPE::from_file( - saved_files[0].to_str().unwrap(), - &saved_files[1].to_str().unwrap(), - ) - .unk_token("[UNK]".to_string()) - .build()?, - ); - // END quicktour_reload_model - // START quicktour_save - tokenizer.save("data/tokenizer-wiki.json", false)?; - // END quicktour_save - - Ok(()) -} - -#[allow(unused_imports, clippy::type_complexity)] -fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<( - TokenizerImpl< - BPE, - NormalizerWrapper, - PreTokenizerWrapper, - PostProcessorWrapper, - DecoderWrapper, - >, - BpeTrainer, -)> { // START quicktour_init_tokenizer use tokenizers::models::bpe::BPE; - use tokenizers::TokenizerBuilder; let mut tokenizer: TokenizerImpl< BPE, @@ -135,7 +91,35 @@ fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<( tokenizer.with_pre_tokenizer(Whitespace::default()); // END quicktour_init_pretok - Ok((tokenizer, trainer)) + // START quicktour_train + let files = vec![ + "data/wikitext-103-raw/wiki.train.raw".into(), + "data/wikitext-103-raw/wiki.test.raw".into(), + "data/wikitext-103-raw/wiki.valid.raw".into(), + ]; + tokenizer.train_and_replace(&trainer, files)?; + // END quicktour_train + // START quicktour_reload_model + use std::path::Path; + use tokenizers::Model; + + let saved_files = tokenizer + .get_model() + .save(&Path::new("data"), Some("wiki"))?; + tokenizer.with_model( + BPE::from_file( + saved_files[0].to_str().unwrap(), + &saved_files[1].to_str().unwrap(), + ) + .unk_token("[UNK]".to_string()) + .build()?, + ); + // END quicktour_reload_model + // START quicktour_save + tokenizer.save("data/tokenizer-wiki.json", false)?; + // END quicktour_save + + Ok(()) } #[test] @@ -386,7 +370,7 @@ fn pipeline() -> tokenizers::Result<()> { #[test] #[ignore] -fn pipeline_bert() -> tokenizers::Result<()> { +fn train_pipeline_bert() -> tokenizers::Result<()> { // START bert_setup_tokenizer use tokenizers::models::wordpiece::WordPiece; use tokenizers::Tokenizer; @@ -438,10 +422,11 @@ fn pipeline_bert() -> tokenizers::Result<()> { ]) .build() .into(); - let files = ["test", "train", "valid"] - .iter() - .map(|split| format!("data/wikitext-103-raw/wiki.{}.raw", split)) - .collect::>(); + let files = vec![ + "data/wikitext-103-raw/wiki.train.raw".into(), + "data/wikitext-103-raw/wiki.test.raw".into(), + "data/wikitext-103-raw/wiki.valid.raw".into(), + ]; bert_tokenizer.train_and_replace(&trainer, files)?; let model_files = bert_tokenizer @@ -456,6 +441,13 @@ fn pipeline_bert() -> tokenizers::Result<()> { bert_tokenizer.save("data/bert-wiki.json", false)?; // END bert_train_tokenizer + Ok(()) +} + +#[test] +fn pipeline_bert() -> tokenizers::Result<()> { + let mut bert_tokenizer = Tokenizer::from_file("data/bert-wiki.json")?; + // START bert_test_decoding let output = bert_tokenizer.encode("Welcome to the 🤗 Tokenizers library.", true)?; println!("{:?}", output.get_tokens()); @@ -465,13 +457,22 @@ fn pipeline_bert() -> tokenizers::Result<()> { println!("{}", decoded); // "welcome to the tok ##eni ##zer ##s library ." // END bert_test_decoding + assert_eq!( + output.get_tokens(), + &[ + "[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", + ".", "[SEP]" + ] + ); + assert_eq!(decoded, "welcome to the tok ##eni ##zer ##s library ."); // START bert_proper_decoding use tokenizers::decoders::wordpiece::WordPiece as WordPieceDecoder; + bert_tokenizer.with_decoder(WordPieceDecoder::default()); let decoded = bert_tokenizer.decode(output.get_ids().to_vec(), true)?; // "welcome to the tokenizers library." // END bert_proper_decoding - println!("{}", decoded); + assert_eq!(decoded, "welcome to the tokenizers library."); Ok(()) }