Doc - Add @narsil suggestions

Co-Authored-By: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
Anthony MOI
2020-11-02 12:19:44 -05:00
committed by Anthony MOI
parent 56c507dcdd
commit adfef0d906
4 changed files with 60 additions and 55 deletions

View File

@ -60,7 +60,7 @@ Then training your tokenizer on a set of files just takes two lines of codes:
from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.train(trainer, [wiki.train.raw, wiki.valid.raw, wiki.test.raw])
tokenizer.train(trainer, ["wiki.train.raw", "wiki.valid.raw", "wiki.test.raw"])
```
Once your tokenizer is trained, encode any text with just one line:

View File

@ -67,7 +67,7 @@ The ``Normalizer`` is optional.
:header-rows: 1
* - Name
- Desription
- Description
- Example
* - NFD

View File

@ -6,7 +6,7 @@ dir_guard=@mkdir -p $(@D)
SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt
BENCHMARK_RESOURCES = $(SHARED_RESOURCES)
TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/albert-base-v1-tokenizer.json $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json
TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/albert-base-v1-tokenizer.json $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json
.PHONY : build
build :
@ -75,3 +75,7 @@ $(DATA_DIR)/roberta.json :
$(DATA_DIR)/tokenizer-wiki.json :
$(dir_guard)
wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json -O $@
$(DATA_DIR)/bert-wiki.json :
$(dir_guard)
wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json -O $@

View File

@ -1,4 +1,4 @@
use tokenizers::models::bpe::{BpeTrainer, BpeTrainerBuilder, BPE};
use tokenizers::models::bpe::{BpeTrainerBuilder, BPE};
use tokenizers::normalizers::{Sequence, Strip, NFC};
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
use tokenizers::{AddedToken, TokenizerBuilder};
@ -61,52 +61,8 @@ fn load_tokenizer() {
#[test]
#[ignore]
fn quicktour_slow_train() -> tokenizers::Result<()> {
let (mut tokenizer, trainer) = quicktour_get_tokenizer_trainer()?;
// START quicktour_train
let files = ["test", "train", "valid"]
.iter()
.map(|split| format!("data/wikitext-103-raw/wiki.{}.raw", split))
.collect::<Vec<_>>();
tokenizer.train_and_replace(&trainer, files)?;
// END quicktour_train
// START quicktour_reload_model
use std::path::Path;
use tokenizers::Model;
let saved_files = tokenizer
.get_model()
.save(&Path::new("data"), Some("wiki"))?;
tokenizer.with_model(
BPE::from_file(
saved_files[0].to_str().unwrap(),
&saved_files[1].to_str().unwrap(),
)
.unk_token("[UNK]".to_string())
.build()?,
);
// END quicktour_reload_model
// START quicktour_save
tokenizer.save("data/tokenizer-wiki.json", false)?;
// END quicktour_save
Ok(())
}
#[allow(unused_imports, clippy::type_complexity)]
fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<(
TokenizerImpl<
BPE,
NormalizerWrapper,
PreTokenizerWrapper,
PostProcessorWrapper,
DecoderWrapper,
>,
BpeTrainer,
)> {
// START quicktour_init_tokenizer
use tokenizers::models::bpe::BPE;
use tokenizers::TokenizerBuilder;
let mut tokenizer: TokenizerImpl<
BPE,
@ -135,7 +91,35 @@ fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<(
tokenizer.with_pre_tokenizer(Whitespace::default());
// END quicktour_init_pretok
Ok((tokenizer, trainer))
// START quicktour_train
let files = vec![
"data/wikitext-103-raw/wiki.train.raw".into(),
"data/wikitext-103-raw/wiki.test.raw".into(),
"data/wikitext-103-raw/wiki.valid.raw".into(),
];
tokenizer.train_and_replace(&trainer, files)?;
// END quicktour_train
// START quicktour_reload_model
use std::path::Path;
use tokenizers::Model;
let saved_files = tokenizer
.get_model()
.save(&Path::new("data"), Some("wiki"))?;
tokenizer.with_model(
BPE::from_file(
saved_files[0].to_str().unwrap(),
&saved_files[1].to_str().unwrap(),
)
.unk_token("[UNK]".to_string())
.build()?,
);
// END quicktour_reload_model
// START quicktour_save
tokenizer.save("data/tokenizer-wiki.json", false)?;
// END quicktour_save
Ok(())
}
#[test]
@ -386,7 +370,7 @@ fn pipeline() -> tokenizers::Result<()> {
#[test]
#[ignore]
fn pipeline_bert() -> tokenizers::Result<()> {
fn train_pipeline_bert() -> tokenizers::Result<()> {
// START bert_setup_tokenizer
use tokenizers::models::wordpiece::WordPiece;
use tokenizers::Tokenizer;
@ -438,10 +422,11 @@ fn pipeline_bert() -> tokenizers::Result<()> {
])
.build()
.into();
let files = ["test", "train", "valid"]
.iter()
.map(|split| format!("data/wikitext-103-raw/wiki.{}.raw", split))
.collect::<Vec<_>>();
let files = vec![
"data/wikitext-103-raw/wiki.train.raw".into(),
"data/wikitext-103-raw/wiki.test.raw".into(),
"data/wikitext-103-raw/wiki.valid.raw".into(),
];
bert_tokenizer.train_and_replace(&trainer, files)?;
let model_files = bert_tokenizer
@ -456,6 +441,13 @@ fn pipeline_bert() -> tokenizers::Result<()> {
bert_tokenizer.save("data/bert-wiki.json", false)?;
// END bert_train_tokenizer
Ok(())
}
#[test]
fn pipeline_bert() -> tokenizers::Result<()> {
let mut bert_tokenizer = Tokenizer::from_file("data/bert-wiki.json")?;
// START bert_test_decoding
let output = bert_tokenizer.encode("Welcome to the 🤗 Tokenizers library.", true)?;
println!("{:?}", output.get_tokens());
@ -465,13 +457,22 @@ fn pipeline_bert() -> tokenizers::Result<()> {
println!("{}", decoded);
// "welcome to the tok ##eni ##zer ##s library ."
// END bert_test_decoding
assert_eq!(
output.get_tokens(),
&[
"[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library",
".", "[SEP]"
]
);
assert_eq!(decoded, "welcome to the tok ##eni ##zer ##s library .");
// START bert_proper_decoding
use tokenizers::decoders::wordpiece::WordPiece as WordPieceDecoder;
bert_tokenizer.with_decoder(WordPieceDecoder::default());
let decoded = bert_tokenizer.decode(output.get_ids().to_vec(), true)?;
// "welcome to the tokenizers library."
// END bert_proper_decoding
println!("{}", decoded);
assert_eq!(decoded, "welcome to the tokenizers library.");
Ok(())
}