mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Doc - Add @narsil suggestions
Co-Authored-By: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
@ -60,7 +60,7 @@ Then training your tokenizer on a set of files just takes two lines of codes:
|
||||
from tokenizers.trainers import BpeTrainer
|
||||
|
||||
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
||||
tokenizer.train(trainer, [wiki.train.raw, wiki.valid.raw, wiki.test.raw])
|
||||
tokenizer.train(trainer, ["wiki.train.raw", "wiki.valid.raw", "wiki.test.raw"])
|
||||
```
|
||||
|
||||
Once your tokenizer is trained, encode any text with just one line:
|
||||
|
@ -67,7 +67,7 @@ The ``Normalizer`` is optional.
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Desription
|
||||
- Description
|
||||
- Example
|
||||
|
||||
* - NFD
|
||||
|
@ -6,7 +6,7 @@ dir_guard=@mkdir -p $(@D)
|
||||
|
||||
SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt
|
||||
BENCHMARK_RESOURCES = $(SHARED_RESOURCES)
|
||||
TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/albert-base-v1-tokenizer.json $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json
|
||||
TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/albert-base-v1-tokenizer.json $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json
|
||||
|
||||
.PHONY : build
|
||||
build :
|
||||
@ -75,3 +75,7 @@ $(DATA_DIR)/roberta.json :
|
||||
$(DATA_DIR)/tokenizer-wiki.json :
|
||||
$(dir_guard)
|
||||
wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json -O $@
|
||||
|
||||
$(DATA_DIR)/bert-wiki.json :
|
||||
$(dir_guard)
|
||||
wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json -O $@
|
||||
|
@ -1,4 +1,4 @@
|
||||
use tokenizers::models::bpe::{BpeTrainer, BpeTrainerBuilder, BPE};
|
||||
use tokenizers::models::bpe::{BpeTrainerBuilder, BPE};
|
||||
use tokenizers::normalizers::{Sequence, Strip, NFC};
|
||||
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
|
||||
use tokenizers::{AddedToken, TokenizerBuilder};
|
||||
@ -61,52 +61,8 @@ fn load_tokenizer() {
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn quicktour_slow_train() -> tokenizers::Result<()> {
|
||||
let (mut tokenizer, trainer) = quicktour_get_tokenizer_trainer()?;
|
||||
|
||||
// START quicktour_train
|
||||
let files = ["test", "train", "valid"]
|
||||
.iter()
|
||||
.map(|split| format!("data/wikitext-103-raw/wiki.{}.raw", split))
|
||||
.collect::<Vec<_>>();
|
||||
tokenizer.train_and_replace(&trainer, files)?;
|
||||
// END quicktour_train
|
||||
// START quicktour_reload_model
|
||||
use std::path::Path;
|
||||
use tokenizers::Model;
|
||||
|
||||
let saved_files = tokenizer
|
||||
.get_model()
|
||||
.save(&Path::new("data"), Some("wiki"))?;
|
||||
tokenizer.with_model(
|
||||
BPE::from_file(
|
||||
saved_files[0].to_str().unwrap(),
|
||||
&saved_files[1].to_str().unwrap(),
|
||||
)
|
||||
.unk_token("[UNK]".to_string())
|
||||
.build()?,
|
||||
);
|
||||
// END quicktour_reload_model
|
||||
// START quicktour_save
|
||||
tokenizer.save("data/tokenizer-wiki.json", false)?;
|
||||
// END quicktour_save
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(unused_imports, clippy::type_complexity)]
|
||||
fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<(
|
||||
TokenizerImpl<
|
||||
BPE,
|
||||
NormalizerWrapper,
|
||||
PreTokenizerWrapper,
|
||||
PostProcessorWrapper,
|
||||
DecoderWrapper,
|
||||
>,
|
||||
BpeTrainer,
|
||||
)> {
|
||||
// START quicktour_init_tokenizer
|
||||
use tokenizers::models::bpe::BPE;
|
||||
use tokenizers::TokenizerBuilder;
|
||||
|
||||
let mut tokenizer: TokenizerImpl<
|
||||
BPE,
|
||||
@ -135,7 +91,35 @@ fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<(
|
||||
tokenizer.with_pre_tokenizer(Whitespace::default());
|
||||
// END quicktour_init_pretok
|
||||
|
||||
Ok((tokenizer, trainer))
|
||||
// START quicktour_train
|
||||
let files = vec![
|
||||
"data/wikitext-103-raw/wiki.train.raw".into(),
|
||||
"data/wikitext-103-raw/wiki.test.raw".into(),
|
||||
"data/wikitext-103-raw/wiki.valid.raw".into(),
|
||||
];
|
||||
tokenizer.train_and_replace(&trainer, files)?;
|
||||
// END quicktour_train
|
||||
// START quicktour_reload_model
|
||||
use std::path::Path;
|
||||
use tokenizers::Model;
|
||||
|
||||
let saved_files = tokenizer
|
||||
.get_model()
|
||||
.save(&Path::new("data"), Some("wiki"))?;
|
||||
tokenizer.with_model(
|
||||
BPE::from_file(
|
||||
saved_files[0].to_str().unwrap(),
|
||||
&saved_files[1].to_str().unwrap(),
|
||||
)
|
||||
.unk_token("[UNK]".to_string())
|
||||
.build()?,
|
||||
);
|
||||
// END quicktour_reload_model
|
||||
// START quicktour_save
|
||||
tokenizer.save("data/tokenizer-wiki.json", false)?;
|
||||
// END quicktour_save
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -386,7 +370,7 @@ fn pipeline() -> tokenizers::Result<()> {
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn pipeline_bert() -> tokenizers::Result<()> {
|
||||
fn train_pipeline_bert() -> tokenizers::Result<()> {
|
||||
// START bert_setup_tokenizer
|
||||
use tokenizers::models::wordpiece::WordPiece;
|
||||
use tokenizers::Tokenizer;
|
||||
@ -438,10 +422,11 @@ fn pipeline_bert() -> tokenizers::Result<()> {
|
||||
])
|
||||
.build()
|
||||
.into();
|
||||
let files = ["test", "train", "valid"]
|
||||
.iter()
|
||||
.map(|split| format!("data/wikitext-103-raw/wiki.{}.raw", split))
|
||||
.collect::<Vec<_>>();
|
||||
let files = vec![
|
||||
"data/wikitext-103-raw/wiki.train.raw".into(),
|
||||
"data/wikitext-103-raw/wiki.test.raw".into(),
|
||||
"data/wikitext-103-raw/wiki.valid.raw".into(),
|
||||
];
|
||||
bert_tokenizer.train_and_replace(&trainer, files)?;
|
||||
|
||||
let model_files = bert_tokenizer
|
||||
@ -456,6 +441,13 @@ fn pipeline_bert() -> tokenizers::Result<()> {
|
||||
|
||||
bert_tokenizer.save("data/bert-wiki.json", false)?;
|
||||
// END bert_train_tokenizer
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pipeline_bert() -> tokenizers::Result<()> {
|
||||
let mut bert_tokenizer = Tokenizer::from_file("data/bert-wiki.json")?;
|
||||
|
||||
// START bert_test_decoding
|
||||
let output = bert_tokenizer.encode("Welcome to the 🤗 Tokenizers library.", true)?;
|
||||
println!("{:?}", output.get_tokens());
|
||||
@ -465,13 +457,22 @@ fn pipeline_bert() -> tokenizers::Result<()> {
|
||||
println!("{}", decoded);
|
||||
// "welcome to the tok ##eni ##zer ##s library ."
|
||||
// END bert_test_decoding
|
||||
assert_eq!(
|
||||
output.get_tokens(),
|
||||
&[
|
||||
"[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library",
|
||||
".", "[SEP]"
|
||||
]
|
||||
);
|
||||
assert_eq!(decoded, "welcome to the tok ##eni ##zer ##s library .");
|
||||
// START bert_proper_decoding
|
||||
use tokenizers::decoders::wordpiece::WordPiece as WordPieceDecoder;
|
||||
|
||||
bert_tokenizer.with_decoder(WordPieceDecoder::default());
|
||||
let decoded = bert_tokenizer.decode(output.get_ids().to_vec(), true)?;
|
||||
// "welcome to the tokenizers library."
|
||||
// END bert_proper_decoding
|
||||
println!("{}", decoded);
|
||||
assert_eq!(decoded, "welcome to the tokenizers library.");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
Reference in New Issue
Block a user