mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Doc - Add @narsil suggestions
Co-Authored-By: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
@ -60,7 +60,7 @@ Then training your tokenizer on a set of files just takes two lines of codes:
|
|||||||
from tokenizers.trainers import BpeTrainer
|
from tokenizers.trainers import BpeTrainer
|
||||||
|
|
||||||
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
||||||
tokenizer.train(trainer, [wiki.train.raw, wiki.valid.raw, wiki.test.raw])
|
tokenizer.train(trainer, ["wiki.train.raw", "wiki.valid.raw", "wiki.test.raw"])
|
||||||
```
|
```
|
||||||
|
|
||||||
Once your tokenizer is trained, encode any text with just one line:
|
Once your tokenizer is trained, encode any text with just one line:
|
||||||
|
@ -67,7 +67,7 @@ The ``Normalizer`` is optional.
|
|||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Name
|
* - Name
|
||||||
- Desription
|
- Description
|
||||||
- Example
|
- Example
|
||||||
|
|
||||||
* - NFD
|
* - NFD
|
||||||
|
@ -6,7 +6,7 @@ dir_guard=@mkdir -p $(@D)
|
|||||||
|
|
||||||
SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt
|
SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt
|
||||||
BENCHMARK_RESOURCES = $(SHARED_RESOURCES)
|
BENCHMARK_RESOURCES = $(SHARED_RESOURCES)
|
||||||
TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/albert-base-v1-tokenizer.json $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json
|
TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/albert-base-v1-tokenizer.json $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json
|
||||||
|
|
||||||
.PHONY : build
|
.PHONY : build
|
||||||
build :
|
build :
|
||||||
@ -75,3 +75,7 @@ $(DATA_DIR)/roberta.json :
|
|||||||
$(DATA_DIR)/tokenizer-wiki.json :
|
$(DATA_DIR)/tokenizer-wiki.json :
|
||||||
$(dir_guard)
|
$(dir_guard)
|
||||||
wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json -O $@
|
wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json -O $@
|
||||||
|
|
||||||
|
$(DATA_DIR)/bert-wiki.json :
|
||||||
|
$(dir_guard)
|
||||||
|
wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json -O $@
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
use tokenizers::models::bpe::{BpeTrainer, BpeTrainerBuilder, BPE};
|
use tokenizers::models::bpe::{BpeTrainerBuilder, BPE};
|
||||||
use tokenizers::normalizers::{Sequence, Strip, NFC};
|
use tokenizers::normalizers::{Sequence, Strip, NFC};
|
||||||
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
|
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
|
||||||
use tokenizers::{AddedToken, TokenizerBuilder};
|
use tokenizers::{AddedToken, TokenizerBuilder};
|
||||||
@ -61,52 +61,8 @@ fn load_tokenizer() {
|
|||||||
#[test]
|
#[test]
|
||||||
#[ignore]
|
#[ignore]
|
||||||
fn quicktour_slow_train() -> tokenizers::Result<()> {
|
fn quicktour_slow_train() -> tokenizers::Result<()> {
|
||||||
let (mut tokenizer, trainer) = quicktour_get_tokenizer_trainer()?;
|
|
||||||
|
|
||||||
// START quicktour_train
|
|
||||||
let files = ["test", "train", "valid"]
|
|
||||||
.iter()
|
|
||||||
.map(|split| format!("data/wikitext-103-raw/wiki.{}.raw", split))
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
tokenizer.train_and_replace(&trainer, files)?;
|
|
||||||
// END quicktour_train
|
|
||||||
// START quicktour_reload_model
|
|
||||||
use std::path::Path;
|
|
||||||
use tokenizers::Model;
|
|
||||||
|
|
||||||
let saved_files = tokenizer
|
|
||||||
.get_model()
|
|
||||||
.save(&Path::new("data"), Some("wiki"))?;
|
|
||||||
tokenizer.with_model(
|
|
||||||
BPE::from_file(
|
|
||||||
saved_files[0].to_str().unwrap(),
|
|
||||||
&saved_files[1].to_str().unwrap(),
|
|
||||||
)
|
|
||||||
.unk_token("[UNK]".to_string())
|
|
||||||
.build()?,
|
|
||||||
);
|
|
||||||
// END quicktour_reload_model
|
|
||||||
// START quicktour_save
|
|
||||||
tokenizer.save("data/tokenizer-wiki.json", false)?;
|
|
||||||
// END quicktour_save
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[allow(unused_imports, clippy::type_complexity)]
|
|
||||||
fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<(
|
|
||||||
TokenizerImpl<
|
|
||||||
BPE,
|
|
||||||
NormalizerWrapper,
|
|
||||||
PreTokenizerWrapper,
|
|
||||||
PostProcessorWrapper,
|
|
||||||
DecoderWrapper,
|
|
||||||
>,
|
|
||||||
BpeTrainer,
|
|
||||||
)> {
|
|
||||||
// START quicktour_init_tokenizer
|
// START quicktour_init_tokenizer
|
||||||
use tokenizers::models::bpe::BPE;
|
use tokenizers::models::bpe::BPE;
|
||||||
use tokenizers::TokenizerBuilder;
|
|
||||||
|
|
||||||
let mut tokenizer: TokenizerImpl<
|
let mut tokenizer: TokenizerImpl<
|
||||||
BPE,
|
BPE,
|
||||||
@ -135,7 +91,35 @@ fn quicktour_get_tokenizer_trainer() -> tokenizers::Result<(
|
|||||||
tokenizer.with_pre_tokenizer(Whitespace::default());
|
tokenizer.with_pre_tokenizer(Whitespace::default());
|
||||||
// END quicktour_init_pretok
|
// END quicktour_init_pretok
|
||||||
|
|
||||||
Ok((tokenizer, trainer))
|
// START quicktour_train
|
||||||
|
let files = vec![
|
||||||
|
"data/wikitext-103-raw/wiki.train.raw".into(),
|
||||||
|
"data/wikitext-103-raw/wiki.test.raw".into(),
|
||||||
|
"data/wikitext-103-raw/wiki.valid.raw".into(),
|
||||||
|
];
|
||||||
|
tokenizer.train_and_replace(&trainer, files)?;
|
||||||
|
// END quicktour_train
|
||||||
|
// START quicktour_reload_model
|
||||||
|
use std::path::Path;
|
||||||
|
use tokenizers::Model;
|
||||||
|
|
||||||
|
let saved_files = tokenizer
|
||||||
|
.get_model()
|
||||||
|
.save(&Path::new("data"), Some("wiki"))?;
|
||||||
|
tokenizer.with_model(
|
||||||
|
BPE::from_file(
|
||||||
|
saved_files[0].to_str().unwrap(),
|
||||||
|
&saved_files[1].to_str().unwrap(),
|
||||||
|
)
|
||||||
|
.unk_token("[UNK]".to_string())
|
||||||
|
.build()?,
|
||||||
|
);
|
||||||
|
// END quicktour_reload_model
|
||||||
|
// START quicktour_save
|
||||||
|
tokenizer.save("data/tokenizer-wiki.json", false)?;
|
||||||
|
// END quicktour_save
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -386,7 +370,7 @@ fn pipeline() -> tokenizers::Result<()> {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[ignore]
|
#[ignore]
|
||||||
fn pipeline_bert() -> tokenizers::Result<()> {
|
fn train_pipeline_bert() -> tokenizers::Result<()> {
|
||||||
// START bert_setup_tokenizer
|
// START bert_setup_tokenizer
|
||||||
use tokenizers::models::wordpiece::WordPiece;
|
use tokenizers::models::wordpiece::WordPiece;
|
||||||
use tokenizers::Tokenizer;
|
use tokenizers::Tokenizer;
|
||||||
@ -438,10 +422,11 @@ fn pipeline_bert() -> tokenizers::Result<()> {
|
|||||||
])
|
])
|
||||||
.build()
|
.build()
|
||||||
.into();
|
.into();
|
||||||
let files = ["test", "train", "valid"]
|
let files = vec![
|
||||||
.iter()
|
"data/wikitext-103-raw/wiki.train.raw".into(),
|
||||||
.map(|split| format!("data/wikitext-103-raw/wiki.{}.raw", split))
|
"data/wikitext-103-raw/wiki.test.raw".into(),
|
||||||
.collect::<Vec<_>>();
|
"data/wikitext-103-raw/wiki.valid.raw".into(),
|
||||||
|
];
|
||||||
bert_tokenizer.train_and_replace(&trainer, files)?;
|
bert_tokenizer.train_and_replace(&trainer, files)?;
|
||||||
|
|
||||||
let model_files = bert_tokenizer
|
let model_files = bert_tokenizer
|
||||||
@ -456,6 +441,13 @@ fn pipeline_bert() -> tokenizers::Result<()> {
|
|||||||
|
|
||||||
bert_tokenizer.save("data/bert-wiki.json", false)?;
|
bert_tokenizer.save("data/bert-wiki.json", false)?;
|
||||||
// END bert_train_tokenizer
|
// END bert_train_tokenizer
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn pipeline_bert() -> tokenizers::Result<()> {
|
||||||
|
let mut bert_tokenizer = Tokenizer::from_file("data/bert-wiki.json")?;
|
||||||
|
|
||||||
// START bert_test_decoding
|
// START bert_test_decoding
|
||||||
let output = bert_tokenizer.encode("Welcome to the 🤗 Tokenizers library.", true)?;
|
let output = bert_tokenizer.encode("Welcome to the 🤗 Tokenizers library.", true)?;
|
||||||
println!("{:?}", output.get_tokens());
|
println!("{:?}", output.get_tokens());
|
||||||
@ -465,13 +457,22 @@ fn pipeline_bert() -> tokenizers::Result<()> {
|
|||||||
println!("{}", decoded);
|
println!("{}", decoded);
|
||||||
// "welcome to the tok ##eni ##zer ##s library ."
|
// "welcome to the tok ##eni ##zer ##s library ."
|
||||||
// END bert_test_decoding
|
// END bert_test_decoding
|
||||||
|
assert_eq!(
|
||||||
|
output.get_tokens(),
|
||||||
|
&[
|
||||||
|
"[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library",
|
||||||
|
".", "[SEP]"
|
||||||
|
]
|
||||||
|
);
|
||||||
|
assert_eq!(decoded, "welcome to the tok ##eni ##zer ##s library .");
|
||||||
// START bert_proper_decoding
|
// START bert_proper_decoding
|
||||||
use tokenizers::decoders::wordpiece::WordPiece as WordPieceDecoder;
|
use tokenizers::decoders::wordpiece::WordPiece as WordPieceDecoder;
|
||||||
|
|
||||||
bert_tokenizer.with_decoder(WordPieceDecoder::default());
|
bert_tokenizer.with_decoder(WordPieceDecoder::default());
|
||||||
let decoded = bert_tokenizer.decode(output.get_ids().to_vec(), true)?;
|
let decoded = bert_tokenizer.decode(output.get_ids().to_vec(), true)?;
|
||||||
// "welcome to the tokenizers library."
|
// "welcome to the tokenizers library."
|
||||||
// END bert_proper_decoding
|
// END bert_proper_decoding
|
||||||
println!("{}", decoded);
|
assert_eq!(decoded, "welcome to the tokenizers library.");
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user